1
2/*--------------------------------------------------------------------*/
3/*--- begin                                       guest_x86_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2011 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates x86 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 32-bit value is being written.
42
43   FUCOMI(P): what happens to A and S flags?  Currently are forced
44      to zero.
45
46   x87 FP Limitations:
47
48   * all arithmetic done at 64 bits
49
50   * no FP exceptions, except for handling stack over/underflow
51
52   * FP rounding mode observed only for float->int conversions
53     and int->float conversions which could lose accuracy, and
54     for float-to-float rounding.  For all other operations,
55     round-to-nearest is used, regardless.
56
57   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
58     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
59     even when it isn't.
60
61   * some of the FCOM cases could do with testing -- not convinced
62     that the args are the right way round.
63
64   * FSAVE does not re-initialise the FPU; it should do
65
66   * FINIT not only initialises the FPU environment, it also
67     zeroes all the FP registers.  It should leave the registers
68     unchanged.
69
70   SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
71   per Intel docs this bit has no meaning anyway.  Since PUSHF is the
72   only way to observe eflags[1], a proper fix would be to make that
73   bit be set by PUSHF.
74
75   The state of %eflags.AC (alignment check, bit 18) is recorded by
76   the simulation (viz, if you set it with popf then a pushf produces
77   the value you set it to), but it is otherwise ignored.  In
78   particular, setting it to 1 does NOT cause alignment checking to
79   happen.  Programs that set it to 1 and then rely on the resulting
80   SIGBUSs to inform them of misaligned accesses will not work.
81
82   Implementation of sysenter is necessarily partial.  sysenter is a
83   kind of system call entry.  When doing a sysenter, the return
84   address is not known -- that is something that is beyond Vex's
85   knowledge.  So the generated IR forces a return to the scheduler,
86   which can do what it likes to simulate the systenter, but it MUST
87   set this thread's guest_EIP field with the continuation address
88   before resuming execution.  If that doesn't happen, the thread will
89   jump to address zero, which is probably fatal.
90
91   This module uses global variables and so is not MT-safe (if that
92   should ever become relevant).
93
94   The delta values are 32-bit ints, not 64-bit ints.  That means
95   this module may not work right if run on a 64-bit host.  That should
96   be fixed properly, really -- if anyone ever wants to use Vex to
97   translate x86 code for execution on a 64-bit host.
98
99   casLE (implementation of lock-prefixed insns) and rep-prefixed
100   insns: the side-exit back to the start of the insn is done with
101   Ijk_Boring.  This is quite wrong, it should be done with
102   Ijk_NoRedir, since otherwise the side exit, which is intended to
103   restart the instruction for whatever reason, could go somewhere
104   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
105   no-redir jumps performance critical, at least for rep-prefixed
106   instructions, since all iterations thereof would involve such a
107   jump.  It's not such a big deal with casLE since the side exit is
108   only taken if the CAS fails, that is, the location is contended,
109   which is relatively unlikely.
110
111   XXXX: Nov 2009: handling of SWP on ARM suffers from the same
112   problem.
113
114   Note also, the test for CAS success vs failure is done using
115   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
116   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
117   shouldn't definedness-check these comparisons.  See
118   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
119   background/rationale.
120*/
121
122/* Performance holes:
123
124   - fcom ; fstsw %ax ; sahf
125     sahf does not update the O flag (sigh) and so O needs to
126     be computed.  This is done expensively; it would be better
127     to have a calculate_eflags_o helper.
128
129   - emwarns; some FP codes can generate huge numbers of these
130     if the fpucw is changed in an inner loop.  It would be
131     better for the guest state to have an emwarn-enable reg
132     which can be set zero or nonzero.  If it is zero, emwarns
133     are not flagged, and instead control just flows all the
134     way through bbs as usual.
135*/
136
137/* "Special" instructions.
138
139   This instruction decoder can decode three special instructions
140   which mean nothing natively (are no-ops as far as regs/mem are
141   concerned) but have meaning for supporting Valgrind.  A special
142   instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
143   C1C713 (in the standard interpretation, that means: roll $3, %edi;
144   roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
145   one of the following 3 are allowed (standard interpretation in
146   parentheses):
147
148      87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
149      87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
150      87D2 (xchgl %edx,%edx)   call-noredir *%EAX
151
152   Any other bytes following the 12-byte preamble are illegal and
153   constitute a failure in instruction decoding.  This all assumes
154   that the preamble will never occur except in specific code
155   fragments designed for Valgrind to catch.
156
157   No prefixes may precede a "Special" instruction.
158*/
159
160/* LOCK prefixed instructions.  These are translated using IR-level
161   CAS statements (IRCAS) and are believed to preserve atomicity, even
162   from the point of view of some other process racing against a
163   simulated one (presumably they communicate via a shared memory
164   segment).
165
166   Handlers which are aware of LOCK prefixes are:
167      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
168      dis_cmpxchg_G_E  (cmpxchg)
169      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
170      dis_Grp3         (not, neg)
171      dis_Grp4         (inc, dec)
172      dis_Grp5         (inc, dec)
173      dis_Grp8_Imm     (bts, btc, btr)
174      dis_bt_G_E       (bts, btc, btr)
175      dis_xadd_G_E     (xadd)
176*/
177
178
179#include "libvex_basictypes.h"
180#include "libvex_ir.h"
181#include "libvex.h"
182#include "libvex_guest_x86.h"
183
184#include "main_util.h"
185#include "main_globals.h"
186#include "guest_generic_bb_to_IR.h"
187#include "guest_generic_x87.h"
188#include "guest_x86_defs.h"
189
190
191/*------------------------------------------------------------*/
192/*--- Globals                                              ---*/
193/*------------------------------------------------------------*/
194
195/* These are set at the start of the translation of an insn, right
196   down in disInstr_X86, so that we don't have to pass them around
197   endlessly.  They are all constant during the translation of any
198   given insn. */
199
200/* We need to know this to do sub-register accesses correctly. */
201static Bool host_is_bigendian;
202
203/* Pointer to the guest code area (points to start of BB, not to the
204   insn being processed). */
205static UChar* guest_code;
206
207/* The guest address corresponding to guest_code[0]. */
208static Addr32 guest_EIP_bbstart;
209
210/* The guest address for the instruction currently being
211   translated. */
212static Addr32 guest_EIP_curr_instr;
213
214/* The IRSB* into which we're generating code. */
215static IRSB* irsb;
216
217
218/*------------------------------------------------------------*/
219/*--- Debugging output                                     ---*/
220/*------------------------------------------------------------*/
221
222#define DIP(format, args...)           \
223   if (vex_traceflags & VEX_TRACE_FE)  \
224      vex_printf(format, ## args)
225
226#define DIS(buf, format, args...)      \
227   if (vex_traceflags & VEX_TRACE_FE)  \
228      vex_sprintf(buf, format, ## args)
229
230
231/*------------------------------------------------------------*/
232/*--- Offsets of various parts of the x86 guest state.     ---*/
233/*------------------------------------------------------------*/
234
235#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
236#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
237#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
238#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
239#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
240#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
241#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
242#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
243
244#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
245
246#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
247#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
248#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
249#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
250
251#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
252#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
253#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
254#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
255#define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
256#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
257#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
258#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
259
260#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
261#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
262#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
263#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
264#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
265#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
266#define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
267#define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
268
269#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
270#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
271#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
272#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
273#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
274#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
275#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
276#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
277#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
278
279#define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
280
281#define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
282#define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
283#define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
284
285#define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
286
287
288/*------------------------------------------------------------*/
289/*--- Helper bits and pieces for deconstructing the        ---*/
290/*--- x86 insn stream.                                     ---*/
291/*------------------------------------------------------------*/
292
293/* This is the Intel register encoding -- integer regs. */
294#define R_EAX 0
295#define R_ECX 1
296#define R_EDX 2
297#define R_EBX 3
298#define R_ESP 4
299#define R_EBP 5
300#define R_ESI 6
301#define R_EDI 7
302
303#define R_AL (0+R_EAX)
304#define R_AH (4+R_EAX)
305
306/* This is the Intel register encoding -- segment regs. */
307#define R_ES 0
308#define R_CS 1
309#define R_SS 2
310#define R_DS 3
311#define R_FS 4
312#define R_GS 5
313
314
315/* Add a statement to the list held by "irbb". */
316static void stmt ( IRStmt* st )
317{
318   addStmtToIRSB( irsb, st );
319}
320
321/* Generate a new temporary of the given type. */
322static IRTemp newTemp ( IRType ty )
323{
324   vassert(isPlausibleIRType(ty));
325   return newIRTemp( irsb->tyenv, ty );
326}
327
328/* Various simple conversions */
329
330static UInt extend_s_8to32( UInt x )
331{
332   return (UInt)((((Int)x) << 24) >> 24);
333}
334
335static UInt extend_s_16to32 ( UInt x )
336{
337   return (UInt)((((Int)x) << 16) >> 16);
338}
339
340/* Fetch a byte from the guest insn stream. */
341static UChar getIByte ( Int delta )
342{
343   return guest_code[delta];
344}
345
346/* Extract the reg field from a modRM byte. */
347static Int gregOfRM ( UChar mod_reg_rm )
348{
349   return (Int)( (mod_reg_rm >> 3) & 7 );
350}
351
352/* Figure out whether the mod and rm parts of a modRM byte refer to a
353   register or memory.  If so, the byte will have the form 11XXXYYY,
354   where YYY is the register number. */
355static Bool epartIsReg ( UChar mod_reg_rm )
356{
357   return toBool(0xC0 == (mod_reg_rm & 0xC0));
358}
359
360/* ... and extract the register number ... */
361static Int eregOfRM ( UChar mod_reg_rm )
362{
363   return (Int)(mod_reg_rm & 0x7);
364}
365
366/* Get a 8/16/32-bit unsigned value out of the insn stream. */
367
368static UChar getUChar ( Int delta )
369{
370   UChar v = guest_code[delta+0];
371   return toUChar(v);
372}
373
374static UInt getUDisp16 ( Int delta )
375{
376   UInt v = guest_code[delta+1]; v <<= 8;
377   v |= guest_code[delta+0];
378   return v & 0xFFFF;
379}
380
381static UInt getUDisp32 ( Int delta )
382{
383   UInt v = guest_code[delta+3]; v <<= 8;
384   v |= guest_code[delta+2]; v <<= 8;
385   v |= guest_code[delta+1]; v <<= 8;
386   v |= guest_code[delta+0];
387   return v;
388}
389
390static UInt getUDisp ( Int size, Int delta )
391{
392   switch (size) {
393      case 4: return getUDisp32(delta);
394      case 2: return getUDisp16(delta);
395      case 1: return (UInt)getUChar(delta);
396      default: vpanic("getUDisp(x86)");
397   }
398   return 0; /*notreached*/
399}
400
401
402/* Get a byte value out of the insn stream and sign-extend to 32
403   bits. */
404static UInt getSDisp8 ( Int delta )
405{
406   return extend_s_8to32( (UInt) (guest_code[delta]) );
407}
408
409static UInt getSDisp16 ( Int delta0 )
410{
411   UChar* eip = (UChar*)(&guest_code[delta0]);
412   UInt d = *eip++;
413   d |= ((*eip++) << 8);
414   return extend_s_16to32(d);
415}
416
417static UInt getSDisp ( Int size, Int delta )
418{
419   switch (size) {
420      case 4: return getUDisp32(delta);
421      case 2: return getSDisp16(delta);
422      case 1: return getSDisp8(delta);
423      default: vpanic("getSDisp(x86)");
424  }
425  return 0; /*notreached*/
426}
427
428
429/*------------------------------------------------------------*/
430/*--- Helpers for constructing IR.                         ---*/
431/*------------------------------------------------------------*/
432
433/* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
434   register references, we need to take the host endianness into
435   account.  Supplied value is 0 .. 7 and in the Intel instruction
436   encoding. */
437
438static IRType szToITy ( Int n )
439{
440   switch (n) {
441      case 1: return Ity_I8;
442      case 2: return Ity_I16;
443      case 4: return Ity_I32;
444      default: vpanic("szToITy(x86)");
445   }
446}
447
448/* On a little-endian host, less significant bits of the guest
449   registers are at lower addresses.  Therefore, if a reference to a
450   register low half has the safe guest state offset as a reference to
451   the full register.
452*/
453static Int integerGuestRegOffset ( Int sz, UInt archreg )
454{
455   vassert(archreg < 8);
456
457   /* Correct for little-endian host only. */
458   vassert(!host_is_bigendian);
459
460   if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
461      switch (archreg) {
462         case R_EAX: return OFFB_EAX;
463         case R_EBX: return OFFB_EBX;
464         case R_ECX: return OFFB_ECX;
465         case R_EDX: return OFFB_EDX;
466         case R_ESI: return OFFB_ESI;
467         case R_EDI: return OFFB_EDI;
468         case R_ESP: return OFFB_ESP;
469         case R_EBP: return OFFB_EBP;
470         default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
471      }
472   }
473
474   vassert(archreg >= 4 && archreg < 8 && sz == 1);
475   switch (archreg-4) {
476      case R_EAX: return 1+ OFFB_EAX;
477      case R_EBX: return 1+ OFFB_EBX;
478      case R_ECX: return 1+ OFFB_ECX;
479      case R_EDX: return 1+ OFFB_EDX;
480      default: vpanic("integerGuestRegOffset(x86,le)(1h)");
481   }
482
483   /* NOTREACHED */
484   vpanic("integerGuestRegOffset(x86,le)");
485}
486
487static Int segmentGuestRegOffset ( UInt sreg )
488{
489   switch (sreg) {
490      case R_ES: return OFFB_ES;
491      case R_CS: return OFFB_CS;
492      case R_SS: return OFFB_SS;
493      case R_DS: return OFFB_DS;
494      case R_FS: return OFFB_FS;
495      case R_GS: return OFFB_GS;
496      default: vpanic("segmentGuestRegOffset(x86)");
497   }
498}
499
500static Int xmmGuestRegOffset ( UInt xmmreg )
501{
502   switch (xmmreg) {
503      case 0: return OFFB_XMM0;
504      case 1: return OFFB_XMM1;
505      case 2: return OFFB_XMM2;
506      case 3: return OFFB_XMM3;
507      case 4: return OFFB_XMM4;
508      case 5: return OFFB_XMM5;
509      case 6: return OFFB_XMM6;
510      case 7: return OFFB_XMM7;
511      default: vpanic("xmmGuestRegOffset");
512   }
513}
514
515/* Lanes of vector registers are always numbered from zero being the
516   least significant lane (rightmost in the register).  */
517
518static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
519{
520   /* Correct for little-endian host only. */
521   vassert(!host_is_bigendian);
522   vassert(laneno >= 0 && laneno < 8);
523   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
524}
525
526static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
527{
528   /* Correct for little-endian host only. */
529   vassert(!host_is_bigendian);
530   vassert(laneno >= 0 && laneno < 4);
531   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
532}
533
534static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
535{
536   /* Correct for little-endian host only. */
537   vassert(!host_is_bigendian);
538   vassert(laneno >= 0 && laneno < 2);
539   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
540}
541
542static IRExpr* getIReg ( Int sz, UInt archreg )
543{
544   vassert(sz == 1 || sz == 2 || sz == 4);
545   vassert(archreg < 8);
546   return IRExpr_Get( integerGuestRegOffset(sz,archreg),
547                      szToITy(sz) );
548}
549
550/* Ditto, but write to a reg instead. */
551static void putIReg ( Int sz, UInt archreg, IRExpr* e )
552{
553   IRType ty = typeOfIRExpr(irsb->tyenv, e);
554   switch (sz) {
555      case 1: vassert(ty == Ity_I8); break;
556      case 2: vassert(ty == Ity_I16); break;
557      case 4: vassert(ty == Ity_I32); break;
558      default: vpanic("putIReg(x86)");
559   }
560   vassert(archreg < 8);
561   stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
562}
563
564static IRExpr* getSReg ( UInt sreg )
565{
566   return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
567}
568
569static void putSReg ( UInt sreg, IRExpr* e )
570{
571   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
572   stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
573}
574
575static IRExpr* getXMMReg ( UInt xmmreg )
576{
577   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
578}
579
580static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
581{
582   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
583}
584
585static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
586{
587   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
588}
589
590static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
591{
592   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
593}
594
595static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
596{
597   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
598}
599
600static void putXMMReg ( UInt xmmreg, IRExpr* e )
601{
602   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
603   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
604}
605
606static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
607{
608   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
609   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
610}
611
612static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
613{
614   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
615   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
616}
617
618static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
619{
620   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
621   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
622}
623
624static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
625{
626   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
627   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
628}
629
630static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
631{
632   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
633   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
634}
635
636static void assign ( IRTemp dst, IRExpr* e )
637{
638   stmt( IRStmt_WrTmp(dst, e) );
639}
640
641static void storeLE ( IRExpr* addr, IRExpr* data )
642{
643   stmt( IRStmt_Store(Iend_LE, addr, data) );
644}
645
646static IRExpr* unop ( IROp op, IRExpr* a )
647{
648   return IRExpr_Unop(op, a);
649}
650
651static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
652{
653   return IRExpr_Binop(op, a1, a2);
654}
655
656static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
657{
658   return IRExpr_Triop(op, a1, a2, a3);
659}
660
661static IRExpr* mkexpr ( IRTemp tmp )
662{
663   return IRExpr_RdTmp(tmp);
664}
665
666static IRExpr* mkU8 ( UInt i )
667{
668   vassert(i < 256);
669   return IRExpr_Const(IRConst_U8( (UChar)i ));
670}
671
672static IRExpr* mkU16 ( UInt i )
673{
674   vassert(i < 65536);
675   return IRExpr_Const(IRConst_U16( (UShort)i ));
676}
677
678static IRExpr* mkU32 ( UInt i )
679{
680   return IRExpr_Const(IRConst_U32(i));
681}
682
683static IRExpr* mkU64 ( ULong i )
684{
685   return IRExpr_Const(IRConst_U64(i));
686}
687
688static IRExpr* mkU ( IRType ty, UInt i )
689{
690   if (ty == Ity_I8)  return mkU8(i);
691   if (ty == Ity_I16) return mkU16(i);
692   if (ty == Ity_I32) return mkU32(i);
693   /* If this panics, it usually means you passed a size (1,2,4)
694      value as the IRType, rather than a real IRType. */
695   vpanic("mkU(x86)");
696}
697
698static IRExpr* mkV128 ( UShort mask )
699{
700   return IRExpr_Const(IRConst_V128(mask));
701}
702
703static IRExpr* loadLE ( IRType ty, IRExpr* addr )
704{
705   return IRExpr_Load(Iend_LE, ty, addr);
706}
707
708static IROp mkSizedOp ( IRType ty, IROp op8 )
709{
710   Int adj;
711   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
712   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
713           || op8 == Iop_Mul8
714           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
715           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
716           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
717           || op8 == Iop_CasCmpNE8
718           || op8 == Iop_Not8);
719   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
720   return adj + op8;
721}
722
723static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
724{
725   if (szSmall == 1 && szBig == 4) {
726      return signd ? Iop_8Sto32 : Iop_8Uto32;
727   }
728   if (szSmall == 1 && szBig == 2) {
729      return signd ? Iop_8Sto16 : Iop_8Uto16;
730   }
731   if (szSmall == 2 && szBig == 4) {
732      return signd ? Iop_16Sto32 : Iop_16Uto32;
733   }
734   vpanic("mkWidenOp(x86,guest)");
735}
736
737static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
738{
739   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
740   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
741   return unop(Iop_32to1,
742               binop(Iop_And32,
743                     unop(Iop_1Uto32,x),
744                     unop(Iop_1Uto32,y)));
745}
746
747/* Generate a compare-and-swap operation, operating on memory at
748   'addr'.  The expected value is 'expVal' and the new value is
749   'newVal'.  If the operation fails, then transfer control (with a
750   no-redir jump (XXX no -- see comment at top of this file)) to
751   'restart_point', which is presumably the address of the guest
752   instruction again -- retrying, essentially. */
753static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
754                    Addr32 restart_point )
755{
756   IRCAS* cas;
757   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
758   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
759   IRTemp oldTmp = newTemp(tyE);
760   IRTemp expTmp = newTemp(tyE);
761   vassert(tyE == tyN);
762   vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
763   assign(expTmp, expVal);
764   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
765                  NULL, mkexpr(expTmp), NULL, newVal );
766   stmt( IRStmt_CAS(cas) );
767   stmt( IRStmt_Exit(
768            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
769                   mkexpr(oldTmp), mkexpr(expTmp) ),
770            Ijk_Boring, /*Ijk_NoRedir*/
771            IRConst_U32( restart_point )
772         ));
773}
774
775
776/*------------------------------------------------------------*/
777/*--- Helpers for %eflags.                                 ---*/
778/*------------------------------------------------------------*/
779
780/* -------------- Evaluating the flags-thunk. -------------- */
781
782/* Build IR to calculate all the eflags from stored
783   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
784   Ity_I32. */
785static IRExpr* mk_x86g_calculate_eflags_all ( void )
786{
787   IRExpr** args
788      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
789                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
790                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
791                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
792   IRExpr* call
793      = mkIRExprCCall(
794           Ity_I32,
795           0/*regparm*/,
796           "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
797           args
798        );
799   /* Exclude OP and NDEP from definedness checking.  We're only
800      interested in DEP1 and DEP2. */
801   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
802   return call;
803}
804
805/* Build IR to calculate some particular condition from stored
806   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
807   Ity_Bit. */
808static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
809{
810   IRExpr** args
811      = mkIRExprVec_5( mkU32(cond),
812                       IRExpr_Get(OFFB_CC_OP,  Ity_I32),
813                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
814                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
815                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
816   IRExpr* call
817      = mkIRExprCCall(
818           Ity_I32,
819           0/*regparm*/,
820           "x86g_calculate_condition", &x86g_calculate_condition,
821           args
822        );
823   /* Exclude the requested condition, OP and NDEP from definedness
824      checking.  We're only interested in DEP1 and DEP2. */
825   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
826   return unop(Iop_32to1, call);
827}
828
829/* Build IR to calculate just the carry flag from stored
830   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
831static IRExpr* mk_x86g_calculate_eflags_c ( void )
832{
833   IRExpr** args
834      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
835                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
836                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
837                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
838   IRExpr* call
839      = mkIRExprCCall(
840           Ity_I32,
841           3/*regparm*/,
842           "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
843           args
844        );
845   /* Exclude OP and NDEP from definedness checking.  We're only
846      interested in DEP1 and DEP2. */
847   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
848   return call;
849}
850
851
852/* -------------- Building the flags-thunk. -------------- */
853
854/* The machinery in this section builds the flag-thunk following a
855   flag-setting operation.  Hence the various setFlags_* functions.
856*/
857
858static Bool isAddSub ( IROp op8 )
859{
860   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
861}
862
863static Bool isLogic ( IROp op8 )
864{
865   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
866}
867
868/* U-widen 8/16/32 bit int expr to 32. */
869static IRExpr* widenUto32 ( IRExpr* e )
870{
871   switch (typeOfIRExpr(irsb->tyenv,e)) {
872      case Ity_I32: return e;
873      case Ity_I16: return unop(Iop_16Uto32,e);
874      case Ity_I8:  return unop(Iop_8Uto32,e);
875      default: vpanic("widenUto32");
876   }
877}
878
879/* S-widen 8/16/32 bit int expr to 32. */
880static IRExpr* widenSto32 ( IRExpr* e )
881{
882   switch (typeOfIRExpr(irsb->tyenv,e)) {
883      case Ity_I32: return e;
884      case Ity_I16: return unop(Iop_16Sto32,e);
885      case Ity_I8:  return unop(Iop_8Sto32,e);
886      default: vpanic("widenSto32");
887   }
888}
889
890/* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
891   of these combinations make sense. */
892static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
893{
894   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
895   if (src_ty == dst_ty)
896      return e;
897   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
898      return unop(Iop_32to16, e);
899   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
900      return unop(Iop_32to8, e);
901
902   vex_printf("\nsrc, dst tys are: ");
903   ppIRType(src_ty);
904   vex_printf(", ");
905   ppIRType(dst_ty);
906   vex_printf("\n");
907   vpanic("narrowTo(x86)");
908}
909
910
911/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
912   auto-sized up to the real op. */
913
914static
915void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
916{
917   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
918
919   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
920
921   switch (op8) {
922      case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
923      case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
924      default:       ppIROp(op8);
925                     vpanic("setFlags_DEP1_DEP2(x86)");
926   }
927   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
928   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
929   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
930   /* Set NDEP even though it isn't used.  This makes redundant-PUT
931      elimination of previous stores to this field work better. */
932   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
933}
934
935
936/* Set the OP and DEP1 fields only, and write zero to DEP2. */
937
938static
939void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
940{
941   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
942
943   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
944
945   switch (op8) {
946      case Iop_Or8:
947      case Iop_And8:
948      case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
949      default:       ppIROp(op8);
950                     vpanic("setFlags_DEP1(x86)");
951   }
952   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
953   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
954   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
955   /* Set NDEP even though it isn't used.  This makes redundant-PUT
956      elimination of previous stores to this field work better. */
957   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
958}
959
960
961/* For shift operations, we put in the result and the undershifted
962   result.  Except if the shift amount is zero, the thunk is left
963   unchanged. */
964
965static void setFlags_DEP1_DEP2_shift ( IROp    op32,
966                                       IRTemp  res,
967                                       IRTemp  resUS,
968                                       IRType  ty,
969                                       IRTemp  guard )
970{
971   Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
972
973   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
974   vassert(guard);
975
976   /* Both kinds of right shifts are handled by the same thunk
977      operation. */
978   switch (op32) {
979      case Iop_Shr32:
980      case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
981      case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
982      default:        ppIROp(op32);
983                      vpanic("setFlags_DEP1_DEP2_shift(x86)");
984   }
985
986   /* DEP1 contains the result, DEP2 contains the undershifted value. */
987   stmt( IRStmt_Put( OFFB_CC_OP,
988                     IRExpr_Mux0X( mkexpr(guard),
989                                   IRExpr_Get(OFFB_CC_OP,Ity_I32),
990                                   mkU32(ccOp))) );
991   stmt( IRStmt_Put( OFFB_CC_DEP1,
992                     IRExpr_Mux0X( mkexpr(guard),
993                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
994                                   widenUto32(mkexpr(res)))) );
995   stmt( IRStmt_Put( OFFB_CC_DEP2,
996                     IRExpr_Mux0X( mkexpr(guard),
997                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
998                                   widenUto32(mkexpr(resUS)))) );
999   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1000      elimination of previous stores to this field work better. */
1001   stmt( IRStmt_Put( OFFB_CC_NDEP,
1002                     IRExpr_Mux0X( mkexpr(guard),
1003                                   IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
1004				   mkU32(0) )));
1005}
1006
1007
1008/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1009   the former value of the carry flag, which unfortunately we have to
1010   compute. */
1011
1012static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1013{
1014   Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
1015
1016   ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
1017   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
1018
1019   /* This has to come first, because calculating the C flag
1020      may require reading all four thunk fields. */
1021   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
1022   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
1023   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
1024   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
1025}
1026
1027
1028/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1029   two arguments. */
1030
1031static
1032void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
1033{
1034   switch (ty) {
1035      case Ity_I8:
1036         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
1037         break;
1038      case Ity_I16:
1039         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
1040         break;
1041      case Ity_I32:
1042         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
1043         break;
1044      default:
1045         vpanic("setFlags_MUL(x86)");
1046   }
1047   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
1048   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
1049   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1050      elimination of previous stores to this field work better. */
1051   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1052}
1053
1054
1055/* -------------- Condition codes. -------------- */
1056
1057/* Condition codes, using the Intel encoding.  */
1058
1059static HChar* name_X86Condcode ( X86Condcode cond )
1060{
1061   switch (cond) {
1062      case X86CondO:      return "o";
1063      case X86CondNO:     return "no";
1064      case X86CondB:      return "b";
1065      case X86CondNB:     return "nb";
1066      case X86CondZ:      return "z";
1067      case X86CondNZ:     return "nz";
1068      case X86CondBE:     return "be";
1069      case X86CondNBE:    return "nbe";
1070      case X86CondS:      return "s";
1071      case X86CondNS:     return "ns";
1072      case X86CondP:      return "p";
1073      case X86CondNP:     return "np";
1074      case X86CondL:      return "l";
1075      case X86CondNL:     return "nl";
1076      case X86CondLE:     return "le";
1077      case X86CondNLE:    return "nle";
1078      case X86CondAlways: return "ALWAYS";
1079      default: vpanic("name_X86Condcode");
1080   }
1081}
1082
1083static
1084X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
1085                                      Bool*        needInvert )
1086{
1087   vassert(cond >= X86CondO && cond <= X86CondNLE);
1088   if (cond & 1) {
1089      *needInvert = True;
1090      return cond-1;
1091   } else {
1092      *needInvert = False;
1093      return cond;
1094   }
1095}
1096
1097
1098/* -------------- Helpers for ADD/SUB with carry. -------------- */
1099
1100/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1101   appropriately.
1102
1103   Optionally, generate a store for the 'tres' value.  This can either
1104   be a normal store, or it can be a cas-with-possible-failure style
1105   store:
1106
1107   if taddr is IRTemp_INVALID, then no store is generated.
1108
1109   if taddr is not IRTemp_INVALID, then a store (using taddr as
1110   the address) is generated:
1111
1112     if texpVal is IRTemp_INVALID then a normal store is
1113     generated, and restart_point must be zero (it is irrelevant).
1114
1115     if texpVal is not IRTemp_INVALID then a cas-style store is
1116     generated.  texpVal is the expected value, restart_point
1117     is the restart point if the store fails, and texpVal must
1118     have the same type as tres.
1119*/
1120static void helper_ADC ( Int sz,
1121                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1122                         /* info about optional store: */
1123                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1124{
1125   UInt    thunkOp;
1126   IRType  ty    = szToITy(sz);
1127   IRTemp  oldc  = newTemp(Ity_I32);
1128   IRTemp  oldcn = newTemp(ty);
1129   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1130   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1131
1132   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1133   vassert(sz == 1 || sz == 2 || sz == 4);
1134   thunkOp = sz==4 ? X86G_CC_OP_ADCL
1135                   : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
1136
1137   /* oldc = old carry flag, 0 or 1 */
1138   assign( oldc,  binop(Iop_And32,
1139                        mk_x86g_calculate_eflags_c(),
1140                        mkU32(1)) );
1141
1142   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1143
1144   assign( tres, binop(plus,
1145                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1146                       mkexpr(oldcn)) );
1147
1148   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1149      start of this function. */
1150   if (taddr != IRTemp_INVALID) {
1151      if (texpVal == IRTemp_INVALID) {
1152         vassert(restart_point == 0);
1153         storeLE( mkexpr(taddr), mkexpr(tres) );
1154      } else {
1155         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1156         /* .. and hence 'texpVal' has the same type as 'tres'. */
1157         casLE( mkexpr(taddr),
1158                mkexpr(texpVal), mkexpr(tres), restart_point );
1159      }
1160   }
1161
1162   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1163   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
1164   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1165                                                         mkexpr(oldcn)) )) );
1166   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1167}
1168
1169
1170/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1171   appropriately.  As with helper_ADC, possibly generate a store of
1172   the result -- see comments on helper_ADC for details.
1173*/
1174static void helper_SBB ( Int sz,
1175                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1176                         /* info about optional store: */
1177                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1178{
1179   UInt    thunkOp;
1180   IRType  ty    = szToITy(sz);
1181   IRTemp  oldc  = newTemp(Ity_I32);
1182   IRTemp  oldcn = newTemp(ty);
1183   IROp    minus = mkSizedOp(ty, Iop_Sub8);
1184   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1185
1186   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1187   vassert(sz == 1 || sz == 2 || sz == 4);
1188   thunkOp = sz==4 ? X86G_CC_OP_SBBL
1189                   : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
1190
1191   /* oldc = old carry flag, 0 or 1 */
1192   assign( oldc, binop(Iop_And32,
1193                       mk_x86g_calculate_eflags_c(),
1194                       mkU32(1)) );
1195
1196   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1197
1198   assign( tres, binop(minus,
1199                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
1200                       mkexpr(oldcn)) );
1201
1202   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1203      start of this function. */
1204   if (taddr != IRTemp_INVALID) {
1205      if (texpVal == IRTemp_INVALID) {
1206         vassert(restart_point == 0);
1207         storeLE( mkexpr(taddr), mkexpr(tres) );
1208      } else {
1209         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1210         /* .. and hence 'texpVal' has the same type as 'tres'. */
1211         casLE( mkexpr(taddr),
1212                mkexpr(texpVal), mkexpr(tres), restart_point );
1213      }
1214   }
1215
1216   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1217   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
1218   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1219                                                         mkexpr(oldcn)) )) );
1220   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1221}
1222
1223
1224/* -------------- Helpers for disassembly printing. -------------- */
1225
1226static HChar* nameGrp1 ( Int opc_aux )
1227{
1228   static HChar* grp1_names[8]
1229     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1230   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
1231   return grp1_names[opc_aux];
1232}
1233
1234static HChar* nameGrp2 ( Int opc_aux )
1235{
1236   static HChar* grp2_names[8]
1237     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1238   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
1239   return grp2_names[opc_aux];
1240}
1241
1242static HChar* nameGrp4 ( Int opc_aux )
1243{
1244   static HChar* grp4_names[8]
1245     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1246   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
1247   return grp4_names[opc_aux];
1248}
1249
1250static HChar* nameGrp5 ( Int opc_aux )
1251{
1252   static HChar* grp5_names[8]
1253     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
1254   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
1255   return grp5_names[opc_aux];
1256}
1257
1258static HChar* nameGrp8 ( Int opc_aux )
1259{
1260   static HChar* grp8_names[8]
1261     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
1262   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
1263   return grp8_names[opc_aux];
1264}
1265
1266static HChar* nameIReg ( Int size, Int reg )
1267{
1268   static HChar* ireg32_names[8]
1269     = { "%eax", "%ecx", "%edx", "%ebx",
1270         "%esp", "%ebp", "%esi", "%edi" };
1271   static HChar* ireg16_names[8]
1272     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
1273   static HChar* ireg8_names[8]
1274     = { "%al", "%cl", "%dl", "%bl",
1275         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
1276   if (reg < 0 || reg > 7) goto bad;
1277   switch (size) {
1278      case 4: return ireg32_names[reg];
1279      case 2: return ireg16_names[reg];
1280      case 1: return ireg8_names[reg];
1281   }
1282  bad:
1283   vpanic("nameIReg(X86)");
1284   return NULL; /*notreached*/
1285}
1286
1287static HChar* nameSReg ( UInt sreg )
1288{
1289   switch (sreg) {
1290      case R_ES: return "%es";
1291      case R_CS: return "%cs";
1292      case R_SS: return "%ss";
1293      case R_DS: return "%ds";
1294      case R_FS: return "%fs";
1295      case R_GS: return "%gs";
1296      default: vpanic("nameSReg(x86)");
1297   }
1298}
1299
1300static HChar* nameMMXReg ( Int mmxreg )
1301{
1302   static HChar* mmx_names[8]
1303     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
1304   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
1305   return mmx_names[mmxreg];
1306}
1307
1308static HChar* nameXMMReg ( Int xmmreg )
1309{
1310   static HChar* xmm_names[8]
1311     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
1312         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
1313   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
1314   return xmm_names[xmmreg];
1315}
1316
1317static HChar* nameMMXGran ( Int gran )
1318{
1319   switch (gran) {
1320      case 0: return "b";
1321      case 1: return "w";
1322      case 2: return "d";
1323      case 3: return "q";
1324      default: vpanic("nameMMXGran(x86,guest)");
1325   }
1326}
1327
1328static HChar nameISize ( Int size )
1329{
1330   switch (size) {
1331      case 4: return 'l';
1332      case 2: return 'w';
1333      case 1: return 'b';
1334      default: vpanic("nameISize(x86)");
1335   }
1336}
1337
1338
1339/*------------------------------------------------------------*/
1340/*--- JMP helpers                                          ---*/
1341/*------------------------------------------------------------*/
1342
1343static void jmp_lit( IRJumpKind kind, Addr32 d32 )
1344{
1345   irsb->next     = mkU32(d32);
1346   irsb->jumpkind = kind;
1347}
1348
1349static void jmp_treg( IRJumpKind kind, IRTemp t )
1350{
1351   irsb->next = mkexpr(t);
1352   irsb->jumpkind = kind;
1353}
1354
1355static
1356void jcc_01( X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
1357{
1358   Bool        invert;
1359   X86Condcode condPos;
1360   condPos = positiveIse_X86Condcode ( cond, &invert );
1361   if (invert) {
1362      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1363                         Ijk_Boring,
1364                         IRConst_U32(d32_false) ) );
1365      irsb->next     = mkU32(d32_true);
1366      irsb->jumpkind = Ijk_Boring;
1367   } else {
1368      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1369                         Ijk_Boring,
1370                         IRConst_U32(d32_true) ) );
1371      irsb->next     = mkU32(d32_false);
1372      irsb->jumpkind = Ijk_Boring;
1373   }
1374}
1375
1376
1377/*------------------------------------------------------------*/
1378/*--- Disassembling addressing modes                       ---*/
1379/*------------------------------------------------------------*/
1380
1381static
1382HChar* sorbTxt ( UChar sorb )
1383{
1384   switch (sorb) {
1385      case 0:    return ""; /* no override */
1386      case 0x3E: return "%ds";
1387      case 0x26: return "%es:";
1388      case 0x64: return "%fs:";
1389      case 0x65: return "%gs:";
1390      default: vpanic("sorbTxt(x86,guest)");
1391   }
1392}
1393
1394
1395/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
1396   linear address by adding any required segment override as indicated
1397   by sorb. */
1398static
1399IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
1400{
1401   Int    sreg;
1402   IRType hWordTy;
1403   IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
1404
1405   if (sorb == 0)
1406      /* the common case - no override */
1407      return virtual;
1408
1409   switch (sorb) {
1410      case 0x3E: sreg = R_DS; break;
1411      case 0x26: sreg = R_ES; break;
1412      case 0x64: sreg = R_FS; break;
1413      case 0x65: sreg = R_GS; break;
1414      default: vpanic("handleSegOverride(x86,guest)");
1415   }
1416
1417   hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
1418
1419   seg_selector = newTemp(Ity_I32);
1420   ldt_ptr      = newTemp(hWordTy);
1421   gdt_ptr      = newTemp(hWordTy);
1422   r64          = newTemp(Ity_I64);
1423
1424   assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
1425   assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
1426   assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
1427
1428   /*
1429   Call this to do the translation and limit checks:
1430   ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
1431                                 UInt seg_selector, UInt virtual_addr )
1432   */
1433   assign(
1434      r64,
1435      mkIRExprCCall(
1436         Ity_I64,
1437         0/*regparms*/,
1438         "x86g_use_seg_selector",
1439         &x86g_use_seg_selector,
1440         mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
1441                        mkexpr(seg_selector), virtual)
1442      )
1443   );
1444
1445   /* If the high 32 of the result are non-zero, there was a
1446      failure in address translation.  In which case, make a
1447      quick exit.
1448   */
1449   stmt(
1450      IRStmt_Exit(
1451         binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
1452         Ijk_MapFail,
1453         IRConst_U32( guest_EIP_curr_instr )
1454      )
1455   );
1456
1457   /* otherwise, here's the translated result. */
1458   return unop(Iop_64to32, mkexpr(r64));
1459}
1460
1461
1462/* Generate IR to calculate an address indicated by a ModRM and
1463   following SIB bytes.  The expression, and the number of bytes in
1464   the address mode, are returned.  Note that this fn should not be
1465   called if the R/M part of the address denotes a register instead of
1466   memory.  If print_codegen is true, text of the addressing mode is
1467   placed in buf.
1468
1469   The computed address is stored in a new tempreg, and the
1470   identity of the tempreg is returned.  */
1471
1472static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
1473{
1474   IRTemp tmp = newTemp(Ity_I32);
1475   assign( tmp, addr32 );
1476   return tmp;
1477}
1478
1479static
1480IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
1481{
1482   UChar mod_reg_rm = getIByte(delta);
1483   delta++;
1484
1485   buf[0] = (UChar)0;
1486
1487   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1488      jump table seems a bit excessive.
1489   */
1490   mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1491   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1492                                            /* is now XX0XXYYY */
1493   mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1494   switch (mod_reg_rm) {
1495
1496      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
1497         --> GET %reg, t
1498      */
1499      case 0x00: case 0x01: case 0x02: case 0x03:
1500      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1501         { UChar rm = mod_reg_rm;
1502           DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
1503           *len = 1;
1504           return disAMode_copy2tmp(
1505                  handleSegOverride(sorb, getIReg(4,rm)));
1506         }
1507
1508      /* d8(%eax) ... d8(%edi), not including d8(%esp)
1509         --> GET %reg, t ; ADDL d8, t
1510      */
1511      case 0x08: case 0x09: case 0x0A: case 0x0B:
1512      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1513         { UChar rm = toUChar(mod_reg_rm & 7);
1514           UInt  d  = getSDisp8(delta);
1515           DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1516           *len = 2;
1517           return disAMode_copy2tmp(
1518                  handleSegOverride(sorb,
1519                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1520         }
1521
1522      /* d32(%eax) ... d32(%edi), not including d32(%esp)
1523         --> GET %reg, t ; ADDL d8, t
1524      */
1525      case 0x10: case 0x11: case 0x12: case 0x13:
1526      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1527         { UChar rm = toUChar(mod_reg_rm & 7);
1528           UInt  d  = getUDisp32(delta);
1529           DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1530           *len = 5;
1531           return disAMode_copy2tmp(
1532                  handleSegOverride(sorb,
1533                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1534         }
1535
1536      /* a register, %eax .. %edi.  This shouldn't happen. */
1537      case 0x18: case 0x19: case 0x1A: case 0x1B:
1538      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1539         vpanic("disAMode(x86): not an addr!");
1540
1541      /* a 32-bit literal address
1542         --> MOV d32, tmp
1543      */
1544      case 0x05:
1545         { UInt d = getUDisp32(delta);
1546           *len = 5;
1547           DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1548           return disAMode_copy2tmp(
1549                     handleSegOverride(sorb, mkU32(d)));
1550         }
1551
1552      case 0x04: {
1553         /* SIB, with no displacement.  Special cases:
1554            -- %esp cannot act as an index value.
1555               If index_r indicates %esp, zero is used for the index.
1556            -- when mod is zero and base indicates EBP, base is instead
1557               a 32-bit literal.
1558            It's all madness, I tell you.  Extract %index, %base and
1559            scale from the SIB byte.  The value denoted is then:
1560               | %index == %ESP && %base == %EBP
1561               = d32 following SIB byte
1562               | %index == %ESP && %base != %EBP
1563               = %base
1564               | %index != %ESP && %base == %EBP
1565               = d32 following SIB byte + (%index << scale)
1566               | %index != %ESP && %base != %ESP
1567               = %base + (%index << scale)
1568
1569            What happens to the souls of CPU architects who dream up such
1570            horrendous schemes, do you suppose?
1571         */
1572         UChar sib     = getIByte(delta);
1573         UChar scale   = toUChar((sib >> 6) & 3);
1574         UChar index_r = toUChar((sib >> 3) & 7);
1575         UChar base_r  = toUChar(sib & 7);
1576         delta++;
1577
1578         if (index_r != R_ESP && base_r != R_EBP) {
1579            DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
1580                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1581            *len = 2;
1582            return
1583               disAMode_copy2tmp(
1584               handleSegOverride(sorb,
1585                  binop(Iop_Add32,
1586                        getIReg(4,base_r),
1587                        binop(Iop_Shl32, getIReg(4,index_r),
1588                              mkU8(scale)))));
1589         }
1590
1591         if (index_r != R_ESP && base_r == R_EBP) {
1592            UInt d = getUDisp32(delta);
1593            DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
1594                      nameIReg(4,index_r), 1<<scale);
1595            *len = 6;
1596            return
1597               disAMode_copy2tmp(
1598               handleSegOverride(sorb,
1599                  binop(Iop_Add32,
1600                        binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
1601                        mkU32(d))));
1602         }
1603
1604         if (index_r == R_ESP && base_r != R_EBP) {
1605            DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
1606            *len = 2;
1607            return disAMode_copy2tmp(
1608                   handleSegOverride(sorb, getIReg(4,base_r)));
1609         }
1610
1611         if (index_r == R_ESP && base_r == R_EBP) {
1612            UInt d = getUDisp32(delta);
1613            DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
1614            *len = 6;
1615            return disAMode_copy2tmp(
1616                   handleSegOverride(sorb, mkU32(d)));
1617         }
1618         /*NOTREACHED*/
1619         vassert(0);
1620      }
1621
1622      /* SIB, with 8-bit displacement.  Special cases:
1623         -- %esp cannot act as an index value.
1624            If index_r indicates %esp, zero is used for the index.
1625         Denoted value is:
1626            | %index == %ESP
1627            = d8 + %base
1628            | %index != %ESP
1629            = d8 + %base + (%index << scale)
1630      */
1631      case 0x0C: {
1632         UChar sib     = getIByte(delta);
1633         UChar scale   = toUChar((sib >> 6) & 3);
1634         UChar index_r = toUChar((sib >> 3) & 7);
1635         UChar base_r  = toUChar(sib & 7);
1636         UInt  d       = getSDisp8(delta+1);
1637
1638         if (index_r == R_ESP) {
1639            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1640                                   (Int)d, nameIReg(4,base_r));
1641            *len = 3;
1642            return disAMode_copy2tmp(
1643                   handleSegOverride(sorb,
1644                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1645         } else {
1646            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1647                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1648            *len = 3;
1649            return
1650                disAMode_copy2tmp(
1651                handleSegOverride(sorb,
1652                  binop(Iop_Add32,
1653                        binop(Iop_Add32,
1654                              getIReg(4,base_r),
1655                              binop(Iop_Shl32,
1656                                    getIReg(4,index_r), mkU8(scale))),
1657                        mkU32(d))));
1658         }
1659	 /*NOTREACHED*/
1660         vassert(0);
1661      }
1662
1663      /* SIB, with 32-bit displacement.  Special cases:
1664         -- %esp cannot act as an index value.
1665            If index_r indicates %esp, zero is used for the index.
1666         Denoted value is:
1667            | %index == %ESP
1668            = d32 + %base
1669            | %index != %ESP
1670            = d32 + %base + (%index << scale)
1671      */
1672      case 0x14: {
1673         UChar sib     = getIByte(delta);
1674         UChar scale   = toUChar((sib >> 6) & 3);
1675         UChar index_r = toUChar((sib >> 3) & 7);
1676         UChar base_r  = toUChar(sib & 7);
1677         UInt d        = getUDisp32(delta+1);
1678
1679         if (index_r == R_ESP) {
1680            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1681                                   (Int)d, nameIReg(4,base_r));
1682            *len = 6;
1683            return disAMode_copy2tmp(
1684                   handleSegOverride(sorb,
1685                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1686         } else {
1687            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1688                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1689            *len = 6;
1690            return
1691                disAMode_copy2tmp(
1692                handleSegOverride(sorb,
1693                  binop(Iop_Add32,
1694                        binop(Iop_Add32,
1695                              getIReg(4,base_r),
1696                              binop(Iop_Shl32,
1697                                    getIReg(4,index_r), mkU8(scale))),
1698                        mkU32(d))));
1699         }
1700	 /*NOTREACHED*/
1701         vassert(0);
1702      }
1703
1704      default:
1705         vpanic("disAMode(x86)");
1706         return 0; /*notreached*/
1707   }
1708}
1709
1710
1711/* Figure out the number of (insn-stream) bytes constituting the amode
1712   beginning at delta.  Is useful for getting hold of literals beyond
1713   the end of the amode before it has been disassembled.  */
1714
1715static UInt lengthAMode ( Int delta )
1716{
1717   UChar mod_reg_rm = getIByte(delta); delta++;
1718
1719   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1720      jump table seems a bit excessive.
1721   */
1722   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1723   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1724                                     /* is now XX0XXYYY */
1725   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1726   switch (mod_reg_rm) {
1727
1728      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
1729      case 0x00: case 0x01: case 0x02: case 0x03:
1730      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1731         return 1;
1732
1733      /* d8(%eax) ... d8(%edi), not including d8(%esp). */
1734      case 0x08: case 0x09: case 0x0A: case 0x0B:
1735      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1736         return 2;
1737
1738      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
1739      case 0x10: case 0x11: case 0x12: case 0x13:
1740      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1741         return 5;
1742
1743      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
1744      case 0x18: case 0x19: case 0x1A: case 0x1B:
1745      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1746         return 1;
1747
1748      /* a 32-bit literal address. */
1749      case 0x05: return 5;
1750
1751      /* SIB, no displacement.  */
1752      case 0x04: {
1753         UChar sib    = getIByte(delta);
1754         UChar base_r = toUChar(sib & 7);
1755         if (base_r == R_EBP) return 6; else return 2;
1756      }
1757      /* SIB, with 8-bit displacement.  */
1758      case 0x0C: return 3;
1759
1760      /* SIB, with 32-bit displacement.  */
1761      case 0x14: return 6;
1762
1763      default:
1764         vpanic("lengthAMode");
1765         return 0; /*notreached*/
1766   }
1767}
1768
1769/*------------------------------------------------------------*/
1770/*--- Disassembling common idioms                          ---*/
1771/*------------------------------------------------------------*/
1772
1773/* Handle binary integer instructions of the form
1774      op E, G  meaning
1775      op reg-or-mem, reg
1776   Is passed the a ptr to the modRM byte, the actual operation, and the
1777   data size.  Returns the address advanced completely over this
1778   instruction.
1779
1780   E(src) is reg-or-mem
1781   G(dst) is reg.
1782
1783   If E is reg, -->    GET %G,  tmp
1784                       OP %E,   tmp
1785                       PUT tmp, %G
1786
1787   If E is mem and OP is not reversible,
1788                -->    (getAddr E) -> tmpa
1789                       LD (tmpa), tmpa
1790                       GET %G, tmp2
1791                       OP tmpa, tmp2
1792                       PUT tmp2, %G
1793
1794   If E is mem and OP is reversible
1795                -->    (getAddr E) -> tmpa
1796                       LD (tmpa), tmpa
1797                       OP %G, tmpa
1798                       PUT tmpa, %G
1799*/
1800static
1801UInt dis_op2_E_G ( UChar       sorb,
1802                   Bool        addSubCarry,
1803                   IROp        op8,
1804                   Bool        keep,
1805                   Int         size,
1806                   Int         delta0,
1807                   HChar*      t_x86opc )
1808{
1809   HChar   dis_buf[50];
1810   Int     len;
1811   IRType  ty   = szToITy(size);
1812   IRTemp  dst1 = newTemp(ty);
1813   IRTemp  src  = newTemp(ty);
1814   IRTemp  dst0 = newTemp(ty);
1815   UChar   rm   = getUChar(delta0);
1816   IRTemp  addr = IRTemp_INVALID;
1817
1818   /* addSubCarry == True indicates the intended operation is
1819      add-with-carry or subtract-with-borrow. */
1820   if (addSubCarry) {
1821      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1822      vassert(keep);
1823   }
1824
1825   if (epartIsReg(rm)) {
1826      /* Specially handle XOR reg,reg, because that doesn't really
1827         depend on reg, and doing the obvious thing potentially
1828         generates a spurious value check failure due to the bogus
1829         dependency.  Ditto SBB reg,reg. */
1830      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1831          && gregOfRM(rm) == eregOfRM(rm)) {
1832         putIReg(size, gregOfRM(rm), mkU(ty,0));
1833      }
1834      assign( dst0, getIReg(size,gregOfRM(rm)) );
1835      assign( src,  getIReg(size,eregOfRM(rm)) );
1836
1837      if (addSubCarry && op8 == Iop_Add8) {
1838         helper_ADC( size, dst1, dst0, src,
1839                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1840         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1841      } else
1842      if (addSubCarry && op8 == Iop_Sub8) {
1843         helper_SBB( size, dst1, dst0, src,
1844                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1845         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1846      } else {
1847         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1848         if (isAddSub(op8))
1849            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1850         else
1851            setFlags_DEP1(op8, dst1, ty);
1852         if (keep)
1853            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1854      }
1855
1856      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1857                          nameIReg(size,eregOfRM(rm)),
1858                          nameIReg(size,gregOfRM(rm)));
1859      return 1+delta0;
1860   } else {
1861      /* E refers to memory */
1862      addr = disAMode ( &len, sorb, delta0, dis_buf);
1863      assign( dst0, getIReg(size,gregOfRM(rm)) );
1864      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
1865
1866      if (addSubCarry && op8 == Iop_Add8) {
1867         helper_ADC( size, dst1, dst0, src,
1868                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1869         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1870      } else
1871      if (addSubCarry && op8 == Iop_Sub8) {
1872         helper_SBB( size, dst1, dst0, src,
1873                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1874         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1875      } else {
1876         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1877         if (isAddSub(op8))
1878            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1879         else
1880            setFlags_DEP1(op8, dst1, ty);
1881         if (keep)
1882            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1883      }
1884
1885      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1886                          dis_buf,nameIReg(size,gregOfRM(rm)));
1887      return len+delta0;
1888   }
1889}
1890
1891
1892
1893/* Handle binary integer instructions of the form
1894      op G, E  meaning
1895      op reg, reg-or-mem
1896   Is passed the a ptr to the modRM byte, the actual operation, and the
1897   data size.  Returns the address advanced completely over this
1898   instruction.
1899
1900   G(src) is reg.
1901   E(dst) is reg-or-mem
1902
1903   If E is reg, -->    GET %E,  tmp
1904                       OP %G,   tmp
1905                       PUT tmp, %E
1906
1907   If E is mem, -->    (getAddr E) -> tmpa
1908                       LD (tmpa), tmpv
1909                       OP %G, tmpv
1910                       ST tmpv, (tmpa)
1911*/
1912static
1913UInt dis_op2_G_E ( UChar       sorb,
1914                   Bool        locked,
1915                   Bool        addSubCarry,
1916                   IROp        op8,
1917                   Bool        keep,
1918                   Int         size,
1919                   Int         delta0,
1920                   HChar*      t_x86opc )
1921{
1922   HChar   dis_buf[50];
1923   Int     len;
1924   IRType  ty   = szToITy(size);
1925   IRTemp  dst1 = newTemp(ty);
1926   IRTemp  src  = newTemp(ty);
1927   IRTemp  dst0 = newTemp(ty);
1928   UChar   rm   = getIByte(delta0);
1929   IRTemp  addr = IRTemp_INVALID;
1930
1931   /* addSubCarry == True indicates the intended operation is
1932      add-with-carry or subtract-with-borrow. */
1933   if (addSubCarry) {
1934      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1935      vassert(keep);
1936   }
1937
1938   if (epartIsReg(rm)) {
1939      /* Specially handle XOR reg,reg, because that doesn't really
1940         depend on reg, and doing the obvious thing potentially
1941         generates a spurious value check failure due to the bogus
1942         dependency.  Ditto SBB reg,reg.*/
1943      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1944          && gregOfRM(rm) == eregOfRM(rm)) {
1945         putIReg(size, eregOfRM(rm), mkU(ty,0));
1946      }
1947      assign(dst0, getIReg(size,eregOfRM(rm)));
1948      assign(src,  getIReg(size,gregOfRM(rm)));
1949
1950      if (addSubCarry && op8 == Iop_Add8) {
1951         helper_ADC( size, dst1, dst0, src,
1952                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1953         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1954      } else
1955      if (addSubCarry && op8 == Iop_Sub8) {
1956         helper_SBB( size, dst1, dst0, src,
1957                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1958         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1959      } else {
1960         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
1961         if (isAddSub(op8))
1962            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1963         else
1964            setFlags_DEP1(op8, dst1, ty);
1965         if (keep)
1966            putIReg(size, eregOfRM(rm), mkexpr(dst1));
1967      }
1968
1969      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1970                          nameIReg(size,gregOfRM(rm)),
1971                          nameIReg(size,eregOfRM(rm)));
1972      return 1+delta0;
1973   }
1974
1975   /* E refers to memory */
1976   {
1977      addr = disAMode ( &len, sorb, delta0, dis_buf);
1978      assign(dst0, loadLE(ty,mkexpr(addr)));
1979      assign(src,  getIReg(size,gregOfRM(rm)));
1980
1981      if (addSubCarry && op8 == Iop_Add8) {
1982         if (locked) {
1983            /* cas-style store */
1984            helper_ADC( size, dst1, dst0, src,
1985                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
1986         } else {
1987            /* normal store */
1988            helper_ADC( size, dst1, dst0, src,
1989                        /*store*/addr, IRTemp_INVALID, 0 );
1990         }
1991      } else
1992      if (addSubCarry && op8 == Iop_Sub8) {
1993         if (locked) {
1994            /* cas-style store */
1995            helper_SBB( size, dst1, dst0, src,
1996                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
1997         } else {
1998            /* normal store */
1999            helper_SBB( size, dst1, dst0, src,
2000                        /*store*/addr, IRTemp_INVALID, 0 );
2001         }
2002      } else {
2003         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2004         if (keep) {
2005            if (locked) {
2006               if (0) vex_printf("locked case\n" );
2007               casLE( mkexpr(addr),
2008                      mkexpr(dst0)/*expval*/,
2009                      mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
2010            } else {
2011               if (0) vex_printf("nonlocked case\n");
2012               storeLE(mkexpr(addr), mkexpr(dst1));
2013            }
2014         }
2015         if (isAddSub(op8))
2016            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2017         else
2018            setFlags_DEP1(op8, dst1, ty);
2019      }
2020
2021      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2022                          nameIReg(size,gregOfRM(rm)), dis_buf);
2023      return len+delta0;
2024   }
2025}
2026
2027
2028/* Handle move instructions of the form
2029      mov E, G  meaning
2030      mov reg-or-mem, reg
2031   Is passed the a ptr to the modRM byte, and the data size.  Returns
2032   the address advanced completely over this instruction.
2033
2034   E(src) is reg-or-mem
2035   G(dst) is reg.
2036
2037   If E is reg, -->    GET %E,  tmpv
2038                       PUT tmpv, %G
2039
2040   If E is mem  -->    (getAddr E) -> tmpa
2041                       LD (tmpa), tmpb
2042                       PUT tmpb, %G
2043*/
2044static
2045UInt dis_mov_E_G ( UChar       sorb,
2046                   Int         size,
2047                   Int         delta0 )
2048{
2049   Int len;
2050   UChar rm = getIByte(delta0);
2051   HChar dis_buf[50];
2052
2053   if (epartIsReg(rm)) {
2054      putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
2055      DIP("mov%c %s,%s\n", nameISize(size),
2056                           nameIReg(size,eregOfRM(rm)),
2057                           nameIReg(size,gregOfRM(rm)));
2058      return 1+delta0;
2059   }
2060
2061   /* E refers to memory */
2062   {
2063      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
2064      putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
2065      DIP("mov%c %s,%s\n", nameISize(size),
2066                           dis_buf,nameIReg(size,gregOfRM(rm)));
2067      return delta0+len;
2068   }
2069}
2070
2071
2072/* Handle move instructions of the form
2073      mov G, E  meaning
2074      mov reg, reg-or-mem
2075   Is passed the a ptr to the modRM byte, and the data size.  Returns
2076   the address advanced completely over this instruction.
2077
2078   G(src) is reg.
2079   E(dst) is reg-or-mem
2080
2081   If E is reg, -->    GET %G,  tmp
2082                       PUT tmp, %E
2083
2084   If E is mem, -->    (getAddr E) -> tmpa
2085                       GET %G, tmpv
2086                       ST tmpv, (tmpa)
2087*/
2088static
2089UInt dis_mov_G_E ( UChar       sorb,
2090                   Int         size,
2091                   Int         delta0 )
2092{
2093   Int len;
2094   UChar rm = getIByte(delta0);
2095   HChar dis_buf[50];
2096
2097   if (epartIsReg(rm)) {
2098      putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
2099      DIP("mov%c %s,%s\n", nameISize(size),
2100                           nameIReg(size,gregOfRM(rm)),
2101                           nameIReg(size,eregOfRM(rm)));
2102      return 1+delta0;
2103   }
2104
2105   /* E refers to memory */
2106   {
2107      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
2108      storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
2109      DIP("mov%c %s,%s\n", nameISize(size),
2110                           nameIReg(size,gregOfRM(rm)), dis_buf);
2111      return len+delta0;
2112   }
2113}
2114
2115
2116/* op $immediate, AL/AX/EAX. */
2117static
2118UInt dis_op_imm_A ( Int    size,
2119                    Bool   carrying,
2120                    IROp   op8,
2121                    Bool   keep,
2122                    Int    delta,
2123                    HChar* t_x86opc )
2124{
2125   IRType ty   = szToITy(size);
2126   IRTemp dst0 = newTemp(ty);
2127   IRTemp src  = newTemp(ty);
2128   IRTemp dst1 = newTemp(ty);
2129   UInt lit    = getUDisp(size,delta);
2130   assign(dst0, getIReg(size,R_EAX));
2131   assign(src,  mkU(ty,lit));
2132
2133   if (isAddSub(op8) && !carrying) {
2134      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2135      setFlags_DEP1_DEP2(op8, dst0, src, ty);
2136   }
2137   else
2138   if (isLogic(op8)) {
2139      vassert(!carrying);
2140      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2141      setFlags_DEP1(op8, dst1, ty);
2142   }
2143   else
2144   if (op8 == Iop_Add8 && carrying) {
2145      helper_ADC( size, dst1, dst0, src,
2146                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2147   }
2148   else
2149   if (op8 == Iop_Sub8 && carrying) {
2150      helper_SBB( size, dst1, dst0, src,
2151                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2152   }
2153   else
2154      vpanic("dis_op_imm_A(x86,guest)");
2155
2156   if (keep)
2157      putIReg(size, R_EAX, mkexpr(dst1));
2158
2159   DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
2160                           lit, nameIReg(size,R_EAX));
2161   return delta+size;
2162}
2163
2164
2165/* Sign- and Zero-extending moves. */
2166static
2167UInt dis_movx_E_G ( UChar      sorb,
2168                    Int delta, Int szs, Int szd, Bool sign_extend )
2169{
2170   UChar rm = getIByte(delta);
2171   if (epartIsReg(rm)) {
2172      if (szd == szs) {
2173         // mutant case.  See #250799
2174         putIReg(szd, gregOfRM(rm),
2175                           getIReg(szs,eregOfRM(rm)));
2176      } else {
2177         // normal case
2178         putIReg(szd, gregOfRM(rm),
2179                      unop(mkWidenOp(szs,szd,sign_extend),
2180                           getIReg(szs,eregOfRM(rm))));
2181      }
2182      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2183                               nameISize(szs), nameISize(szd),
2184                               nameIReg(szs,eregOfRM(rm)),
2185                               nameIReg(szd,gregOfRM(rm)));
2186      return 1+delta;
2187   }
2188
2189   /* E refers to memory */
2190   {
2191      Int    len;
2192      HChar  dis_buf[50];
2193      IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
2194      if (szd == szs) {
2195         // mutant case.  See #250799
2196         putIReg(szd, gregOfRM(rm),
2197                           loadLE(szToITy(szs),mkexpr(addr)));
2198      } else {
2199         // normal case
2200         putIReg(szd, gregOfRM(rm),
2201                      unop(mkWidenOp(szs,szd,sign_extend),
2202                           loadLE(szToITy(szs),mkexpr(addr))));
2203      }
2204      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2205                               nameISize(szs), nameISize(szd),
2206                               dis_buf, nameIReg(szd,gregOfRM(rm)));
2207      return len+delta;
2208   }
2209}
2210
2211
2212/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
2213   16 / 8 bit quantity in the given IRTemp.  */
2214static
2215void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
2216{
2217   IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
2218   IRTemp src64 = newTemp(Ity_I64);
2219   IRTemp dst64 = newTemp(Ity_I64);
2220   switch (sz) {
2221      case 4:
2222         assign( src64, binop(Iop_32HLto64,
2223                              getIReg(4,R_EDX), getIReg(4,R_EAX)) );
2224         assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
2225         putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
2226         putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
2227         break;
2228      case 2: {
2229         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2230         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2231         assign( src64, unop(widen3264,
2232                             binop(Iop_16HLto32,
2233                                   getIReg(2,R_EDX), getIReg(2,R_EAX))) );
2234         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
2235         putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
2236         putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
2237         break;
2238      }
2239      case 1: {
2240         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2241         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2242         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
2243         assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
2244         assign( dst64,
2245                 binop(op, mkexpr(src64),
2246                           unop(widen1632, unop(widen816, mkexpr(t)))) );
2247         putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
2248                           unop(Iop_64to32,mkexpr(dst64)))) );
2249         putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
2250                           unop(Iop_64HIto32,mkexpr(dst64)))) );
2251         break;
2252      }
2253      default: vpanic("codegen_div(x86)");
2254   }
2255}
2256
2257
2258static
2259UInt dis_Grp1 ( UChar sorb, Bool locked,
2260                Int delta, UChar modrm,
2261                Int am_sz, Int d_sz, Int sz, UInt d32 )
2262{
2263   Int     len;
2264   HChar   dis_buf[50];
2265   IRType  ty   = szToITy(sz);
2266   IRTemp  dst1 = newTemp(ty);
2267   IRTemp  src  = newTemp(ty);
2268   IRTemp  dst0 = newTemp(ty);
2269   IRTemp  addr = IRTemp_INVALID;
2270   IROp    op8  = Iop_INVALID;
2271   UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
2272
2273   switch (gregOfRM(modrm)) {
2274      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
2275      case 2: break;  // ADC
2276      case 3: break;  // SBB
2277      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
2278      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
2279      /*NOTREACHED*/
2280      default: vpanic("dis_Grp1: unhandled case");
2281   }
2282
2283   if (epartIsReg(modrm)) {
2284      vassert(am_sz == 1);
2285
2286      assign(dst0, getIReg(sz,eregOfRM(modrm)));
2287      assign(src,  mkU(ty,d32 & mask));
2288
2289      if (gregOfRM(modrm) == 2 /* ADC */) {
2290         helper_ADC( sz, dst1, dst0, src,
2291                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2292      } else
2293      if (gregOfRM(modrm) == 3 /* SBB */) {
2294         helper_SBB( sz, dst1, dst0, src,
2295                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2296      } else {
2297         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2298         if (isAddSub(op8))
2299            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2300         else
2301            setFlags_DEP1(op8, dst1, ty);
2302      }
2303
2304      if (gregOfRM(modrm) < 7)
2305         putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2306
2307      delta += (am_sz + d_sz);
2308      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
2309                              nameIReg(sz,eregOfRM(modrm)));
2310   } else {
2311      addr = disAMode ( &len, sorb, delta, dis_buf);
2312
2313      assign(dst0, loadLE(ty,mkexpr(addr)));
2314      assign(src, mkU(ty,d32 & mask));
2315
2316      if (gregOfRM(modrm) == 2 /* ADC */) {
2317         if (locked) {
2318            /* cas-style store */
2319            helper_ADC( sz, dst1, dst0, src,
2320                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2321         } else {
2322            /* normal store */
2323            helper_ADC( sz, dst1, dst0, src,
2324                        /*store*/addr, IRTemp_INVALID, 0 );
2325         }
2326      } else
2327      if (gregOfRM(modrm) == 3 /* SBB */) {
2328         if (locked) {
2329            /* cas-style store */
2330            helper_SBB( sz, dst1, dst0, src,
2331                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2332         } else {
2333            /* normal store */
2334            helper_SBB( sz, dst1, dst0, src,
2335                        /*store*/addr, IRTemp_INVALID, 0 );
2336         }
2337      } else {
2338         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2339         if (gregOfRM(modrm) < 7) {
2340            if (locked) {
2341               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
2342                                    mkexpr(dst1)/*newVal*/,
2343                                    guest_EIP_curr_instr );
2344            } else {
2345               storeLE(mkexpr(addr), mkexpr(dst1));
2346            }
2347         }
2348         if (isAddSub(op8))
2349            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2350         else
2351            setFlags_DEP1(op8, dst1, ty);
2352      }
2353
2354      delta += (len+d_sz);
2355      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
2356                              d32, dis_buf);
2357   }
2358   return delta;
2359}
2360
2361
2362/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
2363   expression. */
2364
2365static
2366UInt dis_Grp2 ( UChar sorb,
2367                Int delta, UChar modrm,
2368                Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
2369                HChar* shift_expr_txt, Bool* decode_OK )
2370{
2371   /* delta on entry points at the modrm byte. */
2372   HChar  dis_buf[50];
2373   Int    len;
2374   Bool   isShift, isRotate, isRotateC;
2375   IRType ty    = szToITy(sz);
2376   IRTemp dst0  = newTemp(ty);
2377   IRTemp dst1  = newTemp(ty);
2378   IRTemp addr  = IRTemp_INVALID;
2379
2380   *decode_OK = True;
2381
2382   vassert(sz == 1 || sz == 2 || sz == 4);
2383
2384   /* Put value to shift/rotate in dst0. */
2385   if (epartIsReg(modrm)) {
2386      assign(dst0, getIReg(sz, eregOfRM(modrm)));
2387      delta += (am_sz + d_sz);
2388   } else {
2389      addr = disAMode ( &len, sorb, delta, dis_buf);
2390      assign(dst0, loadLE(ty,mkexpr(addr)));
2391      delta += len + d_sz;
2392   }
2393
2394   isShift = False;
2395   switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
2396
2397   isRotate = False;
2398   switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
2399
2400   isRotateC = False;
2401   switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
2402
2403   if (!isShift && !isRotate && !isRotateC) {
2404      /*NOTREACHED*/
2405      vpanic("dis_Grp2(Reg): unhandled case(x86)");
2406   }
2407
2408   if (isRotateC) {
2409      /* call a helper; these insns are so ridiculous they do not
2410         deserve better */
2411      Bool     left = toBool(gregOfRM(modrm) == 2);
2412      IRTemp   r64  = newTemp(Ity_I64);
2413      IRExpr** args
2414         = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
2415                          widenUto32(shift_expr),   /* rotate amount */
2416                          widenUto32(mk_x86g_calculate_eflags_all()),
2417                          mkU32(sz) );
2418      assign( r64, mkIRExprCCall(
2419                      Ity_I64,
2420                      0/*regparm*/,
2421                      left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
2422                      left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
2423                      args
2424                   )
2425            );
2426      /* new eflags in hi half r64; new value in lo half r64 */
2427      assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
2428      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2429      stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
2430      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2431      /* Set NDEP even though it isn't used.  This makes redundant-PUT
2432         elimination of previous stores to this field work better. */
2433      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2434   }
2435
2436   if (isShift) {
2437
2438      IRTemp pre32     = newTemp(Ity_I32);
2439      IRTemp res32     = newTemp(Ity_I32);
2440      IRTemp res32ss   = newTemp(Ity_I32);
2441      IRTemp shift_amt = newTemp(Ity_I8);
2442      IROp   op32;
2443
2444      switch (gregOfRM(modrm)) {
2445         case 4: op32 = Iop_Shl32; break;
2446         case 5: op32 = Iop_Shr32; break;
2447         case 6: op32 = Iop_Shl32; break;
2448         case 7: op32 = Iop_Sar32; break;
2449         /*NOTREACHED*/
2450         default: vpanic("dis_Grp2:shift"); break;
2451      }
2452
2453      /* Widen the value to be shifted to 32 bits, do the shift, and
2454         narrow back down.  This seems surprisingly long-winded, but
2455         unfortunately the Intel semantics requires that 8/16-bit
2456         shifts give defined results for shift values all the way up
2457         to 31, and this seems the simplest way to do it.  It has the
2458         advantage that the only IR level shifts generated are of 32
2459         bit values, and the shift amount is guaranteed to be in the
2460         range 0 .. 31, thereby observing the IR semantics requiring
2461         all shift values to be in the range 0 .. 2^word_size-1. */
2462
2463      /* shift_amt = shift_expr & 31, regardless of operation size */
2464      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
2465
2466      /* suitably widen the value to be shifted to 32 bits. */
2467      assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
2468                                     : widenUto32(mkexpr(dst0)) );
2469
2470      /* res32 = pre32 `shift` shift_amt */
2471      assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
2472
2473      /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
2474      assign( res32ss,
2475              binop(op32,
2476                    mkexpr(pre32),
2477                    binop(Iop_And8,
2478                          binop(Iop_Sub8,
2479                                mkexpr(shift_amt), mkU8(1)),
2480                          mkU8(31))) );
2481
2482      /* Build the flags thunk. */
2483      setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
2484
2485      /* Narrow the result back down. */
2486      assign( dst1, narrowTo(ty, mkexpr(res32)) );
2487
2488   } /* if (isShift) */
2489
2490   else
2491   if (isRotate) {
2492      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
2493      Bool   left      = toBool(gregOfRM(modrm) == 0);
2494      IRTemp rot_amt   = newTemp(Ity_I8);
2495      IRTemp rot_amt32 = newTemp(Ity_I8);
2496      IRTemp oldFlags  = newTemp(Ity_I32);
2497
2498      /* rot_amt = shift_expr & mask */
2499      /* By masking the rotate amount thusly, the IR-level Shl/Shr
2500         expressions never shift beyond the word size and thus remain
2501         well defined. */
2502      assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
2503
2504      if (ty == Ity_I32)
2505         assign(rot_amt, mkexpr(rot_amt32));
2506      else
2507         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
2508
2509      if (left) {
2510
2511         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
2512         assign(dst1,
2513            binop( mkSizedOp(ty,Iop_Or8),
2514                   binop( mkSizedOp(ty,Iop_Shl8),
2515                          mkexpr(dst0),
2516                          mkexpr(rot_amt)
2517                   ),
2518                   binop( mkSizedOp(ty,Iop_Shr8),
2519                          mkexpr(dst0),
2520                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2521                   )
2522            )
2523         );
2524         ccOp += X86G_CC_OP_ROLB;
2525
2526      } else { /* right */
2527
2528         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
2529         assign(dst1,
2530            binop( mkSizedOp(ty,Iop_Or8),
2531                   binop( mkSizedOp(ty,Iop_Shr8),
2532                          mkexpr(dst0),
2533                          mkexpr(rot_amt)
2534                   ),
2535                   binop( mkSizedOp(ty,Iop_Shl8),
2536                          mkexpr(dst0),
2537                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2538                   )
2539            )
2540         );
2541         ccOp += X86G_CC_OP_RORB;
2542
2543      }
2544
2545      /* dst1 now holds the rotated value.  Build flag thunk.  We
2546         need the resulting value for this, and the previous flags.
2547         Except don't set it if the rotate count is zero. */
2548
2549      assign(oldFlags, mk_x86g_calculate_eflags_all());
2550
2551      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
2552      stmt( IRStmt_Put( OFFB_CC_OP,
2553                        IRExpr_Mux0X( mkexpr(rot_amt32),
2554                                      IRExpr_Get(OFFB_CC_OP,Ity_I32),
2555                                      mkU32(ccOp))) );
2556      stmt( IRStmt_Put( OFFB_CC_DEP1,
2557                        IRExpr_Mux0X( mkexpr(rot_amt32),
2558                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
2559                                      widenUto32(mkexpr(dst1)))) );
2560      stmt( IRStmt_Put( OFFB_CC_DEP2,
2561                        IRExpr_Mux0X( mkexpr(rot_amt32),
2562                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
2563                                      mkU32(0))) );
2564      stmt( IRStmt_Put( OFFB_CC_NDEP,
2565                        IRExpr_Mux0X( mkexpr(rot_amt32),
2566                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
2567                                      mkexpr(oldFlags))) );
2568   } /* if (isRotate) */
2569
2570   /* Save result, and finish up. */
2571   if (epartIsReg(modrm)) {
2572      putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2573      if (vex_traceflags & VEX_TRACE_FE) {
2574         vex_printf("%s%c ",
2575                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2576         if (shift_expr_txt)
2577            vex_printf("%s", shift_expr_txt);
2578         else
2579            ppIRExpr(shift_expr);
2580         vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
2581      }
2582   } else {
2583      storeLE(mkexpr(addr), mkexpr(dst1));
2584      if (vex_traceflags & VEX_TRACE_FE) {
2585         vex_printf("%s%c ",
2586                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2587         if (shift_expr_txt)
2588            vex_printf("%s", shift_expr_txt);
2589         else
2590            ppIRExpr(shift_expr);
2591         vex_printf(", %s\n", dis_buf);
2592      }
2593   }
2594   return delta;
2595}
2596
2597
2598/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
2599static
2600UInt dis_Grp8_Imm ( UChar sorb,
2601                    Bool locked,
2602                    Int delta, UChar modrm,
2603                    Int am_sz, Int sz, UInt src_val,
2604                    Bool* decode_OK )
2605{
2606   /* src_val denotes a d8.
2607      And delta on entry points at the modrm byte. */
2608
2609   IRType ty     = szToITy(sz);
2610   IRTemp t2     = newTemp(Ity_I32);
2611   IRTemp t2m    = newTemp(Ity_I32);
2612   IRTemp t_addr = IRTemp_INVALID;
2613   HChar  dis_buf[50];
2614   UInt   mask;
2615
2616   /* we're optimists :-) */
2617   *decode_OK = True;
2618
2619   /* Limit src_val -- the bit offset -- to something within a word.
2620      The Intel docs say that literal offsets larger than a word are
2621      masked in this way. */
2622   switch (sz) {
2623      case 2:  src_val &= 15; break;
2624      case 4:  src_val &= 31; break;
2625      default: *decode_OK = False; return delta;
2626   }
2627
2628   /* Invent a mask suitable for the operation. */
2629   switch (gregOfRM(modrm)) {
2630      case 4: /* BT */  mask = 0;               break;
2631      case 5: /* BTS */ mask = 1 << src_val;    break;
2632      case 6: /* BTR */ mask = ~(1 << src_val); break;
2633      case 7: /* BTC */ mask = 1 << src_val;    break;
2634         /* If this needs to be extended, probably simplest to make a
2635            new function to handle the other cases (0 .. 3).  The
2636            Intel docs do however not indicate any use for 0 .. 3, so
2637            we don't expect this to happen. */
2638      default: *decode_OK = False; return delta;
2639   }
2640
2641   /* Fetch the value to be tested and modified into t2, which is
2642      32-bits wide regardless of sz. */
2643   if (epartIsReg(modrm)) {
2644      vassert(am_sz == 1);
2645      assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
2646      delta += (am_sz + 1);
2647      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2648                              src_val, nameIReg(sz,eregOfRM(modrm)));
2649   } else {
2650      Int len;
2651      t_addr = disAMode ( &len, sorb, delta, dis_buf);
2652      delta  += (len+1);
2653      assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
2654      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2655                              src_val, dis_buf);
2656   }
2657
2658   /* Compute the new value into t2m, if non-BT. */
2659   switch (gregOfRM(modrm)) {
2660      case 4: /* BT */
2661         break;
2662      case 5: /* BTS */
2663         assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
2664         break;
2665      case 6: /* BTR */
2666         assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
2667         break;
2668      case 7: /* BTC */
2669         assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
2670         break;
2671      default:
2672         /*NOTREACHED*/ /*the previous switch guards this*/
2673         vassert(0);
2674   }
2675
2676   /* Write the result back, if non-BT.  If the CAS fails then we
2677      side-exit from the trace at this point, and so the flag state is
2678      not affected.  This is of course as required. */
2679   if (gregOfRM(modrm) != 4 /* BT */) {
2680      if (epartIsReg(modrm)) {
2681         putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
2682      } else {
2683         if (locked) {
2684            casLE( mkexpr(t_addr),
2685                   narrowTo(ty, mkexpr(t2))/*expd*/,
2686                   narrowTo(ty, mkexpr(t2m))/*new*/,
2687                   guest_EIP_curr_instr );
2688         } else {
2689            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
2690         }
2691      }
2692   }
2693
2694   /* Copy relevant bit from t2 into the carry flag. */
2695   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
2696   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2697   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2698   stmt( IRStmt_Put(
2699            OFFB_CC_DEP1,
2700            binop(Iop_And32,
2701                  binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
2702                  mkU32(1))
2703       ));
2704   /* Set NDEP even though it isn't used.  This makes redundant-PUT
2705      elimination of previous stores to this field work better. */
2706   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2707
2708   return delta;
2709}
2710
2711
2712/* Signed/unsigned widening multiply.  Generate IR to multiply the
2713   value in EAX/AX/AL by the given IRTemp, and park the result in
2714   EDX:EAX/DX:AX/AX.
2715*/
2716static void codegen_mulL_A_D ( Int sz, Bool syned,
2717                               IRTemp tmp, HChar* tmp_txt )
2718{
2719   IRType ty = szToITy(sz);
2720   IRTemp t1 = newTemp(ty);
2721
2722   assign( t1, getIReg(sz, R_EAX) );
2723
2724   switch (ty) {
2725      case Ity_I32: {
2726         IRTemp res64   = newTemp(Ity_I64);
2727         IRTemp resHi   = newTemp(Ity_I32);
2728         IRTemp resLo   = newTemp(Ity_I32);
2729         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
2730         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2731         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
2732         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2733         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
2734         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
2735         putIReg(4, R_EDX, mkexpr(resHi));
2736         putIReg(4, R_EAX, mkexpr(resLo));
2737         break;
2738      }
2739      case Ity_I16: {
2740         IRTemp res32   = newTemp(Ity_I32);
2741         IRTemp resHi   = newTemp(Ity_I16);
2742         IRTemp resLo   = newTemp(Ity_I16);
2743         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
2744         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2745         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
2746         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2747         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
2748         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
2749         putIReg(2, R_EDX, mkexpr(resHi));
2750         putIReg(2, R_EAX, mkexpr(resLo));
2751         break;
2752      }
2753      case Ity_I8: {
2754         IRTemp res16   = newTemp(Ity_I16);
2755         IRTemp resHi   = newTemp(Ity_I8);
2756         IRTemp resLo   = newTemp(Ity_I8);
2757         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
2758         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2759         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
2760         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2761         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
2762         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
2763         putIReg(2, R_EAX, mkexpr(res16));
2764         break;
2765      }
2766      default:
2767         vpanic("codegen_mulL_A_D(x86)");
2768   }
2769   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
2770}
2771
2772
2773/* Group 3 extended opcodes. */
2774static
2775UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
2776{
2777   UInt    d32;
2778   UChar   modrm;
2779   HChar   dis_buf[50];
2780   Int     len;
2781   IRTemp  addr;
2782   IRType  ty = szToITy(sz);
2783   IRTemp  t1 = newTemp(ty);
2784   IRTemp dst1, src, dst0;
2785
2786   *decode_OK = True; /* may change this later */
2787
2788   modrm = getIByte(delta);
2789
2790   if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
2791      /* LOCK prefix only allowed with not and neg subopcodes */
2792      *decode_OK = False;
2793      return delta;
2794   }
2795
2796   if (epartIsReg(modrm)) {
2797      switch (gregOfRM(modrm)) {
2798         case 0: { /* TEST */
2799            delta++; d32 = getUDisp(sz, delta); delta += sz;
2800            dst1 = newTemp(ty);
2801            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2802                               getIReg(sz,eregOfRM(modrm)),
2803                               mkU(ty,d32)));
2804            setFlags_DEP1( Iop_And8, dst1, ty );
2805            DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
2806                                      nameIReg(sz, eregOfRM(modrm)));
2807            break;
2808         }
2809         case 1: /* UNDEFINED */
2810           /* The Intel docs imply this insn is undefined and binutils
2811              agrees.  Unfortunately Core 2 will run it (with who
2812              knows what result?)  sandpile.org reckons it's an alias
2813              for case 0.  We play safe. */
2814           *decode_OK = False;
2815           break;
2816         case 2: /* NOT */
2817            delta++;
2818            putIReg(sz, eregOfRM(modrm),
2819                        unop(mkSizedOp(ty,Iop_Not8),
2820                             getIReg(sz, eregOfRM(modrm))));
2821            DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2822            break;
2823         case 3: /* NEG */
2824            delta++;
2825            dst0 = newTemp(ty);
2826            src  = newTemp(ty);
2827            dst1 = newTemp(ty);
2828            assign(dst0, mkU(ty,0));
2829            assign(src,  getIReg(sz,eregOfRM(modrm)));
2830            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
2831            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2832            putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2833            DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2834            break;
2835         case 4: /* MUL (unsigned widening) */
2836            delta++;
2837            src = newTemp(ty);
2838            assign(src, getIReg(sz,eregOfRM(modrm)));
2839            codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
2840            break;
2841         case 5: /* IMUL (signed widening) */
2842            delta++;
2843            src = newTemp(ty);
2844            assign(src, getIReg(sz,eregOfRM(modrm)));
2845            codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
2846            break;
2847         case 6: /* DIV */
2848            delta++;
2849            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2850            codegen_div ( sz, t1, False );
2851            DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2852            break;
2853         case 7: /* IDIV */
2854            delta++;
2855            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2856            codegen_div ( sz, t1, True );
2857            DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2858            break;
2859         default:
2860            /* This can't happen - gregOfRM should return 0 .. 7 only */
2861            vpanic("Grp3(x86)");
2862      }
2863   } else {
2864      addr = disAMode ( &len, sorb, delta, dis_buf );
2865      t1   = newTemp(ty);
2866      delta += len;
2867      assign(t1, loadLE(ty,mkexpr(addr)));
2868      switch (gregOfRM(modrm)) {
2869         case 0: { /* TEST */
2870            d32 = getUDisp(sz, delta); delta += sz;
2871            dst1 = newTemp(ty);
2872            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2873                               mkexpr(t1), mkU(ty,d32)));
2874            setFlags_DEP1( Iop_And8, dst1, ty );
2875            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
2876            break;
2877         }
2878         case 1: /* UNDEFINED */
2879           /* See comment above on R case */
2880           *decode_OK = False;
2881           break;
2882         case 2: /* NOT */
2883            dst1 = newTemp(ty);
2884            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
2885            if (locked) {
2886               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2887                                    guest_EIP_curr_instr );
2888            } else {
2889               storeLE( mkexpr(addr), mkexpr(dst1) );
2890            }
2891            DIP("not%c %s\n", nameISize(sz), dis_buf);
2892            break;
2893         case 3: /* NEG */
2894            dst0 = newTemp(ty);
2895            src  = newTemp(ty);
2896            dst1 = newTemp(ty);
2897            assign(dst0, mkU(ty,0));
2898            assign(src,  mkexpr(t1));
2899            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
2900                               mkexpr(dst0), mkexpr(src)));
2901            if (locked) {
2902               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2903                                    guest_EIP_curr_instr );
2904            } else {
2905               storeLE( mkexpr(addr), mkexpr(dst1) );
2906            }
2907            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2908            DIP("neg%c %s\n", nameISize(sz), dis_buf);
2909            break;
2910         case 4: /* MUL */
2911            codegen_mulL_A_D ( sz, False, t1, dis_buf );
2912            break;
2913         case 5: /* IMUL */
2914            codegen_mulL_A_D ( sz, True, t1, dis_buf );
2915            break;
2916         case 6: /* DIV */
2917            codegen_div ( sz, t1, False );
2918            DIP("div%c %s\n", nameISize(sz), dis_buf);
2919            break;
2920         case 7: /* IDIV */
2921            codegen_div ( sz, t1, True );
2922            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
2923            break;
2924         default:
2925            /* This can't happen - gregOfRM should return 0 .. 7 only */
2926            vpanic("Grp3(x86)");
2927      }
2928   }
2929   return delta;
2930}
2931
2932
2933/* Group 4 extended opcodes. */
2934static
2935UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
2936{
2937   Int   alen;
2938   UChar modrm;
2939   HChar dis_buf[50];
2940   IRType ty = Ity_I8;
2941   IRTemp t1 = newTemp(ty);
2942   IRTemp t2 = newTemp(ty);
2943
2944   *decode_OK = True;
2945
2946   modrm = getIByte(delta);
2947
2948   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
2949      /* LOCK prefix only allowed with inc and dec subopcodes */
2950      *decode_OK = False;
2951      return delta;
2952   }
2953
2954   if (epartIsReg(modrm)) {
2955      assign(t1, getIReg(1, eregOfRM(modrm)));
2956      switch (gregOfRM(modrm)) {
2957         case 0: /* INC */
2958            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2959            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2960            setFlags_INC_DEC( True, t2, ty );
2961            break;
2962         case 1: /* DEC */
2963            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2964            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2965            setFlags_INC_DEC( False, t2, ty );
2966            break;
2967         default:
2968            *decode_OK = False;
2969            return delta;
2970      }
2971      delta++;
2972      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
2973                      nameIReg(1, eregOfRM(modrm)));
2974   } else {
2975      IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
2976      assign( t1, loadLE(ty, mkexpr(addr)) );
2977      switch (gregOfRM(modrm)) {
2978         case 0: /* INC */
2979            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2980            if (locked) {
2981               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
2982                      guest_EIP_curr_instr );
2983            } else {
2984               storeLE( mkexpr(addr), mkexpr(t2) );
2985            }
2986            setFlags_INC_DEC( True, t2, ty );
2987            break;
2988         case 1: /* DEC */
2989            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2990            if (locked) {
2991               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
2992                      guest_EIP_curr_instr );
2993            } else {
2994               storeLE( mkexpr(addr), mkexpr(t2) );
2995            }
2996            setFlags_INC_DEC( False, t2, ty );
2997            break;
2998         default:
2999            *decode_OK = False;
3000            return delta;
3001      }
3002      delta += alen;
3003      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
3004   }
3005   return delta;
3006}
3007
3008
3009/* Group 5 extended opcodes. */
3010static
3011UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
3012                DisResult* dres, Bool* decode_OK )
3013{
3014   Int     len;
3015   UChar   modrm;
3016   HChar   dis_buf[50];
3017   IRTemp  addr = IRTemp_INVALID;
3018   IRType  ty = szToITy(sz);
3019   IRTemp  t1 = newTemp(ty);
3020   IRTemp  t2 = IRTemp_INVALID;
3021
3022   *decode_OK = True;
3023
3024   modrm = getIByte(delta);
3025
3026   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3027      /* LOCK prefix only allowed with inc and dec subopcodes */
3028      *decode_OK = False;
3029      return delta;
3030   }
3031
3032   if (epartIsReg(modrm)) {
3033      assign(t1, getIReg(sz,eregOfRM(modrm)));
3034      switch (gregOfRM(modrm)) {
3035         case 0: /* INC */
3036            vassert(sz == 2 || sz == 4);
3037            t2 = newTemp(ty);
3038            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3039                             mkexpr(t1), mkU(ty,1)));
3040            setFlags_INC_DEC( True, t2, ty );
3041            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3042            break;
3043         case 1: /* DEC */
3044            vassert(sz == 2 || sz == 4);
3045            t2 = newTemp(ty);
3046            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3047                             mkexpr(t1), mkU(ty,1)));
3048            setFlags_INC_DEC( False, t2, ty );
3049            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3050            break;
3051         case 2: /* call Ev */
3052            vassert(sz == 4);
3053            t2 = newTemp(Ity_I32);
3054            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3055            putIReg(4, R_ESP, mkexpr(t2));
3056            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
3057            jmp_treg(Ijk_Call,t1);
3058            dres->whatNext = Dis_StopHere;
3059            break;
3060         case 4: /* jmp Ev */
3061            vassert(sz == 4);
3062            jmp_treg(Ijk_Boring,t1);
3063            dres->whatNext = Dis_StopHere;
3064            break;
3065         case 6: /* PUSH Ev */
3066            vassert(sz == 4 || sz == 2);
3067            t2 = newTemp(Ity_I32);
3068            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3069            putIReg(4, R_ESP, mkexpr(t2) );
3070            storeLE( mkexpr(t2), mkexpr(t1) );
3071            break;
3072         default:
3073            *decode_OK = False;
3074            return delta;
3075      }
3076      delta++;
3077      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3078                       nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3079   } else {
3080      addr = disAMode ( &len, sorb, delta, dis_buf );
3081      assign(t1, loadLE(ty,mkexpr(addr)));
3082      switch (gregOfRM(modrm)) {
3083         case 0: /* INC */
3084            t2 = newTemp(ty);
3085            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3086                             mkexpr(t1), mkU(ty,1)));
3087            if (locked) {
3088               casLE( mkexpr(addr),
3089                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3090            } else {
3091               storeLE(mkexpr(addr),mkexpr(t2));
3092            }
3093            setFlags_INC_DEC( True, t2, ty );
3094            break;
3095         case 1: /* DEC */
3096            t2 = newTemp(ty);
3097            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3098                             mkexpr(t1), mkU(ty,1)));
3099            if (locked) {
3100               casLE( mkexpr(addr),
3101                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3102            } else {
3103               storeLE(mkexpr(addr),mkexpr(t2));
3104            }
3105            setFlags_INC_DEC( False, t2, ty );
3106            break;
3107         case 2: /* call Ev */
3108            vassert(sz == 4);
3109            t2 = newTemp(Ity_I32);
3110            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3111            putIReg(4, R_ESP, mkexpr(t2));
3112            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
3113            jmp_treg(Ijk_Call,t1);
3114            dres->whatNext = Dis_StopHere;
3115            break;
3116         case 4: /* JMP Ev */
3117            vassert(sz == 4);
3118            jmp_treg(Ijk_Boring,t1);
3119            dres->whatNext = Dis_StopHere;
3120            break;
3121         case 6: /* PUSH Ev */
3122            vassert(sz == 4 || sz == 2);
3123            t2 = newTemp(Ity_I32);
3124            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3125            putIReg(4, R_ESP, mkexpr(t2) );
3126            storeLE( mkexpr(t2), mkexpr(t1) );
3127            break;
3128         default:
3129            *decode_OK = False;
3130            return delta;
3131      }
3132      delta += len;
3133      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3134                       nameISize(sz), dis_buf);
3135   }
3136   return delta;
3137}
3138
3139
3140/*------------------------------------------------------------*/
3141/*--- Disassembling string ops (including REP prefixes)    ---*/
3142/*------------------------------------------------------------*/
3143
3144/* Code shared by all the string ops */
3145static
3146void dis_string_op_increment(Int sz, Int t_inc)
3147{
3148   if (sz == 4 || sz == 2) {
3149      assign( t_inc,
3150              binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
3151                               mkU8(sz/2) ) );
3152   } else {
3153      assign( t_inc,
3154              IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
3155   }
3156}
3157
3158static
3159void dis_string_op( void (*dis_OP)( Int, IRTemp ),
3160                    Int sz, HChar* name, UChar sorb )
3161{
3162   IRTemp t_inc = newTemp(Ity_I32);
3163   vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
3164   dis_string_op_increment(sz, t_inc);
3165   dis_OP( sz, t_inc );
3166   DIP("%s%c\n", name, nameISize(sz));
3167}
3168
3169static
3170void dis_MOVS ( Int sz, IRTemp t_inc )
3171{
3172   IRType ty = szToITy(sz);
3173   IRTemp td = newTemp(Ity_I32);   /* EDI */
3174   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3175
3176   assign( td, getIReg(4, R_EDI) );
3177   assign( ts, getIReg(4, R_ESI) );
3178
3179   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
3180
3181   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3182   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3183}
3184
3185static
3186void dis_LODS ( Int sz, IRTemp t_inc )
3187{
3188   IRType ty = szToITy(sz);
3189   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3190
3191   assign( ts, getIReg(4, R_ESI) );
3192
3193   putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
3194
3195   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3196}
3197
3198static
3199void dis_STOS ( Int sz, IRTemp t_inc )
3200{
3201   IRType ty = szToITy(sz);
3202   IRTemp ta = newTemp(ty);        /* EAX */
3203   IRTemp td = newTemp(Ity_I32);   /* EDI */
3204
3205   assign( ta, getIReg(sz, R_EAX) );
3206   assign( td, getIReg(4, R_EDI) );
3207
3208   storeLE( mkexpr(td), mkexpr(ta) );
3209
3210   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3211}
3212
3213static
3214void dis_CMPS ( Int sz, IRTemp t_inc )
3215{
3216   IRType ty  = szToITy(sz);
3217   IRTemp tdv = newTemp(ty);      /* (EDI) */
3218   IRTemp tsv = newTemp(ty);      /* (ESI) */
3219   IRTemp td  = newTemp(Ity_I32); /*  EDI  */
3220   IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
3221
3222   assign( td, getIReg(4, R_EDI) );
3223   assign( ts, getIReg(4, R_ESI) );
3224
3225   assign( tdv, loadLE(ty,mkexpr(td)) );
3226   assign( tsv, loadLE(ty,mkexpr(ts)) );
3227
3228   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
3229
3230   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3231   putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3232}
3233
3234static
3235void dis_SCAS ( Int sz, IRTemp t_inc )
3236{
3237   IRType ty  = szToITy(sz);
3238   IRTemp ta  = newTemp(ty);       /*  EAX  */
3239   IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
3240   IRTemp tdv = newTemp(ty);       /* (EDI) */
3241
3242   assign( ta, getIReg(sz, R_EAX) );
3243   assign( td, getIReg(4, R_EDI) );
3244
3245   assign( tdv, loadLE(ty,mkexpr(td)) );
3246   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
3247
3248   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3249}
3250
3251
3252/* Wrap the appropriate string op inside a REP/REPE/REPNE.
3253   We assume the insn is the last one in the basic block, and so emit a jump
3254   to the next insn, rather than just falling through. */
3255static
3256void dis_REP_op ( X86Condcode cond,
3257                  void (*dis_OP)(Int, IRTemp),
3258                  Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
3259{
3260   IRTemp t_inc = newTemp(Ity_I32);
3261   IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
3262
3263   assign( tc, getIReg(4,R_ECX) );
3264
3265   stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
3266                      Ijk_Boring,
3267                      IRConst_U32(eip_next) ) );
3268
3269   putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
3270
3271   dis_string_op_increment(sz, t_inc);
3272   dis_OP (sz, t_inc);
3273
3274   if (cond == X86CondAlways) {
3275      jmp_lit(Ijk_Boring,eip);
3276   } else {
3277      stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
3278                         Ijk_Boring,
3279                         IRConst_U32(eip) ) );
3280      jmp_lit(Ijk_Boring,eip_next);
3281   }
3282   DIP("%s%c\n", name, nameISize(sz));
3283}
3284
3285
3286/*------------------------------------------------------------*/
3287/*--- Arithmetic, etc.                                     ---*/
3288/*------------------------------------------------------------*/
3289
3290/* IMUL E, G.  Supplied eip points to the modR/M byte. */
3291static
3292UInt dis_mul_E_G ( UChar       sorb,
3293                   Int         size,
3294                   Int         delta0 )
3295{
3296   Int    alen;
3297   HChar  dis_buf[50];
3298   UChar  rm = getIByte(delta0);
3299   IRType ty = szToITy(size);
3300   IRTemp te = newTemp(ty);
3301   IRTemp tg = newTemp(ty);
3302   IRTemp resLo = newTemp(ty);
3303
3304   assign( tg, getIReg(size, gregOfRM(rm)) );
3305   if (epartIsReg(rm)) {
3306      assign( te, getIReg(size, eregOfRM(rm)) );
3307   } else {
3308      IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
3309      assign( te, loadLE(ty,mkexpr(addr)) );
3310   }
3311
3312   setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
3313
3314   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
3315
3316   putIReg(size, gregOfRM(rm), mkexpr(resLo) );
3317
3318   if (epartIsReg(rm)) {
3319      DIP("imul%c %s, %s\n", nameISize(size),
3320                             nameIReg(size,eregOfRM(rm)),
3321                             nameIReg(size,gregOfRM(rm)));
3322      return 1+delta0;
3323   } else {
3324      DIP("imul%c %s, %s\n", nameISize(size),
3325                             dis_buf, nameIReg(size,gregOfRM(rm)));
3326      return alen+delta0;
3327   }
3328}
3329
3330
3331/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
3332static
3333UInt dis_imul_I_E_G ( UChar       sorb,
3334                      Int         size,
3335                      Int         delta,
3336                      Int         litsize )
3337{
3338   Int    d32, alen;
3339   HChar  dis_buf[50];
3340   UChar  rm = getIByte(delta);
3341   IRType ty = szToITy(size);
3342   IRTemp te = newTemp(ty);
3343   IRTemp tl = newTemp(ty);
3344   IRTemp resLo = newTemp(ty);
3345
3346   vassert(size == 1 || size == 2 || size == 4);
3347
3348   if (epartIsReg(rm)) {
3349      assign(te, getIReg(size, eregOfRM(rm)));
3350      delta++;
3351   } else {
3352      IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
3353      assign(te, loadLE(ty, mkexpr(addr)));
3354      delta += alen;
3355   }
3356   d32 = getSDisp(litsize,delta);
3357   delta += litsize;
3358
3359   if (size == 1) d32 &= 0xFF;
3360   if (size == 2) d32 &= 0xFFFF;
3361
3362   assign(tl, mkU(ty,d32));
3363
3364   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
3365
3366   setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
3367
3368   putIReg(size, gregOfRM(rm), mkexpr(resLo));
3369
3370   DIP("imul %d, %s, %s\n", d32,
3371       ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
3372       nameIReg(size,gregOfRM(rm)) );
3373   return delta;
3374}
3375
3376
3377/* Generate an IR sequence to do a count-leading-zeroes operation on
3378   the supplied IRTemp, and return a new IRTemp holding the result.
3379   'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
3380   argument is zero, return the number of bits in the word (the
3381   natural semantics). */
3382static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
3383{
3384   vassert(ty == Ity_I32 || ty == Ity_I16);
3385
3386   IRTemp src32 = newTemp(Ity_I32);
3387   assign(src32, widenUto32( mkexpr(src) ));
3388
3389   IRTemp src32x = newTemp(Ity_I32);
3390   assign(src32x,
3391          binop(Iop_Shl32, mkexpr(src32),
3392                           mkU8(32 - 8 * sizeofIRType(ty))));
3393
3394   // Clz32 has undefined semantics when its input is zero, so
3395   // special-case around that.
3396   IRTemp res32 = newTemp(Ity_I32);
3397   assign(res32,
3398          IRExpr_Mux0X(
3399             unop(Iop_1Uto8,
3400                  binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
3401             unop(Iop_Clz32, mkexpr(src32x)),
3402             mkU32(8 * sizeofIRType(ty))
3403   ));
3404
3405   IRTemp res = newTemp(ty);
3406   assign(res, narrowTo(ty, mkexpr(res32)));
3407   return res;
3408}
3409
3410
3411/*------------------------------------------------------------*/
3412/*---                                                      ---*/
3413/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
3414/*---                                                      ---*/
3415/*------------------------------------------------------------*/
3416
3417/* --- Helper functions for dealing with the register stack. --- */
3418
3419/* --- Set the emulation-warning pseudo-register. --- */
3420
3421static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
3422{
3423   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3424   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
3425}
3426
3427/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
3428
3429static IRExpr* mkQNaN64 ( void )
3430{
3431  /* QNaN is 0 2047 1 0(51times)
3432     == 0b 11111111111b 1 0(51times)
3433     == 0x7FF8 0000 0000 0000
3434   */
3435   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
3436}
3437
3438/* --------- Get/put the top-of-stack pointer. --------- */
3439
3440static IRExpr* get_ftop ( void )
3441{
3442   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
3443}
3444
3445static void put_ftop ( IRExpr* e )
3446{
3447   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3448   stmt( IRStmt_Put( OFFB_FTOP, e ) );
3449}
3450
3451/* --------- Get/put the C3210 bits. --------- */
3452
3453static IRExpr* get_C3210 ( void )
3454{
3455   return IRExpr_Get( OFFB_FC3210, Ity_I32 );
3456}
3457
3458static void put_C3210 ( IRExpr* e )
3459{
3460   stmt( IRStmt_Put( OFFB_FC3210, e ) );
3461}
3462
3463/* --------- Get/put the FPU rounding mode. --------- */
3464static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
3465{
3466   return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
3467}
3468
3469static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
3470{
3471   stmt( IRStmt_Put( OFFB_FPROUND, e ) );
3472}
3473
3474
3475/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
3476/* Produces a value in 0 .. 3, which is encoded as per the type
3477   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
3478   per IRRoundingMode, we merely need to get it and mask it for
3479   safety.
3480*/
3481static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
3482{
3483   return binop( Iop_And32, get_fpround(), mkU32(3) );
3484}
3485
3486static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
3487{
3488   return mkU32(Irrm_NEAREST);
3489}
3490
3491
3492/* --------- Get/set FP register tag bytes. --------- */
3493
3494/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
3495
3496static void put_ST_TAG ( Int i, IRExpr* value )
3497{
3498   IRRegArray* descr;
3499   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
3500   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3501   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
3502}
3503
3504/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
3505   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
3506
3507static IRExpr* get_ST_TAG ( Int i )
3508{
3509   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3510   return IRExpr_GetI( descr, get_ftop(), i );
3511}
3512
3513
3514/* --------- Get/set FP registers. --------- */
3515
3516/* Given i, and some expression e, emit 'ST(i) = e' and set the
3517   register's tag to indicate the register is full.  The previous
3518   state of the register is not checked. */
3519
3520static void put_ST_UNCHECKED ( Int i, IRExpr* value )
3521{
3522   IRRegArray* descr;
3523   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
3524   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3525   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
3526   /* Mark the register as in-use. */
3527   put_ST_TAG(i, mkU8(1));
3528}
3529
3530/* Given i, and some expression e, emit
3531      ST(i) = is_full(i) ? NaN : e
3532   and set the tag accordingly.
3533*/
3534
3535static void put_ST ( Int i, IRExpr* value )
3536{
3537   put_ST_UNCHECKED( i,
3538                     IRExpr_Mux0X( get_ST_TAG(i),
3539                                   /* 0 means empty */
3540                                   value,
3541                                   /* non-0 means full */
3542                                   mkQNaN64()
3543                   )
3544   );
3545}
3546
3547
3548/* Given i, generate an expression yielding 'ST(i)'. */
3549
3550static IRExpr* get_ST_UNCHECKED ( Int i )
3551{
3552   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3553   return IRExpr_GetI( descr, get_ftop(), i );
3554}
3555
3556
3557/* Given i, generate an expression yielding
3558  is_full(i) ? ST(i) : NaN
3559*/
3560
3561static IRExpr* get_ST ( Int i )
3562{
3563   return
3564      IRExpr_Mux0X( get_ST_TAG(i),
3565                    /* 0 means empty */
3566                    mkQNaN64(),
3567                    /* non-0 means full */
3568                    get_ST_UNCHECKED(i));
3569}
3570
3571
3572/* Adjust FTOP downwards by one register. */
3573
3574static void fp_push ( void )
3575{
3576   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
3577}
3578
3579/* Adjust FTOP upwards by one register, and mark the vacated register
3580   as empty.  */
3581
3582static void fp_pop ( void )
3583{
3584   put_ST_TAG(0, mkU8(0));
3585   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
3586}
3587
3588/* Clear the C2 bit of the FPU status register, for
3589   sin/cos/tan/sincos. */
3590
3591static void clear_C2 ( void )
3592{
3593   put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
3594}
3595
3596/* Invent a plausible-looking FPU status word value:
3597      ((ftop & 7) << 11) | (c3210 & 0x4700)
3598 */
3599static IRExpr* get_FPU_sw ( void )
3600{
3601   return
3602      unop(Iop_32to16,
3603           binop(Iop_Or32,
3604                 binop(Iop_Shl32,
3605                       binop(Iop_And32, get_ftop(), mkU32(7)),
3606                             mkU8(11)),
3607                       binop(Iop_And32, get_C3210(), mkU32(0x4700))
3608      ));
3609}
3610
3611
3612/* ------------------------------------------------------- */
3613/* Given all that stack-mangling junk, we can now go ahead
3614   and describe FP instructions.
3615*/
3616
3617/* ST(0) = ST(0) `op` mem64/32(addr)
3618   Need to check ST(0)'s tag on read, but not on write.
3619*/
3620static
3621void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
3622                         IROp op, Bool dbl )
3623{
3624   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3625   if (dbl) {
3626      put_ST_UNCHECKED(0,
3627         triop( op,
3628                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3629                get_ST(0),
3630                loadLE(Ity_F64,mkexpr(addr))
3631         ));
3632   } else {
3633      put_ST_UNCHECKED(0,
3634         triop( op,
3635                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3636                get_ST(0),
3637                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
3638         ));
3639   }
3640}
3641
3642
3643/* ST(0) = mem64/32(addr) `op` ST(0)
3644   Need to check ST(0)'s tag on read, but not on write.
3645*/
3646static
3647void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
3648                            IROp op, Bool dbl )
3649{
3650   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3651   if (dbl) {
3652      put_ST_UNCHECKED(0,
3653         triop( op,
3654                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3655                loadLE(Ity_F64,mkexpr(addr)),
3656                get_ST(0)
3657         ));
3658   } else {
3659      put_ST_UNCHECKED(0,
3660         triop( op,
3661                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3662                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
3663                get_ST(0)
3664         ));
3665   }
3666}
3667
3668
3669/* ST(dst) = ST(dst) `op` ST(src).
3670   Check dst and src tags when reading but not on write.
3671*/
3672static
3673void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3674                      Bool pop_after )
3675{
3676   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3677                                 (Int)st_src, (Int)st_dst );
3678   put_ST_UNCHECKED(
3679      st_dst,
3680      triop( op,
3681             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3682             get_ST(st_dst),
3683             get_ST(st_src) )
3684   );
3685   if (pop_after)
3686      fp_pop();
3687}
3688
3689/* ST(dst) = ST(src) `op` ST(dst).
3690   Check dst and src tags when reading but not on write.
3691*/
3692static
3693void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3694                         Bool pop_after )
3695{
3696   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3697                                 (Int)st_src, (Int)st_dst );
3698   put_ST_UNCHECKED(
3699      st_dst,
3700      triop( op,
3701             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3702             get_ST(st_src),
3703             get_ST(st_dst) )
3704   );
3705   if (pop_after)
3706      fp_pop();
3707}
3708
3709/* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
3710static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
3711{
3712   DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
3713   /* This is a bit of a hack (and isn't really right).  It sets
3714      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
3715      documentation implies A and S are unchanged.
3716   */
3717   /* It's also fishy in that it is used both for COMIP and
3718      UCOMIP, and they aren't the same (although similar). */
3719   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
3720   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
3721   stmt( IRStmt_Put( OFFB_CC_DEP1,
3722                     binop( Iop_And32,
3723                            binop(Iop_CmpF64, get_ST(0), get_ST(i)),
3724                            mkU32(0x45)
3725       )));
3726   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3727      elimination of previous stores to this field work better. */
3728   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
3729   if (pop_after)
3730      fp_pop();
3731}
3732
3733
3734static
3735UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
3736{
3737   Int    len;
3738   UInt   r_src, r_dst;
3739   HChar  dis_buf[50];
3740   IRTemp t1, t2;
3741
3742   /* On entry, delta points at the second byte of the insn (the modrm
3743      byte).*/
3744   UChar first_opcode = getIByte(delta-1);
3745   UChar modrm        = getIByte(delta+0);
3746
3747   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
3748
3749   if (first_opcode == 0xD8) {
3750      if (modrm < 0xC0) {
3751
3752         /* bits 5,4,3 are an opcode extension, and the modRM also
3753           specifies an address. */
3754         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3755         delta += len;
3756
3757         switch (gregOfRM(modrm)) {
3758
3759            case 0: /* FADD single-real */
3760               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
3761               break;
3762
3763            case 1: /* FMUL single-real */
3764               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
3765               break;
3766
3767            case 2: /* FCOM single-real */
3768               DIP("fcoms %s\n", dis_buf);
3769               /* This forces C1 to zero, which isn't right. */
3770               put_C3210(
3771                   binop( Iop_And32,
3772                          binop(Iop_Shl32,
3773                                binop(Iop_CmpF64,
3774                                      get_ST(0),
3775                                      unop(Iop_F32toF64,
3776                                           loadLE(Ity_F32,mkexpr(addr)))),
3777                                mkU8(8)),
3778                          mkU32(0x4500)
3779                   ));
3780               break;
3781
3782            case 3: /* FCOMP single-real */
3783               DIP("fcomps %s\n", dis_buf);
3784               /* This forces C1 to zero, which isn't right. */
3785               put_C3210(
3786                   binop( Iop_And32,
3787                          binop(Iop_Shl32,
3788                                binop(Iop_CmpF64,
3789                                      get_ST(0),
3790                                      unop(Iop_F32toF64,
3791                                           loadLE(Ity_F32,mkexpr(addr)))),
3792                                mkU8(8)),
3793                          mkU32(0x4500)
3794                   ));
3795               fp_pop();
3796               break;
3797
3798            case 4: /* FSUB single-real */
3799               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
3800               break;
3801
3802            case 5: /* FSUBR single-real */
3803               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
3804               break;
3805
3806            case 6: /* FDIV single-real */
3807               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
3808               break;
3809
3810            case 7: /* FDIVR single-real */
3811               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
3812               break;
3813
3814            default:
3815               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
3816               vex_printf("first_opcode == 0xD8\n");
3817               goto decode_fail;
3818         }
3819      } else {
3820         delta++;
3821         switch (modrm) {
3822
3823            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
3824               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
3825               break;
3826
3827            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
3828               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
3829               break;
3830
3831            /* Dunno if this is right */
3832            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
3833               r_dst = (UInt)modrm - 0xD0;
3834               DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
3835               /* This forces C1 to zero, which isn't right. */
3836               put_C3210(
3837                   binop( Iop_And32,
3838                          binop(Iop_Shl32,
3839                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3840                                mkU8(8)),
3841                          mkU32(0x4500)
3842                   ));
3843               break;
3844
3845            /* Dunno if this is right */
3846            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
3847               r_dst = (UInt)modrm - 0xD8;
3848               DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
3849               /* This forces C1 to zero, which isn't right. */
3850               put_C3210(
3851                   binop( Iop_And32,
3852                          binop(Iop_Shl32,
3853                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3854                                mkU8(8)),
3855                          mkU32(0x4500)
3856                   ));
3857               fp_pop();
3858               break;
3859
3860            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
3861               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
3862               break;
3863
3864            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
3865               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
3866               break;
3867
3868            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
3869               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
3870               break;
3871
3872            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
3873               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
3874               break;
3875
3876            default:
3877               goto decode_fail;
3878         }
3879      }
3880   }
3881
3882   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
3883   else
3884   if (first_opcode == 0xD9) {
3885      if (modrm < 0xC0) {
3886
3887         /* bits 5,4,3 are an opcode extension, and the modRM also
3888            specifies an address. */
3889         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3890         delta += len;
3891
3892         switch (gregOfRM(modrm)) {
3893
3894            case 0: /* FLD single-real */
3895               DIP("flds %s\n", dis_buf);
3896               fp_push();
3897               put_ST(0, unop(Iop_F32toF64,
3898                              loadLE(Ity_F32, mkexpr(addr))));
3899               break;
3900
3901            case 2: /* FST single-real */
3902               DIP("fsts %s\n", dis_buf);
3903               storeLE(mkexpr(addr),
3904                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
3905               break;
3906
3907            case 3: /* FSTP single-real */
3908               DIP("fstps %s\n", dis_buf);
3909               storeLE(mkexpr(addr),
3910                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
3911               fp_pop();
3912               break;
3913
3914            case 4: { /* FLDENV m28 */
3915               /* Uses dirty helper:
3916                     VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
3917               IRTemp   ew = newTemp(Ity_I32);
3918               IRDirty* d  = unsafeIRDirty_0_N (
3919                                0/*regparms*/,
3920                                "x86g_dirtyhelper_FLDENV",
3921                                &x86g_dirtyhelper_FLDENV,
3922                                mkIRExprVec_1( mkexpr(addr) )
3923                             );
3924               d->needsBBP = True;
3925               d->tmp      = ew;
3926               /* declare we're reading memory */
3927               d->mFx   = Ifx_Read;
3928               d->mAddr = mkexpr(addr);
3929               d->mSize = 28;
3930
3931               /* declare we're writing guest state */
3932               d->nFxState = 4;
3933
3934               d->fxState[0].fx     = Ifx_Write;
3935               d->fxState[0].offset = OFFB_FTOP;
3936               d->fxState[0].size   = sizeof(UInt);
3937
3938               d->fxState[1].fx     = Ifx_Write;
3939               d->fxState[1].offset = OFFB_FPTAGS;
3940               d->fxState[1].size   = 8 * sizeof(UChar);
3941
3942               d->fxState[2].fx     = Ifx_Write;
3943               d->fxState[2].offset = OFFB_FPROUND;
3944               d->fxState[2].size   = sizeof(UInt);
3945
3946               d->fxState[3].fx     = Ifx_Write;
3947               d->fxState[3].offset = OFFB_FC3210;
3948               d->fxState[3].size   = sizeof(UInt);
3949
3950               stmt( IRStmt_Dirty(d) );
3951
3952               /* ew contains any emulation warning we may need to
3953                  issue.  If needed, side-exit to the next insn,
3954                  reporting the warning, so that Valgrind's dispatcher
3955                  sees the warning. */
3956               put_emwarn( mkexpr(ew) );
3957               stmt(
3958                  IRStmt_Exit(
3959                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
3960                     Ijk_EmWarn,
3961                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
3962                  )
3963               );
3964
3965               DIP("fldenv %s\n", dis_buf);
3966               break;
3967            }
3968
3969            case 5: {/* FLDCW */
3970               /* The only thing we observe in the control word is the
3971                  rounding mode.  Therefore, pass the 16-bit value
3972                  (x87 native-format control word) to a clean helper,
3973                  getting back a 64-bit value, the lower half of which
3974                  is the FPROUND value to store, and the upper half of
3975                  which is the emulation-warning token which may be
3976                  generated.
3977               */
3978               /* ULong x86h_check_fldcw ( UInt ); */
3979               IRTemp t64 = newTemp(Ity_I64);
3980               IRTemp ew = newTemp(Ity_I32);
3981               DIP("fldcw %s\n", dis_buf);
3982               assign( t64, mkIRExprCCall(
3983                               Ity_I64, 0/*regparms*/,
3984                               "x86g_check_fldcw",
3985                               &x86g_check_fldcw,
3986                               mkIRExprVec_1(
3987                                  unop( Iop_16Uto32,
3988                                        loadLE(Ity_I16, mkexpr(addr)))
3989                               )
3990                            )
3991                     );
3992
3993               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
3994               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
3995               put_emwarn( mkexpr(ew) );
3996               /* Finally, if an emulation warning was reported,
3997                  side-exit to the next insn, reporting the warning,
3998                  so that Valgrind's dispatcher sees the warning. */
3999               stmt(
4000                  IRStmt_Exit(
4001                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4002                     Ijk_EmWarn,
4003                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
4004                  )
4005               );
4006               break;
4007            }
4008
4009            case 6: { /* FNSTENV m28 */
4010               /* Uses dirty helper:
4011                     void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
4012               IRDirty* d = unsafeIRDirty_0_N (
4013                               0/*regparms*/,
4014                               "x86g_dirtyhelper_FSTENV",
4015                               &x86g_dirtyhelper_FSTENV,
4016                               mkIRExprVec_1( mkexpr(addr) )
4017                            );
4018               d->needsBBP = True;
4019               /* declare we're writing memory */
4020               d->mFx   = Ifx_Write;
4021               d->mAddr = mkexpr(addr);
4022               d->mSize = 28;
4023
4024               /* declare we're reading guest state */
4025               d->nFxState = 4;
4026
4027               d->fxState[0].fx     = Ifx_Read;
4028               d->fxState[0].offset = OFFB_FTOP;
4029               d->fxState[0].size   = sizeof(UInt);
4030
4031               d->fxState[1].fx     = Ifx_Read;
4032               d->fxState[1].offset = OFFB_FPTAGS;
4033               d->fxState[1].size   = 8 * sizeof(UChar);
4034
4035               d->fxState[2].fx     = Ifx_Read;
4036               d->fxState[2].offset = OFFB_FPROUND;
4037               d->fxState[2].size   = sizeof(UInt);
4038
4039               d->fxState[3].fx     = Ifx_Read;
4040               d->fxState[3].offset = OFFB_FC3210;
4041               d->fxState[3].size   = sizeof(UInt);
4042
4043               stmt( IRStmt_Dirty(d) );
4044
4045               DIP("fnstenv %s\n", dis_buf);
4046               break;
4047            }
4048
4049            case 7: /* FNSTCW */
4050              /* Fake up a native x87 FPU control word.  The only
4051                 thing it depends on is FPROUND[1:0], so call a clean
4052                 helper to cook it up. */
4053               /* UInt x86h_create_fpucw ( UInt fpround ) */
4054               DIP("fnstcw %s\n", dis_buf);
4055               storeLE(
4056                  mkexpr(addr),
4057                  unop( Iop_32to16,
4058                        mkIRExprCCall(
4059                           Ity_I32, 0/*regp*/,
4060                           "x86g_create_fpucw", &x86g_create_fpucw,
4061                           mkIRExprVec_1( get_fpround() )
4062                        )
4063                  )
4064               );
4065               break;
4066
4067            default:
4068               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4069               vex_printf("first_opcode == 0xD9\n");
4070               goto decode_fail;
4071         }
4072
4073      } else {
4074         delta++;
4075         switch (modrm) {
4076
4077            case 0xC0 ... 0xC7: /* FLD %st(?) */
4078               r_src = (UInt)modrm - 0xC0;
4079               DIP("fld %%st(%d)\n", (Int)r_src);
4080               t1 = newTemp(Ity_F64);
4081               assign(t1, get_ST(r_src));
4082               fp_push();
4083               put_ST(0, mkexpr(t1));
4084               break;
4085
4086            case 0xC8 ... 0xCF: /* FXCH %st(?) */
4087               r_src = (UInt)modrm - 0xC8;
4088               DIP("fxch %%st(%d)\n", (Int)r_src);
4089               t1 = newTemp(Ity_F64);
4090               t2 = newTemp(Ity_F64);
4091               assign(t1, get_ST(0));
4092               assign(t2, get_ST(r_src));
4093               put_ST_UNCHECKED(0, mkexpr(t2));
4094               put_ST_UNCHECKED(r_src, mkexpr(t1));
4095               break;
4096
4097            case 0xE0: /* FCHS */
4098               DIP("fchs\n");
4099               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
4100               break;
4101
4102            case 0xE1: /* FABS */
4103               DIP("fabs\n");
4104               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
4105               break;
4106
4107            case 0xE4: /* FTST */
4108               DIP("ftst\n");
4109               /* This forces C1 to zero, which isn't right. */
4110               /* Well, in fact the Intel docs say (bizarrely): "C1 is
4111                  set to 0 if stack underflow occurred; otherwise, set
4112                  to 0" which is pretty nonsensical.  I guess it's a
4113                   typo. */
4114               put_C3210(
4115                   binop( Iop_And32,
4116                          binop(Iop_Shl32,
4117                                binop(Iop_CmpF64,
4118                                      get_ST(0),
4119                                      IRExpr_Const(IRConst_F64i(0x0ULL))),
4120                                mkU8(8)),
4121                          mkU32(0x4500)
4122                   ));
4123               break;
4124
4125            case 0xE5: { /* FXAM */
4126               /* This is an interesting one.  It examines %st(0),
4127                  regardless of whether the tag says it's empty or not.
4128                  Here, just pass both the tag (in our format) and the
4129                  value (as a double, actually a ULong) to a helper
4130                  function. */
4131               IRExpr** args
4132                  = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
4133                                   unop(Iop_ReinterpF64asI64,
4134                                        get_ST_UNCHECKED(0)) );
4135               put_C3210(mkIRExprCCall(
4136                            Ity_I32,
4137                            0/*regparm*/,
4138                            "x86g_calculate_FXAM", &x86g_calculate_FXAM,
4139                            args
4140                        ));
4141               DIP("fxam\n");
4142               break;
4143            }
4144
4145            case 0xE8: /* FLD1 */
4146               DIP("fld1\n");
4147               fp_push();
4148               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
4149               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
4150               break;
4151
4152            case 0xE9: /* FLDL2T */
4153               DIP("fldl2t\n");
4154               fp_push();
4155               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
4156               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
4157               break;
4158
4159            case 0xEA: /* FLDL2E */
4160               DIP("fldl2e\n");
4161               fp_push();
4162               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
4163               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
4164               break;
4165
4166            case 0xEB: /* FLDPI */
4167               DIP("fldpi\n");
4168               fp_push();
4169               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
4170               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
4171               break;
4172
4173            case 0xEC: /* FLDLG2 */
4174               DIP("fldlg2\n");
4175               fp_push();
4176               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
4177               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
4178               break;
4179
4180            case 0xED: /* FLDLN2 */
4181               DIP("fldln2\n");
4182               fp_push();
4183               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
4184               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
4185               break;
4186
4187            case 0xEE: /* FLDZ */
4188               DIP("fldz\n");
4189               fp_push();
4190               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
4191               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
4192               break;
4193
4194            case 0xF0: /* F2XM1 */
4195               DIP("f2xm1\n");
4196               put_ST_UNCHECKED(0,
4197                  binop(Iop_2xm1F64,
4198                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4199                        get_ST(0)));
4200               break;
4201
4202            case 0xF1: /* FYL2X */
4203               DIP("fyl2x\n");
4204               put_ST_UNCHECKED(1,
4205                  triop(Iop_Yl2xF64,
4206                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4207                        get_ST(1),
4208                        get_ST(0)));
4209               fp_pop();
4210               break;
4211
4212            case 0xF2: /* FPTAN */
4213               DIP("ftan\n");
4214               put_ST_UNCHECKED(0,
4215                  binop(Iop_TanF64,
4216                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4217                        get_ST(0)));
4218               fp_push();
4219               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
4220               clear_C2(); /* HACK */
4221               break;
4222
4223            case 0xF3: /* FPATAN */
4224               DIP("fpatan\n");
4225               put_ST_UNCHECKED(1,
4226                  triop(Iop_AtanF64,
4227                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4228                        get_ST(1),
4229                        get_ST(0)));
4230               fp_pop();
4231               break;
4232
4233            case 0xF4: { /* FXTRACT */
4234               IRTemp argF = newTemp(Ity_F64);
4235               IRTemp sigF = newTemp(Ity_F64);
4236               IRTemp expF = newTemp(Ity_F64);
4237               IRTemp argI = newTemp(Ity_I64);
4238               IRTemp sigI = newTemp(Ity_I64);
4239               IRTemp expI = newTemp(Ity_I64);
4240               DIP("fxtract\n");
4241               assign( argF, get_ST(0) );
4242               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
4243               assign( sigI,
4244                       mkIRExprCCall(
4245                          Ity_I64, 0/*regparms*/,
4246                          "x86amd64g_calculate_FXTRACT",
4247                          &x86amd64g_calculate_FXTRACT,
4248                          mkIRExprVec_2( mkexpr(argI),
4249                                         mkIRExpr_HWord(0)/*sig*/ ))
4250               );
4251               assign( expI,
4252                       mkIRExprCCall(
4253                          Ity_I64, 0/*regparms*/,
4254                          "x86amd64g_calculate_FXTRACT",
4255                          &x86amd64g_calculate_FXTRACT,
4256                          mkIRExprVec_2( mkexpr(argI),
4257                                         mkIRExpr_HWord(1)/*exp*/ ))
4258               );
4259               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
4260               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
4261               /* exponent */
4262               put_ST_UNCHECKED(0, mkexpr(expF) );
4263               fp_push();
4264               /* significand */
4265               put_ST(0, mkexpr(sigF) );
4266               break;
4267            }
4268
4269            case 0xF5: { /* FPREM1 -- IEEE compliant */
4270               IRTemp a1 = newTemp(Ity_F64);
4271               IRTemp a2 = newTemp(Ity_F64);
4272               DIP("fprem1\n");
4273               /* Do FPREM1 twice, once to get the remainder, and once
4274                  to get the C3210 flag values. */
4275               assign( a1, get_ST(0) );
4276               assign( a2, get_ST(1) );
4277               put_ST_UNCHECKED(0,
4278                  triop(Iop_PRem1F64,
4279                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4280                        mkexpr(a1),
4281                        mkexpr(a2)));
4282               put_C3210(
4283                  triop(Iop_PRem1C3210F64,
4284                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4285                        mkexpr(a1),
4286                        mkexpr(a2)) );
4287               break;
4288            }
4289
4290            case 0xF7: /* FINCSTP */
4291               DIP("fprem\n");
4292               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4293               break;
4294
4295            case 0xF8: { /* FPREM -- not IEEE compliant */
4296               IRTemp a1 = newTemp(Ity_F64);
4297               IRTemp a2 = newTemp(Ity_F64);
4298               DIP("fprem\n");
4299               /* Do FPREM twice, once to get the remainder, and once
4300                  to get the C3210 flag values. */
4301               assign( a1, get_ST(0) );
4302               assign( a2, get_ST(1) );
4303               put_ST_UNCHECKED(0,
4304                  triop(Iop_PRemF64,
4305                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4306                        mkexpr(a1),
4307                        mkexpr(a2)));
4308               put_C3210(
4309                  triop(Iop_PRemC3210F64,
4310                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4311                        mkexpr(a1),
4312                        mkexpr(a2)) );
4313               break;
4314            }
4315
4316            case 0xF9: /* FYL2XP1 */
4317               DIP("fyl2xp1\n");
4318               put_ST_UNCHECKED(1,
4319                  triop(Iop_Yl2xp1F64,
4320                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4321                        get_ST(1),
4322                        get_ST(0)));
4323               fp_pop();
4324               break;
4325
4326            case 0xFA: /* FSQRT */
4327               DIP("fsqrt\n");
4328               put_ST_UNCHECKED(0,
4329                  binop(Iop_SqrtF64,
4330                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4331                        get_ST(0)));
4332               break;
4333
4334            case 0xFB: { /* FSINCOS */
4335               IRTemp a1 = newTemp(Ity_F64);
4336               assign( a1, get_ST(0) );
4337               DIP("fsincos\n");
4338               put_ST_UNCHECKED(0,
4339                  binop(Iop_SinF64,
4340                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4341                        mkexpr(a1)));
4342               fp_push();
4343               put_ST(0,
4344                  binop(Iop_CosF64,
4345                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4346                        mkexpr(a1)));
4347               clear_C2(); /* HACK */
4348               break;
4349            }
4350
4351            case 0xFC: /* FRNDINT */
4352               DIP("frndint\n");
4353               put_ST_UNCHECKED(0,
4354                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
4355               break;
4356
4357            case 0xFD: /* FSCALE */
4358               DIP("fscale\n");
4359               put_ST_UNCHECKED(0,
4360                  triop(Iop_ScaleF64,
4361                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4362                        get_ST(0),
4363                        get_ST(1)));
4364               break;
4365
4366            case 0xFE: /* FSIN */
4367               DIP("fsin\n");
4368               put_ST_UNCHECKED(0,
4369                  binop(Iop_SinF64,
4370                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4371                        get_ST(0)));
4372               clear_C2(); /* HACK */
4373               break;
4374
4375            case 0xFF: /* FCOS */
4376               DIP("fcos\n");
4377               put_ST_UNCHECKED(0,
4378                  binop(Iop_CosF64,
4379                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4380                        get_ST(0)));
4381               clear_C2(); /* HACK */
4382               break;
4383
4384            default:
4385               goto decode_fail;
4386         }
4387      }
4388   }
4389
4390   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
4391   else
4392   if (first_opcode == 0xDA) {
4393
4394      if (modrm < 0xC0) {
4395
4396         /* bits 5,4,3 are an opcode extension, and the modRM also
4397            specifies an address. */
4398         IROp   fop;
4399         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4400         delta += len;
4401         switch (gregOfRM(modrm)) {
4402
4403            case 0: /* FIADD m32int */ /* ST(0) += m32int */
4404               DIP("fiaddl %s\n", dis_buf);
4405               fop = Iop_AddF64;
4406               goto do_fop_m32;
4407
4408            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
4409               DIP("fimull %s\n", dis_buf);
4410               fop = Iop_MulF64;
4411               goto do_fop_m32;
4412
4413            case 2: /* FICOM m32int */
4414               DIP("ficoml %s\n", dis_buf);
4415               /* This forces C1 to zero, which isn't right. */
4416               put_C3210(
4417                   binop( Iop_And32,
4418                          binop(Iop_Shl32,
4419                                binop(Iop_CmpF64,
4420                                      get_ST(0),
4421                                      unop(Iop_I32StoF64,
4422                                           loadLE(Ity_I32,mkexpr(addr)))),
4423                                mkU8(8)),
4424                          mkU32(0x4500)
4425                   ));
4426               break;
4427
4428            case 3: /* FICOMP m32int */
4429               DIP("ficompl %s\n", dis_buf);
4430               /* This forces C1 to zero, which isn't right. */
4431               put_C3210(
4432                   binop( Iop_And32,
4433                          binop(Iop_Shl32,
4434                                binop(Iop_CmpF64,
4435                                      get_ST(0),
4436                                      unop(Iop_I32StoF64,
4437                                           loadLE(Ity_I32,mkexpr(addr)))),
4438                                mkU8(8)),
4439                          mkU32(0x4500)
4440                   ));
4441               fp_pop();
4442               break;
4443
4444            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
4445               DIP("fisubl %s\n", dis_buf);
4446               fop = Iop_SubF64;
4447               goto do_fop_m32;
4448
4449            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
4450               DIP("fisubrl %s\n", dis_buf);
4451               fop = Iop_SubF64;
4452               goto do_foprev_m32;
4453
4454            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
4455               DIP("fidivl %s\n", dis_buf);
4456               fop = Iop_DivF64;
4457               goto do_fop_m32;
4458
4459            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
4460               DIP("fidivrl %s\n", dis_buf);
4461               fop = Iop_DivF64;
4462               goto do_foprev_m32;
4463
4464            do_fop_m32:
4465               put_ST_UNCHECKED(0,
4466                  triop(fop,
4467                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4468                        get_ST(0),
4469                        unop(Iop_I32StoF64,
4470                             loadLE(Ity_I32, mkexpr(addr)))));
4471               break;
4472
4473            do_foprev_m32:
4474               put_ST_UNCHECKED(0,
4475                  triop(fop,
4476                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4477                        unop(Iop_I32StoF64,
4478                             loadLE(Ity_I32, mkexpr(addr))),
4479                        get_ST(0)));
4480               break;
4481
4482            default:
4483               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4484               vex_printf("first_opcode == 0xDA\n");
4485               goto decode_fail;
4486         }
4487
4488      } else {
4489
4490         delta++;
4491         switch (modrm) {
4492
4493            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
4494               r_src = (UInt)modrm - 0xC0;
4495               DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
4496               put_ST_UNCHECKED(0,
4497                                IRExpr_Mux0X(
4498                                    unop(Iop_1Uto8,
4499                                         mk_x86g_calculate_condition(X86CondB)),
4500                                    get_ST(0), get_ST(r_src)) );
4501               break;
4502
4503            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
4504               r_src = (UInt)modrm - 0xC8;
4505               DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
4506               put_ST_UNCHECKED(0,
4507                                IRExpr_Mux0X(
4508                                    unop(Iop_1Uto8,
4509                                         mk_x86g_calculate_condition(X86CondZ)),
4510                                    get_ST(0), get_ST(r_src)) );
4511               break;
4512
4513            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
4514               r_src = (UInt)modrm - 0xD0;
4515               DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
4516               put_ST_UNCHECKED(0,
4517                                IRExpr_Mux0X(
4518                                    unop(Iop_1Uto8,
4519                                         mk_x86g_calculate_condition(X86CondBE)),
4520                                    get_ST(0), get_ST(r_src)) );
4521               break;
4522
4523            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
4524               r_src = (UInt)modrm - 0xD8;
4525               DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
4526               put_ST_UNCHECKED(0,
4527                                IRExpr_Mux0X(
4528                                    unop(Iop_1Uto8,
4529                                         mk_x86g_calculate_condition(X86CondP)),
4530                                    get_ST(0), get_ST(r_src)) );
4531               break;
4532
4533            case 0xE9: /* FUCOMPP %st(0),%st(1) */
4534               DIP("fucompp %%st(0),%%st(1)\n");
4535               /* This forces C1 to zero, which isn't right. */
4536               put_C3210(
4537                   binop( Iop_And32,
4538                          binop(Iop_Shl32,
4539                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
4540                                mkU8(8)),
4541                          mkU32(0x4500)
4542                   ));
4543               fp_pop();
4544               fp_pop();
4545               break;
4546
4547            default:
4548               goto decode_fail;
4549         }
4550
4551      }
4552   }
4553
4554   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
4555   else
4556   if (first_opcode == 0xDB) {
4557      if (modrm < 0xC0) {
4558
4559         /* bits 5,4,3 are an opcode extension, and the modRM also
4560            specifies an address. */
4561         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4562         delta += len;
4563
4564         switch (gregOfRM(modrm)) {
4565
4566            case 0: /* FILD m32int */
4567               DIP("fildl %s\n", dis_buf);
4568               fp_push();
4569               put_ST(0, unop(Iop_I32StoF64,
4570                              loadLE(Ity_I32, mkexpr(addr))));
4571               break;
4572
4573            case 1: /* FISTTPL m32 (SSE3) */
4574               DIP("fisttpl %s\n", dis_buf);
4575               storeLE( mkexpr(addr),
4576                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
4577               fp_pop();
4578               break;
4579
4580            case 2: /* FIST m32 */
4581               DIP("fistl %s\n", dis_buf);
4582               storeLE( mkexpr(addr),
4583                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4584               break;
4585
4586            case 3: /* FISTP m32 */
4587               DIP("fistpl %s\n", dis_buf);
4588               storeLE( mkexpr(addr),
4589                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4590               fp_pop();
4591               break;
4592
4593            case 5: { /* FLD extended-real */
4594               /* Uses dirty helper:
4595                     ULong x86g_loadF80le ( UInt )
4596                  addr holds the address.  First, do a dirty call to
4597                  get hold of the data. */
4598               IRTemp   val  = newTemp(Ity_I64);
4599               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
4600
4601               IRDirty* d = unsafeIRDirty_1_N (
4602                               val,
4603                               0/*regparms*/,
4604                               "x86g_dirtyhelper_loadF80le",
4605                               &x86g_dirtyhelper_loadF80le,
4606                               args
4607                            );
4608               /* declare that we're reading memory */
4609               d->mFx   = Ifx_Read;
4610               d->mAddr = mkexpr(addr);
4611               d->mSize = 10;
4612
4613               /* execute the dirty call, dumping the result in val. */
4614               stmt( IRStmt_Dirty(d) );
4615               fp_push();
4616               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
4617
4618               DIP("fldt %s\n", dis_buf);
4619               break;
4620            }
4621
4622            case 7: { /* FSTP extended-real */
4623               /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
4624               IRExpr** args
4625                  = mkIRExprVec_2( mkexpr(addr),
4626                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
4627
4628               IRDirty* d = unsafeIRDirty_0_N (
4629                               0/*regparms*/,
4630                               "x86g_dirtyhelper_storeF80le",
4631                               &x86g_dirtyhelper_storeF80le,
4632                               args
4633                            );
4634               /* declare we're writing memory */
4635               d->mFx   = Ifx_Write;
4636               d->mAddr = mkexpr(addr);
4637               d->mSize = 10;
4638
4639               /* execute the dirty call. */
4640               stmt( IRStmt_Dirty(d) );
4641               fp_pop();
4642
4643               DIP("fstpt\n %s", dis_buf);
4644               break;
4645            }
4646
4647            default:
4648               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4649               vex_printf("first_opcode == 0xDB\n");
4650               goto decode_fail;
4651         }
4652
4653      } else {
4654
4655         delta++;
4656         switch (modrm) {
4657
4658            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
4659               r_src = (UInt)modrm - 0xC0;
4660               DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
4661               put_ST_UNCHECKED(0,
4662                                IRExpr_Mux0X(
4663                                    unop(Iop_1Uto8,
4664                                         mk_x86g_calculate_condition(X86CondNB)),
4665                                    get_ST(0), get_ST(r_src)) );
4666               break;
4667
4668            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
4669               r_src = (UInt)modrm - 0xC8;
4670               DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
4671               put_ST_UNCHECKED(0,
4672                                IRExpr_Mux0X(
4673                                    unop(Iop_1Uto8,
4674                                         mk_x86g_calculate_condition(X86CondNZ)),
4675                                    get_ST(0), get_ST(r_src)) );
4676               break;
4677
4678            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
4679               r_src = (UInt)modrm - 0xD0;
4680               DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
4681               put_ST_UNCHECKED(0,
4682                                IRExpr_Mux0X(
4683                                    unop(Iop_1Uto8,
4684                                         mk_x86g_calculate_condition(X86CondNBE)),
4685                                    get_ST(0), get_ST(r_src)) );
4686               break;
4687
4688            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
4689               r_src = (UInt)modrm - 0xD8;
4690               DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
4691               put_ST_UNCHECKED(0,
4692                                IRExpr_Mux0X(
4693                                    unop(Iop_1Uto8,
4694                                         mk_x86g_calculate_condition(X86CondNP)),
4695                                    get_ST(0), get_ST(r_src)) );
4696               break;
4697
4698            case 0xE2:
4699               DIP("fnclex\n");
4700               break;
4701
4702            case 0xE3: {
4703               /* Uses dirty helper:
4704                     void x86g_do_FINIT ( VexGuestX86State* ) */
4705               IRDirty* d  = unsafeIRDirty_0_N (
4706                                0/*regparms*/,
4707                                "x86g_dirtyhelper_FINIT",
4708                                &x86g_dirtyhelper_FINIT,
4709                                mkIRExprVec_0()
4710                             );
4711               d->needsBBP = True;
4712
4713               /* declare we're writing guest state */
4714               d->nFxState = 5;
4715
4716               d->fxState[0].fx     = Ifx_Write;
4717               d->fxState[0].offset = OFFB_FTOP;
4718               d->fxState[0].size   = sizeof(UInt);
4719
4720               d->fxState[1].fx     = Ifx_Write;
4721               d->fxState[1].offset = OFFB_FPREGS;
4722               d->fxState[1].size   = 8 * sizeof(ULong);
4723
4724               d->fxState[2].fx     = Ifx_Write;
4725               d->fxState[2].offset = OFFB_FPTAGS;
4726               d->fxState[2].size   = 8 * sizeof(UChar);
4727
4728               d->fxState[3].fx     = Ifx_Write;
4729               d->fxState[3].offset = OFFB_FPROUND;
4730               d->fxState[3].size   = sizeof(UInt);
4731
4732               d->fxState[4].fx     = Ifx_Write;
4733               d->fxState[4].offset = OFFB_FC3210;
4734               d->fxState[4].size   = sizeof(UInt);
4735
4736               stmt( IRStmt_Dirty(d) );
4737
4738               DIP("fninit\n");
4739               break;
4740            }
4741
4742            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
4743               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
4744               break;
4745
4746            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
4747               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
4748               break;
4749
4750            default:
4751               goto decode_fail;
4752         }
4753      }
4754   }
4755
4756   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
4757   else
4758   if (first_opcode == 0xDC) {
4759      if (modrm < 0xC0) {
4760
4761         /* bits 5,4,3 are an opcode extension, and the modRM also
4762            specifies an address. */
4763         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4764         delta += len;
4765
4766         switch (gregOfRM(modrm)) {
4767
4768            case 0: /* FADD double-real */
4769               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
4770               break;
4771
4772            case 1: /* FMUL double-real */
4773               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
4774               break;
4775
4776            case 2: /* FCOM double-real */
4777               DIP("fcoml %s\n", dis_buf);
4778               /* This forces C1 to zero, which isn't right. */
4779               put_C3210(
4780                   binop( Iop_And32,
4781                          binop(Iop_Shl32,
4782                                binop(Iop_CmpF64,
4783                                      get_ST(0),
4784                                      loadLE(Ity_F64,mkexpr(addr))),
4785                                mkU8(8)),
4786                          mkU32(0x4500)
4787                   ));
4788               break;
4789
4790            case 3: /* FCOMP double-real */
4791               DIP("fcompl %s\n", dis_buf);
4792               /* This forces C1 to zero, which isn't right. */
4793               put_C3210(
4794                   binop( Iop_And32,
4795                          binop(Iop_Shl32,
4796                                binop(Iop_CmpF64,
4797                                      get_ST(0),
4798                                      loadLE(Ity_F64,mkexpr(addr))),
4799                                mkU8(8)),
4800                          mkU32(0x4500)
4801                   ));
4802               fp_pop();
4803               break;
4804
4805            case 4: /* FSUB double-real */
4806               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
4807               break;
4808
4809            case 5: /* FSUBR double-real */
4810               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
4811               break;
4812
4813            case 6: /* FDIV double-real */
4814               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
4815               break;
4816
4817            case 7: /* FDIVR double-real */
4818               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
4819               break;
4820
4821            default:
4822               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4823               vex_printf("first_opcode == 0xDC\n");
4824               goto decode_fail;
4825         }
4826
4827      } else {
4828
4829         delta++;
4830         switch (modrm) {
4831
4832            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
4833               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
4834               break;
4835
4836            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
4837               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
4838               break;
4839
4840            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
4841               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
4842               break;
4843
4844            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
4845               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
4846               break;
4847
4848            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
4849               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
4850               break;
4851
4852            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
4853               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
4854               break;
4855
4856            default:
4857               goto decode_fail;
4858         }
4859
4860      }
4861   }
4862
4863   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
4864   else
4865   if (first_opcode == 0xDD) {
4866
4867      if (modrm < 0xC0) {
4868
4869         /* bits 5,4,3 are an opcode extension, and the modRM also
4870            specifies an address. */
4871         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4872         delta += len;
4873
4874         switch (gregOfRM(modrm)) {
4875
4876            case 0: /* FLD double-real */
4877               DIP("fldl %s\n", dis_buf);
4878               fp_push();
4879               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
4880               break;
4881
4882            case 1: /* FISTTPQ m64 (SSE3) */
4883               DIP("fistppll %s\n", dis_buf);
4884               storeLE( mkexpr(addr),
4885                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
4886               fp_pop();
4887               break;
4888
4889            case 2: /* FST double-real */
4890               DIP("fstl %s\n", dis_buf);
4891               storeLE(mkexpr(addr), get_ST(0));
4892               break;
4893
4894            case 3: /* FSTP double-real */
4895               DIP("fstpl %s\n", dis_buf);
4896               storeLE(mkexpr(addr), get_ST(0));
4897               fp_pop();
4898               break;
4899
4900            case 4: { /* FRSTOR m108 */
4901               /* Uses dirty helper:
4902                     VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
4903               IRTemp   ew = newTemp(Ity_I32);
4904               IRDirty* d  = unsafeIRDirty_0_N (
4905                                0/*regparms*/,
4906                                "x86g_dirtyhelper_FRSTOR",
4907                                &x86g_dirtyhelper_FRSTOR,
4908                                mkIRExprVec_1( mkexpr(addr) )
4909                             );
4910               d->needsBBP = True;
4911               d->tmp      = ew;
4912               /* declare we're reading memory */
4913               d->mFx   = Ifx_Read;
4914               d->mAddr = mkexpr(addr);
4915               d->mSize = 108;
4916
4917               /* declare we're writing guest state */
4918               d->nFxState = 5;
4919
4920               d->fxState[0].fx     = Ifx_Write;
4921               d->fxState[0].offset = OFFB_FTOP;
4922               d->fxState[0].size   = sizeof(UInt);
4923
4924               d->fxState[1].fx     = Ifx_Write;
4925               d->fxState[1].offset = OFFB_FPREGS;
4926               d->fxState[1].size   = 8 * sizeof(ULong);
4927
4928               d->fxState[2].fx     = Ifx_Write;
4929               d->fxState[2].offset = OFFB_FPTAGS;
4930               d->fxState[2].size   = 8 * sizeof(UChar);
4931
4932               d->fxState[3].fx     = Ifx_Write;
4933               d->fxState[3].offset = OFFB_FPROUND;
4934               d->fxState[3].size   = sizeof(UInt);
4935
4936               d->fxState[4].fx     = Ifx_Write;
4937               d->fxState[4].offset = OFFB_FC3210;
4938               d->fxState[4].size   = sizeof(UInt);
4939
4940               stmt( IRStmt_Dirty(d) );
4941
4942               /* ew contains any emulation warning we may need to
4943                  issue.  If needed, side-exit to the next insn,
4944                  reporting the warning, so that Valgrind's dispatcher
4945                  sees the warning. */
4946               put_emwarn( mkexpr(ew) );
4947               stmt(
4948                  IRStmt_Exit(
4949                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4950                     Ijk_EmWarn,
4951                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
4952                  )
4953               );
4954
4955               DIP("frstor %s\n", dis_buf);
4956               break;
4957            }
4958
4959            case 6: { /* FNSAVE m108 */
4960               /* Uses dirty helper:
4961                     void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
4962               IRDirty* d = unsafeIRDirty_0_N (
4963                               0/*regparms*/,
4964                               "x86g_dirtyhelper_FSAVE",
4965                               &x86g_dirtyhelper_FSAVE,
4966                               mkIRExprVec_1( mkexpr(addr) )
4967                            );
4968               d->needsBBP = True;
4969               /* declare we're writing memory */
4970               d->mFx   = Ifx_Write;
4971               d->mAddr = mkexpr(addr);
4972               d->mSize = 108;
4973
4974               /* declare we're reading guest state */
4975               d->nFxState = 5;
4976
4977               d->fxState[0].fx     = Ifx_Read;
4978               d->fxState[0].offset = OFFB_FTOP;
4979               d->fxState[0].size   = sizeof(UInt);
4980
4981               d->fxState[1].fx     = Ifx_Read;
4982               d->fxState[1].offset = OFFB_FPREGS;
4983               d->fxState[1].size   = 8 * sizeof(ULong);
4984
4985               d->fxState[2].fx     = Ifx_Read;
4986               d->fxState[2].offset = OFFB_FPTAGS;
4987               d->fxState[2].size   = 8 * sizeof(UChar);
4988
4989               d->fxState[3].fx     = Ifx_Read;
4990               d->fxState[3].offset = OFFB_FPROUND;
4991               d->fxState[3].size   = sizeof(UInt);
4992
4993               d->fxState[4].fx     = Ifx_Read;
4994               d->fxState[4].offset = OFFB_FC3210;
4995               d->fxState[4].size   = sizeof(UInt);
4996
4997               stmt( IRStmt_Dirty(d) );
4998
4999               DIP("fnsave %s\n", dis_buf);
5000               break;
5001            }
5002
5003            case 7: { /* FNSTSW m16 */
5004               IRExpr* sw = get_FPU_sw();
5005               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
5006               storeLE( mkexpr(addr), sw );
5007               DIP("fnstsw %s\n", dis_buf);
5008               break;
5009            }
5010
5011            default:
5012               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5013               vex_printf("first_opcode == 0xDD\n");
5014               goto decode_fail;
5015         }
5016      } else {
5017         delta++;
5018         switch (modrm) {
5019
5020            case 0xC0 ... 0xC7: /* FFREE %st(?) */
5021               r_dst = (UInt)modrm - 0xC0;
5022               DIP("ffree %%st(%d)\n", (Int)r_dst);
5023               put_ST_TAG ( r_dst, mkU8(0) );
5024               break;
5025
5026            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
5027               r_dst = (UInt)modrm - 0xD0;
5028               DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
5029               /* P4 manual says: "If the destination operand is a
5030                  non-empty register, the invalid-operation exception
5031                  is not generated.  Hence put_ST_UNCHECKED. */
5032               put_ST_UNCHECKED(r_dst, get_ST(0));
5033               break;
5034
5035            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
5036               r_dst = (UInt)modrm - 0xD8;
5037               DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
5038               /* P4 manual says: "If the destination operand is a
5039                  non-empty register, the invalid-operation exception
5040                  is not generated.  Hence put_ST_UNCHECKED. */
5041               put_ST_UNCHECKED(r_dst, get_ST(0));
5042               fp_pop();
5043               break;
5044
5045            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
5046               r_dst = (UInt)modrm - 0xE0;
5047               DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
5048               /* This forces C1 to zero, which isn't right. */
5049               put_C3210(
5050                   binop( Iop_And32,
5051                          binop(Iop_Shl32,
5052                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5053                                mkU8(8)),
5054                          mkU32(0x4500)
5055                   ));
5056               break;
5057
5058            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
5059               r_dst = (UInt)modrm - 0xE8;
5060               DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
5061               /* This forces C1 to zero, which isn't right. */
5062               put_C3210(
5063                   binop( Iop_And32,
5064                          binop(Iop_Shl32,
5065                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5066                                mkU8(8)),
5067                          mkU32(0x4500)
5068                   ));
5069               fp_pop();
5070               break;
5071
5072            default:
5073               goto decode_fail;
5074         }
5075      }
5076   }
5077
5078   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
5079   else
5080   if (first_opcode == 0xDE) {
5081
5082      if (modrm < 0xC0) {
5083
5084         /* bits 5,4,3 are an opcode extension, and the modRM also
5085            specifies an address. */
5086         IROp   fop;
5087         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5088         delta += len;
5089
5090         switch (gregOfRM(modrm)) {
5091
5092            case 0: /* FIADD m16int */ /* ST(0) += m16int */
5093               DIP("fiaddw %s\n", dis_buf);
5094               fop = Iop_AddF64;
5095               goto do_fop_m16;
5096
5097            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
5098               DIP("fimulw %s\n", dis_buf);
5099               fop = Iop_MulF64;
5100               goto do_fop_m16;
5101
5102            case 2: /* FICOM m16int */
5103               DIP("ficomw %s\n", dis_buf);
5104               /* This forces C1 to zero, which isn't right. */
5105               put_C3210(
5106                   binop( Iop_And32,
5107                          binop(Iop_Shl32,
5108                                binop(Iop_CmpF64,
5109                                      get_ST(0),
5110                                      unop(Iop_I32StoF64,
5111                                         unop(Iop_16Sto32,
5112                                           loadLE(Ity_I16,mkexpr(addr))))),
5113                                mkU8(8)),
5114                          mkU32(0x4500)
5115                   ));
5116               break;
5117
5118            case 3: /* FICOMP m16int */
5119               DIP("ficompw %s\n", dis_buf);
5120               /* This forces C1 to zero, which isn't right. */
5121               put_C3210(
5122                   binop( Iop_And32,
5123                          binop(Iop_Shl32,
5124                                binop(Iop_CmpF64,
5125                                      get_ST(0),
5126                                      unop(Iop_I32StoF64,
5127                                         unop(Iop_16Sto32,
5128                                              loadLE(Ity_I16,mkexpr(addr))))),
5129                                mkU8(8)),
5130                          mkU32(0x4500)
5131                   ));
5132               fp_pop();
5133               break;
5134
5135            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
5136               DIP("fisubw %s\n", dis_buf);
5137               fop = Iop_SubF64;
5138               goto do_fop_m16;
5139
5140            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
5141               DIP("fisubrw %s\n", dis_buf);
5142               fop = Iop_SubF64;
5143               goto do_foprev_m16;
5144
5145            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
5146               DIP("fisubw %s\n", dis_buf);
5147               fop = Iop_DivF64;
5148               goto do_fop_m16;
5149
5150            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
5151               DIP("fidivrw %s\n", dis_buf);
5152               fop = Iop_DivF64;
5153               goto do_foprev_m16;
5154
5155            do_fop_m16:
5156               put_ST_UNCHECKED(0,
5157                  triop(fop,
5158                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5159                        get_ST(0),
5160                        unop(Iop_I32StoF64,
5161                             unop(Iop_16Sto32,
5162                                  loadLE(Ity_I16, mkexpr(addr))))));
5163               break;
5164
5165            do_foprev_m16:
5166               put_ST_UNCHECKED(0,
5167                  triop(fop,
5168                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5169                        unop(Iop_I32StoF64,
5170                             unop(Iop_16Sto32,
5171                                  loadLE(Ity_I16, mkexpr(addr)))),
5172                        get_ST(0)));
5173               break;
5174
5175            default:
5176               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5177               vex_printf("first_opcode == 0xDE\n");
5178               goto decode_fail;
5179         }
5180
5181      } else {
5182
5183         delta++;
5184         switch (modrm) {
5185
5186            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
5187               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
5188               break;
5189
5190            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
5191               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
5192               break;
5193
5194            case 0xD9: /* FCOMPP %st(0),%st(1) */
5195               DIP("fuompp %%st(0),%%st(1)\n");
5196               /* This forces C1 to zero, which isn't right. */
5197               put_C3210(
5198                   binop( Iop_And32,
5199                          binop(Iop_Shl32,
5200                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5201                                mkU8(8)),
5202                          mkU32(0x4500)
5203                   ));
5204               fp_pop();
5205               fp_pop();
5206               break;
5207
5208            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
5209               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
5210               break;
5211
5212            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
5213               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
5214               break;
5215
5216            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
5217               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
5218               break;
5219
5220            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
5221               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
5222               break;
5223
5224            default:
5225               goto decode_fail;
5226         }
5227
5228      }
5229   }
5230
5231   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
5232   else
5233   if (first_opcode == 0xDF) {
5234
5235      if (modrm < 0xC0) {
5236
5237         /* bits 5,4,3 are an opcode extension, and the modRM also
5238            specifies an address. */
5239         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5240         delta += len;
5241
5242         switch (gregOfRM(modrm)) {
5243
5244            case 0: /* FILD m16int */
5245               DIP("fildw %s\n", dis_buf);
5246               fp_push();
5247               put_ST(0, unop(Iop_I32StoF64,
5248                              unop(Iop_16Sto32,
5249                                   loadLE(Ity_I16, mkexpr(addr)))));
5250               break;
5251
5252            case 1: /* FISTTPS m16 (SSE3) */
5253               DIP("fisttps %s\n", dis_buf);
5254               storeLE( mkexpr(addr),
5255                        binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
5256               fp_pop();
5257               break;
5258
5259            case 2: /* FIST m16 */
5260               DIP("fistp %s\n", dis_buf);
5261               storeLE( mkexpr(addr),
5262                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5263               break;
5264
5265            case 3: /* FISTP m16 */
5266               DIP("fistps %s\n", dis_buf);
5267               storeLE( mkexpr(addr),
5268                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5269               fp_pop();
5270               break;
5271
5272            case 5: /* FILD m64 */
5273               DIP("fildll %s\n", dis_buf);
5274               fp_push();
5275               put_ST(0, binop(Iop_I64StoF64,
5276                               get_roundingmode(),
5277                               loadLE(Ity_I64, mkexpr(addr))));
5278               break;
5279
5280            case 7: /* FISTP m64 */
5281               DIP("fistpll %s\n", dis_buf);
5282               storeLE( mkexpr(addr),
5283                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
5284               fp_pop();
5285               break;
5286
5287            default:
5288               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5289               vex_printf("first_opcode == 0xDF\n");
5290               goto decode_fail;
5291         }
5292
5293      } else {
5294
5295         delta++;
5296         switch (modrm) {
5297
5298            case 0xC0: /* FFREEP %st(0) */
5299               DIP("ffreep %%st(%d)\n", 0);
5300               put_ST_TAG ( 0, mkU8(0) );
5301               fp_pop();
5302               break;
5303
5304            case 0xE0: /* FNSTSW %ax */
5305               DIP("fnstsw %%ax\n");
5306               /* Get the FPU status word value and dump it in %AX. */
5307               if (0) {
5308                  /* The obvious thing to do is simply dump the 16-bit
5309                     status word value in %AX.  However, due to a
5310                     limitation in Memcheck's origin tracking
5311                     machinery, this causes Memcheck not to track the
5312                     origin of any undefinedness into %AH (only into
5313                     %AL/%AX/%EAX), which means origins are lost in
5314                     the sequence "fnstsw %ax; test $M,%ah; jcond .." */
5315                  putIReg(2, R_EAX, get_FPU_sw());
5316               } else {
5317                  /* So a somewhat lame kludge is to make it very
5318                     clear to Memcheck that the value is written to
5319                     both %AH and %AL.  This generates marginally
5320                     worse code, but I don't think it matters much. */
5321                  IRTemp t16 = newTemp(Ity_I16);
5322                  assign(t16, get_FPU_sw());
5323                  putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
5324                  putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
5325               }
5326               break;
5327
5328            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
5329               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
5330               break;
5331
5332            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
5333               /* not really right since COMIP != UCOMIP */
5334               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
5335               break;
5336
5337            default:
5338               goto decode_fail;
5339         }
5340      }
5341
5342   }
5343
5344   else
5345   vpanic("dis_FPU(x86): invalid primary opcode");
5346
5347   *decode_ok = True;
5348   return delta;
5349
5350  decode_fail:
5351   *decode_ok = False;
5352   return delta;
5353}
5354
5355
5356/*------------------------------------------------------------*/
5357/*---                                                      ---*/
5358/*--- MMX INSTRUCTIONS                                     ---*/
5359/*---                                                      ---*/
5360/*------------------------------------------------------------*/
5361
5362/* Effect of MMX insns on x87 FPU state (table 11-2 of
5363   IA32 arch manual, volume 3):
5364
5365   Read from, or write to MMX register (viz, any insn except EMMS):
5366   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
5367   * FP stack pointer set to zero
5368
5369   EMMS:
5370   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
5371   * FP stack pointer set to zero
5372*/
5373
5374static void do_MMX_preamble ( void )
5375{
5376   Int         i;
5377   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5378   IRExpr*     zero  = mkU32(0);
5379   IRExpr*     tag1  = mkU8(1);
5380   put_ftop(zero);
5381   for (i = 0; i < 8; i++)
5382      stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
5383}
5384
5385static void do_EMMS_preamble ( void )
5386{
5387   Int         i;
5388   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5389   IRExpr*     zero  = mkU32(0);
5390   IRExpr*     tag0  = mkU8(0);
5391   put_ftop(zero);
5392   for (i = 0; i < 8; i++)
5393      stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
5394}
5395
5396
5397static IRExpr* getMMXReg ( UInt archreg )
5398{
5399   vassert(archreg < 8);
5400   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
5401}
5402
5403
5404static void putMMXReg ( UInt archreg, IRExpr* e )
5405{
5406   vassert(archreg < 8);
5407   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
5408   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
5409}
5410
5411
5412/* Helper for non-shift MMX insns.  Note this is incomplete in the
5413   sense that it does not first call do_MMX_preamble() -- that is the
5414   responsibility of its caller. */
5415
5416static
5417UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
5418                               Int    delta,
5419                               UChar  opc,
5420                               HChar* name,
5421                               Bool   show_granularity )
5422{
5423   HChar   dis_buf[50];
5424   UChar   modrm = getIByte(delta);
5425   Bool    isReg = epartIsReg(modrm);
5426   IRExpr* argL  = NULL;
5427   IRExpr* argR  = NULL;
5428   IRExpr* argG  = NULL;
5429   IRExpr* argE  = NULL;
5430   IRTemp  res   = newTemp(Ity_I64);
5431
5432   Bool    invG  = False;
5433   IROp    op    = Iop_INVALID;
5434   void*   hAddr = NULL;
5435   HChar*  hName = NULL;
5436   Bool    eLeft = False;
5437
5438#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
5439
5440   switch (opc) {
5441      /* Original MMX ones */
5442      case 0xFC: op = Iop_Add8x8; break;
5443      case 0xFD: op = Iop_Add16x4; break;
5444      case 0xFE: op = Iop_Add32x2; break;
5445
5446      case 0xEC: op = Iop_QAdd8Sx8; break;
5447      case 0xED: op = Iop_QAdd16Sx4; break;
5448
5449      case 0xDC: op = Iop_QAdd8Ux8; break;
5450      case 0xDD: op = Iop_QAdd16Ux4; break;
5451
5452      case 0xF8: op = Iop_Sub8x8;  break;
5453      case 0xF9: op = Iop_Sub16x4; break;
5454      case 0xFA: op = Iop_Sub32x2; break;
5455
5456      case 0xE8: op = Iop_QSub8Sx8; break;
5457      case 0xE9: op = Iop_QSub16Sx4; break;
5458
5459      case 0xD8: op = Iop_QSub8Ux8; break;
5460      case 0xD9: op = Iop_QSub16Ux4; break;
5461
5462      case 0xE5: op = Iop_MulHi16Sx4; break;
5463      case 0xD5: op = Iop_Mul16x4; break;
5464      case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
5465
5466      case 0x74: op = Iop_CmpEQ8x8; break;
5467      case 0x75: op = Iop_CmpEQ16x4; break;
5468      case 0x76: op = Iop_CmpEQ32x2; break;
5469
5470      case 0x64: op = Iop_CmpGT8Sx8; break;
5471      case 0x65: op = Iop_CmpGT16Sx4; break;
5472      case 0x66: op = Iop_CmpGT32Sx2; break;
5473
5474      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
5475      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
5476      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
5477
5478      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
5479      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
5480      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
5481
5482      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
5483      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
5484      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
5485
5486      case 0xDB: op = Iop_And64; break;
5487      case 0xDF: op = Iop_And64; invG = True; break;
5488      case 0xEB: op = Iop_Or64; break;
5489      case 0xEF: /* Possibly do better here if argL and argR are the
5490                    same reg */
5491                 op = Iop_Xor64; break;
5492
5493      /* Introduced in SSE1 */
5494      case 0xE0: op = Iop_Avg8Ux8;    break;
5495      case 0xE3: op = Iop_Avg16Ux4;   break;
5496      case 0xEE: op = Iop_Max16Sx4;   break;
5497      case 0xDE: op = Iop_Max8Ux8;    break;
5498      case 0xEA: op = Iop_Min16Sx4;   break;
5499      case 0xDA: op = Iop_Min8Ux8;    break;
5500      case 0xE4: op = Iop_MulHi16Ux4; break;
5501      case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
5502
5503      /* Introduced in SSE2 */
5504      case 0xD4: op = Iop_Add64; break;
5505      case 0xFB: op = Iop_Sub64; break;
5506
5507      default:
5508         vex_printf("\n0x%x\n", (Int)opc);
5509         vpanic("dis_MMXop_regmem_to_reg");
5510   }
5511
5512#  undef XXX
5513
5514   argG = getMMXReg(gregOfRM(modrm));
5515   if (invG)
5516      argG = unop(Iop_Not64, argG);
5517
5518   if (isReg) {
5519      delta++;
5520      argE = getMMXReg(eregOfRM(modrm));
5521   } else {
5522      Int    len;
5523      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5524      delta += len;
5525      argE = loadLE(Ity_I64, mkexpr(addr));
5526   }
5527
5528   if (eLeft) {
5529      argL = argE;
5530      argR = argG;
5531   } else {
5532      argL = argG;
5533      argR = argE;
5534   }
5535
5536   if (op != Iop_INVALID) {
5537      vassert(hName == NULL);
5538      vassert(hAddr == NULL);
5539      assign(res, binop(op, argL, argR));
5540   } else {
5541      vassert(hName != NULL);
5542      vassert(hAddr != NULL);
5543      assign( res,
5544              mkIRExprCCall(
5545                 Ity_I64,
5546                 0/*regparms*/, hName, hAddr,
5547                 mkIRExprVec_2( argL, argR )
5548              )
5549            );
5550   }
5551
5552   putMMXReg( gregOfRM(modrm), mkexpr(res) );
5553
5554   DIP("%s%s %s, %s\n",
5555       name, show_granularity ? nameMMXGran(opc & 3) : "",
5556       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
5557       nameMMXReg(gregOfRM(modrm)) );
5558
5559   return delta;
5560}
5561
5562
5563/* Vector by scalar shift of G by the amount specified at the bottom
5564   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
5565
5566static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
5567                                 HChar* opname, IROp op )
5568{
5569   HChar   dis_buf[50];
5570   Int     alen, size;
5571   IRTemp  addr;
5572   Bool    shl, shr, sar;
5573   UChar   rm   = getIByte(delta);
5574   IRTemp  g0   = newTemp(Ity_I64);
5575   IRTemp  g1   = newTemp(Ity_I64);
5576   IRTemp  amt  = newTemp(Ity_I32);
5577   IRTemp  amt8 = newTemp(Ity_I8);
5578
5579   if (epartIsReg(rm)) {
5580      assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
5581      DIP("%s %s,%s\n", opname,
5582                        nameMMXReg(eregOfRM(rm)),
5583                        nameMMXReg(gregOfRM(rm)) );
5584      delta++;
5585   } else {
5586      addr = disAMode ( &alen, sorb, delta, dis_buf );
5587      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
5588      DIP("%s %s,%s\n", opname,
5589                        dis_buf,
5590                        nameMMXReg(gregOfRM(rm)) );
5591      delta += alen;
5592   }
5593   assign( g0,   getMMXReg(gregOfRM(rm)) );
5594   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
5595
5596   shl = shr = sar = False;
5597   size = 0;
5598   switch (op) {
5599      case Iop_ShlN16x4: shl = True; size = 32; break;
5600      case Iop_ShlN32x2: shl = True; size = 32; break;
5601      case Iop_Shl64:    shl = True; size = 64; break;
5602      case Iop_ShrN16x4: shr = True; size = 16; break;
5603      case Iop_ShrN32x2: shr = True; size = 32; break;
5604      case Iop_Shr64:    shr = True; size = 64; break;
5605      case Iop_SarN16x4: sar = True; size = 16; break;
5606      case Iop_SarN32x2: sar = True; size = 32; break;
5607      default: vassert(0);
5608   }
5609
5610   if (shl || shr) {
5611     assign(
5612        g1,
5613        IRExpr_Mux0X(
5614           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
5615           mkU64(0),
5616           binop(op, mkexpr(g0), mkexpr(amt8))
5617        )
5618     );
5619   } else
5620   if (sar) {
5621     assign(
5622        g1,
5623        IRExpr_Mux0X(
5624           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
5625           binop(op, mkexpr(g0), mkU8(size-1)),
5626           binop(op, mkexpr(g0), mkexpr(amt8))
5627        )
5628     );
5629   } else {
5630      /*NOTREACHED*/
5631      vassert(0);
5632   }
5633
5634   putMMXReg( gregOfRM(rm), mkexpr(g1) );
5635   return delta;
5636}
5637
5638
5639/* Vector by scalar shift of E by an immediate byte.  This is a
5640   straight copy of dis_SSE_shiftE_imm. */
5641
5642static
5643UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
5644{
5645   Bool    shl, shr, sar;
5646   UChar   rm   = getIByte(delta);
5647   IRTemp  e0   = newTemp(Ity_I64);
5648   IRTemp  e1   = newTemp(Ity_I64);
5649   UChar   amt, size;
5650   vassert(epartIsReg(rm));
5651   vassert(gregOfRM(rm) == 2
5652           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
5653   amt = getIByte(delta+1);
5654   delta += 2;
5655   DIP("%s $%d,%s\n", opname,
5656                      (Int)amt,
5657                      nameMMXReg(eregOfRM(rm)) );
5658
5659   assign( e0, getMMXReg(eregOfRM(rm)) );
5660
5661   shl = shr = sar = False;
5662   size = 0;
5663   switch (op) {
5664      case Iop_ShlN16x4: shl = True; size = 16; break;
5665      case Iop_ShlN32x2: shl = True; size = 32; break;
5666      case Iop_Shl64:    shl = True; size = 64; break;
5667      case Iop_SarN16x4: sar = True; size = 16; break;
5668      case Iop_SarN32x2: sar = True; size = 32; break;
5669      case Iop_ShrN16x4: shr = True; size = 16; break;
5670      case Iop_ShrN32x2: shr = True; size = 32; break;
5671      case Iop_Shr64:    shr = True; size = 64; break;
5672      default: vassert(0);
5673   }
5674
5675   if (shl || shr) {
5676      assign( e1, amt >= size
5677                     ? mkU64(0)
5678                     : binop(op, mkexpr(e0), mkU8(amt))
5679      );
5680   } else
5681   if (sar) {
5682      assign( e1, amt >= size
5683                     ? binop(op, mkexpr(e0), mkU8(size-1))
5684                     : binop(op, mkexpr(e0), mkU8(amt))
5685      );
5686   } else {
5687      /*NOTREACHED*/
5688      vassert(0);
5689   }
5690
5691   putMMXReg( eregOfRM(rm), mkexpr(e1) );
5692   return delta;
5693}
5694
5695
5696/* Completely handle all MMX instructions except emms. */
5697
5698static
5699UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
5700{
5701   Int   len;
5702   UChar modrm;
5703   HChar dis_buf[50];
5704   UChar opc = getIByte(delta);
5705   delta++;
5706
5707   /* dis_MMX handles all insns except emms. */
5708   do_MMX_preamble();
5709
5710   switch (opc) {
5711
5712      case 0x6E:
5713         /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
5714         if (sz != 4)
5715            goto mmx_decode_failure;
5716         modrm = getIByte(delta);
5717         if (epartIsReg(modrm)) {
5718            delta++;
5719            putMMXReg(
5720               gregOfRM(modrm),
5721               binop( Iop_32HLto64,
5722                      mkU32(0),
5723                      getIReg(4, eregOfRM(modrm)) ) );
5724            DIP("movd %s, %s\n",
5725                nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5726         } else {
5727            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5728            delta += len;
5729            putMMXReg(
5730               gregOfRM(modrm),
5731               binop( Iop_32HLto64,
5732                      mkU32(0),
5733                      loadLE(Ity_I32, mkexpr(addr)) ) );
5734            DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
5735         }
5736         break;
5737
5738      case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
5739         if (sz != 4)
5740            goto mmx_decode_failure;
5741         modrm = getIByte(delta);
5742         if (epartIsReg(modrm)) {
5743            delta++;
5744            putIReg( 4, eregOfRM(modrm),
5745                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5746            DIP("movd %s, %s\n",
5747                nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
5748         } else {
5749            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5750            delta += len;
5751            storeLE( mkexpr(addr),
5752                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5753            DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
5754         }
5755         break;
5756
5757      case 0x6F:
5758         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
5759         if (sz != 4)
5760            goto mmx_decode_failure;
5761         modrm = getIByte(delta);
5762         if (epartIsReg(modrm)) {
5763            delta++;
5764            putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
5765            DIP("movq %s, %s\n",
5766                nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5767         } else {
5768            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5769            delta += len;
5770            putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
5771            DIP("movq %s, %s\n",
5772                dis_buf, nameMMXReg(gregOfRM(modrm)));
5773         }
5774         break;
5775
5776      case 0x7F:
5777         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
5778         if (sz != 4)
5779            goto mmx_decode_failure;
5780         modrm = getIByte(delta);
5781         if (epartIsReg(modrm)) {
5782            delta++;
5783            putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
5784            DIP("movq %s, %s\n",
5785                nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
5786         } else {
5787            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5788            delta += len;
5789            storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
5790            DIP("mov(nt)q %s, %s\n",
5791                nameMMXReg(gregOfRM(modrm)), dis_buf);
5792         }
5793         break;
5794
5795      case 0xFC:
5796      case 0xFD:
5797      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
5798         if (sz != 4)
5799            goto mmx_decode_failure;
5800         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
5801         break;
5802
5803      case 0xEC:
5804      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
5805         if (sz != 4)
5806            goto mmx_decode_failure;
5807         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
5808         break;
5809
5810      case 0xDC:
5811      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5812         if (sz != 4)
5813            goto mmx_decode_failure;
5814         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
5815         break;
5816
5817      case 0xF8:
5818      case 0xF9:
5819      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
5820         if (sz != 4)
5821            goto mmx_decode_failure;
5822         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
5823         break;
5824
5825      case 0xE8:
5826      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
5827         if (sz != 4)
5828            goto mmx_decode_failure;
5829         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
5830         break;
5831
5832      case 0xD8:
5833      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5834         if (sz != 4)
5835            goto mmx_decode_failure;
5836         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
5837         break;
5838
5839      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
5840         if (sz != 4)
5841            goto mmx_decode_failure;
5842         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
5843         break;
5844
5845      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
5846         if (sz != 4)
5847            goto mmx_decode_failure;
5848         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
5849         break;
5850
5851      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
5852         vassert(sz == 4);
5853         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
5854         break;
5855
5856      case 0x74:
5857      case 0x75:
5858      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
5859         if (sz != 4)
5860            goto mmx_decode_failure;
5861         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
5862         break;
5863
5864      case 0x64:
5865      case 0x65:
5866      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
5867         if (sz != 4)
5868            goto mmx_decode_failure;
5869         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
5870         break;
5871
5872      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
5873         if (sz != 4)
5874            goto mmx_decode_failure;
5875         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
5876         break;
5877
5878      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
5879         if (sz != 4)
5880            goto mmx_decode_failure;
5881         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
5882         break;
5883
5884      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
5885         if (sz != 4)
5886            goto mmx_decode_failure;
5887         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
5888         break;
5889
5890      case 0x68:
5891      case 0x69:
5892      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
5893         if (sz != 4)
5894            goto mmx_decode_failure;
5895         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
5896         break;
5897
5898      case 0x60:
5899      case 0x61:
5900      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
5901         if (sz != 4)
5902            goto mmx_decode_failure;
5903         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
5904         break;
5905
5906      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
5907         if (sz != 4)
5908            goto mmx_decode_failure;
5909         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
5910         break;
5911
5912      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
5913         if (sz != 4)
5914            goto mmx_decode_failure;
5915         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
5916         break;
5917
5918      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
5919         if (sz != 4)
5920            goto mmx_decode_failure;
5921         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
5922         break;
5923
5924      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
5925         if (sz != 4)
5926            goto mmx_decode_failure;
5927         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
5928         break;
5929
5930#     define SHIFT_BY_REG(_name,_op)                                 \
5931                delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
5932                break;
5933
5934      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
5935      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
5936      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
5937      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
5938
5939      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
5940      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
5941      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
5942      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
5943
5944      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
5945      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
5946      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
5947
5948#     undef SHIFT_BY_REG
5949
5950      case 0x71:
5951      case 0x72:
5952      case 0x73: {
5953         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
5954         UChar byte2, subopc;
5955         if (sz != 4)
5956            goto mmx_decode_failure;
5957         byte2  = getIByte(delta);           /* amode / sub-opcode */
5958         subopc = toUChar( (byte2 >> 3) & 7 );
5959
5960#        define SHIFT_BY_IMM(_name,_op)                         \
5961             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
5962             } while (0)
5963
5964              if (subopc == 2 /*SRL*/ && opc == 0x71)
5965                 SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
5966         else if (subopc == 2 /*SRL*/ && opc == 0x72)
5967                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
5968         else if (subopc == 2 /*SRL*/ && opc == 0x73)
5969                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
5970
5971         else if (subopc == 4 /*SAR*/ && opc == 0x71)
5972                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
5973         else if (subopc == 4 /*SAR*/ && opc == 0x72)
5974                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
5975
5976         else if (subopc == 6 /*SHL*/ && opc == 0x71)
5977                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
5978         else if (subopc == 6 /*SHL*/ && opc == 0x72)
5979                 SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
5980         else if (subopc == 6 /*SHL*/ && opc == 0x73)
5981                 SHIFT_BY_IMM("psllq", Iop_Shl64);
5982
5983         else goto mmx_decode_failure;
5984
5985#        undef SHIFT_BY_IMM
5986         break;
5987      }
5988
5989      case 0xF7: {
5990         IRTemp addr    = newTemp(Ity_I32);
5991         IRTemp regD    = newTemp(Ity_I64);
5992         IRTemp regM    = newTemp(Ity_I64);
5993         IRTemp mask    = newTemp(Ity_I64);
5994         IRTemp olddata = newTemp(Ity_I64);
5995         IRTemp newdata = newTemp(Ity_I64);
5996
5997         modrm = getIByte(delta);
5998         if (sz != 4 || (!epartIsReg(modrm)))
5999            goto mmx_decode_failure;
6000         delta++;
6001
6002         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
6003         assign( regM, getMMXReg( eregOfRM(modrm) ));
6004         assign( regD, getMMXReg( gregOfRM(modrm) ));
6005         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
6006         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
6007         assign( newdata,
6008                 binop(Iop_Or64,
6009                       binop(Iop_And64,
6010                             mkexpr(regD),
6011                             mkexpr(mask) ),
6012                       binop(Iop_And64,
6013                             mkexpr(olddata),
6014                             unop(Iop_Not64, mkexpr(mask)))) );
6015         storeLE( mkexpr(addr), mkexpr(newdata) );
6016         DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
6017                                 nameMMXReg( gregOfRM(modrm) ) );
6018         break;
6019      }
6020
6021      /* --- MMX decode failure --- */
6022      default:
6023      mmx_decode_failure:
6024         *decode_ok = False;
6025         return delta; /* ignored */
6026
6027   }
6028
6029   *decode_ok = True;
6030   return delta;
6031}
6032
6033
6034/*------------------------------------------------------------*/
6035/*--- More misc arithmetic and other obscure insns.        ---*/
6036/*------------------------------------------------------------*/
6037
6038/* Double length left and right shifts.  Apparently only required in
6039   v-size (no b- variant). */
6040static
6041UInt dis_SHLRD_Gv_Ev ( UChar sorb,
6042                       Int delta, UChar modrm,
6043                       Int sz,
6044                       IRExpr* shift_amt,
6045                       Bool amt_is_literal,
6046                       HChar* shift_amt_txt,
6047                       Bool left_shift )
6048{
6049   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
6050      for printing it.   And eip on entry points at the modrm byte. */
6051   Int len;
6052   HChar dis_buf[50];
6053
6054   IRType ty       = szToITy(sz);
6055   IRTemp gsrc     = newTemp(ty);
6056   IRTemp esrc     = newTemp(ty);
6057   IRTemp addr     = IRTemp_INVALID;
6058   IRTemp tmpSH    = newTemp(Ity_I8);
6059   IRTemp tmpL     = IRTemp_INVALID;
6060   IRTemp tmpRes   = IRTemp_INVALID;
6061   IRTemp tmpSubSh = IRTemp_INVALID;
6062   IROp   mkpair;
6063   IROp   getres;
6064   IROp   shift;
6065   IRExpr* mask = NULL;
6066
6067   vassert(sz == 2 || sz == 4);
6068
6069   /* The E-part is the destination; this is shifted.  The G-part
6070      supplies bits to be shifted into the E-part, but is not
6071      changed.
6072
6073      If shifting left, form a double-length word with E at the top
6074      and G at the bottom, and shift this left.  The result is then in
6075      the high part.
6076
6077      If shifting right, form a double-length word with G at the top
6078      and E at the bottom, and shift this right.  The result is then
6079      at the bottom.  */
6080
6081   /* Fetch the operands. */
6082
6083   assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
6084
6085   if (epartIsReg(modrm)) {
6086      delta++;
6087      assign( esrc, getIReg(sz, eregOfRM(modrm)) );
6088      DIP("sh%cd%c %s, %s, %s\n",
6089          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6090          shift_amt_txt,
6091          nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
6092   } else {
6093      addr = disAMode ( &len, sorb, delta, dis_buf );
6094      delta += len;
6095      assign( esrc, loadLE(ty, mkexpr(addr)) );
6096      DIP("sh%cd%c %s, %s, %s\n",
6097          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6098          shift_amt_txt,
6099          nameIReg(sz, gregOfRM(modrm)), dis_buf);
6100   }
6101
6102   /* Round up the relevant primops. */
6103
6104   if (sz == 4) {
6105      tmpL     = newTemp(Ity_I64);
6106      tmpRes   = newTemp(Ity_I32);
6107      tmpSubSh = newTemp(Ity_I32);
6108      mkpair   = Iop_32HLto64;
6109      getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
6110      shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
6111      mask     = mkU8(31);
6112   } else {
6113      /* sz == 2 */
6114      tmpL     = newTemp(Ity_I32);
6115      tmpRes   = newTemp(Ity_I16);
6116      tmpSubSh = newTemp(Ity_I16);
6117      mkpair   = Iop_16HLto32;
6118      getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
6119      shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
6120      mask     = mkU8(15);
6121   }
6122
6123   /* Do the shift, calculate the subshift value, and set
6124      the flag thunk. */
6125
6126   assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
6127
6128   if (left_shift)
6129      assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
6130   else
6131      assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
6132
6133   assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
6134   assign( tmpSubSh,
6135           unop(getres,
6136                binop(shift,
6137                      mkexpr(tmpL),
6138                      binop(Iop_And8,
6139                            binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
6140                            mask))) );
6141
6142   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
6143                              tmpRes, tmpSubSh, ty, tmpSH );
6144
6145   /* Put result back. */
6146
6147   if (epartIsReg(modrm)) {
6148      putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
6149   } else {
6150      storeLE( mkexpr(addr), mkexpr(tmpRes) );
6151   }
6152
6153   if (amt_is_literal) delta++;
6154   return delta;
6155}
6156
6157
6158/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
6159   required. */
6160
6161typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
6162
6163static HChar* nameBtOp ( BtOp op )
6164{
6165   switch (op) {
6166      case BtOpNone:  return "";
6167      case BtOpSet:   return "s";
6168      case BtOpReset: return "r";
6169      case BtOpComp:  return "c";
6170      default: vpanic("nameBtOp(x86)");
6171   }
6172}
6173
6174
6175static
6176UInt dis_bt_G_E ( VexAbiInfo* vbi,
6177                  UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
6178{
6179   HChar  dis_buf[50];
6180   UChar  modrm;
6181   Int    len;
6182   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
6183          t_addr1, t_esp, t_mask, t_new;
6184
6185   vassert(sz == 2 || sz == 4);
6186
6187   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
6188             = t_addr0 = t_addr1 = t_esp
6189             = t_mask = t_new = IRTemp_INVALID;
6190
6191   t_fetched = newTemp(Ity_I8);
6192   t_new     = newTemp(Ity_I8);
6193   t_bitno0  = newTemp(Ity_I32);
6194   t_bitno1  = newTemp(Ity_I32);
6195   t_bitno2  = newTemp(Ity_I8);
6196   t_addr1   = newTemp(Ity_I32);
6197   modrm     = getIByte(delta);
6198
6199   assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
6200
6201   if (epartIsReg(modrm)) {
6202      delta++;
6203      /* Get it onto the client's stack. */
6204      t_esp = newTemp(Ity_I32);
6205      t_addr0 = newTemp(Ity_I32);
6206
6207      /* For the choice of the value 128, see comment in dis_bt_G_E in
6208         guest_amd64_toIR.c.  We point out here only that 128 is
6209         fast-cased in Memcheck and is > 0, so seems like a good
6210         choice. */
6211      vassert(vbi->guest_stack_redzone_size == 0);
6212      assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
6213      putIReg(4, R_ESP, mkexpr(t_esp));
6214
6215      storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
6216
6217      /* Make t_addr0 point at it. */
6218      assign( t_addr0, mkexpr(t_esp) );
6219
6220      /* Mask out upper bits of the shift amount, since we're doing a
6221         reg. */
6222      assign( t_bitno1, binop(Iop_And32,
6223                              mkexpr(t_bitno0),
6224                              mkU32(sz == 4 ? 31 : 15)) );
6225
6226   } else {
6227      t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
6228      delta += len;
6229      assign( t_bitno1, mkexpr(t_bitno0) );
6230   }
6231
6232   /* At this point: t_addr0 is the address being operated on.  If it
6233      was a reg, we will have pushed it onto the client's stack.
6234      t_bitno1 is the bit number, suitably masked in the case of a
6235      reg.  */
6236
6237   /* Now the main sequence. */
6238   assign( t_addr1,
6239           binop(Iop_Add32,
6240                 mkexpr(t_addr0),
6241                 binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
6242
6243   /* t_addr1 now holds effective address */
6244
6245   assign( t_bitno2,
6246           unop(Iop_32to8,
6247                binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
6248
6249   /* t_bitno2 contains offset of bit within byte */
6250
6251   if (op != BtOpNone) {
6252      t_mask = newTemp(Ity_I8);
6253      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
6254   }
6255
6256   /* t_mask is now a suitable byte mask */
6257
6258   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
6259
6260   if (op != BtOpNone) {
6261      switch (op) {
6262         case BtOpSet:
6263            assign( t_new,
6264                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
6265            break;
6266         case BtOpComp:
6267            assign( t_new,
6268                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
6269            break;
6270         case BtOpReset:
6271            assign( t_new,
6272                    binop(Iop_And8, mkexpr(t_fetched),
6273                                    unop(Iop_Not8, mkexpr(t_mask))) );
6274            break;
6275         default:
6276            vpanic("dis_bt_G_E(x86)");
6277      }
6278      if (locked && !epartIsReg(modrm)) {
6279         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
6280                                 mkexpr(t_new)/*new*/,
6281                                 guest_EIP_curr_instr );
6282      } else {
6283         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
6284      }
6285   }
6286
6287   /* Side effect done; now get selected bit into Carry flag */
6288   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
6289   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6290   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6291   stmt( IRStmt_Put(
6292            OFFB_CC_DEP1,
6293            binop(Iop_And32,
6294                  binop(Iop_Shr32,
6295                        unop(Iop_8Uto32, mkexpr(t_fetched)),
6296                        mkexpr(t_bitno2)),
6297                  mkU32(1)))
6298       );
6299   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6300      elimination of previous stores to this field work better. */
6301   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6302
6303   /* Move reg operand from stack back to reg */
6304   if (epartIsReg(modrm)) {
6305      /* t_esp still points at it. */
6306      putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
6307      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
6308   }
6309
6310   DIP("bt%s%c %s, %s\n",
6311       nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
6312       ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
6313
6314   return delta;
6315}
6316
6317
6318
6319/* Handle BSF/BSR.  Only v-size seems necessary. */
6320static
6321UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
6322{
6323   Bool   isReg;
6324   UChar  modrm;
6325   HChar  dis_buf[50];
6326
6327   IRType ty  = szToITy(sz);
6328   IRTemp src = newTemp(ty);
6329   IRTemp dst = newTemp(ty);
6330
6331   IRTemp src32 = newTemp(Ity_I32);
6332   IRTemp dst32 = newTemp(Ity_I32);
6333   IRTemp src8  = newTemp(Ity_I8);
6334
6335   vassert(sz == 4 || sz == 2);
6336
6337   modrm = getIByte(delta);
6338
6339   isReg = epartIsReg(modrm);
6340   if (isReg) {
6341      delta++;
6342      assign( src, getIReg(sz, eregOfRM(modrm)) );
6343   } else {
6344      Int    len;
6345      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6346      delta += len;
6347      assign( src, loadLE(ty, mkexpr(addr)) );
6348   }
6349
6350   DIP("bs%c%c %s, %s\n",
6351       fwds ? 'f' : 'r', nameISize(sz),
6352       ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
6353       nameIReg(sz, gregOfRM(modrm)));
6354
6355   /* Generate an 8-bit expression which is zero iff the
6356      original is zero, and nonzero otherwise */
6357   assign( src8,
6358           unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
6359                           mkexpr(src), mkU(ty,0))) );
6360
6361   /* Flags: Z is 1 iff source value is zero.  All others
6362      are undefined -- we force them to zero. */
6363   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6364   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6365   stmt( IRStmt_Put(
6366            OFFB_CC_DEP1,
6367            IRExpr_Mux0X( mkexpr(src8),
6368                          /* src==0 */
6369                          mkU32(X86G_CC_MASK_Z),
6370                          /* src!=0 */
6371                          mkU32(0)
6372                        )
6373       ));
6374   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6375      elimination of previous stores to this field work better. */
6376   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6377
6378   /* Result: iff source value is zero, we can't use
6379      Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
6380      But anyway, Intel x86 semantics say the result is undefined in
6381      such situations.  Hence handle the zero case specially. */
6382
6383   /* Bleh.  What we compute:
6384
6385          bsf32:  if src == 0 then 0 else  Ctz32(src)
6386          bsr32:  if src == 0 then 0 else  31 - Clz32(src)
6387
6388          bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
6389          bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
6390
6391      First, widen src to 32 bits if it is not already.
6392
6393      Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
6394      dst register unchanged when src == 0.  Hence change accordingly.
6395   */
6396   if (sz == 2)
6397      assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
6398   else
6399      assign( src32, mkexpr(src) );
6400
6401   /* The main computation, guarding against zero. */
6402   assign( dst32,
6403           IRExpr_Mux0X(
6404              mkexpr(src8),
6405              /* src == 0 -- leave dst unchanged */
6406              widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
6407              /* src != 0 */
6408              fwds ? unop(Iop_Ctz32, mkexpr(src32))
6409                   : binop(Iop_Sub32,
6410                           mkU32(31),
6411                           unop(Iop_Clz32, mkexpr(src32)))
6412           )
6413         );
6414
6415   if (sz == 2)
6416      assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
6417   else
6418      assign( dst, mkexpr(dst32) );
6419
6420   /* dump result back */
6421   putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
6422
6423   return delta;
6424}
6425
6426
6427static
6428void codegen_xchg_eAX_Reg ( Int sz, Int reg )
6429{
6430   IRType ty = szToITy(sz);
6431   IRTemp t1 = newTemp(ty);
6432   IRTemp t2 = newTemp(ty);
6433   vassert(sz == 2 || sz == 4);
6434   assign( t1, getIReg(sz, R_EAX) );
6435   assign( t2, getIReg(sz, reg) );
6436   putIReg( sz, R_EAX, mkexpr(t2) );
6437   putIReg( sz, reg, mkexpr(t1) );
6438   DIP("xchg%c %s, %s\n",
6439       nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
6440}
6441
6442
6443static
6444void codegen_SAHF ( void )
6445{
6446   /* Set the flags to:
6447      (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
6448      | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6449                |X86G_CC_MASK_P|X86G_CC_MASK_C)
6450   */
6451   UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6452                       |X86G_CC_MASK_C|X86G_CC_MASK_P;
6453   IRTemp oldflags   = newTemp(Ity_I32);
6454   assign( oldflags, mk_x86g_calculate_eflags_all() );
6455   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6456   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6457   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6458   stmt( IRStmt_Put( OFFB_CC_DEP1,
6459         binop(Iop_Or32,
6460               binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
6461               binop(Iop_And32,
6462                     binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
6463                     mkU32(mask_SZACP))
6464              )
6465   ));
6466   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6467      elimination of previous stores to this field work better. */
6468   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6469}
6470
6471
6472static
6473void codegen_LAHF ( void  )
6474{
6475   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
6476   IRExpr* eax_with_hole;
6477   IRExpr* new_byte;
6478   IRExpr* new_eax;
6479   UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6480                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
6481
6482   IRTemp  flags = newTemp(Ity_I32);
6483   assign( flags, mk_x86g_calculate_eflags_all() );
6484
6485   eax_with_hole
6486      = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
6487   new_byte
6488      = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
6489                        mkU32(1<<1));
6490   new_eax
6491      = binop(Iop_Or32, eax_with_hole,
6492                        binop(Iop_Shl32, new_byte, mkU8(8)));
6493   putIReg(4, R_EAX, new_eax);
6494}
6495
6496
6497static
6498UInt dis_cmpxchg_G_E ( UChar       sorb,
6499                       Bool        locked,
6500                       Int         size,
6501                       Int         delta0 )
6502{
6503   HChar dis_buf[50];
6504   Int   len;
6505
6506   IRType ty    = szToITy(size);
6507   IRTemp acc   = newTemp(ty);
6508   IRTemp src   = newTemp(ty);
6509   IRTemp dest  = newTemp(ty);
6510   IRTemp dest2 = newTemp(ty);
6511   IRTemp acc2  = newTemp(ty);
6512   IRTemp cond8 = newTemp(Ity_I8);
6513   IRTemp addr  = IRTemp_INVALID;
6514   UChar  rm    = getUChar(delta0);
6515
6516   /* There are 3 cases to consider:
6517
6518      reg-reg: ignore any lock prefix, generate sequence based
6519               on Mux0X
6520
6521      reg-mem, not locked: ignore any lock prefix, generate sequence
6522                           based on Mux0X
6523
6524      reg-mem, locked: use IRCAS
6525   */
6526   if (epartIsReg(rm)) {
6527      /* case 1 */
6528      assign( dest, getIReg(size, eregOfRM(rm)) );
6529      delta0++;
6530      assign( src, getIReg(size, gregOfRM(rm)) );
6531      assign( acc, getIReg(size, R_EAX) );
6532      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6533      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6534      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
6535      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6536      putIReg(size, R_EAX, mkexpr(acc2));
6537      putIReg(size, eregOfRM(rm), mkexpr(dest2));
6538      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6539                               nameIReg(size,gregOfRM(rm)),
6540                               nameIReg(size,eregOfRM(rm)) );
6541   }
6542   else if (!epartIsReg(rm) && !locked) {
6543      /* case 2 */
6544      addr = disAMode ( &len, sorb, delta0, dis_buf );
6545      assign( dest, loadLE(ty, mkexpr(addr)) );
6546      delta0 += len;
6547      assign( src, getIReg(size, gregOfRM(rm)) );
6548      assign( acc, getIReg(size, R_EAX) );
6549      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6550      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6551      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
6552      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6553      putIReg(size, R_EAX, mkexpr(acc2));
6554      storeLE( mkexpr(addr), mkexpr(dest2) );
6555      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6556                               nameIReg(size,gregOfRM(rm)), dis_buf);
6557   }
6558   else if (!epartIsReg(rm) && locked) {
6559      /* case 3 */
6560      /* src is new value.  acc is expected value.  dest is old value.
6561         Compute success from the output of the IRCAS, and steer the
6562         new value for EAX accordingly: in case of success, EAX is
6563         unchanged. */
6564      addr = disAMode ( &len, sorb, delta0, dis_buf );
6565      delta0 += len;
6566      assign( src, getIReg(size, gregOfRM(rm)) );
6567      assign( acc, getIReg(size, R_EAX) );
6568      stmt( IRStmt_CAS(
6569         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
6570                  NULL, mkexpr(acc), NULL, mkexpr(src) )
6571      ));
6572      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6573      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6574      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6575      putIReg(size, R_EAX, mkexpr(acc2));
6576      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6577                               nameIReg(size,gregOfRM(rm)), dis_buf);
6578   }
6579   else vassert(0);
6580
6581   return delta0;
6582}
6583
6584
6585/* Handle conditional move instructions of the form
6586      cmovcc E(reg-or-mem), G(reg)
6587
6588   E(src) is reg-or-mem
6589   G(dst) is reg.
6590
6591   If E is reg, -->    GET %E, tmps
6592                       GET %G, tmpd
6593                       CMOVcc tmps, tmpd
6594                       PUT tmpd, %G
6595
6596   If E is mem  -->    (getAddr E) -> tmpa
6597                       LD (tmpa), tmps
6598                       GET %G, tmpd
6599                       CMOVcc tmps, tmpd
6600                       PUT tmpd, %G
6601*/
6602static
6603UInt dis_cmov_E_G ( UChar       sorb,
6604                    Int         sz,
6605                    X86Condcode cond,
6606                    Int         delta0 )
6607{
6608   UChar rm  = getIByte(delta0);
6609   HChar dis_buf[50];
6610   Int   len;
6611
6612   IRType ty   = szToITy(sz);
6613   IRTemp tmps = newTemp(ty);
6614   IRTemp tmpd = newTemp(ty);
6615
6616   if (epartIsReg(rm)) {
6617      assign( tmps, getIReg(sz, eregOfRM(rm)) );
6618      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6619
6620      putIReg(sz, gregOfRM(rm),
6621                  IRExpr_Mux0X( unop(Iop_1Uto8,
6622                                     mk_x86g_calculate_condition(cond)),
6623                                mkexpr(tmpd),
6624                                mkexpr(tmps) )
6625             );
6626      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6627                              name_X86Condcode(cond),
6628                              nameIReg(sz,eregOfRM(rm)),
6629                              nameIReg(sz,gregOfRM(rm)));
6630      return 1+delta0;
6631   }
6632
6633   /* E refers to memory */
6634   {
6635      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6636      assign( tmps, loadLE(ty, mkexpr(addr)) );
6637      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6638
6639      putIReg(sz, gregOfRM(rm),
6640                  IRExpr_Mux0X( unop(Iop_1Uto8,
6641                                     mk_x86g_calculate_condition(cond)),
6642                                mkexpr(tmpd),
6643                                mkexpr(tmps) )
6644             );
6645
6646      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6647                              name_X86Condcode(cond),
6648                              dis_buf,
6649                              nameIReg(sz,gregOfRM(rm)));
6650      return len+delta0;
6651   }
6652}
6653
6654
6655static
6656UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
6657                    Bool* decodeOK )
6658{
6659   Int   len;
6660   UChar rm = getIByte(delta0);
6661   HChar dis_buf[50];
6662
6663   IRType ty    = szToITy(sz);
6664   IRTemp tmpd  = newTemp(ty);
6665   IRTemp tmpt0 = newTemp(ty);
6666   IRTemp tmpt1 = newTemp(ty);
6667
6668   /* There are 3 cases to consider:
6669
6670      reg-reg: ignore any lock prefix,
6671               generate 'naive' (non-atomic) sequence
6672
6673      reg-mem, not locked: ignore any lock prefix, generate 'naive'
6674                           (non-atomic) sequence
6675
6676      reg-mem, locked: use IRCAS
6677   */
6678
6679   if (epartIsReg(rm)) {
6680      /* case 1 */
6681      assign( tmpd,  getIReg(sz, eregOfRM(rm)));
6682      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6683      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6684                           mkexpr(tmpd), mkexpr(tmpt0)) );
6685      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6686      putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
6687      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6688      DIP("xadd%c %s, %s\n",
6689          nameISize(sz), nameIReg(sz,gregOfRM(rm)),
6690          				 nameIReg(sz,eregOfRM(rm)));
6691      *decodeOK = True;
6692      return 1+delta0;
6693   }
6694   else if (!epartIsReg(rm) && !locked) {
6695      /* case 2 */
6696      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6697      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6698      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6699      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6700                           mkexpr(tmpd), mkexpr(tmpt0)) );
6701      storeLE( mkexpr(addr), mkexpr(tmpt1) );
6702      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6703      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6704      DIP("xadd%c %s, %s\n",
6705          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6706      *decodeOK = True;
6707      return len+delta0;
6708   }
6709   else if (!epartIsReg(rm) && locked) {
6710      /* case 3 */
6711      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6712      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6713      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6714      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6715                           mkexpr(tmpd), mkexpr(tmpt0)) );
6716      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
6717                           mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
6718      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6719      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6720      DIP("xadd%c %s, %s\n",
6721          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6722      *decodeOK = True;
6723      return len+delta0;
6724   }
6725   /*UNREACHED*/
6726   vassert(0);
6727}
6728
6729/* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
6730
6731static
6732UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
6733{
6734   Int    len;
6735   IRTemp addr;
6736   UChar  rm  = getIByte(delta0);
6737   HChar  dis_buf[50];
6738
6739   if (epartIsReg(rm)) {
6740      putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
6741      DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
6742      return 1+delta0;
6743   } else {
6744      addr = disAMode ( &len, sorb, delta0, dis_buf );
6745      putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
6746      DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
6747      return len+delta0;
6748   }
6749}
6750
6751/* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
6752   dst is ireg and sz==4, zero out top half of it.  */
6753
6754static
6755UInt dis_mov_Sw_Ew ( UChar sorb,
6756                     Int   sz,
6757                     Int   delta0 )
6758{
6759   Int    len;
6760   IRTemp addr;
6761   UChar  rm  = getIByte(delta0);
6762   HChar  dis_buf[50];
6763
6764   vassert(sz == 2 || sz == 4);
6765
6766   if (epartIsReg(rm)) {
6767      if (sz == 4)
6768         putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
6769      else
6770         putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
6771
6772      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
6773      return 1+delta0;
6774   } else {
6775      addr = disAMode ( &len, sorb, delta0, dis_buf );
6776      storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
6777      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
6778      return len+delta0;
6779   }
6780}
6781
6782
6783static
6784void dis_push_segreg ( UInt sreg, Int sz )
6785{
6786    IRTemp t1 = newTemp(Ity_I16);
6787    IRTemp ta = newTemp(Ity_I32);
6788    vassert(sz == 2 || sz == 4);
6789
6790    assign( t1, getSReg(sreg) );
6791    assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
6792    putIReg(4, R_ESP, mkexpr(ta));
6793    storeLE( mkexpr(ta), mkexpr(t1) );
6794
6795    DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6796}
6797
6798static
6799void dis_pop_segreg ( UInt sreg, Int sz )
6800{
6801    IRTemp t1 = newTemp(Ity_I16);
6802    IRTemp ta = newTemp(Ity_I32);
6803    vassert(sz == 2 || sz == 4);
6804
6805    assign( ta, getIReg(4, R_ESP) );
6806    assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
6807
6808    putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
6809    putSReg( sreg, mkexpr(t1) );
6810    DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6811}
6812
6813static
6814void dis_ret ( UInt d32 )
6815{
6816   IRTemp t1 = newTemp(Ity_I32), t2 = newTemp(Ity_I32);
6817   assign(t1, getIReg(4,R_ESP));
6818   assign(t2, loadLE(Ity_I32,mkexpr(t1)));
6819   putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
6820   jmp_treg(Ijk_Ret,t2);
6821}
6822
6823/*------------------------------------------------------------*/
6824/*--- SSE/SSE2/SSE3 helpers                                ---*/
6825/*------------------------------------------------------------*/
6826
6827/* Worker function; do not call directly.
6828   Handles full width G = G `op` E   and   G = (not G) `op` E.
6829*/
6830
6831static UInt dis_SSE_E_to_G_all_wrk (
6832               UChar sorb, Int delta,
6833               HChar* opname, IROp op,
6834               Bool   invertG
6835            )
6836{
6837   HChar   dis_buf[50];
6838   Int     alen;
6839   IRTemp  addr;
6840   UChar   rm = getIByte(delta);
6841   IRExpr* gpart
6842      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
6843                : getXMMReg(gregOfRM(rm));
6844   if (epartIsReg(rm)) {
6845      putXMMReg( gregOfRM(rm),
6846                 binop(op, gpart,
6847                           getXMMReg(eregOfRM(rm))) );
6848      DIP("%s %s,%s\n", opname,
6849                        nameXMMReg(eregOfRM(rm)),
6850                        nameXMMReg(gregOfRM(rm)) );
6851      return delta+1;
6852   } else {
6853      addr = disAMode ( &alen, sorb, delta, dis_buf );
6854      putXMMReg( gregOfRM(rm),
6855                 binop(op, gpart,
6856                           loadLE(Ity_V128, mkexpr(addr))) );
6857      DIP("%s %s,%s\n", opname,
6858                        dis_buf,
6859                        nameXMMReg(gregOfRM(rm)) );
6860      return delta+alen;
6861   }
6862}
6863
6864
6865/* All lanes SSE binary operation, G = G `op` E. */
6866
6867static
6868UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
6869{
6870   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
6871}
6872
6873/* All lanes SSE binary operation, G = (not G) `op` E. */
6874
6875static
6876UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
6877                               HChar* opname, IROp op )
6878{
6879   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
6880}
6881
6882
6883/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
6884
6885static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
6886                                  HChar* opname, IROp op )
6887{
6888   HChar   dis_buf[50];
6889   Int     alen;
6890   IRTemp  addr;
6891   UChar   rm = getIByte(delta);
6892   IRExpr* gpart = getXMMReg(gregOfRM(rm));
6893   if (epartIsReg(rm)) {
6894      putXMMReg( gregOfRM(rm),
6895                 binop(op, gpart,
6896                           getXMMReg(eregOfRM(rm))) );
6897      DIP("%s %s,%s\n", opname,
6898                        nameXMMReg(eregOfRM(rm)),
6899                        nameXMMReg(gregOfRM(rm)) );
6900      return delta+1;
6901   } else {
6902      /* We can only do a 32-bit memory read, so the upper 3/4 of the
6903         E operand needs to be made simply of zeroes. */
6904      IRTemp epart = newTemp(Ity_V128);
6905      addr = disAMode ( &alen, sorb, delta, dis_buf );
6906      assign( epart, unop( Iop_32UtoV128,
6907                           loadLE(Ity_I32, mkexpr(addr))) );
6908      putXMMReg( gregOfRM(rm),
6909                 binop(op, gpart, mkexpr(epart)) );
6910      DIP("%s %s,%s\n", opname,
6911                        dis_buf,
6912                        nameXMMReg(gregOfRM(rm)) );
6913      return delta+alen;
6914   }
6915}
6916
6917
6918/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
6919
6920static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
6921                                  HChar* opname, IROp op )
6922{
6923   HChar   dis_buf[50];
6924   Int     alen;
6925   IRTemp  addr;
6926   UChar   rm = getIByte(delta);
6927   IRExpr* gpart = getXMMReg(gregOfRM(rm));
6928   if (epartIsReg(rm)) {
6929      putXMMReg( gregOfRM(rm),
6930                 binop(op, gpart,
6931                           getXMMReg(eregOfRM(rm))) );
6932      DIP("%s %s,%s\n", opname,
6933                        nameXMMReg(eregOfRM(rm)),
6934                        nameXMMReg(gregOfRM(rm)) );
6935      return delta+1;
6936   } else {
6937      /* We can only do a 64-bit memory read, so the upper half of the
6938         E operand needs to be made simply of zeroes. */
6939      IRTemp epart = newTemp(Ity_V128);
6940      addr = disAMode ( &alen, sorb, delta, dis_buf );
6941      assign( epart, unop( Iop_64UtoV128,
6942                           loadLE(Ity_I64, mkexpr(addr))) );
6943      putXMMReg( gregOfRM(rm),
6944                 binop(op, gpart, mkexpr(epart)) );
6945      DIP("%s %s,%s\n", opname,
6946                        dis_buf,
6947                        nameXMMReg(gregOfRM(rm)) );
6948      return delta+alen;
6949   }
6950}
6951
6952
6953/* All lanes unary SSE operation, G = op(E). */
6954
6955static UInt dis_SSE_E_to_G_unary_all (
6956               UChar sorb, Int delta,
6957               HChar* opname, IROp op
6958            )
6959{
6960   HChar   dis_buf[50];
6961   Int     alen;
6962   IRTemp  addr;
6963   UChar   rm = getIByte(delta);
6964   if (epartIsReg(rm)) {
6965      putXMMReg( gregOfRM(rm),
6966                 unop(op, getXMMReg(eregOfRM(rm))) );
6967      DIP("%s %s,%s\n", opname,
6968                        nameXMMReg(eregOfRM(rm)),
6969                        nameXMMReg(gregOfRM(rm)) );
6970      return delta+1;
6971   } else {
6972      addr = disAMode ( &alen, sorb, delta, dis_buf );
6973      putXMMReg( gregOfRM(rm),
6974                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
6975      DIP("%s %s,%s\n", opname,
6976                        dis_buf,
6977                        nameXMMReg(gregOfRM(rm)) );
6978      return delta+alen;
6979   }
6980}
6981
6982
6983/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
6984
6985static UInt dis_SSE_E_to_G_unary_lo32 (
6986               UChar sorb, Int delta,
6987               HChar* opname, IROp op
6988            )
6989{
6990   /* First we need to get the old G value and patch the low 32 bits
6991      of the E operand into it.  Then apply op and write back to G. */
6992   HChar   dis_buf[50];
6993   Int     alen;
6994   IRTemp  addr;
6995   UChar   rm = getIByte(delta);
6996   IRTemp  oldG0 = newTemp(Ity_V128);
6997   IRTemp  oldG1 = newTemp(Ity_V128);
6998
6999   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7000
7001   if (epartIsReg(rm)) {
7002      assign( oldG1,
7003              binop( Iop_SetV128lo32,
7004                     mkexpr(oldG0),
7005                     getXMMRegLane32(eregOfRM(rm), 0)) );
7006      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7007      DIP("%s %s,%s\n", opname,
7008                        nameXMMReg(eregOfRM(rm)),
7009                        nameXMMReg(gregOfRM(rm)) );
7010      return delta+1;
7011   } else {
7012      addr = disAMode ( &alen, sorb, delta, dis_buf );
7013      assign( oldG1,
7014              binop( Iop_SetV128lo32,
7015                     mkexpr(oldG0),
7016                     loadLE(Ity_I32, mkexpr(addr)) ));
7017      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7018      DIP("%s %s,%s\n", opname,
7019                        dis_buf,
7020                        nameXMMReg(gregOfRM(rm)) );
7021      return delta+alen;
7022   }
7023}
7024
7025
7026/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
7027
7028static UInt dis_SSE_E_to_G_unary_lo64 (
7029               UChar sorb, Int delta,
7030               HChar* opname, IROp op
7031            )
7032{
7033   /* First we need to get the old G value and patch the low 64 bits
7034      of the E operand into it.  Then apply op and write back to G. */
7035   HChar   dis_buf[50];
7036   Int     alen;
7037   IRTemp  addr;
7038   UChar   rm = getIByte(delta);
7039   IRTemp  oldG0 = newTemp(Ity_V128);
7040   IRTemp  oldG1 = newTemp(Ity_V128);
7041
7042   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7043
7044   if (epartIsReg(rm)) {
7045      assign( oldG1,
7046              binop( Iop_SetV128lo64,
7047                     mkexpr(oldG0),
7048                     getXMMRegLane64(eregOfRM(rm), 0)) );
7049      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7050      DIP("%s %s,%s\n", opname,
7051                        nameXMMReg(eregOfRM(rm)),
7052                        nameXMMReg(gregOfRM(rm)) );
7053      return delta+1;
7054   } else {
7055      addr = disAMode ( &alen, sorb, delta, dis_buf );
7056      assign( oldG1,
7057              binop( Iop_SetV128lo64,
7058                     mkexpr(oldG0),
7059                     loadLE(Ity_I64, mkexpr(addr)) ));
7060      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7061      DIP("%s %s,%s\n", opname,
7062                        dis_buf,
7063                        nameXMMReg(gregOfRM(rm)) );
7064      return delta+alen;
7065   }
7066}
7067
7068
7069/* SSE integer binary operation:
7070      G = G `op` E   (eLeft == False)
7071      G = E `op` G   (eLeft == True)
7072*/
7073static UInt dis_SSEint_E_to_G(
7074               UChar sorb, Int delta,
7075               HChar* opname, IROp op,
7076               Bool   eLeft
7077            )
7078{
7079   HChar   dis_buf[50];
7080   Int     alen;
7081   IRTemp  addr;
7082   UChar   rm = getIByte(delta);
7083   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7084   IRExpr* epart = NULL;
7085   if (epartIsReg(rm)) {
7086      epart = getXMMReg(eregOfRM(rm));
7087      DIP("%s %s,%s\n", opname,
7088                        nameXMMReg(eregOfRM(rm)),
7089                        nameXMMReg(gregOfRM(rm)) );
7090      delta += 1;
7091   } else {
7092      addr  = disAMode ( &alen, sorb, delta, dis_buf );
7093      epart = loadLE(Ity_V128, mkexpr(addr));
7094      DIP("%s %s,%s\n", opname,
7095                        dis_buf,
7096                        nameXMMReg(gregOfRM(rm)) );
7097      delta += alen;
7098   }
7099   putXMMReg( gregOfRM(rm),
7100              eLeft ? binop(op, epart, gpart)
7101	            : binop(op, gpart, epart) );
7102   return delta;
7103}
7104
7105
7106/* Helper for doing SSE FP comparisons. */
7107
7108static void findSSECmpOp ( Bool* needNot, IROp* op,
7109                           Int imm8, Bool all_lanes, Int sz )
7110{
7111   imm8 &= 7;
7112   *needNot = False;
7113   *op      = Iop_INVALID;
7114   if (imm8 >= 4) {
7115      *needNot = True;
7116      imm8 -= 4;
7117   }
7118
7119   if (sz == 4 && all_lanes) {
7120      switch (imm8) {
7121         case 0: *op = Iop_CmpEQ32Fx4; return;
7122         case 1: *op = Iop_CmpLT32Fx4; return;
7123         case 2: *op = Iop_CmpLE32Fx4; return;
7124         case 3: *op = Iop_CmpUN32Fx4; return;
7125         default: break;
7126      }
7127   }
7128   if (sz == 4 && !all_lanes) {
7129      switch (imm8) {
7130         case 0: *op = Iop_CmpEQ32F0x4; return;
7131         case 1: *op = Iop_CmpLT32F0x4; return;
7132         case 2: *op = Iop_CmpLE32F0x4; return;
7133         case 3: *op = Iop_CmpUN32F0x4; return;
7134         default: break;
7135      }
7136   }
7137   if (sz == 8 && all_lanes) {
7138      switch (imm8) {
7139         case 0: *op = Iop_CmpEQ64Fx2; return;
7140         case 1: *op = Iop_CmpLT64Fx2; return;
7141         case 2: *op = Iop_CmpLE64Fx2; return;
7142         case 3: *op = Iop_CmpUN64Fx2; return;
7143         default: break;
7144      }
7145   }
7146   if (sz == 8 && !all_lanes) {
7147      switch (imm8) {
7148         case 0: *op = Iop_CmpEQ64F0x2; return;
7149         case 1: *op = Iop_CmpLT64F0x2; return;
7150         case 2: *op = Iop_CmpLE64F0x2; return;
7151         case 3: *op = Iop_CmpUN64F0x2; return;
7152         default: break;
7153      }
7154   }
7155   vpanic("findSSECmpOp(x86,guest)");
7156}
7157
7158/* Handles SSE 32F/64F comparisons. */
7159
7160static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
7161				HChar* opname, Bool all_lanes, Int sz )
7162{
7163   HChar   dis_buf[50];
7164   Int     alen, imm8;
7165   IRTemp  addr;
7166   Bool    needNot = False;
7167   IROp    op      = Iop_INVALID;
7168   IRTemp  plain   = newTemp(Ity_V128);
7169   UChar   rm      = getIByte(delta);
7170   UShort  mask    = 0;
7171   vassert(sz == 4 || sz == 8);
7172   if (epartIsReg(rm)) {
7173      imm8 = getIByte(delta+1);
7174      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7175      assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
7176                               getXMMReg(eregOfRM(rm))) );
7177      delta += 2;
7178      DIP("%s $%d,%s,%s\n", opname,
7179                            (Int)imm8,
7180                            nameXMMReg(eregOfRM(rm)),
7181                            nameXMMReg(gregOfRM(rm)) );
7182   } else {
7183      addr = disAMode ( &alen, sorb, delta, dis_buf );
7184      imm8 = getIByte(delta+alen);
7185      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7186      assign( plain,
7187              binop(
7188                 op,
7189                 getXMMReg(gregOfRM(rm)),
7190                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
7191                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
7192                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
7193             )
7194      );
7195      delta += alen+1;
7196      DIP("%s $%d,%s,%s\n", opname,
7197                            (Int)imm8,
7198                            dis_buf,
7199                            nameXMMReg(gregOfRM(rm)) );
7200   }
7201
7202   if (needNot && all_lanes) {
7203      putXMMReg( gregOfRM(rm),
7204                 unop(Iop_NotV128, mkexpr(plain)) );
7205   }
7206   else
7207   if (needNot && !all_lanes) {
7208      mask = toUShort( sz==4 ? 0x000F : 0x00FF );
7209      putXMMReg( gregOfRM(rm),
7210                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
7211   }
7212   else {
7213      putXMMReg( gregOfRM(rm), mkexpr(plain) );
7214   }
7215
7216   return delta;
7217}
7218
7219
7220/* Vector by scalar shift of G by the amount specified at the bottom
7221   of E. */
7222
7223static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
7224                                 HChar* opname, IROp op )
7225{
7226   HChar   dis_buf[50];
7227   Int     alen, size;
7228   IRTemp  addr;
7229   Bool    shl, shr, sar;
7230   UChar   rm   = getIByte(delta);
7231   IRTemp  g0   = newTemp(Ity_V128);
7232   IRTemp  g1   = newTemp(Ity_V128);
7233   IRTemp  amt  = newTemp(Ity_I32);
7234   IRTemp  amt8 = newTemp(Ity_I8);
7235   if (epartIsReg(rm)) {
7236      assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
7237      DIP("%s %s,%s\n", opname,
7238                        nameXMMReg(eregOfRM(rm)),
7239                        nameXMMReg(gregOfRM(rm)) );
7240      delta++;
7241   } else {
7242      addr = disAMode ( &alen, sorb, delta, dis_buf );
7243      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
7244      DIP("%s %s,%s\n", opname,
7245                        dis_buf,
7246                        nameXMMReg(gregOfRM(rm)) );
7247      delta += alen;
7248   }
7249   assign( g0,   getXMMReg(gregOfRM(rm)) );
7250   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
7251
7252   shl = shr = sar = False;
7253   size = 0;
7254   switch (op) {
7255      case Iop_ShlN16x8: shl = True; size = 32; break;
7256      case Iop_ShlN32x4: shl = True; size = 32; break;
7257      case Iop_ShlN64x2: shl = True; size = 64; break;
7258      case Iop_SarN16x8: sar = True; size = 16; break;
7259      case Iop_SarN32x4: sar = True; size = 32; break;
7260      case Iop_ShrN16x8: shr = True; size = 16; break;
7261      case Iop_ShrN32x4: shr = True; size = 32; break;
7262      case Iop_ShrN64x2: shr = True; size = 64; break;
7263      default: vassert(0);
7264   }
7265
7266   if (shl || shr) {
7267     assign(
7268        g1,
7269        IRExpr_Mux0X(
7270           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
7271           mkV128(0x0000),
7272           binop(op, mkexpr(g0), mkexpr(amt8))
7273        )
7274     );
7275   } else
7276   if (sar) {
7277     assign(
7278        g1,
7279        IRExpr_Mux0X(
7280           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
7281           binop(op, mkexpr(g0), mkU8(size-1)),
7282           binop(op, mkexpr(g0), mkexpr(amt8))
7283        )
7284     );
7285   } else {
7286      /*NOTREACHED*/
7287      vassert(0);
7288   }
7289
7290   putXMMReg( gregOfRM(rm), mkexpr(g1) );
7291   return delta;
7292}
7293
7294
7295/* Vector by scalar shift of E by an immediate byte. */
7296
7297static
7298UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
7299{
7300   Bool    shl, shr, sar;
7301   UChar   rm   = getIByte(delta);
7302   IRTemp  e0   = newTemp(Ity_V128);
7303   IRTemp  e1   = newTemp(Ity_V128);
7304   UChar   amt, size;
7305   vassert(epartIsReg(rm));
7306   vassert(gregOfRM(rm) == 2
7307           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
7308   amt = getIByte(delta+1);
7309   delta += 2;
7310   DIP("%s $%d,%s\n", opname,
7311                      (Int)amt,
7312                      nameXMMReg(eregOfRM(rm)) );
7313   assign( e0, getXMMReg(eregOfRM(rm)) );
7314
7315   shl = shr = sar = False;
7316   size = 0;
7317   switch (op) {
7318      case Iop_ShlN16x8: shl = True; size = 16; break;
7319      case Iop_ShlN32x4: shl = True; size = 32; break;
7320      case Iop_ShlN64x2: shl = True; size = 64; break;
7321      case Iop_SarN16x8: sar = True; size = 16; break;
7322      case Iop_SarN32x4: sar = True; size = 32; break;
7323      case Iop_ShrN16x8: shr = True; size = 16; break;
7324      case Iop_ShrN32x4: shr = True; size = 32; break;
7325      case Iop_ShrN64x2: shr = True; size = 64; break;
7326      default: vassert(0);
7327   }
7328
7329   if (shl || shr) {
7330      assign( e1, amt >= size
7331                     ? mkV128(0x0000)
7332                     : binop(op, mkexpr(e0), mkU8(amt))
7333      );
7334   } else
7335   if (sar) {
7336      assign( e1, amt >= size
7337                     ? binop(op, mkexpr(e0), mkU8(size-1))
7338                     : binop(op, mkexpr(e0), mkU8(amt))
7339      );
7340   } else {
7341      /*NOTREACHED*/
7342      vassert(0);
7343   }
7344
7345   putXMMReg( eregOfRM(rm), mkexpr(e1) );
7346   return delta;
7347}
7348
7349
7350/* Get the current SSE rounding mode. */
7351
7352static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
7353{
7354   return binop( Iop_And32,
7355                 IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
7356                 mkU32(3) );
7357}
7358
7359static void put_sse_roundingmode ( IRExpr* sseround )
7360{
7361   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
7362   stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
7363}
7364
7365/* Break a 128-bit value up into four 32-bit ints. */
7366
7367static void breakup128to32s ( IRTemp t128,
7368			      /*OUTs*/
7369                              IRTemp* t3, IRTemp* t2,
7370                              IRTemp* t1, IRTemp* t0 )
7371{
7372   IRTemp hi64 = newTemp(Ity_I64);
7373   IRTemp lo64 = newTemp(Ity_I64);
7374   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
7375   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
7376
7377   vassert(t0 && *t0 == IRTemp_INVALID);
7378   vassert(t1 && *t1 == IRTemp_INVALID);
7379   vassert(t2 && *t2 == IRTemp_INVALID);
7380   vassert(t3 && *t3 == IRTemp_INVALID);
7381
7382   *t0 = newTemp(Ity_I32);
7383   *t1 = newTemp(Ity_I32);
7384   *t2 = newTemp(Ity_I32);
7385   *t3 = newTemp(Ity_I32);
7386   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
7387   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
7388   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
7389   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
7390}
7391
7392/* Construct a 128-bit value from four 32-bit ints. */
7393
7394static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
7395                              IRTemp t1, IRTemp t0 )
7396{
7397   return
7398      binop( Iop_64HLtoV128,
7399             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
7400             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
7401   );
7402}
7403
7404/* Break a 64-bit value up into four 16-bit ints. */
7405
7406static void breakup64to16s ( IRTemp t64,
7407                             /*OUTs*/
7408                             IRTemp* t3, IRTemp* t2,
7409                             IRTemp* t1, IRTemp* t0 )
7410{
7411   IRTemp hi32 = newTemp(Ity_I32);
7412   IRTemp lo32 = newTemp(Ity_I32);
7413   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
7414   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
7415
7416   vassert(t0 && *t0 == IRTemp_INVALID);
7417   vassert(t1 && *t1 == IRTemp_INVALID);
7418   vassert(t2 && *t2 == IRTemp_INVALID);
7419   vassert(t3 && *t3 == IRTemp_INVALID);
7420
7421   *t0 = newTemp(Ity_I16);
7422   *t1 = newTemp(Ity_I16);
7423   *t2 = newTemp(Ity_I16);
7424   *t3 = newTemp(Ity_I16);
7425   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
7426   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
7427   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
7428   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
7429}
7430
7431/* Construct a 64-bit value from four 16-bit ints. */
7432
7433static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
7434                             IRTemp t1, IRTemp t0 )
7435{
7436   return
7437      binop( Iop_32HLto64,
7438             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
7439             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
7440   );
7441}
7442
7443/* Generate IR to set the guest %EFLAGS from the pushfl-format image
7444   in the given 32-bit temporary.  The flags that are set are: O S Z A
7445   C P D ID AC.
7446
7447   In all cases, code to set AC is generated.  However, VEX actually
7448   ignores the AC value and so can optionally emit an emulation
7449   warning when it is enabled.  In this routine, an emulation warning
7450   is only emitted if emit_AC_emwarn is True, in which case
7451   next_insn_EIP must be correct (this allows for correct code
7452   generation for popfl/popfw).  If emit_AC_emwarn is False,
7453   next_insn_EIP is unimportant (this allows for easy if kludgey code
7454   generation for IRET.) */
7455
7456static
7457void set_EFLAGS_from_value ( IRTemp t1,
7458                             Bool   emit_AC_emwarn,
7459                             Addr32 next_insn_EIP )
7460{
7461   vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
7462
7463   /* t1 is the flag word.  Mask out everything except OSZACP and set
7464      the flags thunk to X86G_CC_OP_COPY. */
7465   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
7466   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
7467   stmt( IRStmt_Put( OFFB_CC_DEP1,
7468                     binop(Iop_And32,
7469                           mkexpr(t1),
7470                           mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
7471                                  | X86G_CC_MASK_A | X86G_CC_MASK_Z
7472                                  | X86G_CC_MASK_S| X86G_CC_MASK_O )
7473                          )
7474                    )
7475       );
7476   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7477      elimination of previous stores to this field work better. */
7478   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7479
7480   /* Also need to set the D flag, which is held in bit 10 of t1.
7481      If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
7482   stmt( IRStmt_Put(
7483            OFFB_DFLAG,
7484            IRExpr_Mux0X(
7485               unop(Iop_32to8,
7486                    binop(Iop_And32,
7487                          binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
7488                          mkU32(1))),
7489               mkU32(1),
7490               mkU32(0xFFFFFFFF)))
7491       );
7492
7493   /* Set the ID flag */
7494   stmt( IRStmt_Put(
7495            OFFB_IDFLAG,
7496            IRExpr_Mux0X(
7497               unop(Iop_32to8,
7498                    binop(Iop_And32,
7499                          binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
7500                          mkU32(1))),
7501               mkU32(0),
7502               mkU32(1)))
7503       );
7504
7505   /* And set the AC flag.  If setting it 1 to, possibly emit an
7506      emulation warning. */
7507   stmt( IRStmt_Put(
7508            OFFB_ACFLAG,
7509            IRExpr_Mux0X(
7510               unop(Iop_32to8,
7511                    binop(Iop_And32,
7512                          binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
7513                          mkU32(1))),
7514               mkU32(0),
7515               mkU32(1)))
7516       );
7517
7518   if (emit_AC_emwarn) {
7519      put_emwarn( mkU32(EmWarn_X86_acFlag) );
7520      stmt(
7521         IRStmt_Exit(
7522            binop( Iop_CmpNE32,
7523                   binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
7524                   mkU32(0) ),
7525            Ijk_EmWarn,
7526            IRConst_U32( next_insn_EIP )
7527         )
7528      );
7529   }
7530}
7531
7532
7533/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
7534   values (aa,bb), computes, for each of the 4 16-bit lanes:
7535
7536   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
7537*/
7538static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
7539{
7540   IRTemp aa      = newTemp(Ity_I64);
7541   IRTemp bb      = newTemp(Ity_I64);
7542   IRTemp aahi32s = newTemp(Ity_I64);
7543   IRTemp aalo32s = newTemp(Ity_I64);
7544   IRTemp bbhi32s = newTemp(Ity_I64);
7545   IRTemp bblo32s = newTemp(Ity_I64);
7546   IRTemp rHi     = newTemp(Ity_I64);
7547   IRTemp rLo     = newTemp(Ity_I64);
7548   IRTemp one32x2 = newTemp(Ity_I64);
7549   assign(aa, aax);
7550   assign(bb, bbx);
7551   assign( aahi32s,
7552           binop(Iop_SarN32x2,
7553                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
7554                 mkU8(16) ));
7555   assign( aalo32s,
7556           binop(Iop_SarN32x2,
7557                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
7558                 mkU8(16) ));
7559   assign( bbhi32s,
7560           binop(Iop_SarN32x2,
7561                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
7562                 mkU8(16) ));
7563   assign( bblo32s,
7564           binop(Iop_SarN32x2,
7565                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
7566                 mkU8(16) ));
7567   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
7568   assign(
7569      rHi,
7570      binop(
7571         Iop_ShrN32x2,
7572         binop(
7573            Iop_Add32x2,
7574            binop(
7575               Iop_ShrN32x2,
7576               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
7577               mkU8(14)
7578            ),
7579            mkexpr(one32x2)
7580         ),
7581         mkU8(1)
7582      )
7583   );
7584   assign(
7585      rLo,
7586      binop(
7587         Iop_ShrN32x2,
7588         binop(
7589            Iop_Add32x2,
7590            binop(
7591               Iop_ShrN32x2,
7592               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
7593               mkU8(14)
7594            ),
7595            mkexpr(one32x2)
7596         ),
7597         mkU8(1)
7598      )
7599   );
7600   return
7601      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
7602}
7603
7604/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
7605   values (aa,bb), computes, for each lane:
7606
7607          if aa_lane < 0 then - bb_lane
7608     else if aa_lane > 0 then bb_lane
7609     else 0
7610*/
7611static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
7612{
7613   IRTemp aa       = newTemp(Ity_I64);
7614   IRTemp bb       = newTemp(Ity_I64);
7615   IRTemp zero     = newTemp(Ity_I64);
7616   IRTemp bbNeg    = newTemp(Ity_I64);
7617   IRTemp negMask  = newTemp(Ity_I64);
7618   IRTemp posMask  = newTemp(Ity_I64);
7619   IROp   opSub    = Iop_INVALID;
7620   IROp   opCmpGTS = Iop_INVALID;
7621
7622   switch (laneszB) {
7623      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
7624      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
7625      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
7626      default: vassert(0);
7627   }
7628
7629   assign( aa,      aax );
7630   assign( bb,      bbx );
7631   assign( zero,    mkU64(0) );
7632   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
7633   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
7634   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
7635
7636   return
7637      binop(Iop_Or64,
7638            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
7639            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
7640
7641}
7642
7643/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
7644   value aa, computes, for each lane
7645
7646   if aa < 0 then -aa else aa
7647
7648   Note that the result is interpreted as unsigned, so that the
7649   absolute value of the most negative signed input can be
7650   represented.
7651*/
7652static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
7653{
7654   IRTemp aa      = newTemp(Ity_I64);
7655   IRTemp zero    = newTemp(Ity_I64);
7656   IRTemp aaNeg   = newTemp(Ity_I64);
7657   IRTemp negMask = newTemp(Ity_I64);
7658   IRTemp posMask = newTemp(Ity_I64);
7659   IROp   opSub   = Iop_INVALID;
7660   IROp   opSarN  = Iop_INVALID;
7661
7662   switch (laneszB) {
7663      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
7664      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
7665      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
7666      default: vassert(0);
7667   }
7668
7669   assign( aa,      aax );
7670   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
7671   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
7672   assign( zero,    mkU64(0) );
7673   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
7674   return
7675      binop(Iop_Or64,
7676            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
7677            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
7678}
7679
7680static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
7681                                        IRTemp lo64, Int byteShift )
7682{
7683   vassert(byteShift >= 1 && byteShift <= 7);
7684   return
7685      binop(Iop_Or64,
7686            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
7687            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
7688      );
7689}
7690
7691/* Generate a SIGSEGV followed by a restart of the current instruction
7692   if effective_addr is not 16-aligned.  This is required behaviour
7693   for some SSE3 instructions and all 128-bit SSSE3 instructions.
7694   This assumes that guest_RIP_curr_instr is set correctly! */
7695/* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
7696 * the check. Need to enable it once TSan stops generating unaligned
7697 * accesses in the wrappers.
7698 * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
7699static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
7700{
7701   stmt(
7702      IRStmt_Exit(
7703         binop(Iop_CmpNE32,
7704               binop(Iop_And32,mkexpr(effective_addr),mkU32(0x0)),
7705               mkU32(0)),
7706         Ijk_SigSEGV,
7707         IRConst_U32(guest_EIP_curr_instr)
7708      )
7709   );
7710}
7711
7712
7713/* Helper for deciding whether a given insn (starting at the opcode
7714   byte) may validly be used with a LOCK prefix.  The following insns
7715   may be used with LOCK when their destination operand is in memory.
7716   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
7717
7718   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
7719   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
7720   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
7721   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
7722   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
7723   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
7724   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
7725
7726   DEC        FE /1,  FF /1
7727   INC        FE /0,  FF /0
7728
7729   NEG        F6 /3,  F7 /3
7730   NOT        F6 /2,  F7 /2
7731
7732   XCHG       86, 87
7733
7734   BTC        0F BB,  0F BA /7
7735   BTR        0F B3,  0F BA /6
7736   BTS        0F AB,  0F BA /5
7737
7738   CMPXCHG    0F B0,  0F B1
7739   CMPXCHG8B  0F C7 /1
7740
7741   XADD       0F C0,  0F C1
7742
7743   ------------------------------
7744
7745   80 /0  =  addb $imm8,  rm8
7746   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
7747   82 /0  =  addb $imm8,  rm8
7748   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
7749
7750   00     =  addb r8,  rm8
7751   01     =  addl r32, rm32  and  addw r16, rm16
7752
7753   Same for ADD OR ADC SBB AND SUB XOR
7754
7755   FE /1  = dec rm8
7756   FF /1  = dec rm32  and  dec rm16
7757
7758   FE /0  = inc rm8
7759   FF /0  = inc rm32  and  inc rm16
7760
7761   F6 /3  = neg rm8
7762   F7 /3  = neg rm32  and  neg rm16
7763
7764   F6 /2  = not rm8
7765   F7 /2  = not rm32  and  not rm16
7766
7767   0F BB     = btcw r16, rm16    and  btcl r32, rm32
7768   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
7769
7770   Same for BTS, BTR
7771*/
7772static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
7773{
7774   switch (opc[0]) {
7775      case 0x00: case 0x01: case 0x08: case 0x09:
7776      case 0x10: case 0x11: case 0x18: case 0x19:
7777      case 0x20: case 0x21: case 0x28: case 0x29:
7778      case 0x30: case 0x31:
7779         if (!epartIsReg(opc[1]))
7780            return True;
7781         break;
7782
7783      case 0x80: case 0x81: case 0x82: case 0x83:
7784         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
7785             && !epartIsReg(opc[1]))
7786            return True;
7787         break;
7788
7789      case 0xFE: case 0xFF:
7790         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
7791             && !epartIsReg(opc[1]))
7792            return True;
7793         break;
7794
7795      case 0xF6: case 0xF7:
7796         if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
7797             && !epartIsReg(opc[1]))
7798            return True;
7799         break;
7800
7801      case 0x86: case 0x87:
7802         if (!epartIsReg(opc[1]))
7803            return True;
7804         break;
7805
7806      case 0x0F: {
7807         switch (opc[1]) {
7808            case 0xBB: case 0xB3: case 0xAB:
7809               if (!epartIsReg(opc[2]))
7810                  return True;
7811               break;
7812            case 0xBA:
7813               if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
7814                   && !epartIsReg(opc[2]))
7815                  return True;
7816               break;
7817            case 0xB0: case 0xB1:
7818               if (!epartIsReg(opc[2]))
7819                  return True;
7820               break;
7821            case 0xC7:
7822               if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
7823                  return True;
7824               break;
7825            case 0xC0: case 0xC1:
7826               if (!epartIsReg(opc[2]))
7827                  return True;
7828               break;
7829            default:
7830               break;
7831         } /* switch (opc[1]) */
7832         break;
7833      }
7834
7835      default:
7836         break;
7837   } /* switch (opc[0]) */
7838
7839   return False;
7840}
7841
7842
7843/*------------------------------------------------------------*/
7844/*--- Disassemble a single instruction                     ---*/
7845/*------------------------------------------------------------*/
7846
7847/* Disassemble a single instruction into IR.  The instruction is
7848   located in host memory at &guest_code[delta].  *expect_CAS is set
7849   to True if the resulting IR is expected to contain an IRCAS
7850   statement, and False if it's not expected to.  This makes it
7851   possible for the caller of disInstr_X86_WRK to check that
7852   LOCK-prefixed instructions are at least plausibly translated, in
7853   that it becomes possible to check that a (validly) LOCK-prefixed
7854   instruction generates a translation containing an IRCAS, and
7855   instructions without LOCK prefixes don't generate translations
7856   containing an IRCAS.
7857*/
7858static
7859DisResult disInstr_X86_WRK (
7860             /*OUT*/Bool* expect_CAS,
7861             Bool         put_IP,
7862             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
7863             Bool         resteerCisOk,
7864             void*        callback_opaque,
7865             Long         delta64,
7866             VexArchInfo* archinfo,
7867             VexAbiInfo*  vbi
7868          )
7869{
7870   IRType    ty;
7871   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
7872   Int       alen;
7873   UChar     opc, modrm, abyte, pre;
7874   UInt      d32;
7875   HChar     dis_buf[50];
7876   Int       am_sz, d_sz, n_prefixes;
7877   DisResult dres;
7878   UChar*    insn; /* used in SSE decoders */
7879
7880   /* The running delta */
7881   Int delta = (Int)delta64;
7882
7883   /* Holds eip at the start of the insn, so that we can print
7884      consistent error messages for unimplemented insns. */
7885   Int delta_start = delta;
7886
7887   /* sz denotes the nominal data-op size of the insn; we change it to
7888      2 if an 0x66 prefix is seen */
7889   Int sz = 4;
7890
7891   /* sorb holds the segment-override-prefix byte, if any.  Zero if no
7892      prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
7893      indicating the prefix.  */
7894   UChar sorb = 0;
7895
7896   /* Gets set to True if a LOCK prefix is seen. */
7897   Bool pfx_lock = False;
7898
7899   /* Set result defaults. */
7900   dres.whatNext   = Dis_Continue;
7901   dres.len        = 0;
7902   dres.continueAt = 0;
7903
7904   *expect_CAS = False;
7905
7906   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
7907
7908   vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
7909   DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
7910
7911   /* We may be asked to update the guest EIP before going further. */
7912   if (put_IP)
7913      stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr)) );
7914
7915   /* Spot "Special" instructions (see comment at top of file). */
7916   {
7917      UChar* code = (UChar*)(guest_code + delta);
7918      /* Spot the 12-byte preamble:
7919         C1C703   roll $3,  %edi
7920         C1C70D   roll $13, %edi
7921         C1C71D   roll $29, %edi
7922         C1C713   roll $19, %edi
7923      */
7924      if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
7925          code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
7926          code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
7927          code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
7928         /* Got a "Special" instruction preamble.  Which one is it? */
7929         if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
7930            /* %EDX = client_request ( %EAX ) */
7931            DIP("%%edx = client_request ( %%eax )\n");
7932            delta += 14;
7933            jmp_lit(Ijk_ClientReq, guest_EIP_bbstart+delta);
7934            dres.whatNext = Dis_StopHere;
7935            goto decode_success;
7936         }
7937         else
7938         if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
7939            /* %EAX = guest_NRADDR */
7940            DIP("%%eax = guest_NRADDR\n");
7941            delta += 14;
7942            putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
7943            goto decode_success;
7944         }
7945         else
7946         if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
7947            /* call-noredir *%EAX */
7948            DIP("call-noredir *%%eax\n");
7949            delta += 14;
7950            t1 = newTemp(Ity_I32);
7951            assign(t1, getIReg(4,R_EAX));
7952            t2 = newTemp(Ity_I32);
7953            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
7954            putIReg(4, R_ESP, mkexpr(t2));
7955            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
7956            jmp_treg(Ijk_NoRedir,t1);
7957            dres.whatNext = Dis_StopHere;
7958            goto decode_success;
7959         }
7960         /* We don't know what it is. */
7961         goto decode_failure;
7962         /*NOTREACHED*/
7963      }
7964   }
7965
7966   /* Handle a couple of weird-ass NOPs that have been observed in the
7967      wild. */
7968   {
7969      UChar* code = (UChar*)(guest_code + delta);
7970      /* Sun's JVM 1.5.0 uses the following as a NOP:
7971         26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
7972      if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
7973          && code[3] == 0x65 && code[4] == 0x90) {
7974         DIP("%%es:%%cs:%%fs:%%gs:nop\n");
7975         delta += 5;
7976         goto decode_success;
7977      }
7978      /* Don't barf on recent binutils padding,
7979         all variants of which are: nopw %cs:0x0(%eax,%eax,1)
7980         66 2e 0f 1f 84 00 00 00 00 00
7981         66 66 2e 0f 1f 84 00 00 00 00 00
7982         66 66 66 2e 0f 1f 84 00 00 00 00 00
7983         66 66 66 66 2e 0f 1f 84 00 00 00 00 00
7984         66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
7985         66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
7986      */
7987      if (code[0] == 0x66) {
7988         Int data16_cnt;
7989         for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
7990            if (code[data16_cnt] != 0x66)
7991               break;
7992         if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
7993             && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
7994             && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
7995             && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
7996             && code[data16_cnt + 8] == 0x00 ) {
7997            DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
7998            delta += 9 + data16_cnt;
7999            goto decode_success;
8000         }
8001      }
8002   }
8003
8004   /* Normal instruction handling starts here. */
8005
8006   /* Deal with some but not all prefixes:
8007         66(oso)
8008         F0(lock)
8009         2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
8010      Not dealt with (left in place):
8011         F2 F3
8012   */
8013   n_prefixes = 0;
8014   while (True) {
8015      if (n_prefixes > 7) goto decode_failure;
8016      pre = getUChar(delta);
8017      switch (pre) {
8018         case 0x66:
8019            sz = 2;
8020            break;
8021         case 0xF0:
8022            pfx_lock = True;
8023            *expect_CAS = True;
8024            break;
8025         case 0x3E: /* %DS: */
8026         case 0x26: /* %ES: */
8027         case 0x64: /* %FS: */
8028         case 0x65: /* %GS: */
8029            if (sorb != 0)
8030               goto decode_failure; /* only one seg override allowed */
8031            sorb = pre;
8032            break;
8033         case 0x2E: { /* %CS: */
8034            /* 2E prefix on a conditional branch instruction is a
8035               branch-prediction hint, which can safely be ignored.  */
8036            UChar op1 = getIByte(delta+1);
8037            UChar op2 = getIByte(delta+2);
8038            if ((op1 >= 0x70 && op1 <= 0x7F)
8039                || (op1 == 0xE3)
8040                || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
8041               if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
8042            } else {
8043               /* All other CS override cases are not handled */
8044               goto decode_failure;
8045            }
8046            break;
8047         }
8048         case 0x36: /* %SS: */
8049            /* SS override cases are not handled */
8050            goto decode_failure;
8051         default:
8052            goto not_a_prefix;
8053      }
8054      n_prefixes++;
8055      delta++;
8056   }
8057
8058   not_a_prefix:
8059
8060   /* Now we should be looking at the primary opcode byte or the
8061      leading F2 or F3.  Check that any LOCK prefix is actually
8062      allowed. */
8063
8064   if (pfx_lock) {
8065      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
8066         DIP("lock ");
8067      } else {
8068         *expect_CAS = False;
8069         goto decode_failure;
8070      }
8071   }
8072
8073
8074   /* ---------------------------------------------------- */
8075   /* --- The SSE decoder.                             --- */
8076   /* ---------------------------------------------------- */
8077
8078   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
8079      previous life? */
8080
8081   /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
8082      later section, further on. */
8083
8084   insn = (UChar*)&guest_code[delta];
8085
8086   /* Treat fxsave specially.  It should be doable even on an SSE0
8087      (Pentium-II class) CPU.  Hence be prepared to handle it on
8088      any subarchitecture variant.
8089   */
8090
8091   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
8092   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8093       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
8094      IRDirty* d;
8095      modrm = getIByte(delta+2);
8096      vassert(sz == 4);
8097      vassert(!epartIsReg(modrm));
8098
8099      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8100      delta += 2+alen;
8101      gen_SEGV_if_not_16_aligned(addr);
8102
8103      DIP("fxsave %s\n", dis_buf);
8104
8105      /* Uses dirty helper:
8106            void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
8107      d = unsafeIRDirty_0_N (
8108             0/*regparms*/,
8109             "x86g_dirtyhelper_FXSAVE",
8110             &x86g_dirtyhelper_FXSAVE,
8111             mkIRExprVec_1( mkexpr(addr) )
8112          );
8113      d->needsBBP = True;
8114
8115      /* declare we're writing memory */
8116      d->mFx   = Ifx_Write;
8117      d->mAddr = mkexpr(addr);
8118      d->mSize = 512;
8119
8120      /* declare we're reading guest state */
8121      d->nFxState = 7;
8122
8123      d->fxState[0].fx     = Ifx_Read;
8124      d->fxState[0].offset = OFFB_FTOP;
8125      d->fxState[0].size   = sizeof(UInt);
8126
8127      d->fxState[1].fx     = Ifx_Read;
8128      d->fxState[1].offset = OFFB_FPREGS;
8129      d->fxState[1].size   = 8 * sizeof(ULong);
8130
8131      d->fxState[2].fx     = Ifx_Read;
8132      d->fxState[2].offset = OFFB_FPTAGS;
8133      d->fxState[2].size   = 8 * sizeof(UChar);
8134
8135      d->fxState[3].fx     = Ifx_Read;
8136      d->fxState[3].offset = OFFB_FPROUND;
8137      d->fxState[3].size   = sizeof(UInt);
8138
8139      d->fxState[4].fx     = Ifx_Read;
8140      d->fxState[4].offset = OFFB_FC3210;
8141      d->fxState[4].size   = sizeof(UInt);
8142
8143      d->fxState[5].fx     = Ifx_Read;
8144      d->fxState[5].offset = OFFB_XMM0;
8145      d->fxState[5].size   = 8 * sizeof(U128);
8146
8147      d->fxState[6].fx     = Ifx_Read;
8148      d->fxState[6].offset = OFFB_SSEROUND;
8149      d->fxState[6].size   = sizeof(UInt);
8150
8151      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8152	 images are packed back-to-back.  If not, the value of
8153	 d->fxState[5].size is wrong. */
8154      vassert(16 == sizeof(U128));
8155      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8156
8157      stmt( IRStmt_Dirty(d) );
8158
8159      goto decode_success;
8160   }
8161
8162   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
8163   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8164       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
8165      IRDirty* d;
8166      modrm = getIByte(delta+2);
8167      vassert(sz == 4);
8168      vassert(!epartIsReg(modrm));
8169
8170      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8171      delta += 2+alen;
8172      gen_SEGV_if_not_16_aligned(addr);
8173
8174      DIP("fxrstor %s\n", dis_buf);
8175
8176      /* Uses dirty helper:
8177            VexEmWarn x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
8178         NOTE:
8179            the VexEmWarn value is simply ignored (unlike for FRSTOR)
8180      */
8181      d = unsafeIRDirty_0_N (
8182             0/*regparms*/,
8183             "x86g_dirtyhelper_FXRSTOR",
8184             &x86g_dirtyhelper_FXRSTOR,
8185             mkIRExprVec_1( mkexpr(addr) )
8186          );
8187      d->needsBBP = True;
8188
8189      /* declare we're reading memory */
8190      d->mFx   = Ifx_Read;
8191      d->mAddr = mkexpr(addr);
8192      d->mSize = 512;
8193
8194      /* declare we're writing guest state */
8195      d->nFxState = 7;
8196
8197      d->fxState[0].fx     = Ifx_Write;
8198      d->fxState[0].offset = OFFB_FTOP;
8199      d->fxState[0].size   = sizeof(UInt);
8200
8201      d->fxState[1].fx     = Ifx_Write;
8202      d->fxState[1].offset = OFFB_FPREGS;
8203      d->fxState[1].size   = 8 * sizeof(ULong);
8204
8205      d->fxState[2].fx     = Ifx_Write;
8206      d->fxState[2].offset = OFFB_FPTAGS;
8207      d->fxState[2].size   = 8 * sizeof(UChar);
8208
8209      d->fxState[3].fx     = Ifx_Write;
8210      d->fxState[3].offset = OFFB_FPROUND;
8211      d->fxState[3].size   = sizeof(UInt);
8212
8213      d->fxState[4].fx     = Ifx_Write;
8214      d->fxState[4].offset = OFFB_FC3210;
8215      d->fxState[4].size   = sizeof(UInt);
8216
8217      d->fxState[5].fx     = Ifx_Write;
8218      d->fxState[5].offset = OFFB_XMM0;
8219      d->fxState[5].size   = 8 * sizeof(U128);
8220
8221      d->fxState[6].fx     = Ifx_Write;
8222      d->fxState[6].offset = OFFB_SSEROUND;
8223      d->fxState[6].size   = sizeof(UInt);
8224
8225      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8226	 images are packed back-to-back.  If not, the value of
8227	 d->fxState[5].size is wrong. */
8228      vassert(16 == sizeof(U128));
8229      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8230
8231      stmt( IRStmt_Dirty(d) );
8232
8233      goto decode_success;
8234   }
8235
8236   /* ------ SSE decoder main ------ */
8237
8238   /* Skip parts of the decoder which don't apply given the stated
8239      guest subarchitecture. */
8240   if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
8241      goto after_sse_decoders;
8242
8243   /* Otherwise we must be doing sse1 or sse2, so we can at least try
8244      for SSE1 here. */
8245
8246   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
8247   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
8248      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
8249      goto decode_success;
8250   }
8251
8252   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
8253   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
8254      vassert(sz == 4);
8255      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
8256      goto decode_success;
8257   }
8258
8259   /* 0F 55 = ANDNPS -- G = (not G) and E */
8260   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
8261      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
8262      goto decode_success;
8263   }
8264
8265   /* 0F 54 = ANDPS -- G = G and E */
8266   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
8267      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
8268      goto decode_success;
8269   }
8270
8271   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
8272   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
8273      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
8274      goto decode_success;
8275   }
8276
8277   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
8278   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
8279      vassert(sz == 4);
8280      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
8281      goto decode_success;
8282   }
8283
8284   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
8285   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
8286   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
8287      IRTemp argL = newTemp(Ity_F32);
8288      IRTemp argR = newTemp(Ity_F32);
8289      modrm = getIByte(delta+2);
8290      if (epartIsReg(modrm)) {
8291         assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
8292         delta += 2+1;
8293         DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8294                                  nameXMMReg(gregOfRM(modrm)) );
8295      } else {
8296         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8297	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
8298         delta += 2+alen;
8299         DIP("[u]comiss %s,%s\n", dis_buf,
8300                                  nameXMMReg(gregOfRM(modrm)) );
8301      }
8302      assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
8303
8304      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
8305      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
8306      stmt( IRStmt_Put(
8307               OFFB_CC_DEP1,
8308               binop( Iop_And32,
8309                      binop(Iop_CmpF64,
8310                            unop(Iop_F32toF64,mkexpr(argL)),
8311                            unop(Iop_F32toF64,mkexpr(argR))),
8312                      mkU32(0x45)
8313          )));
8314      /* Set NDEP even though it isn't used.  This makes redundant-PUT
8315         elimination of previous stores to this field work better. */
8316      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
8317      goto decode_success;
8318   }
8319
8320   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
8321      half xmm */
8322   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
8323      IRTemp arg64 = newTemp(Ity_I64);
8324      IRTemp rmode = newTemp(Ity_I32);
8325      vassert(sz == 4);
8326
8327      modrm = getIByte(delta+2);
8328      do_MMX_preamble();
8329      if (epartIsReg(modrm)) {
8330         assign( arg64, getMMXReg(eregOfRM(modrm)) );
8331         delta += 2+1;
8332         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8333                                 nameXMMReg(gregOfRM(modrm)));
8334      } else {
8335         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8336	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
8337         delta += 2+alen;
8338         DIP("cvtpi2ps %s,%s\n", dis_buf,
8339                                 nameXMMReg(gregOfRM(modrm)) );
8340      }
8341
8342      assign( rmode, get_sse_roundingmode() );
8343
8344      putXMMRegLane32F(
8345         gregOfRM(modrm), 0,
8346         binop(Iop_F64toF32,
8347               mkexpr(rmode),
8348               unop(Iop_I32StoF64,
8349                    unop(Iop_64to32, mkexpr(arg64)) )) );
8350
8351      putXMMRegLane32F(
8352         gregOfRM(modrm), 1,
8353         binop(Iop_F64toF32,
8354               mkexpr(rmode),
8355               unop(Iop_I32StoF64,
8356                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
8357
8358      goto decode_success;
8359   }
8360
8361   /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
8362      quarter xmm */
8363   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
8364      IRTemp arg32 = newTemp(Ity_I32);
8365      IRTemp rmode = newTemp(Ity_I32);
8366      vassert(sz == 4);
8367
8368      modrm = getIByte(delta+3);
8369      if (epartIsReg(modrm)) {
8370         assign( arg32, getIReg(4, eregOfRM(modrm)) );
8371         delta += 3+1;
8372         DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
8373                                 nameXMMReg(gregOfRM(modrm)));
8374      } else {
8375         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8376	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
8377         delta += 3+alen;
8378         DIP("cvtsi2ss %s,%s\n", dis_buf,
8379                                 nameXMMReg(gregOfRM(modrm)) );
8380      }
8381
8382      assign( rmode, get_sse_roundingmode() );
8383
8384      putXMMRegLane32F(
8385         gregOfRM(modrm), 0,
8386         binop(Iop_F64toF32,
8387               mkexpr(rmode),
8388               unop(Iop_I32StoF64, mkexpr(arg32)) ) );
8389
8390      goto decode_success;
8391   }
8392
8393   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8394      I32 in mmx, according to prevailing SSE rounding mode */
8395   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8396      I32 in mmx, rounding towards zero */
8397   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
8398      IRTemp dst64  = newTemp(Ity_I64);
8399      IRTemp rmode  = newTemp(Ity_I32);
8400      IRTemp f32lo  = newTemp(Ity_F32);
8401      IRTemp f32hi  = newTemp(Ity_F32);
8402      Bool   r2zero = toBool(insn[1] == 0x2C);
8403
8404      do_MMX_preamble();
8405      modrm = getIByte(delta+2);
8406
8407      if (epartIsReg(modrm)) {
8408         delta += 2+1;
8409	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8410	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
8411         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8412                                   nameXMMReg(eregOfRM(modrm)),
8413                                   nameMMXReg(gregOfRM(modrm)));
8414      } else {
8415         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8416	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8417	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
8418                                              mkexpr(addr),
8419                                              mkU32(4) )));
8420         delta += 2+alen;
8421         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8422                                   dis_buf,
8423                                   nameMMXReg(gregOfRM(modrm)));
8424      }
8425
8426      if (r2zero) {
8427         assign(rmode, mkU32((UInt)Irrm_ZERO) );
8428      } else {
8429         assign( rmode, get_sse_roundingmode() );
8430      }
8431
8432      assign(
8433         dst64,
8434         binop( Iop_32HLto64,
8435                binop( Iop_F64toI32S,
8436                       mkexpr(rmode),
8437                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
8438                binop( Iop_F64toI32S,
8439                       mkexpr(rmode),
8440                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
8441              )
8442      );
8443
8444      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
8445      goto decode_success;
8446   }
8447
8448   /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
8449      I32 in ireg, according to prevailing SSE rounding mode */
8450   /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
8451      I32 in ireg, rounding towards zero */
8452   if (insn[0] == 0xF3 && insn[1] == 0x0F
8453       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
8454      IRTemp rmode = newTemp(Ity_I32);
8455      IRTemp f32lo = newTemp(Ity_F32);
8456      Bool   r2zero = toBool(insn[2] == 0x2C);
8457      vassert(sz == 4);
8458
8459      modrm = getIByte(delta+3);
8460      if (epartIsReg(modrm)) {
8461         delta += 3+1;
8462	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8463         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8464                                   nameXMMReg(eregOfRM(modrm)),
8465                                   nameIReg(4, gregOfRM(modrm)));
8466      } else {
8467         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8468	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8469         delta += 3+alen;
8470         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8471                                   dis_buf,
8472                                   nameIReg(4, gregOfRM(modrm)));
8473      }
8474
8475      if (r2zero) {
8476         assign( rmode, mkU32((UInt)Irrm_ZERO) );
8477      } else {
8478         assign( rmode, get_sse_roundingmode() );
8479      }
8480
8481      putIReg(4, gregOfRM(modrm),
8482                 binop( Iop_F64toI32S,
8483                        mkexpr(rmode),
8484                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
8485      );
8486
8487      goto decode_success;
8488   }
8489
8490   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
8491   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
8492      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
8493      goto decode_success;
8494   }
8495
8496   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
8497   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
8498      vassert(sz == 4);
8499      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
8500      goto decode_success;
8501   }
8502
8503   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
8504   if (insn[0] == 0x0F && insn[1] == 0xAE
8505       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
8506
8507      IRTemp t64 = newTemp(Ity_I64);
8508      IRTemp ew = newTemp(Ity_I32);
8509
8510      modrm = getIByte(delta+2);
8511      vassert(!epartIsReg(modrm));
8512      vassert(sz == 4);
8513
8514      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8515      delta += 2+alen;
8516      DIP("ldmxcsr %s\n", dis_buf);
8517
8518      /* The only thing we observe in %mxcsr is the rounding mode.
8519         Therefore, pass the 32-bit value (SSE native-format control
8520         word) to a clean helper, getting back a 64-bit value, the
8521         lower half of which is the SSEROUND value to store, and the
8522         upper half of which is the emulation-warning token which may
8523         be generated.
8524      */
8525      /* ULong x86h_check_ldmxcsr ( UInt ); */
8526      assign( t64, mkIRExprCCall(
8527                      Ity_I64, 0/*regparms*/,
8528                      "x86g_check_ldmxcsr",
8529                      &x86g_check_ldmxcsr,
8530                      mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
8531                   )
8532            );
8533
8534      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
8535      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
8536      put_emwarn( mkexpr(ew) );
8537      /* Finally, if an emulation warning was reported, side-exit to
8538         the next insn, reporting the warning, so that Valgrind's
8539         dispatcher sees the warning. */
8540      stmt(
8541         IRStmt_Exit(
8542            binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
8543            Ijk_EmWarn,
8544            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
8545         )
8546      );
8547      goto decode_success;
8548   }
8549
8550   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8551   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
8552   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
8553      Bool ok = False;
8554      delta = dis_MMX( &ok, sorb, sz, delta+1 );
8555      if (!ok)
8556         goto decode_failure;
8557      goto decode_success;
8558   }
8559
8560   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
8561   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
8562      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
8563      goto decode_success;
8564   }
8565
8566   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
8567   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
8568      vassert(sz == 4);
8569      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
8570      goto decode_success;
8571   }
8572
8573   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
8574   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
8575      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
8576      goto decode_success;
8577   }
8578
8579   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
8580   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
8581      vassert(sz == 4);
8582      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
8583      goto decode_success;
8584   }
8585
8586   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
8587   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
8588   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
8589      modrm = getIByte(delta+2);
8590      if (epartIsReg(modrm)) {
8591         putXMMReg( gregOfRM(modrm),
8592                    getXMMReg( eregOfRM(modrm) ));
8593         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8594                                  nameXMMReg(gregOfRM(modrm)));
8595         delta += 2+1;
8596      } else {
8597         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8598         if (insn[1] == 0x28/*movaps*/)
8599            gen_SEGV_if_not_16_aligned( addr );
8600         putXMMReg( gregOfRM(modrm),
8601                    loadLE(Ity_V128, mkexpr(addr)) );
8602         DIP("mov[ua]ps %s,%s\n", dis_buf,
8603                                  nameXMMReg(gregOfRM(modrm)));
8604         delta += 2+alen;
8605      }
8606      goto decode_success;
8607   }
8608
8609   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
8610   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
8611   if (sz == 4 && insn[0] == 0x0F
8612       && (insn[1] == 0x29 || insn[1] == 0x11)) {
8613      modrm = getIByte(delta+2);
8614      if (epartIsReg(modrm)) {
8615         /* fall through; awaiting test case */
8616      } else {
8617         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8618         if (insn[1] == 0x29/*movaps*/)
8619            gen_SEGV_if_not_16_aligned( addr );
8620         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
8621         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
8622                                  dis_buf );
8623         delta += 2+alen;
8624         goto decode_success;
8625      }
8626   }
8627
8628   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
8629   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
8630   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
8631      modrm = getIByte(delta+2);
8632      if (epartIsReg(modrm)) {
8633         delta += 2+1;
8634         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
8635                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
8636         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8637                               nameXMMReg(gregOfRM(modrm)));
8638      } else {
8639         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8640         delta += 2+alen;
8641         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
8642                          loadLE(Ity_I64, mkexpr(addr)) );
8643         DIP("movhps %s,%s\n", dis_buf,
8644                               nameXMMReg( gregOfRM(modrm) ));
8645      }
8646      goto decode_success;
8647   }
8648
8649   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
8650   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
8651      if (!epartIsReg(insn[2])) {
8652         delta += 2;
8653         addr = disAMode ( &alen, sorb, delta, dis_buf );
8654         delta += alen;
8655         storeLE( mkexpr(addr),
8656                  getXMMRegLane64( gregOfRM(insn[2]),
8657                                   1/*upper lane*/ ) );
8658         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
8659                               dis_buf);
8660         goto decode_success;
8661      }
8662      /* else fall through */
8663   }
8664
8665   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
8666   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
8667   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
8668      modrm = getIByte(delta+2);
8669      if (epartIsReg(modrm)) {
8670         delta += 2+1;
8671         putXMMRegLane64( gregOfRM(modrm),
8672                          0/*lower lane*/,
8673                          getXMMRegLane64( eregOfRM(modrm), 1 ));
8674         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
8675                                 nameXMMReg(gregOfRM(modrm)));
8676      } else {
8677         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8678         delta += 2+alen;
8679         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
8680                          loadLE(Ity_I64, mkexpr(addr)) );
8681         DIP("movlps %s, %s\n",
8682             dis_buf, nameXMMReg( gregOfRM(modrm) ));
8683      }
8684      goto decode_success;
8685   }
8686
8687   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
8688   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
8689      if (!epartIsReg(insn[2])) {
8690         delta += 2;
8691         addr = disAMode ( &alen, sorb, delta, dis_buf );
8692         delta += alen;
8693         storeLE( mkexpr(addr),
8694                  getXMMRegLane64( gregOfRM(insn[2]),
8695                                   0/*lower lane*/ ) );
8696         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
8697                                dis_buf);
8698         goto decode_success;
8699      }
8700      /* else fall through */
8701   }
8702
8703   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
8704      to 4 lowest bits of ireg(G) */
8705   if (insn[0] == 0x0F && insn[1] == 0x50) {
8706      modrm = getIByte(delta+2);
8707      if (sz == 4 && epartIsReg(modrm)) {
8708         Int src;
8709         t0 = newTemp(Ity_I32);
8710         t1 = newTemp(Ity_I32);
8711         t2 = newTemp(Ity_I32);
8712         t3 = newTemp(Ity_I32);
8713         delta += 2+1;
8714         src = eregOfRM(modrm);
8715         assign( t0, binop( Iop_And32,
8716                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
8717                            mkU32(1) ));
8718         assign( t1, binop( Iop_And32,
8719                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
8720                            mkU32(2) ));
8721         assign( t2, binop( Iop_And32,
8722                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
8723                            mkU32(4) ));
8724         assign( t3, binop( Iop_And32,
8725                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
8726                            mkU32(8) ));
8727         putIReg(4, gregOfRM(modrm),
8728                    binop(Iop_Or32,
8729                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
8730                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
8731                         )
8732                 );
8733         DIP("movmskps %s,%s\n", nameXMMReg(src),
8734                                 nameIReg(4, gregOfRM(modrm)));
8735         goto decode_success;
8736      }
8737      /* else fall through */
8738   }
8739
8740   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
8741   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
8742   if (insn[0] == 0x0F && insn[1] == 0x2B) {
8743      modrm = getIByte(delta+2);
8744      if (!epartIsReg(modrm)) {
8745         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8746         gen_SEGV_if_not_16_aligned( addr );
8747         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
8748         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
8749                                 dis_buf,
8750                                 nameXMMReg(gregOfRM(modrm)));
8751         delta += 2+alen;
8752         goto decode_success;
8753      }
8754      /* else fall through */
8755   }
8756
8757   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8758   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
8759      Intel manual does not say anything about the usual business of
8760      the FP reg tags getting trashed whenever an MMX insn happens.
8761      So we just leave them alone.
8762   */
8763   if (insn[0] == 0x0F && insn[1] == 0xE7) {
8764      modrm = getIByte(delta+2);
8765      if (sz == 4 && !epartIsReg(modrm)) {
8766         /* do_MMX_preamble(); Intel docs don't specify this */
8767         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8768         storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
8769         DIP("movntq %s,%s\n", dis_buf,
8770                               nameMMXReg(gregOfRM(modrm)));
8771         delta += 2+alen;
8772         goto decode_success;
8773      }
8774      /* else fall through */
8775   }
8776
8777   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
8778      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
8779   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
8780      vassert(sz == 4);
8781      modrm = getIByte(delta+3);
8782      if (epartIsReg(modrm)) {
8783         putXMMRegLane32( gregOfRM(modrm), 0,
8784                          getXMMRegLane32( eregOfRM(modrm), 0 ));
8785         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8786                              nameXMMReg(gregOfRM(modrm)));
8787         delta += 3+1;
8788      } else {
8789         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8790         /* zero bits 127:64 */
8791         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
8792         /* zero bits 63:32 */
8793         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
8794         /* write bits 31:0 */
8795         putXMMRegLane32( gregOfRM(modrm), 0,
8796                          loadLE(Ity_I32, mkexpr(addr)) );
8797         DIP("movss %s,%s\n", dis_buf,
8798                              nameXMMReg(gregOfRM(modrm)));
8799         delta += 3+alen;
8800      }
8801      goto decode_success;
8802   }
8803
8804   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
8805      or lo 1/4 xmm). */
8806   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
8807      vassert(sz == 4);
8808      modrm = getIByte(delta+3);
8809      if (epartIsReg(modrm)) {
8810         /* fall through, we don't yet have a test case */
8811      } else {
8812         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8813         storeLE( mkexpr(addr),
8814                  getXMMRegLane32(gregOfRM(modrm), 0) );
8815         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
8816                              dis_buf);
8817         delta += 3+alen;
8818         goto decode_success;
8819      }
8820   }
8821
8822   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
8823   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
8824      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
8825      goto decode_success;
8826   }
8827
8828   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
8829   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
8830      vassert(sz == 4);
8831      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
8832      goto decode_success;
8833   }
8834
8835   /* 0F 56 = ORPS -- G = G and E */
8836   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
8837      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
8838      goto decode_success;
8839   }
8840
8841   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8842   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
8843   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
8844      do_MMX_preamble();
8845      delta = dis_MMXop_regmem_to_reg (
8846                sorb, delta+2, insn[1], "pavgb", False );
8847      goto decode_success;
8848   }
8849
8850   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8851   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
8852   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
8853      do_MMX_preamble();
8854      delta = dis_MMXop_regmem_to_reg (
8855                sorb, delta+2, insn[1], "pavgw", False );
8856      goto decode_success;
8857   }
8858
8859   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8860   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
8861      zero-extend of it in ireg(G). */
8862   if (insn[0] == 0x0F && insn[1] == 0xC5) {
8863      modrm = insn[2];
8864      if (sz == 4 && epartIsReg(modrm)) {
8865         IRTemp sV = newTemp(Ity_I64);
8866         t5 = newTemp(Ity_I16);
8867         do_MMX_preamble();
8868         assign(sV, getMMXReg(eregOfRM(modrm)));
8869         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
8870         switch (insn[3] & 3) {
8871            case 0:  assign(t5, mkexpr(t0)); break;
8872            case 1:  assign(t5, mkexpr(t1)); break;
8873            case 2:  assign(t5, mkexpr(t2)); break;
8874            case 3:  assign(t5, mkexpr(t3)); break;
8875            default: vassert(0); /*NOTREACHED*/
8876         }
8877         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
8878         DIP("pextrw $%d,%s,%s\n",
8879             (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
8880                           nameIReg(4,gregOfRM(modrm)));
8881         delta += 4;
8882         goto decode_success;
8883      }
8884      /* else fall through */
8885   }
8886
8887   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8888   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
8889      put it into the specified lane of mmx(G). */
8890   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
8891      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
8892         mmx reg.  t4 is the new lane value.  t5 is the original
8893         mmx value. t6 is the new mmx value. */
8894      Int lane;
8895      t4 = newTemp(Ity_I16);
8896      t5 = newTemp(Ity_I64);
8897      t6 = newTemp(Ity_I64);
8898      modrm = insn[2];
8899      do_MMX_preamble();
8900
8901      assign(t5, getMMXReg(gregOfRM(modrm)));
8902      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
8903
8904      if (epartIsReg(modrm)) {
8905         assign(t4, getIReg(2, eregOfRM(modrm)));
8906         delta += 3+1;
8907         lane = insn[3+1-1];
8908         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8909                                   nameIReg(2,eregOfRM(modrm)),
8910                                   nameMMXReg(gregOfRM(modrm)));
8911      } else {
8912         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8913         delta += 3+alen;
8914         lane = insn[3+alen-1];
8915         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
8916         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8917                                   dis_buf,
8918                                   nameMMXReg(gregOfRM(modrm)));
8919      }
8920
8921      switch (lane & 3) {
8922         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
8923         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
8924         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
8925         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
8926         default: vassert(0); /*NOTREACHED*/
8927      }
8928      putMMXReg(gregOfRM(modrm), mkexpr(t6));
8929      goto decode_success;
8930   }
8931
8932   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8933   /* 0F EE = PMAXSW -- 16x4 signed max */
8934   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
8935      do_MMX_preamble();
8936      delta = dis_MMXop_regmem_to_reg (
8937                sorb, delta+2, insn[1], "pmaxsw", False );
8938      goto decode_success;
8939   }
8940
8941   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8942   /* 0F DE = PMAXUB -- 8x8 unsigned max */
8943   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
8944      do_MMX_preamble();
8945      delta = dis_MMXop_regmem_to_reg (
8946                sorb, delta+2, insn[1], "pmaxub", False );
8947      goto decode_success;
8948   }
8949
8950   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8951   /* 0F EA = PMINSW -- 16x4 signed min */
8952   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
8953      do_MMX_preamble();
8954      delta = dis_MMXop_regmem_to_reg (
8955                sorb, delta+2, insn[1], "pminsw", False );
8956      goto decode_success;
8957   }
8958
8959   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8960   /* 0F DA = PMINUB -- 8x8 unsigned min */
8961   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
8962      do_MMX_preamble();
8963      delta = dis_MMXop_regmem_to_reg (
8964                sorb, delta+2, insn[1], "pminub", False );
8965      goto decode_success;
8966   }
8967
8968   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8969   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
8970      mmx(G), turn them into a byte, and put zero-extend of it in
8971      ireg(G). */
8972   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
8973      modrm = insn[2];
8974      if (epartIsReg(modrm)) {
8975         do_MMX_preamble();
8976         t0 = newTemp(Ity_I64);
8977         t1 = newTemp(Ity_I32);
8978         assign(t0, getMMXReg(eregOfRM(modrm)));
8979         assign(t1, mkIRExprCCall(
8980                       Ity_I32, 0/*regparms*/,
8981                       "x86g_calculate_mmx_pmovmskb",
8982                       &x86g_calculate_mmx_pmovmskb,
8983                       mkIRExprVec_1(mkexpr(t0))));
8984         putIReg(4, gregOfRM(modrm), mkexpr(t1));
8985         DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8986                                 nameIReg(4,gregOfRM(modrm)));
8987         delta += 3;
8988         goto decode_success;
8989      }
8990      /* else fall through */
8991   }
8992
8993   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8994   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
8995   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
8996      do_MMX_preamble();
8997      delta = dis_MMXop_regmem_to_reg (
8998                sorb, delta+2, insn[1], "pmuluh", False );
8999      goto decode_success;
9000   }
9001
9002   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
9003   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
9004   /* 0F 18 /2 = PREFETCH1 */
9005   /* 0F 18 /3 = PREFETCH2 */
9006   if (insn[0] == 0x0F && insn[1] == 0x18
9007       && !epartIsReg(insn[2])
9008       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
9009      HChar* hintstr = "??";
9010
9011      modrm = getIByte(delta+2);
9012      vassert(!epartIsReg(modrm));
9013
9014      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9015      delta += 2+alen;
9016
9017      switch (gregOfRM(modrm)) {
9018         case 0: hintstr = "nta"; break;
9019         case 1: hintstr = "t0"; break;
9020         case 2: hintstr = "t1"; break;
9021         case 3: hintstr = "t2"; break;
9022         default: vassert(0); /*NOTREACHED*/
9023      }
9024
9025      DIP("prefetch%s %s\n", hintstr, dis_buf);
9026      goto decode_success;
9027   }
9028
9029   /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
9030   /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
9031   if (insn[0] == 0x0F && insn[1] == 0x0D
9032       && !epartIsReg(insn[2])
9033       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
9034      HChar* hintstr = "??";
9035
9036      modrm = getIByte(delta+2);
9037      vassert(!epartIsReg(modrm));
9038
9039      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9040      delta += 2+alen;
9041
9042      switch (gregOfRM(modrm)) {
9043         case 0: hintstr = ""; break;
9044         case 1: hintstr = "w"; break;
9045         default: vassert(0); /*NOTREACHED*/
9046      }
9047
9048      DIP("prefetch%s %s\n", hintstr, dis_buf);
9049      goto decode_success;
9050   }
9051
9052   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9053   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
9054   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
9055      do_MMX_preamble();
9056      delta = dis_MMXop_regmem_to_reg (
9057                 sorb, delta+2, insn[1], "psadbw", False );
9058      goto decode_success;
9059   }
9060
9061   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9062   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
9063   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
9064      Int order;
9065      IRTemp sV, dV, s3, s2, s1, s0;
9066      s3 = s2 = s1 = s0 = IRTemp_INVALID;
9067      sV = newTemp(Ity_I64);
9068      dV = newTemp(Ity_I64);
9069      do_MMX_preamble();
9070      modrm = insn[2];
9071      if (epartIsReg(modrm)) {
9072         assign( sV, getMMXReg(eregOfRM(modrm)) );
9073         order = (Int)insn[3];
9074         delta += 2+2;
9075         DIP("pshufw $%d,%s,%s\n", order,
9076                                   nameMMXReg(eregOfRM(modrm)),
9077                                   nameMMXReg(gregOfRM(modrm)));
9078      } else {
9079         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9080         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
9081	 order = (Int)insn[2+alen];
9082         delta += 3+alen;
9083         DIP("pshufw $%d,%s,%s\n", order,
9084                                   dis_buf,
9085                                   nameMMXReg(gregOfRM(modrm)));
9086      }
9087      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
9088
9089#     define SEL(n) \
9090                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9091      assign(dV,
9092	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
9093                          SEL((order>>2)&3), SEL((order>>0)&3) )
9094      );
9095      putMMXReg(gregOfRM(modrm), mkexpr(dV));
9096#     undef SEL
9097      goto decode_success;
9098   }
9099
9100   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
9101   if (insn[0] == 0x0F && insn[1] == 0x53) {
9102      vassert(sz == 4);
9103      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9104                                        "rcpps", Iop_Recip32Fx4 );
9105      goto decode_success;
9106   }
9107
9108   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
9109   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
9110      vassert(sz == 4);
9111      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9112                                         "rcpss", Iop_Recip32F0x4 );
9113      goto decode_success;
9114   }
9115
9116   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
9117   if (insn[0] == 0x0F && insn[1] == 0x52) {
9118      vassert(sz == 4);
9119      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9120                                        "rsqrtps", Iop_RSqrt32Fx4 );
9121      goto decode_success;
9122   }
9123
9124   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
9125   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
9126      vassert(sz == 4);
9127      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9128                                         "rsqrtss", Iop_RSqrt32F0x4 );
9129      goto decode_success;
9130   }
9131
9132   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
9133   if (insn[0] == 0x0F && insn[1] == 0xAE
9134       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
9135      vassert(sz == 4);
9136      delta += 3;
9137      /* Insert a memory fence.  It's sometimes important that these
9138         are carried through to the generated code. */
9139      stmt( IRStmt_MBE(Imbe_Fence) );
9140      DIP("sfence\n");
9141      goto decode_success;
9142   }
9143
9144   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
9145   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
9146      Int    select;
9147      IRTemp sV, dV;
9148      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9149      sV = newTemp(Ity_V128);
9150      dV = newTemp(Ity_V128);
9151      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9152      modrm = insn[2];
9153      assign( dV, getXMMReg(gregOfRM(modrm)) );
9154
9155      if (epartIsReg(modrm)) {
9156         assign( sV, getXMMReg(eregOfRM(modrm)) );
9157         select = (Int)insn[3];
9158         delta += 2+2;
9159         DIP("shufps $%d,%s,%s\n", select,
9160                                   nameXMMReg(eregOfRM(modrm)),
9161                                   nameXMMReg(gregOfRM(modrm)));
9162      } else {
9163         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9164         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9165         select = (Int)insn[2+alen];
9166         delta += 3+alen;
9167         DIP("shufps $%d,%s,%s\n", select,
9168                                   dis_buf,
9169                                   nameXMMReg(gregOfRM(modrm)));
9170      }
9171
9172      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9173      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9174
9175#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
9176#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9177
9178      putXMMReg(
9179         gregOfRM(modrm),
9180         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
9181                       SELD((select>>2)&3), SELD((select>>0)&3) )
9182      );
9183
9184#     undef SELD
9185#     undef SELS
9186
9187      goto decode_success;
9188   }
9189
9190   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
9191   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
9192      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9193                                        "sqrtps", Iop_Sqrt32Fx4 );
9194      goto decode_success;
9195   }
9196
9197   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
9198   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
9199      vassert(sz == 4);
9200      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9201                                         "sqrtss", Iop_Sqrt32F0x4 );
9202      goto decode_success;
9203   }
9204
9205   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
9206   if (insn[0] == 0x0F && insn[1] == 0xAE
9207       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
9208      modrm = getIByte(delta+2);
9209      vassert(sz == 4);
9210      vassert(!epartIsReg(modrm));
9211
9212      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9213      delta += 2+alen;
9214
9215      /* Fake up a native SSE mxcsr word.  The only thing it depends
9216         on is SSEROUND[1:0], so call a clean helper to cook it up.
9217      */
9218      /* UInt x86h_create_mxcsr ( UInt sseround ) */
9219      DIP("stmxcsr %s\n", dis_buf);
9220      storeLE( mkexpr(addr),
9221               mkIRExprCCall(
9222                  Ity_I32, 0/*regp*/,
9223                  "x86g_create_mxcsr", &x86g_create_mxcsr,
9224                  mkIRExprVec_1( get_sse_roundingmode() )
9225               )
9226             );
9227      goto decode_success;
9228   }
9229
9230   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
9231   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
9232      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
9233      goto decode_success;
9234   }
9235
9236   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
9237   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
9238      vassert(sz == 4);
9239      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
9240      goto decode_success;
9241   }
9242
9243   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
9244   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
9245   /* These just appear to be special cases of SHUFPS */
9246   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
9247      IRTemp sV, dV;
9248      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9249      Bool hi = toBool(insn[1] == 0x15);
9250      sV = newTemp(Ity_V128);
9251      dV = newTemp(Ity_V128);
9252      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9253      modrm = insn[2];
9254      assign( dV, getXMMReg(gregOfRM(modrm)) );
9255
9256      if (epartIsReg(modrm)) {
9257         assign( sV, getXMMReg(eregOfRM(modrm)) );
9258         delta += 2+1;
9259         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9260                                  nameXMMReg(eregOfRM(modrm)),
9261                                  nameXMMReg(gregOfRM(modrm)));
9262      } else {
9263         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9264         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9265         delta += 2+alen;
9266         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9267                                  dis_buf,
9268                                  nameXMMReg(gregOfRM(modrm)));
9269      }
9270
9271      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9272      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9273
9274      if (hi) {
9275         putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
9276      } else {
9277         putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
9278      }
9279
9280      goto decode_success;
9281   }
9282
9283   /* 0F 57 = XORPS -- G = G and E */
9284   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
9285      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
9286      goto decode_success;
9287   }
9288
9289   /* ---------------------------------------------------- */
9290   /* --- end of the SSE decoder.                      --- */
9291   /* ---------------------------------------------------- */
9292
9293   /* ---------------------------------------------------- */
9294   /* --- start of the SSE2 decoder.                   --- */
9295   /* ---------------------------------------------------- */
9296
9297   /* Skip parts of the decoder which don't apply given the stated
9298      guest subarchitecture. */
9299   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
9300      goto after_sse_decoders; /* no SSE2 capabilities */
9301
9302   insn = (UChar*)&guest_code[delta];
9303
9304   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
9305   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
9306      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
9307      goto decode_success;
9308   }
9309
9310   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
9311   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
9312      vassert(sz == 4);
9313      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
9314      goto decode_success;
9315   }
9316
9317   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
9318   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
9319      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
9320      goto decode_success;
9321   }
9322
9323   /* 66 0F 54 = ANDPD -- G = G and E */
9324   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
9325      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
9326      goto decode_success;
9327   }
9328
9329   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
9330   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
9331      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
9332      goto decode_success;
9333   }
9334
9335   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
9336   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
9337      vassert(sz == 4);
9338      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
9339      goto decode_success;
9340   }
9341
9342   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
9343   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
9344   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
9345      IRTemp argL = newTemp(Ity_F64);
9346      IRTemp argR = newTemp(Ity_F64);
9347      modrm = getIByte(delta+2);
9348      if (epartIsReg(modrm)) {
9349         assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
9350         delta += 2+1;
9351         DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9352                                  nameXMMReg(gregOfRM(modrm)) );
9353      } else {
9354         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9355	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9356         delta += 2+alen;
9357         DIP("[u]comisd %s,%s\n", dis_buf,
9358                                  nameXMMReg(gregOfRM(modrm)) );
9359      }
9360      assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
9361
9362      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
9363      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
9364      stmt( IRStmt_Put(
9365               OFFB_CC_DEP1,
9366               binop( Iop_And32,
9367                      binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
9368                      mkU32(0x45)
9369          )));
9370      /* Set NDEP even though it isn't used.  This makes redundant-PUT
9371         elimination of previous stores to this field work better. */
9372      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
9373      goto decode_success;
9374   }
9375
9376   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
9377      F64 in xmm(G) */
9378   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
9379      IRTemp arg64 = newTemp(Ity_I64);
9380      vassert(sz == 4);
9381
9382      modrm = getIByte(delta+3);
9383      if (epartIsReg(modrm)) {
9384         assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
9385         delta += 3+1;
9386         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9387                                 nameXMMReg(gregOfRM(modrm)));
9388      } else {
9389         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9390	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9391         delta += 3+alen;
9392         DIP("cvtdq2pd %s,%s\n", dis_buf,
9393                                 nameXMMReg(gregOfRM(modrm)) );
9394      }
9395
9396      putXMMRegLane64F(
9397         gregOfRM(modrm), 0,
9398         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
9399      );
9400
9401      putXMMRegLane64F(
9402         gregOfRM(modrm), 1,
9403         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
9404      );
9405
9406      goto decode_success;
9407   }
9408
9409   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
9410      xmm(G) */
9411   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
9412      IRTemp argV  = newTemp(Ity_V128);
9413      IRTemp rmode = newTemp(Ity_I32);
9414
9415      modrm = getIByte(delta+2);
9416      if (epartIsReg(modrm)) {
9417         assign( argV, getXMMReg(eregOfRM(modrm)) );
9418         delta += 2+1;
9419         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9420                                 nameXMMReg(gregOfRM(modrm)));
9421      } else {
9422         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9423	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9424         delta += 2+alen;
9425         DIP("cvtdq2ps %s,%s\n", dis_buf,
9426                                 nameXMMReg(gregOfRM(modrm)) );
9427      }
9428
9429      assign( rmode, get_sse_roundingmode() );
9430      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9431
9432#     define CVT(_t)  binop( Iop_F64toF32,                    \
9433                             mkexpr(rmode),                   \
9434                             unop(Iop_I32StoF64,mkexpr(_t)))
9435
9436      putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
9437      putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
9438      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9439      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9440
9441#     undef CVT
9442
9443      goto decode_success;
9444   }
9445
9446   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9447      lo half xmm(G), and zero upper half */
9448   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
9449      IRTemp argV  = newTemp(Ity_V128);
9450      IRTemp rmode = newTemp(Ity_I32);
9451      vassert(sz == 4);
9452
9453      modrm = getIByte(delta+3);
9454      if (epartIsReg(modrm)) {
9455         assign( argV, getXMMReg(eregOfRM(modrm)) );
9456         delta += 3+1;
9457         DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9458                                 nameXMMReg(gregOfRM(modrm)));
9459      } else {
9460         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9461	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9462         delta += 3+alen;
9463         DIP("cvtpd2dq %s,%s\n", dis_buf,
9464                                 nameXMMReg(gregOfRM(modrm)) );
9465      }
9466
9467      assign( rmode, get_sse_roundingmode() );
9468      t0 = newTemp(Ity_F64);
9469      t1 = newTemp(Ity_F64);
9470      assign( t0, unop(Iop_ReinterpI64asF64,
9471                       unop(Iop_V128to64, mkexpr(argV))) );
9472      assign( t1, unop(Iop_ReinterpI64asF64,
9473                       unop(Iop_V128HIto64, mkexpr(argV))) );
9474
9475#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9476                             mkexpr(rmode),                   \
9477                             mkexpr(_t) )
9478
9479      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9480      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9481      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9482      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9483
9484#     undef CVT
9485
9486      goto decode_success;
9487   }
9488
9489   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9490      I32 in mmx, according to prevailing SSE rounding mode */
9491   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9492      I32 in mmx, rounding towards zero */
9493   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9494      IRTemp dst64  = newTemp(Ity_I64);
9495      IRTemp rmode  = newTemp(Ity_I32);
9496      IRTemp f64lo  = newTemp(Ity_F64);
9497      IRTemp f64hi  = newTemp(Ity_F64);
9498      Bool   r2zero = toBool(insn[1] == 0x2C);
9499
9500      do_MMX_preamble();
9501      modrm = getIByte(delta+2);
9502
9503      if (epartIsReg(modrm)) {
9504         delta += 2+1;
9505	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9506	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
9507         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
9508                                   nameXMMReg(eregOfRM(modrm)),
9509                                   nameMMXReg(gregOfRM(modrm)));
9510      } else {
9511         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9512	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9513	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
9514                                              mkexpr(addr),
9515                                              mkU32(8) )));
9516         delta += 2+alen;
9517         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
9518                                   dis_buf,
9519                                   nameMMXReg(gregOfRM(modrm)));
9520      }
9521
9522      if (r2zero) {
9523         assign(rmode, mkU32((UInt)Irrm_ZERO) );
9524      } else {
9525         assign( rmode, get_sse_roundingmode() );
9526      }
9527
9528      assign(
9529         dst64,
9530         binop( Iop_32HLto64,
9531                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
9532                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
9533              )
9534      );
9535
9536      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
9537      goto decode_success;
9538   }
9539
9540   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
9541      lo half xmm(G), and zero upper half */
9542   /* Note, this is practically identical to CVTPD2DQ.  It would have
9543      been nicer to merge them together, but the insn[] offsets differ
9544      by one. */
9545   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
9546      IRTemp argV  = newTemp(Ity_V128);
9547      IRTemp rmode = newTemp(Ity_I32);
9548
9549      modrm = getIByte(delta+2);
9550      if (epartIsReg(modrm)) {
9551         assign( argV, getXMMReg(eregOfRM(modrm)) );
9552         delta += 2+1;
9553         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9554                                 nameXMMReg(gregOfRM(modrm)));
9555      } else {
9556         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9557	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9558         delta += 2+alen;
9559         DIP("cvtpd2ps %s,%s\n", dis_buf,
9560                                 nameXMMReg(gregOfRM(modrm)) );
9561      }
9562
9563      assign( rmode, get_sse_roundingmode() );
9564      t0 = newTemp(Ity_F64);
9565      t1 = newTemp(Ity_F64);
9566      assign( t0, unop(Iop_ReinterpI64asF64,
9567                       unop(Iop_V128to64, mkexpr(argV))) );
9568      assign( t1, unop(Iop_ReinterpI64asF64,
9569                       unop(Iop_V128HIto64, mkexpr(argV))) );
9570
9571#     define CVT(_t)  binop( Iop_F64toF32,                    \
9572                             mkexpr(rmode),                   \
9573                             mkexpr(_t) )
9574
9575      putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
9576      putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
9577      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9578      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9579
9580#     undef CVT
9581
9582      goto decode_success;
9583   }
9584
9585   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
9586      xmm(G) */
9587   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
9588      IRTemp arg64 = newTemp(Ity_I64);
9589
9590      modrm = getIByte(delta+2);
9591      if (epartIsReg(modrm)) {
9592         /* Only switch to MMX mode if the source is a MMX register.
9593            This is inconsistent with all other instructions which
9594            convert between XMM and (M64 or MMX), which always switch
9595            to MMX mode even if 64-bit operand is M64 and not MMX.  At
9596            least, that's what the Intel docs seem to me to say.
9597            Fixes #210264. */
9598         do_MMX_preamble();
9599         assign( arg64, getMMXReg(eregOfRM(modrm)) );
9600         delta += 2+1;
9601         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9602                                 nameXMMReg(gregOfRM(modrm)));
9603      } else {
9604         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9605	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9606         delta += 2+alen;
9607         DIP("cvtpi2pd %s,%s\n", dis_buf,
9608                                 nameXMMReg(gregOfRM(modrm)) );
9609      }
9610
9611      putXMMRegLane64F(
9612         gregOfRM(modrm), 0,
9613         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
9614      );
9615
9616      putXMMRegLane64F(
9617         gregOfRM(modrm), 1,
9618         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
9619      );
9620
9621      goto decode_success;
9622   }
9623
9624   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9625      xmm(G) */
9626   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
9627      IRTemp argV  = newTemp(Ity_V128);
9628      IRTemp rmode = newTemp(Ity_I32);
9629
9630      modrm = getIByte(delta+2);
9631      if (epartIsReg(modrm)) {
9632         assign( argV, getXMMReg(eregOfRM(modrm)) );
9633         delta += 2+1;
9634         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9635                                 nameXMMReg(gregOfRM(modrm)));
9636      } else {
9637         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9638	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9639         delta += 2+alen;
9640         DIP("cvtps2dq %s,%s\n", dis_buf,
9641                                 nameXMMReg(gregOfRM(modrm)) );
9642      }
9643
9644      assign( rmode, get_sse_roundingmode() );
9645      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9646
9647      /* This is less than ideal.  If it turns out to be a performance
9648	 bottleneck it can be improved. */
9649#     define CVT(_t)                            \
9650        binop( Iop_F64toI32S,                   \
9651               mkexpr(rmode),                   \
9652               unop( Iop_F32toF64,              \
9653                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9654
9655      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9656      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9657      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9658      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9659
9660#     undef CVT
9661
9662      goto decode_success;
9663   }
9664
9665   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
9666      F64 in xmm(G). */
9667   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
9668      IRTemp f32lo = newTemp(Ity_F32);
9669      IRTemp f32hi = newTemp(Ity_F32);
9670
9671      modrm = getIByte(delta+2);
9672      if (epartIsReg(modrm)) {
9673         assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
9674         assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
9675         delta += 2+1;
9676         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9677                                 nameXMMReg(gregOfRM(modrm)));
9678      } else {
9679         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9680	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
9681	 assign( f32hi, loadLE(Ity_F32,
9682                               binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
9683         delta += 2+alen;
9684         DIP("cvtps2pd %s,%s\n", dis_buf,
9685                                 nameXMMReg(gregOfRM(modrm)) );
9686      }
9687
9688      putXMMRegLane64F( gregOfRM(modrm), 1,
9689                        unop(Iop_F32toF64, mkexpr(f32hi)) );
9690      putXMMRegLane64F( gregOfRM(modrm), 0,
9691                        unop(Iop_F32toF64, mkexpr(f32lo)) );
9692
9693      goto decode_success;
9694   }
9695
9696   /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
9697      I32 in ireg, according to prevailing SSE rounding mode */
9698   /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
9699      I32 in ireg, rounding towards zero */
9700   if (insn[0] == 0xF2 && insn[1] == 0x0F
9701       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
9702      IRTemp rmode = newTemp(Ity_I32);
9703      IRTemp f64lo = newTemp(Ity_F64);
9704      Bool   r2zero = toBool(insn[2] == 0x2C);
9705      vassert(sz == 4);
9706
9707      modrm = getIByte(delta+3);
9708      if (epartIsReg(modrm)) {
9709         delta += 3+1;
9710	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9711         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9712                                   nameXMMReg(eregOfRM(modrm)),
9713                                   nameIReg(4, gregOfRM(modrm)));
9714      } else {
9715         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9716	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9717         delta += 3+alen;
9718         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9719                                   dis_buf,
9720                                   nameIReg(4, gregOfRM(modrm)));
9721      }
9722
9723      if (r2zero) {
9724         assign( rmode, mkU32((UInt)Irrm_ZERO) );
9725      } else {
9726         assign( rmode, get_sse_roundingmode() );
9727      }
9728
9729      putIReg(4, gregOfRM(modrm),
9730                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
9731
9732      goto decode_success;
9733   }
9734
9735   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
9736      low 1/4 xmm(G), according to prevailing SSE rounding mode */
9737   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
9738      IRTemp rmode = newTemp(Ity_I32);
9739      IRTemp f64lo = newTemp(Ity_F64);
9740      vassert(sz == 4);
9741
9742      modrm = getIByte(delta+3);
9743      if (epartIsReg(modrm)) {
9744         delta += 3+1;
9745	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9746         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9747                                 nameXMMReg(gregOfRM(modrm)));
9748      } else {
9749         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9750	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9751         delta += 3+alen;
9752         DIP("cvtsd2ss %s,%s\n", dis_buf,
9753                                 nameXMMReg(gregOfRM(modrm)));
9754      }
9755
9756      assign( rmode, get_sse_roundingmode() );
9757      putXMMRegLane32F(
9758         gregOfRM(modrm), 0,
9759         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
9760      );
9761
9762      goto decode_success;
9763   }
9764
9765   /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
9766      half xmm */
9767   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
9768      IRTemp arg32 = newTemp(Ity_I32);
9769      vassert(sz == 4);
9770
9771      modrm = getIByte(delta+3);
9772      if (epartIsReg(modrm)) {
9773         assign( arg32, getIReg(4, eregOfRM(modrm)) );
9774         delta += 3+1;
9775         DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
9776                                 nameXMMReg(gregOfRM(modrm)));
9777      } else {
9778         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9779	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
9780         delta += 3+alen;
9781         DIP("cvtsi2sd %s,%s\n", dis_buf,
9782                                 nameXMMReg(gregOfRM(modrm)) );
9783      }
9784
9785      putXMMRegLane64F(
9786         gregOfRM(modrm), 0,
9787         unop(Iop_I32StoF64, mkexpr(arg32)) );
9788
9789      goto decode_success;
9790   }
9791
9792   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
9793      low half xmm(G) */
9794   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
9795      IRTemp f32lo = newTemp(Ity_F32);
9796      vassert(sz == 4);
9797
9798      modrm = getIByte(delta+3);
9799      if (epartIsReg(modrm)) {
9800         delta += 3+1;
9801	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
9802         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9803                                 nameXMMReg(gregOfRM(modrm)));
9804      } else {
9805         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9806	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9807         delta += 3+alen;
9808         DIP("cvtss2sd %s,%s\n", dis_buf,
9809                                 nameXMMReg(gregOfRM(modrm)));
9810      }
9811
9812      putXMMRegLane64F( gregOfRM(modrm), 0,
9813                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
9814
9815      goto decode_success;
9816   }
9817
9818   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9819      lo half xmm(G), and zero upper half, rounding towards zero */
9820   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
9821      IRTemp argV  = newTemp(Ity_V128);
9822      IRTemp rmode = newTemp(Ity_I32);
9823
9824      modrm = getIByte(delta+2);
9825      if (epartIsReg(modrm)) {
9826         assign( argV, getXMMReg(eregOfRM(modrm)) );
9827         delta += 2+1;
9828         DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9829                                  nameXMMReg(gregOfRM(modrm)));
9830      } else {
9831         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9832	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9833         delta += 2+alen;
9834         DIP("cvttpd2dq %s,%s\n", dis_buf,
9835                                  nameXMMReg(gregOfRM(modrm)) );
9836      }
9837
9838      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9839
9840      t0 = newTemp(Ity_F64);
9841      t1 = newTemp(Ity_F64);
9842      assign( t0, unop(Iop_ReinterpI64asF64,
9843                       unop(Iop_V128to64, mkexpr(argV))) );
9844      assign( t1, unop(Iop_ReinterpI64asF64,
9845                       unop(Iop_V128HIto64, mkexpr(argV))) );
9846
9847#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9848                             mkexpr(rmode),                   \
9849                             mkexpr(_t) )
9850
9851      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9852      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9853      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9854      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9855
9856#     undef CVT
9857
9858      goto decode_success;
9859   }
9860
9861   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9862      xmm(G), rounding towards zero */
9863   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
9864      IRTemp argV  = newTemp(Ity_V128);
9865      IRTemp rmode = newTemp(Ity_I32);
9866      vassert(sz == 4);
9867
9868      modrm = getIByte(delta+3);
9869      if (epartIsReg(modrm)) {
9870         assign( argV, getXMMReg(eregOfRM(modrm)) );
9871         delta += 3+1;
9872         DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9873                                  nameXMMReg(gregOfRM(modrm)));
9874      } else {
9875         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9876	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9877         delta += 3+alen;
9878         DIP("cvttps2dq %s,%s\n", dis_buf,
9879                                  nameXMMReg(gregOfRM(modrm)) );
9880      }
9881
9882      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9883      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9884
9885      /* This is less than ideal.  If it turns out to be a performance
9886	 bottleneck it can be improved. */
9887#     define CVT(_t)                            \
9888        binop( Iop_F64toI32S,                   \
9889               mkexpr(rmode),                   \
9890               unop( Iop_F32toF64,              \
9891                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9892
9893      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9894      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9895      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9896      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9897
9898#     undef CVT
9899
9900      goto decode_success;
9901   }
9902
9903   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
9904   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
9905      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
9906      goto decode_success;
9907   }
9908
9909   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
9910   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
9911      vassert(sz == 4);
9912      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
9913      goto decode_success;
9914   }
9915
9916   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
9917   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
9918   if (insn[0] == 0x0F && insn[1] == 0xAE
9919       && epartIsReg(insn[2])
9920       && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
9921      vassert(sz == 4);
9922      delta += 3;
9923      /* Insert a memory fence.  It's sometimes important that these
9924         are carried through to the generated code. */
9925      stmt( IRStmt_MBE(Imbe_Fence) );
9926      DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
9927      goto decode_success;
9928   }
9929
9930   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
9931   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
9932      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
9933      goto decode_success;
9934   }
9935
9936   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
9937   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
9938      vassert(sz == 4);
9939      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
9940      goto decode_success;
9941   }
9942
9943   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
9944   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
9945      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
9946      goto decode_success;
9947   }
9948
9949   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
9950   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
9951      vassert(sz == 4);
9952      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
9953      goto decode_success;
9954   }
9955
9956   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
9957   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
9958   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
9959   if (sz == 2 && insn[0] == 0x0F
9960       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
9961      HChar* wot = insn[1]==0x28 ? "apd" :
9962                   insn[1]==0x10 ? "upd" : "dqa";
9963      modrm = getIByte(delta+2);
9964      if (epartIsReg(modrm)) {
9965         putXMMReg( gregOfRM(modrm),
9966                    getXMMReg( eregOfRM(modrm) ));
9967         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
9968                                   nameXMMReg(gregOfRM(modrm)));
9969         delta += 2+1;
9970      } else {
9971         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9972         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
9973            gen_SEGV_if_not_16_aligned( addr );
9974         putXMMReg( gregOfRM(modrm),
9975                    loadLE(Ity_V128, mkexpr(addr)) );
9976         DIP("mov%s %s,%s\n", wot, dis_buf,
9977                                   nameXMMReg(gregOfRM(modrm)));
9978         delta += 2+alen;
9979      }
9980      goto decode_success;
9981   }
9982
9983   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
9984   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
9985   if (sz == 2 && insn[0] == 0x0F
9986       && (insn[1] == 0x29 || insn[1] == 0x11)) {
9987      HChar* wot = insn[1]==0x29 ? "apd" : "upd";
9988      modrm = getIByte(delta+2);
9989      if (epartIsReg(modrm)) {
9990         /* fall through; awaiting test case */
9991      } else {
9992         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9993         if (insn[1] == 0x29/*movapd*/)
9994            gen_SEGV_if_not_16_aligned( addr );
9995         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9996         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
9997                                   dis_buf );
9998         delta += 2+alen;
9999         goto decode_success;
10000      }
10001   }
10002
10003   /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
10004   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
10005      modrm = getIByte(delta+2);
10006      if (epartIsReg(modrm)) {
10007         delta += 2+1;
10008         putXMMReg(
10009            gregOfRM(modrm),
10010            unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
10011         );
10012         DIP("movd %s, %s\n",
10013             nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
10014      } else {
10015         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10016         delta += 2+alen;
10017         putXMMReg(
10018            gregOfRM(modrm),
10019            unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
10020         );
10021         DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
10022      }
10023      goto decode_success;
10024   }
10025
10026   /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
10027   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
10028      modrm = getIByte(delta+2);
10029      if (epartIsReg(modrm)) {
10030         delta += 2+1;
10031         putIReg( 4, eregOfRM(modrm),
10032                  getXMMRegLane32(gregOfRM(modrm), 0) );
10033         DIP("movd %s, %s\n",
10034             nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
10035      } else {
10036         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10037         delta += 2+alen;
10038         storeLE( mkexpr(addr),
10039                  getXMMRegLane32(gregOfRM(modrm), 0) );
10040         DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10041      }
10042      goto decode_success;
10043   }
10044
10045   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
10046   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
10047      modrm = getIByte(delta+2);
10048      if (epartIsReg(modrm)) {
10049         delta += 2+1;
10050         putXMMReg( eregOfRM(modrm),
10051                    getXMMReg(gregOfRM(modrm)) );
10052         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10053                                nameXMMReg(eregOfRM(modrm)));
10054      } else {
10055         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10056         delta += 2+alen;
10057         gen_SEGV_if_not_16_aligned( addr );
10058         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10059         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10060      }
10061      goto decode_success;
10062   }
10063
10064   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
10065   /* Unfortunately can't simply use the MOVDQA case since the
10066      prefix lengths are different (66 vs F3) */
10067   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
10068      vassert(sz == 4);
10069      modrm = getIByte(delta+3);
10070      if (epartIsReg(modrm)) {
10071         putXMMReg( gregOfRM(modrm),
10072                    getXMMReg( eregOfRM(modrm) ));
10073         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10074                               nameXMMReg(gregOfRM(modrm)));
10075         delta += 3+1;
10076      } else {
10077         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10078         putXMMReg( gregOfRM(modrm),
10079                    loadLE(Ity_V128, mkexpr(addr)) );
10080         DIP("movdqu %s,%s\n", dis_buf,
10081                               nameXMMReg(gregOfRM(modrm)));
10082         delta += 3+alen;
10083      }
10084      goto decode_success;
10085   }
10086
10087   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
10088   /* Unfortunately can't simply use the MOVDQA case since the
10089      prefix lengths are different (66 vs F3) */
10090   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
10091      vassert(sz == 4);
10092      modrm = getIByte(delta+3);
10093      if (epartIsReg(modrm)) {
10094         delta += 3+1;
10095         putXMMReg( eregOfRM(modrm),
10096                    getXMMReg(gregOfRM(modrm)) );
10097         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10098                                nameXMMReg(eregOfRM(modrm)));
10099      } else {
10100         addr = disAMode( &alen, sorb, delta+3, dis_buf );
10101         delta += 3+alen;
10102         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10103         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10104      }
10105      goto decode_success;
10106   }
10107
10108   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
10109   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
10110      vassert(sz == 4);
10111      modrm = getIByte(delta+3);
10112      if (epartIsReg(modrm)) {
10113         do_MMX_preamble();
10114         putMMXReg( gregOfRM(modrm),
10115                    getXMMRegLane64( eregOfRM(modrm), 0 ));
10116         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10117                                nameMMXReg(gregOfRM(modrm)));
10118         delta += 3+1;
10119         goto decode_success;
10120      } else {
10121         /* fall through, apparently no mem case for this insn */
10122      }
10123   }
10124
10125   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
10126   /* These seems identical to MOVHPS.  This instruction encoding is
10127      completely crazy. */
10128   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
10129      modrm = getIByte(delta+2);
10130      if (epartIsReg(modrm)) {
10131         /* fall through; apparently reg-reg is not possible */
10132      } else {
10133         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10134         delta += 2+alen;
10135         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
10136                          loadLE(Ity_I64, mkexpr(addr)) );
10137         DIP("movhpd %s,%s\n", dis_buf,
10138                               nameXMMReg( gregOfRM(modrm) ));
10139         goto decode_success;
10140      }
10141   }
10142
10143   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
10144   /* Again, this seems identical to MOVHPS. */
10145   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
10146      if (!epartIsReg(insn[2])) {
10147         delta += 2;
10148         addr = disAMode ( &alen, sorb, delta, dis_buf );
10149         delta += alen;
10150         storeLE( mkexpr(addr),
10151                  getXMMRegLane64( gregOfRM(insn[2]),
10152                                   1/*upper lane*/ ) );
10153         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
10154                               dis_buf);
10155         goto decode_success;
10156      }
10157      /* else fall through */
10158   }
10159
10160   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
10161   /* Identical to MOVLPS ? */
10162   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
10163      modrm = getIByte(delta+2);
10164      if (epartIsReg(modrm)) {
10165         /* fall through; apparently reg-reg is not possible */
10166      } else {
10167         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10168         delta += 2+alen;
10169         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
10170                          loadLE(Ity_I64, mkexpr(addr)) );
10171         DIP("movlpd %s, %s\n",
10172             dis_buf, nameXMMReg( gregOfRM(modrm) ));
10173         goto decode_success;
10174      }
10175   }
10176
10177   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
10178   /* Identical to MOVLPS ? */
10179   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
10180      if (!epartIsReg(insn[2])) {
10181         delta += 2;
10182         addr = disAMode ( &alen, sorb, delta, dis_buf );
10183         delta += alen;
10184         storeLE( mkexpr(addr),
10185                  getXMMRegLane64( gregOfRM(insn[2]),
10186                                   0/*lower lane*/ ) );
10187         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
10188                                dis_buf);
10189         goto decode_success;
10190      }
10191      /* else fall through */
10192   }
10193
10194   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
10195      2 lowest bits of ireg(G) */
10196   if (insn[0] == 0x0F && insn[1] == 0x50) {
10197      modrm = getIByte(delta+2);
10198      if (sz == 2 && epartIsReg(modrm)) {
10199         Int src;
10200         t0 = newTemp(Ity_I32);
10201         t1 = newTemp(Ity_I32);
10202         delta += 2+1;
10203         src = eregOfRM(modrm);
10204         assign( t0, binop( Iop_And32,
10205                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
10206                            mkU32(1) ));
10207         assign( t1, binop( Iop_And32,
10208                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
10209                            mkU32(2) ));
10210         putIReg(4, gregOfRM(modrm),
10211                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
10212                 );
10213         DIP("movmskpd %s,%s\n", nameXMMReg(src),
10214                                 nameIReg(4, gregOfRM(modrm)));
10215         goto decode_success;
10216      }
10217      /* else fall through */
10218   }
10219
10220   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
10221   if (insn[0] == 0x0F && insn[1] == 0xF7) {
10222      modrm = getIByte(delta+2);
10223      if (sz == 2 && epartIsReg(modrm)) {
10224         IRTemp regD    = newTemp(Ity_V128);
10225         IRTemp mask    = newTemp(Ity_V128);
10226         IRTemp olddata = newTemp(Ity_V128);
10227         IRTemp newdata = newTemp(Ity_V128);
10228                addr    = newTemp(Ity_I32);
10229
10230         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
10231         assign( regD, getXMMReg( gregOfRM(modrm) ));
10232
10233         /* Unfortunately can't do the obvious thing with SarN8x16
10234            here since that can't be re-emitted as SSE2 code - no such
10235            insn. */
10236	 assign(
10237            mask,
10238            binop(Iop_64HLtoV128,
10239                  binop(Iop_SarN8x8,
10240                        getXMMRegLane64( eregOfRM(modrm), 1 ),
10241                        mkU8(7) ),
10242                  binop(Iop_SarN8x8,
10243                        getXMMRegLane64( eregOfRM(modrm), 0 ),
10244                        mkU8(7) ) ));
10245         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10246         assign( newdata,
10247                 binop(Iop_OrV128,
10248                       binop(Iop_AndV128,
10249                             mkexpr(regD),
10250                             mkexpr(mask) ),
10251                       binop(Iop_AndV128,
10252                             mkexpr(olddata),
10253                             unop(Iop_NotV128, mkexpr(mask)))) );
10254         storeLE( mkexpr(addr), mkexpr(newdata) );
10255
10256         delta += 2+1;
10257         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
10258                                   nameXMMReg( gregOfRM(modrm) ) );
10259         goto decode_success;
10260      }
10261      /* else fall through */
10262   }
10263
10264   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
10265   if (insn[0] == 0x0F && insn[1] == 0xE7) {
10266      modrm = getIByte(delta+2);
10267      if (sz == 2 && !epartIsReg(modrm)) {
10268         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10269         gen_SEGV_if_not_16_aligned( addr );
10270         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10271         DIP("movntdq %s,%s\n", dis_buf,
10272                                nameXMMReg(gregOfRM(modrm)));
10273         delta += 2+alen;
10274         goto decode_success;
10275      }
10276      /* else fall through */
10277   }
10278
10279   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
10280   if (insn[0] == 0x0F && insn[1] == 0xC3) {
10281      vassert(sz == 4);
10282      modrm = getIByte(delta+2);
10283      if (!epartIsReg(modrm)) {
10284         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10285         storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
10286         DIP("movnti %s,%s\n", dis_buf,
10287                               nameIReg(4, gregOfRM(modrm)));
10288         delta += 2+alen;
10289         goto decode_success;
10290      }
10291      /* else fall through */
10292   }
10293
10294   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
10295      or lo half xmm).  */
10296   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
10297      modrm = getIByte(delta+2);
10298      if (epartIsReg(modrm)) {
10299         /* fall through, awaiting test case */
10300         /* dst: lo half copied, hi half zeroed */
10301      } else {
10302         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10303         storeLE( mkexpr(addr),
10304                  getXMMRegLane64( gregOfRM(modrm), 0 ));
10305         DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
10306         delta += 2+alen;
10307         goto decode_success;
10308      }
10309   }
10310
10311   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
10312      hi half). */
10313   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
10314      vassert(sz == 4);
10315      modrm = getIByte(delta+3);
10316      if (epartIsReg(modrm)) {
10317         do_MMX_preamble();
10318         putXMMReg( gregOfRM(modrm),
10319                    unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
10320         DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10321                                nameXMMReg(gregOfRM(modrm)));
10322         delta += 3+1;
10323         goto decode_success;
10324      } else {
10325         /* fall through, apparently no mem case for this insn */
10326      }
10327   }
10328
10329   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
10330      G (lo half xmm).  Upper half of G is zeroed out. */
10331   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
10332      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
10333      If E is reg, upper half of G is unchanged. */
10334   if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
10335       || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
10336      vassert(sz == 4);
10337      modrm = getIByte(delta+3);
10338      if (epartIsReg(modrm)) {
10339         putXMMRegLane64( gregOfRM(modrm), 0,
10340                          getXMMRegLane64( eregOfRM(modrm), 0 ));
10341         if (insn[0] == 0xF3/*MOVQ*/) {
10342            /* zero bits 127:64 */
10343            putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10344         }
10345         DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10346                              nameXMMReg(gregOfRM(modrm)));
10347         delta += 3+1;
10348      } else {
10349         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10350         /* zero bits 127:64 */
10351         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10352         /* write bits 63:0 */
10353         putXMMRegLane64( gregOfRM(modrm), 0,
10354                          loadLE(Ity_I64, mkexpr(addr)) );
10355         DIP("movsd %s,%s\n", dis_buf,
10356                              nameXMMReg(gregOfRM(modrm)));
10357         delta += 3+alen;
10358      }
10359      goto decode_success;
10360   }
10361
10362   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
10363      or lo half xmm). */
10364   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
10365      vassert(sz == 4);
10366      modrm = getIByte(delta+3);
10367      if (epartIsReg(modrm)) {
10368         putXMMRegLane64( eregOfRM(modrm), 0,
10369                          getXMMRegLane64( gregOfRM(modrm), 0 ));
10370         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10371                              nameXMMReg(eregOfRM(modrm)));
10372         delta += 3+1;
10373      } else {
10374         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10375         storeLE( mkexpr(addr),
10376                  getXMMRegLane64(gregOfRM(modrm), 0) );
10377         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10378                              dis_buf);
10379         delta += 3+alen;
10380      }
10381      goto decode_success;
10382   }
10383
10384   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
10385   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
10386      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
10387      goto decode_success;
10388   }
10389
10390   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
10391   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
10392      vassert(sz == 4);
10393      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
10394      goto decode_success;
10395   }
10396
10397   /* 66 0F 56 = ORPD -- G = G and E */
10398   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
10399      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
10400      goto decode_success;
10401   }
10402
10403   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
10404   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
10405      Int    select;
10406      IRTemp sV = newTemp(Ity_V128);
10407      IRTemp dV = newTemp(Ity_V128);
10408      IRTemp s1 = newTemp(Ity_I64);
10409      IRTemp s0 = newTemp(Ity_I64);
10410      IRTemp d1 = newTemp(Ity_I64);
10411      IRTemp d0 = newTemp(Ity_I64);
10412
10413      modrm = insn[2];
10414      assign( dV, getXMMReg(gregOfRM(modrm)) );
10415
10416      if (epartIsReg(modrm)) {
10417         assign( sV, getXMMReg(eregOfRM(modrm)) );
10418         select = (Int)insn[3];
10419         delta += 2+2;
10420         DIP("shufpd $%d,%s,%s\n", select,
10421                                   nameXMMReg(eregOfRM(modrm)),
10422                                   nameXMMReg(gregOfRM(modrm)));
10423      } else {
10424         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10425         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10426         select = (Int)insn[2+alen];
10427         delta += 3+alen;
10428         DIP("shufpd $%d,%s,%s\n", select,
10429                                   dis_buf,
10430                                   nameXMMReg(gregOfRM(modrm)));
10431      }
10432
10433      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10434      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10435      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10436      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10437
10438#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
10439#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
10440
10441      putXMMReg(
10442         gregOfRM(modrm),
10443         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
10444      );
10445
10446#     undef SELD
10447#     undef SELS
10448
10449      goto decode_success;
10450   }
10451
10452   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
10453   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
10454      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
10455                                        "sqrtpd", Iop_Sqrt64Fx2 );
10456      goto decode_success;
10457   }
10458
10459   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
10460   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
10461      vassert(sz == 4);
10462      delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
10463                                         "sqrtsd", Iop_Sqrt64F0x2 );
10464      goto decode_success;
10465   }
10466
10467   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
10468   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
10469      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
10470      goto decode_success;
10471   }
10472
10473   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
10474   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
10475      vassert(sz == 4);
10476      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
10477      goto decode_success;
10478   }
10479
10480   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
10481   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
10482   /* These just appear to be special cases of SHUFPS */
10483   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
10484      IRTemp s1 = newTemp(Ity_I64);
10485      IRTemp s0 = newTemp(Ity_I64);
10486      IRTemp d1 = newTemp(Ity_I64);
10487      IRTemp d0 = newTemp(Ity_I64);
10488      IRTemp sV = newTemp(Ity_V128);
10489      IRTemp dV = newTemp(Ity_V128);
10490      Bool   hi = toBool(insn[1] == 0x15);
10491
10492      modrm = insn[2];
10493      assign( dV, getXMMReg(gregOfRM(modrm)) );
10494
10495      if (epartIsReg(modrm)) {
10496         assign( sV, getXMMReg(eregOfRM(modrm)) );
10497         delta += 2+1;
10498         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10499                                  nameXMMReg(eregOfRM(modrm)),
10500                                  nameXMMReg(gregOfRM(modrm)));
10501      } else {
10502         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10503         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10504         delta += 2+alen;
10505         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10506                                  dis_buf,
10507                                  nameXMMReg(gregOfRM(modrm)));
10508      }
10509
10510      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10511      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10512      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10513      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10514
10515      if (hi) {
10516         putXMMReg( gregOfRM(modrm),
10517                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
10518      } else {
10519         putXMMReg( gregOfRM(modrm),
10520                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
10521      }
10522
10523      goto decode_success;
10524   }
10525
10526   /* 66 0F 57 = XORPD -- G = G and E */
10527   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
10528      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
10529      goto decode_success;
10530   }
10531
10532   /* 66 0F 6B = PACKSSDW */
10533   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
10534      delta = dis_SSEint_E_to_G( sorb, delta+2,
10535                                 "packssdw",
10536                                 Iop_QNarrowBin32Sto16Sx8, True );
10537      goto decode_success;
10538   }
10539
10540   /* 66 0F 63 = PACKSSWB */
10541   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
10542      delta = dis_SSEint_E_to_G( sorb, delta+2,
10543                                 "packsswb",
10544                                 Iop_QNarrowBin16Sto8Sx16, True );
10545      goto decode_success;
10546   }
10547
10548   /* 66 0F 67 = PACKUSWB */
10549   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
10550      delta = dis_SSEint_E_to_G( sorb, delta+2,
10551                                 "packuswb",
10552                                 Iop_QNarrowBin16Sto8Ux16, True );
10553      goto decode_success;
10554   }
10555
10556   /* 66 0F FC = PADDB */
10557   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
10558      delta = dis_SSEint_E_to_G( sorb, delta+2,
10559                                 "paddb", Iop_Add8x16, False );
10560      goto decode_success;
10561   }
10562
10563   /* 66 0F FE = PADDD */
10564   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
10565      delta = dis_SSEint_E_to_G( sorb, delta+2,
10566                                 "paddd", Iop_Add32x4, False );
10567      goto decode_success;
10568   }
10569
10570   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10571   /* 0F D4 = PADDQ -- add 64x1 */
10572   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
10573      do_MMX_preamble();
10574      delta = dis_MMXop_regmem_to_reg (
10575                sorb, delta+2, insn[1], "paddq", False );
10576      goto decode_success;
10577   }
10578
10579   /* 66 0F D4 = PADDQ */
10580   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
10581      delta = dis_SSEint_E_to_G( sorb, delta+2,
10582                                 "paddq", Iop_Add64x2, False );
10583      goto decode_success;
10584   }
10585
10586   /* 66 0F FD = PADDW */
10587   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
10588      delta = dis_SSEint_E_to_G( sorb, delta+2,
10589                                 "paddw", Iop_Add16x8, False );
10590      goto decode_success;
10591   }
10592
10593   /* 66 0F EC = PADDSB */
10594   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
10595      delta = dis_SSEint_E_to_G( sorb, delta+2,
10596                                 "paddsb", Iop_QAdd8Sx16, False );
10597      goto decode_success;
10598   }
10599
10600   /* 66 0F ED = PADDSW */
10601   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
10602      delta = dis_SSEint_E_to_G( sorb, delta+2,
10603                                 "paddsw", Iop_QAdd16Sx8, False );
10604      goto decode_success;
10605   }
10606
10607   /* 66 0F DC = PADDUSB */
10608   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
10609      delta = dis_SSEint_E_to_G( sorb, delta+2,
10610                                 "paddusb", Iop_QAdd8Ux16, False );
10611      goto decode_success;
10612   }
10613
10614   /* 66 0F DD = PADDUSW */
10615   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
10616      delta = dis_SSEint_E_to_G( sorb, delta+2,
10617                                 "paddusw", Iop_QAdd16Ux8, False );
10618      goto decode_success;
10619   }
10620
10621   /* 66 0F DB = PAND */
10622   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
10623      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
10624      goto decode_success;
10625   }
10626
10627   /* 66 0F DF = PANDN */
10628   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
10629      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
10630      goto decode_success;
10631   }
10632
10633   /* 66 0F E0 = PAVGB */
10634   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
10635      delta = dis_SSEint_E_to_G( sorb, delta+2,
10636                                 "pavgb", Iop_Avg8Ux16, False );
10637      goto decode_success;
10638   }
10639
10640   /* 66 0F E3 = PAVGW */
10641   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
10642      delta = dis_SSEint_E_to_G( sorb, delta+2,
10643                                 "pavgw", Iop_Avg16Ux8, False );
10644      goto decode_success;
10645   }
10646
10647   /* 66 0F 74 = PCMPEQB */
10648   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
10649      delta = dis_SSEint_E_to_G( sorb, delta+2,
10650                                 "pcmpeqb", Iop_CmpEQ8x16, False );
10651      goto decode_success;
10652   }
10653
10654   /* 66 0F 76 = PCMPEQD */
10655   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
10656      delta = dis_SSEint_E_to_G( sorb, delta+2,
10657                                 "pcmpeqd", Iop_CmpEQ32x4, False );
10658      goto decode_success;
10659   }
10660
10661   /* 66 0F 75 = PCMPEQW */
10662   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
10663      delta = dis_SSEint_E_to_G( sorb, delta+2,
10664                                 "pcmpeqw", Iop_CmpEQ16x8, False );
10665      goto decode_success;
10666   }
10667
10668   /* 66 0F 64 = PCMPGTB */
10669   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
10670      delta = dis_SSEint_E_to_G( sorb, delta+2,
10671                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
10672      goto decode_success;
10673   }
10674
10675   /* 66 0F 66 = PCMPGTD */
10676   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
10677      delta = dis_SSEint_E_to_G( sorb, delta+2,
10678                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
10679      goto decode_success;
10680   }
10681
10682   /* 66 0F 65 = PCMPGTW */
10683   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
10684      delta = dis_SSEint_E_to_G( sorb, delta+2,
10685                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
10686      goto decode_success;
10687   }
10688
10689   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
10690      zero-extend of it in ireg(G). */
10691   if (insn[0] == 0x0F && insn[1] == 0xC5) {
10692      modrm = insn[2];
10693      if (sz == 2 && epartIsReg(modrm)) {
10694         t5 = newTemp(Ity_V128);
10695         t4 = newTemp(Ity_I16);
10696         assign(t5, getXMMReg(eregOfRM(modrm)));
10697         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
10698         switch (insn[3] & 7) {
10699            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
10700            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
10701            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
10702            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
10703            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
10704            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
10705            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
10706            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
10707            default: vassert(0); /*NOTREACHED*/
10708         }
10709         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
10710         DIP("pextrw $%d,%s,%s\n",
10711             (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
10712                           nameIReg(4,gregOfRM(modrm)));
10713         delta += 4;
10714         goto decode_success;
10715      }
10716      /* else fall through */
10717   }
10718
10719   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
10720      put it into the specified lane of xmm(G). */
10721   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
10722      Int lane;
10723      t4 = newTemp(Ity_I16);
10724      modrm = insn[2];
10725
10726      if (epartIsReg(modrm)) {
10727         assign(t4, getIReg(2, eregOfRM(modrm)));
10728         delta += 3+1;
10729         lane = insn[3+1-1];
10730         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10731                                   nameIReg(2,eregOfRM(modrm)),
10732                                   nameXMMReg(gregOfRM(modrm)));
10733      } else {
10734         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10735         delta += 3+alen;
10736         lane = insn[3+alen-1];
10737         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
10738         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10739                                   dis_buf,
10740                                   nameXMMReg(gregOfRM(modrm)));
10741      }
10742
10743      putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
10744      goto decode_success;
10745   }
10746
10747   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
10748      E(xmm or mem) to G(xmm) */
10749   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
10750      IRTemp s1V  = newTemp(Ity_V128);
10751      IRTemp s2V  = newTemp(Ity_V128);
10752      IRTemp dV   = newTemp(Ity_V128);
10753      IRTemp s1Hi = newTemp(Ity_I64);
10754      IRTemp s1Lo = newTemp(Ity_I64);
10755      IRTemp s2Hi = newTemp(Ity_I64);
10756      IRTemp s2Lo = newTemp(Ity_I64);
10757      IRTemp dHi  = newTemp(Ity_I64);
10758      IRTemp dLo  = newTemp(Ity_I64);
10759      modrm = insn[2];
10760      if (epartIsReg(modrm)) {
10761         assign( s1V, getXMMReg(eregOfRM(modrm)) );
10762         delta += 2+1;
10763         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10764                                nameXMMReg(gregOfRM(modrm)));
10765      } else {
10766         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10767         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
10768         delta += 2+alen;
10769         DIP("pmaddwd %s,%s\n", dis_buf,
10770                                nameXMMReg(gregOfRM(modrm)));
10771      }
10772      assign( s2V, getXMMReg(gregOfRM(modrm)) );
10773      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
10774      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
10775      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
10776      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
10777      assign( dHi, mkIRExprCCall(
10778                      Ity_I64, 0/*regparms*/,
10779                      "x86g_calculate_mmx_pmaddwd",
10780                      &x86g_calculate_mmx_pmaddwd,
10781                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
10782                   ));
10783      assign( dLo, mkIRExprCCall(
10784                      Ity_I64, 0/*regparms*/,
10785                      "x86g_calculate_mmx_pmaddwd",
10786                      &x86g_calculate_mmx_pmaddwd,
10787                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
10788                   ));
10789      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
10790      putXMMReg(gregOfRM(modrm), mkexpr(dV));
10791      goto decode_success;
10792   }
10793
10794   /* 66 0F EE = PMAXSW -- 16x8 signed max */
10795   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
10796      delta = dis_SSEint_E_to_G( sorb, delta+2,
10797                                 "pmaxsw", Iop_Max16Sx8, False );
10798      goto decode_success;
10799   }
10800
10801   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
10802   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
10803      delta = dis_SSEint_E_to_G( sorb, delta+2,
10804                                 "pmaxub", Iop_Max8Ux16, False );
10805      goto decode_success;
10806   }
10807
10808   /* 66 0F EA = PMINSW -- 16x8 signed min */
10809   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
10810      delta = dis_SSEint_E_to_G( sorb, delta+2,
10811                                 "pminsw", Iop_Min16Sx8, False );
10812      goto decode_success;
10813   }
10814
10815   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
10816   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
10817      delta = dis_SSEint_E_to_G( sorb, delta+2,
10818                                 "pminub", Iop_Min8Ux16, False );
10819      goto decode_success;
10820   }
10821
10822   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
10823      xmm(G), turn them into a byte, and put zero-extend of it in
10824      ireg(G).  Doing this directly is just too cumbersome; give up
10825      therefore and call a helper. */
10826   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
10827   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
10828      modrm = insn[2];
10829      if (epartIsReg(modrm)) {
10830         t0 = newTemp(Ity_I64);
10831         t1 = newTemp(Ity_I64);
10832         assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
10833         assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
10834         t5 = newTemp(Ity_I32);
10835         assign(t5, mkIRExprCCall(
10836                       Ity_I32, 0/*regparms*/,
10837                       "x86g_calculate_sse_pmovmskb",
10838                       &x86g_calculate_sse_pmovmskb,
10839                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
10840         putIReg(4, gregOfRM(modrm), mkexpr(t5));
10841         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10842                                 nameIReg(4,gregOfRM(modrm)));
10843         delta += 3;
10844         goto decode_success;
10845      }
10846      /* else fall through */
10847   }
10848
10849   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
10850   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
10851      delta = dis_SSEint_E_to_G( sorb, delta+2,
10852                                 "pmulhuw", Iop_MulHi16Ux8, False );
10853      goto decode_success;
10854   }
10855
10856   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
10857   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
10858      delta = dis_SSEint_E_to_G( sorb, delta+2,
10859                                 "pmulhw", Iop_MulHi16Sx8, False );
10860      goto decode_success;
10861   }
10862
10863   /* 66 0F D5 = PMULHL -- 16x8 multiply */
10864   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
10865      delta = dis_SSEint_E_to_G( sorb, delta+2,
10866                                 "pmullw", Iop_Mul16x8, False );
10867      goto decode_success;
10868   }
10869
10870   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10871   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
10872      0 to form 64-bit result */
10873   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
10874      IRTemp sV = newTemp(Ity_I64);
10875      IRTemp dV = newTemp(Ity_I64);
10876      t1 = newTemp(Ity_I32);
10877      t0 = newTemp(Ity_I32);
10878      modrm = insn[2];
10879
10880      do_MMX_preamble();
10881      assign( dV, getMMXReg(gregOfRM(modrm)) );
10882
10883      if (epartIsReg(modrm)) {
10884         assign( sV, getMMXReg(eregOfRM(modrm)) );
10885         delta += 2+1;
10886         DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10887                                nameMMXReg(gregOfRM(modrm)));
10888      } else {
10889         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10890         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
10891         delta += 2+alen;
10892         DIP("pmuludq %s,%s\n", dis_buf,
10893                                nameMMXReg(gregOfRM(modrm)));
10894      }
10895
10896      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
10897      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
10898      putMMXReg( gregOfRM(modrm),
10899                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
10900      goto decode_success;
10901   }
10902
10903   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
10904      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
10905      half */
10906   /* This is a really poor translation -- could be improved if
10907      performance critical */
10908   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
10909      IRTemp sV, dV;
10910      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10911      sV = newTemp(Ity_V128);
10912      dV = newTemp(Ity_V128);
10913      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10914      t1 = newTemp(Ity_I64);
10915      t0 = newTemp(Ity_I64);
10916      modrm = insn[2];
10917      assign( dV, getXMMReg(gregOfRM(modrm)) );
10918
10919      if (epartIsReg(modrm)) {
10920         assign( sV, getXMMReg(eregOfRM(modrm)) );
10921         delta += 2+1;
10922         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10923                                nameXMMReg(gregOfRM(modrm)));
10924      } else {
10925         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10926         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10927         delta += 2+alen;
10928         DIP("pmuludq %s,%s\n", dis_buf,
10929                                nameXMMReg(gregOfRM(modrm)));
10930      }
10931
10932      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
10933      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
10934
10935      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
10936      putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
10937      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
10938      putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
10939      goto decode_success;
10940   }
10941
10942   /* 66 0F EB = POR */
10943   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
10944      delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
10945      goto decode_success;
10946   }
10947
10948   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
10949      from E(xmm or mem) to G(xmm) */
10950   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
10951      IRTemp s1V  = newTemp(Ity_V128);
10952      IRTemp s2V  = newTemp(Ity_V128);
10953      IRTemp dV   = newTemp(Ity_V128);
10954      IRTemp s1Hi = newTemp(Ity_I64);
10955      IRTemp s1Lo = newTemp(Ity_I64);
10956      IRTemp s2Hi = newTemp(Ity_I64);
10957      IRTemp s2Lo = newTemp(Ity_I64);
10958      IRTemp dHi  = newTemp(Ity_I64);
10959      IRTemp dLo  = newTemp(Ity_I64);
10960      modrm = insn[2];
10961      if (epartIsReg(modrm)) {
10962         assign( s1V, getXMMReg(eregOfRM(modrm)) );
10963         delta += 2+1;
10964         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10965                               nameXMMReg(gregOfRM(modrm)));
10966      } else {
10967         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10968         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
10969         delta += 2+alen;
10970         DIP("psadbw %s,%s\n", dis_buf,
10971                               nameXMMReg(gregOfRM(modrm)));
10972      }
10973      assign( s2V, getXMMReg(gregOfRM(modrm)) );
10974      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
10975      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
10976      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
10977      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
10978      assign( dHi, mkIRExprCCall(
10979                      Ity_I64, 0/*regparms*/,
10980                      "x86g_calculate_mmx_psadbw",
10981                      &x86g_calculate_mmx_psadbw,
10982                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
10983                   ));
10984      assign( dLo, mkIRExprCCall(
10985                      Ity_I64, 0/*regparms*/,
10986                      "x86g_calculate_mmx_psadbw",
10987                      &x86g_calculate_mmx_psadbw,
10988                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
10989                   ));
10990      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
10991      putXMMReg(gregOfRM(modrm), mkexpr(dV));
10992      goto decode_success;
10993   }
10994
10995   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
10996   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
10997      Int order;
10998      IRTemp sV, dV, s3, s2, s1, s0;
10999      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11000      sV = newTemp(Ity_V128);
11001      dV = newTemp(Ity_V128);
11002      modrm = insn[2];
11003      if (epartIsReg(modrm)) {
11004         assign( sV, getXMMReg(eregOfRM(modrm)) );
11005         order = (Int)insn[3];
11006         delta += 2+2;
11007         DIP("pshufd $%d,%s,%s\n", order,
11008                                   nameXMMReg(eregOfRM(modrm)),
11009                                   nameXMMReg(gregOfRM(modrm)));
11010      } else {
11011         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11012         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11013	 order = (Int)insn[2+alen];
11014         delta += 3+alen;
11015         DIP("pshufd $%d,%s,%s\n", order,
11016                                   dis_buf,
11017                                   nameXMMReg(gregOfRM(modrm)));
11018      }
11019      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11020
11021#     define SEL(n) \
11022                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11023      assign(dV,
11024	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
11025                           SEL((order>>2)&3), SEL((order>>0)&3) )
11026      );
11027      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11028#     undef SEL
11029      goto decode_success;
11030   }
11031
11032   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
11033      mem) to G(xmm), and copy lower half */
11034   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
11035      Int order;
11036      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
11037      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11038      sV   = newTemp(Ity_V128);
11039      dV   = newTemp(Ity_V128);
11040      sVhi = newTemp(Ity_I64);
11041      dVhi = newTemp(Ity_I64);
11042      modrm = insn[3];
11043      if (epartIsReg(modrm)) {
11044         assign( sV, getXMMReg(eregOfRM(modrm)) );
11045         order = (Int)insn[4];
11046         delta += 4+1;
11047         DIP("pshufhw $%d,%s,%s\n", order,
11048                                    nameXMMReg(eregOfRM(modrm)),
11049                                    nameXMMReg(gregOfRM(modrm)));
11050      } else {
11051         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11052         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11053	 order = (Int)insn[3+alen];
11054         delta += 4+alen;
11055         DIP("pshufhw $%d,%s,%s\n", order,
11056                                    dis_buf,
11057                                    nameXMMReg(gregOfRM(modrm)));
11058      }
11059      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
11060      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
11061
11062#     define SEL(n) \
11063                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11064      assign(dVhi,
11065	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11066                          SEL((order>>2)&3), SEL((order>>0)&3) )
11067      );
11068      assign(dV, binop( Iop_64HLtoV128,
11069                        mkexpr(dVhi),
11070                        unop(Iop_V128to64, mkexpr(sV))) );
11071      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11072#     undef SEL
11073      goto decode_success;
11074   }
11075
11076   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
11077      mem) to G(xmm), and copy upper half */
11078   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
11079      Int order;
11080      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
11081      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11082      sV   = newTemp(Ity_V128);
11083      dV   = newTemp(Ity_V128);
11084      sVlo = newTemp(Ity_I64);
11085      dVlo = newTemp(Ity_I64);
11086      modrm = insn[3];
11087      if (epartIsReg(modrm)) {
11088         assign( sV, getXMMReg(eregOfRM(modrm)) );
11089         order = (Int)insn[4];
11090         delta += 4+1;
11091         DIP("pshuflw $%d,%s,%s\n", order,
11092                                    nameXMMReg(eregOfRM(modrm)),
11093                                    nameXMMReg(gregOfRM(modrm)));
11094      } else {
11095         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11096         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11097	 order = (Int)insn[3+alen];
11098         delta += 4+alen;
11099         DIP("pshuflw $%d,%s,%s\n", order,
11100                                    dis_buf,
11101                                    nameXMMReg(gregOfRM(modrm)));
11102      }
11103      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
11104      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
11105
11106#     define SEL(n) \
11107                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11108      assign(dVlo,
11109	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11110                          SEL((order>>2)&3), SEL((order>>0)&3) )
11111      );
11112      assign(dV, binop( Iop_64HLtoV128,
11113                        unop(Iop_V128HIto64, mkexpr(sV)),
11114                        mkexpr(dVlo) ) );
11115      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11116#     undef SEL
11117      goto decode_success;
11118   }
11119
11120   /* 66 0F 72 /6 ib = PSLLD by immediate */
11121   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11122       && epartIsReg(insn[2])
11123       && gregOfRM(insn[2]) == 6) {
11124      delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
11125      goto decode_success;
11126   }
11127
11128   /* 66 0F F2 = PSLLD by E */
11129   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
11130      delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
11131      goto decode_success;
11132   }
11133
11134   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
11135   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11136       && epartIsReg(insn[2])
11137       && gregOfRM(insn[2]) == 7) {
11138      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11139      Int    imm = (Int)insn[3];
11140      Int    reg = eregOfRM(insn[2]);
11141      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
11142      vassert(imm >= 0 && imm <= 255);
11143      delta += 4;
11144
11145      sV    = newTemp(Ity_V128);
11146      dV    = newTemp(Ity_V128);
11147      hi64  = newTemp(Ity_I64);
11148      lo64  = newTemp(Ity_I64);
11149      hi64r = newTemp(Ity_I64);
11150      lo64r = newTemp(Ity_I64);
11151
11152      if (imm >= 16) {
11153         putXMMReg(reg, mkV128(0x0000));
11154         goto decode_success;
11155      }
11156
11157      assign( sV, getXMMReg(reg) );
11158      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11159      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11160
11161      if (imm == 0) {
11162         assign( lo64r, mkexpr(lo64) );
11163         assign( hi64r, mkexpr(hi64) );
11164      }
11165      else
11166      if (imm == 8) {
11167         assign( lo64r, mkU64(0) );
11168         assign( hi64r, mkexpr(lo64) );
11169      }
11170      else
11171      if (imm > 8) {
11172         assign( lo64r, mkU64(0) );
11173         assign( hi64r, binop( Iop_Shl64,
11174                               mkexpr(lo64),
11175                               mkU8( 8*(imm-8) ) ));
11176      } else {
11177         assign( lo64r, binop( Iop_Shl64,
11178                               mkexpr(lo64),
11179                               mkU8(8 * imm) ));
11180         assign( hi64r,
11181                 binop( Iop_Or64,
11182                        binop(Iop_Shl64, mkexpr(hi64),
11183                                         mkU8(8 * imm)),
11184                        binop(Iop_Shr64, mkexpr(lo64),
11185                                         mkU8(8 * (8 - imm)) )
11186                      )
11187               );
11188      }
11189      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11190      putXMMReg(reg, mkexpr(dV));
11191      goto decode_success;
11192   }
11193
11194   /* 66 0F 73 /6 ib = PSLLQ by immediate */
11195   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11196       && epartIsReg(insn[2])
11197       && gregOfRM(insn[2]) == 6) {
11198      delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
11199      goto decode_success;
11200   }
11201
11202   /* 66 0F F3 = PSLLQ by E */
11203   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
11204      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
11205      goto decode_success;
11206   }
11207
11208   /* 66 0F 71 /6 ib = PSLLW by immediate */
11209   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11210       && epartIsReg(insn[2])
11211       && gregOfRM(insn[2]) == 6) {
11212      delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
11213      goto decode_success;
11214   }
11215
11216   /* 66 0F F1 = PSLLW by E */
11217   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
11218      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
11219      goto decode_success;
11220   }
11221
11222   /* 66 0F 72 /4 ib = PSRAD by immediate */
11223   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11224       && epartIsReg(insn[2])
11225       && gregOfRM(insn[2]) == 4) {
11226      delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
11227      goto decode_success;
11228   }
11229
11230   /* 66 0F E2 = PSRAD by E */
11231   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
11232      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
11233      goto decode_success;
11234   }
11235
11236   /* 66 0F 71 /4 ib = PSRAW by immediate */
11237   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11238       && epartIsReg(insn[2])
11239       && gregOfRM(insn[2]) == 4) {
11240      delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
11241      goto decode_success;
11242   }
11243
11244   /* 66 0F E1 = PSRAW by E */
11245   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
11246      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
11247      goto decode_success;
11248   }
11249
11250   /* 66 0F 72 /2 ib = PSRLD by immediate */
11251   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11252       && epartIsReg(insn[2])
11253       && gregOfRM(insn[2]) == 2) {
11254      delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
11255      goto decode_success;
11256   }
11257
11258   /* 66 0F D2 = PSRLD by E */
11259   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
11260      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
11261      goto decode_success;
11262   }
11263
11264   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
11265   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11266       && epartIsReg(insn[2])
11267       && gregOfRM(insn[2]) == 3) {
11268      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11269      Int    imm = (Int)insn[3];
11270      Int    reg = eregOfRM(insn[2]);
11271      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
11272      vassert(imm >= 0 && imm <= 255);
11273      delta += 4;
11274
11275      sV    = newTemp(Ity_V128);
11276      dV    = newTemp(Ity_V128);
11277      hi64  = newTemp(Ity_I64);
11278      lo64  = newTemp(Ity_I64);
11279      hi64r = newTemp(Ity_I64);
11280      lo64r = newTemp(Ity_I64);
11281
11282      if (imm >= 16) {
11283         putXMMReg(reg, mkV128(0x0000));
11284         goto decode_success;
11285      }
11286
11287      assign( sV, getXMMReg(reg) );
11288      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11289      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11290
11291      if (imm == 0) {
11292         assign( lo64r, mkexpr(lo64) );
11293         assign( hi64r, mkexpr(hi64) );
11294      }
11295      else
11296      if (imm == 8) {
11297         assign( hi64r, mkU64(0) );
11298         assign( lo64r, mkexpr(hi64) );
11299      }
11300      else
11301      if (imm > 8) {
11302         assign( hi64r, mkU64(0) );
11303         assign( lo64r, binop( Iop_Shr64,
11304                               mkexpr(hi64),
11305                               mkU8( 8*(imm-8) ) ));
11306      } else {
11307         assign( hi64r, binop( Iop_Shr64,
11308                               mkexpr(hi64),
11309                               mkU8(8 * imm) ));
11310         assign( lo64r,
11311                 binop( Iop_Or64,
11312                        binop(Iop_Shr64, mkexpr(lo64),
11313                                         mkU8(8 * imm)),
11314                        binop(Iop_Shl64, mkexpr(hi64),
11315                                         mkU8(8 * (8 - imm)) )
11316                      )
11317               );
11318      }
11319
11320      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11321      putXMMReg(reg, mkexpr(dV));
11322      goto decode_success;
11323   }
11324
11325   /* 66 0F 73 /2 ib = PSRLQ by immediate */
11326   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11327       && epartIsReg(insn[2])
11328       && gregOfRM(insn[2]) == 2) {
11329      delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
11330      goto decode_success;
11331   }
11332
11333   /* 66 0F D3 = PSRLQ by E */
11334   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
11335      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
11336      goto decode_success;
11337   }
11338
11339   /* 66 0F 71 /2 ib = PSRLW by immediate */
11340   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11341       && epartIsReg(insn[2])
11342       && gregOfRM(insn[2]) == 2) {
11343      delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
11344      goto decode_success;
11345   }
11346
11347   /* 66 0F D1 = PSRLW by E */
11348   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
11349      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
11350      goto decode_success;
11351   }
11352
11353   /* 66 0F F8 = PSUBB */
11354   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
11355      delta = dis_SSEint_E_to_G( sorb, delta+2,
11356                                 "psubb", Iop_Sub8x16, False );
11357      goto decode_success;
11358   }
11359
11360   /* 66 0F FA = PSUBD */
11361   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
11362      delta = dis_SSEint_E_to_G( sorb, delta+2,
11363                                 "psubd", Iop_Sub32x4, False );
11364      goto decode_success;
11365   }
11366
11367   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11368   /* 0F FB = PSUBQ -- sub 64x1 */
11369   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
11370      do_MMX_preamble();
11371      delta = dis_MMXop_regmem_to_reg (
11372                sorb, delta+2, insn[1], "psubq", False );
11373      goto decode_success;
11374   }
11375
11376   /* 66 0F FB = PSUBQ */
11377   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
11378      delta = dis_SSEint_E_to_G( sorb, delta+2,
11379                                 "psubq", Iop_Sub64x2, False );
11380      goto decode_success;
11381   }
11382
11383   /* 66 0F F9 = PSUBW */
11384   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
11385      delta = dis_SSEint_E_to_G( sorb, delta+2,
11386                                 "psubw", Iop_Sub16x8, False );
11387      goto decode_success;
11388   }
11389
11390   /* 66 0F E8 = PSUBSB */
11391   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
11392      delta = dis_SSEint_E_to_G( sorb, delta+2,
11393                                 "psubsb", Iop_QSub8Sx16, False );
11394      goto decode_success;
11395   }
11396
11397   /* 66 0F E9 = PSUBSW */
11398   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
11399      delta = dis_SSEint_E_to_G( sorb, delta+2,
11400                                 "psubsw", Iop_QSub16Sx8, False );
11401      goto decode_success;
11402   }
11403
11404   /* 66 0F D8 = PSUBSB */
11405   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
11406      delta = dis_SSEint_E_to_G( sorb, delta+2,
11407                                 "psubusb", Iop_QSub8Ux16, False );
11408      goto decode_success;
11409   }
11410
11411   /* 66 0F D9 = PSUBSW */
11412   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
11413      delta = dis_SSEint_E_to_G( sorb, delta+2,
11414                                 "psubusw", Iop_QSub16Ux8, False );
11415      goto decode_success;
11416   }
11417
11418   /* 66 0F 68 = PUNPCKHBW */
11419   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
11420      delta = dis_SSEint_E_to_G( sorb, delta+2,
11421                                 "punpckhbw",
11422                                 Iop_InterleaveHI8x16, True );
11423      goto decode_success;
11424   }
11425
11426   /* 66 0F 6A = PUNPCKHDQ */
11427   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
11428      delta = dis_SSEint_E_to_G( sorb, delta+2,
11429                                 "punpckhdq",
11430                                 Iop_InterleaveHI32x4, True );
11431      goto decode_success;
11432   }
11433
11434   /* 66 0F 6D = PUNPCKHQDQ */
11435   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
11436      delta = dis_SSEint_E_to_G( sorb, delta+2,
11437                                 "punpckhqdq",
11438                                 Iop_InterleaveHI64x2, True );
11439      goto decode_success;
11440   }
11441
11442   /* 66 0F 69 = PUNPCKHWD */
11443   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
11444      delta = dis_SSEint_E_to_G( sorb, delta+2,
11445                                 "punpckhwd",
11446                                 Iop_InterleaveHI16x8, True );
11447      goto decode_success;
11448   }
11449
11450   /* 66 0F 60 = PUNPCKLBW */
11451   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
11452      delta = dis_SSEint_E_to_G( sorb, delta+2,
11453                                 "punpcklbw",
11454                                 Iop_InterleaveLO8x16, True );
11455      goto decode_success;
11456   }
11457
11458   /* 66 0F 62 = PUNPCKLDQ */
11459   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
11460      delta = dis_SSEint_E_to_G( sorb, delta+2,
11461                                 "punpckldq",
11462                                 Iop_InterleaveLO32x4, True );
11463      goto decode_success;
11464   }
11465
11466   /* 66 0F 6C = PUNPCKLQDQ */
11467   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
11468      delta = dis_SSEint_E_to_G( sorb, delta+2,
11469                                 "punpcklqdq",
11470                                 Iop_InterleaveLO64x2, True );
11471      goto decode_success;
11472   }
11473
11474   /* 66 0F 61 = PUNPCKLWD */
11475   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
11476      delta = dis_SSEint_E_to_G( sorb, delta+2,
11477                                 "punpcklwd",
11478                                 Iop_InterleaveLO16x8, True );
11479      goto decode_success;
11480   }
11481
11482   /* 66 0F EF = PXOR */
11483   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
11484      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
11485      goto decode_success;
11486   }
11487
11488//--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
11489//--    if (insn[0] == 0x0F && insn[1] == 0xAE
11490//--        && (!epartIsReg(insn[2]))
11491//--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
11492//--       Bool store = gregOfRM(insn[2]) == 0;
11493//--       vg_assert(sz == 4);
11494//--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
11495//--       t1   = LOW24(pair);
11496//--       eip += 2+HI8(pair);
11497//--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
11498//--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
11499//--                   Lit16, (UShort)insn[2],
11500//--                   TempReg, t1 );
11501//--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
11502//--       goto decode_success;
11503//--    }
11504
11505   /* 0F AE /7 = CLFLUSH -- flush cache line */
11506   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
11507       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
11508
11509      /* This is something of a hack.  We need to know the size of the
11510         cache line containing addr.  Since we don't (easily), assume
11511         256 on the basis that no real cache would have a line that
11512         big.  It's safe to invalidate more stuff than we need, just
11513         inefficient. */
11514      UInt lineszB = 256;
11515
11516      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11517      delta += 2+alen;
11518
11519      /* Round addr down to the start of the containing block. */
11520      stmt( IRStmt_Put(
11521               OFFB_TISTART,
11522               binop( Iop_And32,
11523                      mkexpr(addr),
11524                      mkU32( ~(lineszB-1) ))) );
11525
11526      stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
11527
11528      irsb->jumpkind = Ijk_TInval;
11529      irsb->next     = mkU32(guest_EIP_bbstart+delta);
11530      dres.whatNext  = Dis_StopHere;
11531
11532      DIP("clflush %s\n", dis_buf);
11533      goto decode_success;
11534   }
11535
11536   /* ---------------------------------------------------- */
11537   /* --- end of the SSE2 decoder.                     --- */
11538   /* ---------------------------------------------------- */
11539
11540   /* ---------------------------------------------------- */
11541   /* --- start of the SSE3 decoder.                   --- */
11542   /* ---------------------------------------------------- */
11543
11544   /* Skip parts of the decoder which don't apply given the stated
11545      guest subarchitecture. */
11546   /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
11547   /* In fact this is highly bogus; we accept SSE3 insns even on a
11548      SSE2-only guest since they turn into IR which can be re-emitted
11549      successfully on an SSE2 host. */
11550   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
11551      goto after_sse_decoders; /* no SSE3 capabilities */
11552
11553   insn = (UChar*)&guest_code[delta];
11554
11555   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
11556      duplicating some lanes (2:2:0:0). */
11557   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
11558      duplicating some lanes (3:3:1:1). */
11559   if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
11560       && (insn[2] == 0x12 || insn[2] == 0x16)) {
11561      IRTemp s3, s2, s1, s0;
11562      IRTemp sV  = newTemp(Ity_V128);
11563      Bool   isH = insn[2] == 0x16;
11564      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11565
11566      modrm = insn[3];
11567      if (epartIsReg(modrm)) {
11568         assign( sV, getXMMReg( eregOfRM(modrm)) );
11569         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11570                                  nameXMMReg(eregOfRM(modrm)),
11571                                  nameXMMReg(gregOfRM(modrm)));
11572         delta += 3+1;
11573      } else {
11574         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11575         gen_SEGV_if_not_16_aligned( addr );
11576         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11577         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11578	     dis_buf,
11579             nameXMMReg(gregOfRM(modrm)));
11580         delta += 3+alen;
11581      }
11582
11583      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11584      putXMMReg( gregOfRM(modrm),
11585                 isH ? mk128from32s( s3, s3, s1, s1 )
11586                     : mk128from32s( s2, s2, s0, s0 ) );
11587      goto decode_success;
11588   }
11589
11590   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
11591      duplicating some lanes (0:1:0:1). */
11592   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
11593      IRTemp sV = newTemp(Ity_V128);
11594      IRTemp d0 = newTemp(Ity_I64);
11595
11596      modrm = insn[3];
11597      if (epartIsReg(modrm)) {
11598         assign( sV, getXMMReg( eregOfRM(modrm)) );
11599         DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11600                                nameXMMReg(gregOfRM(modrm)));
11601         delta += 3+1;
11602         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
11603      } else {
11604         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11605         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
11606         DIP("movddup %s,%s\n", dis_buf,
11607                                nameXMMReg(gregOfRM(modrm)));
11608         delta += 3+alen;
11609      }
11610
11611      putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
11612      goto decode_success;
11613   }
11614
11615   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
11616   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
11617      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11618      IRTemp eV   = newTemp(Ity_V128);
11619      IRTemp gV   = newTemp(Ity_V128);
11620      IRTemp addV = newTemp(Ity_V128);
11621      IRTemp subV = newTemp(Ity_V128);
11622      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11623
11624      modrm = insn[3];
11625      if (epartIsReg(modrm)) {
11626         assign( eV, getXMMReg( eregOfRM(modrm)) );
11627         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11628                                 nameXMMReg(gregOfRM(modrm)));
11629         delta += 3+1;
11630      } else {
11631         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11632         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11633         DIP("addsubps %s,%s\n", dis_buf,
11634                                 nameXMMReg(gregOfRM(modrm)));
11635         delta += 3+alen;
11636      }
11637
11638      assign( gV, getXMMReg(gregOfRM(modrm)) );
11639
11640      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
11641      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
11642
11643      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
11644      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
11645
11646      putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
11647      goto decode_success;
11648   }
11649
11650   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
11651   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
11652      IRTemp eV   = newTemp(Ity_V128);
11653      IRTemp gV   = newTemp(Ity_V128);
11654      IRTemp addV = newTemp(Ity_V128);
11655      IRTemp subV = newTemp(Ity_V128);
11656      IRTemp a1     = newTemp(Ity_I64);
11657      IRTemp s0     = newTemp(Ity_I64);
11658
11659      modrm = insn[2];
11660      if (epartIsReg(modrm)) {
11661         assign( eV, getXMMReg( eregOfRM(modrm)) );
11662         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11663                                 nameXMMReg(gregOfRM(modrm)));
11664         delta += 2+1;
11665      } else {
11666         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11667         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11668         DIP("addsubpd %s,%s\n", dis_buf,
11669                                 nameXMMReg(gregOfRM(modrm)));
11670         delta += 2+alen;
11671      }
11672
11673      assign( gV, getXMMReg(gregOfRM(modrm)) );
11674
11675      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
11676      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
11677
11678      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11679      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11680
11681      putXMMReg( gregOfRM(modrm),
11682                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11683      goto decode_success;
11684   }
11685
11686   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
11687   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
11688   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
11689       && (insn[2] == 0x7C || insn[2] == 0x7D)) {
11690      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
11691      IRTemp eV     = newTemp(Ity_V128);
11692      IRTemp gV     = newTemp(Ity_V128);
11693      IRTemp leftV  = newTemp(Ity_V128);
11694      IRTemp rightV = newTemp(Ity_V128);
11695      Bool   isAdd  = insn[2] == 0x7C;
11696      HChar* str    = isAdd ? "add" : "sub";
11697      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
11698
11699      modrm = insn[3];
11700      if (epartIsReg(modrm)) {
11701         assign( eV, getXMMReg( eregOfRM(modrm)) );
11702         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11703                                   nameXMMReg(gregOfRM(modrm)));
11704         delta += 3+1;
11705      } else {
11706         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11707         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11708         DIP("h%sps %s,%s\n", str, dis_buf,
11709                                   nameXMMReg(gregOfRM(modrm)));
11710         delta += 3+alen;
11711      }
11712
11713      assign( gV, getXMMReg(gregOfRM(modrm)) );
11714
11715      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
11716      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
11717
11718      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
11719      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
11720
11721      putXMMReg( gregOfRM(modrm),
11722                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
11723                       mkexpr(leftV), mkexpr(rightV) ) );
11724      goto decode_success;
11725   }
11726
11727   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
11728   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
11729   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
11730      IRTemp e1     = newTemp(Ity_I64);
11731      IRTemp e0     = newTemp(Ity_I64);
11732      IRTemp g1     = newTemp(Ity_I64);
11733      IRTemp g0     = newTemp(Ity_I64);
11734      IRTemp eV     = newTemp(Ity_V128);
11735      IRTemp gV     = newTemp(Ity_V128);
11736      IRTemp leftV  = newTemp(Ity_V128);
11737      IRTemp rightV = newTemp(Ity_V128);
11738      Bool   isAdd  = insn[1] == 0x7C;
11739      HChar* str    = isAdd ? "add" : "sub";
11740
11741      modrm = insn[2];
11742      if (epartIsReg(modrm)) {
11743         assign( eV, getXMMReg( eregOfRM(modrm)) );
11744         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11745                                   nameXMMReg(gregOfRM(modrm)));
11746         delta += 2+1;
11747      } else {
11748         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11749         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11750         DIP("h%spd %s,%s\n", str, dis_buf,
11751                              nameXMMReg(gregOfRM(modrm)));
11752         delta += 2+alen;
11753      }
11754
11755      assign( gV, getXMMReg(gregOfRM(modrm)) );
11756
11757      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
11758      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
11759      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
11760      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
11761
11762      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
11763      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
11764
11765      putXMMReg( gregOfRM(modrm),
11766                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
11767                       mkexpr(leftV), mkexpr(rightV) ) );
11768      goto decode_success;
11769   }
11770
11771   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
11772   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
11773      modrm = getIByte(delta+3);
11774      if (epartIsReg(modrm)) {
11775         goto decode_failure;
11776      } else {
11777         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11778         putXMMReg( gregOfRM(modrm),
11779                    loadLE(Ity_V128, mkexpr(addr)) );
11780         DIP("lddqu %s,%s\n", dis_buf,
11781                              nameXMMReg(gregOfRM(modrm)));
11782         delta += 3+alen;
11783      }
11784      goto decode_success;
11785   }
11786
11787   /* ---------------------------------------------------- */
11788   /* --- end of the SSE3 decoder.                     --- */
11789   /* ---------------------------------------------------- */
11790
11791   /* ---------------------------------------------------- */
11792   /* --- start of the SSSE3 decoder.                  --- */
11793   /* ---------------------------------------------------- */
11794
11795   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
11796      Unsigned Bytes (MMX) */
11797   if (sz == 4
11798       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
11799      IRTemp sV        = newTemp(Ity_I64);
11800      IRTemp dV        = newTemp(Ity_I64);
11801      IRTemp sVoddsSX  = newTemp(Ity_I64);
11802      IRTemp sVevensSX = newTemp(Ity_I64);
11803      IRTemp dVoddsZX  = newTemp(Ity_I64);
11804      IRTemp dVevensZX = newTemp(Ity_I64);
11805
11806      modrm = insn[3];
11807      do_MMX_preamble();
11808      assign( dV, getMMXReg(gregOfRM(modrm)) );
11809
11810      if (epartIsReg(modrm)) {
11811         assign( sV, getMMXReg(eregOfRM(modrm)) );
11812         delta += 3+1;
11813         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11814                                  nameMMXReg(gregOfRM(modrm)));
11815      } else {
11816         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11817         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11818         delta += 3+alen;
11819         DIP("pmaddubsw %s,%s\n", dis_buf,
11820                                  nameMMXReg(gregOfRM(modrm)));
11821      }
11822
11823      /* compute dV unsigned x sV signed */
11824      assign( sVoddsSX,
11825              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
11826      assign( sVevensSX,
11827              binop(Iop_SarN16x4,
11828                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
11829                    mkU8(8)) );
11830      assign( dVoddsZX,
11831              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
11832      assign( dVevensZX,
11833              binop(Iop_ShrN16x4,
11834                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
11835                    mkU8(8)) );
11836
11837      putMMXReg(
11838         gregOfRM(modrm),
11839         binop(Iop_QAdd16Sx4,
11840               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
11841               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
11842         )
11843      );
11844      goto decode_success;
11845   }
11846
11847   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
11848      Unsigned Bytes (XMM) */
11849   if (sz == 2
11850       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
11851      IRTemp sV        = newTemp(Ity_V128);
11852      IRTemp dV        = newTemp(Ity_V128);
11853      IRTemp sVoddsSX  = newTemp(Ity_V128);
11854      IRTemp sVevensSX = newTemp(Ity_V128);
11855      IRTemp dVoddsZX  = newTemp(Ity_V128);
11856      IRTemp dVevensZX = newTemp(Ity_V128);
11857
11858      modrm = insn[3];
11859      assign( dV, getXMMReg(gregOfRM(modrm)) );
11860
11861      if (epartIsReg(modrm)) {
11862         assign( sV, getXMMReg(eregOfRM(modrm)) );
11863         delta += 3+1;
11864         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11865                                  nameXMMReg(gregOfRM(modrm)));
11866      } else {
11867         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11868         gen_SEGV_if_not_16_aligned( addr );
11869         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11870         delta += 3+alen;
11871         DIP("pmaddubsw %s,%s\n", dis_buf,
11872                                  nameXMMReg(gregOfRM(modrm)));
11873      }
11874
11875      /* compute dV unsigned x sV signed */
11876      assign( sVoddsSX,
11877              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
11878      assign( sVevensSX,
11879              binop(Iop_SarN16x8,
11880                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
11881                    mkU8(8)) );
11882      assign( dVoddsZX,
11883              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
11884      assign( dVevensZX,
11885              binop(Iop_ShrN16x8,
11886                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
11887                    mkU8(8)) );
11888
11889      putXMMReg(
11890         gregOfRM(modrm),
11891         binop(Iop_QAdd16Sx8,
11892               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
11893               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
11894         )
11895      );
11896      goto decode_success;
11897   }
11898
11899   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
11900   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
11901      mmx) and G to G (mmx). */
11902   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
11903      mmx) and G to G (mmx). */
11904   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
11905      to G (mmx). */
11906   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
11907      to G (mmx). */
11908   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
11909      to G (mmx). */
11910   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
11911      to G (mmx). */
11912
11913   if (sz == 4
11914       && insn[0] == 0x0F && insn[1] == 0x38
11915       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
11916           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
11917      HChar* str    = "???";
11918      IROp   opV64  = Iop_INVALID;
11919      IROp   opCatO = Iop_CatOddLanes16x4;
11920      IROp   opCatE = Iop_CatEvenLanes16x4;
11921      IRTemp sV     = newTemp(Ity_I64);
11922      IRTemp dV     = newTemp(Ity_I64);
11923
11924      modrm = insn[3];
11925
11926      switch (insn[2]) {
11927         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
11928         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
11929         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
11930         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
11931         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
11932         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
11933         default: vassert(0);
11934      }
11935      if (insn[2] == 0x02 || insn[2] == 0x06) {
11936         opCatO = Iop_InterleaveHI32x2;
11937         opCatE = Iop_InterleaveLO32x2;
11938      }
11939
11940      do_MMX_preamble();
11941      assign( dV, getMMXReg(gregOfRM(modrm)) );
11942
11943      if (epartIsReg(modrm)) {
11944         assign( sV, getMMXReg(eregOfRM(modrm)) );
11945         delta += 3+1;
11946         DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
11947                                  nameMMXReg(gregOfRM(modrm)));
11948      } else {
11949         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11950         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11951         delta += 3+alen;
11952         DIP("ph%s %s,%s\n", str, dis_buf,
11953                                  nameMMXReg(gregOfRM(modrm)));
11954      }
11955
11956      putMMXReg(
11957         gregOfRM(modrm),
11958         binop(opV64,
11959               binop(opCatE,mkexpr(sV),mkexpr(dV)),
11960               binop(opCatO,mkexpr(sV),mkexpr(dV))
11961         )
11962      );
11963      goto decode_success;
11964   }
11965
11966   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
11967      xmm) and G to G (xmm). */
11968   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
11969      xmm) and G to G (xmm). */
11970   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
11971      G to G (xmm). */
11972   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
11973      G to G (xmm). */
11974   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
11975      G to G (xmm). */
11976   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
11977      G to G (xmm). */
11978
11979   if (sz == 2
11980       && insn[0] == 0x0F && insn[1] == 0x38
11981       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
11982           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
11983      HChar* str    = "???";
11984      IROp   opV64  = Iop_INVALID;
11985      IROp   opCatO = Iop_CatOddLanes16x4;
11986      IROp   opCatE = Iop_CatEvenLanes16x4;
11987      IRTemp sV     = newTemp(Ity_V128);
11988      IRTemp dV     = newTemp(Ity_V128);
11989      IRTemp sHi    = newTemp(Ity_I64);
11990      IRTemp sLo    = newTemp(Ity_I64);
11991      IRTemp dHi    = newTemp(Ity_I64);
11992      IRTemp dLo    = newTemp(Ity_I64);
11993
11994      modrm = insn[3];
11995
11996      switch (insn[2]) {
11997         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
11998         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
11999         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12000         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12001         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12002         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12003         default: vassert(0);
12004      }
12005      if (insn[2] == 0x02 || insn[2] == 0x06) {
12006         opCatO = Iop_InterleaveHI32x2;
12007         opCatE = Iop_InterleaveLO32x2;
12008      }
12009
12010      assign( dV, getXMMReg(gregOfRM(modrm)) );
12011
12012      if (epartIsReg(modrm)) {
12013         assign( sV, getXMMReg( eregOfRM(modrm)) );
12014         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12015                                  nameXMMReg(gregOfRM(modrm)));
12016         delta += 3+1;
12017      } else {
12018         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12019         gen_SEGV_if_not_16_aligned( addr );
12020         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12021         DIP("ph%s %s,%s\n", str, dis_buf,
12022                             nameXMMReg(gregOfRM(modrm)));
12023         delta += 3+alen;
12024      }
12025
12026      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12027      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12028      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12029      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12030
12031      /* This isn't a particularly efficient way to compute the
12032         result, but at least it avoids a proliferation of IROps,
12033         hence avoids complication all the backends. */
12034      putXMMReg(
12035         gregOfRM(modrm),
12036         binop(Iop_64HLtoV128,
12037               binop(opV64,
12038                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
12039                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
12040               ),
12041               binop(opV64,
12042                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
12043                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
12044               )
12045         )
12046      );
12047      goto decode_success;
12048   }
12049
12050   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
12051      (MMX) */
12052   if (sz == 4
12053       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12054      IRTemp sV = newTemp(Ity_I64);
12055      IRTemp dV = newTemp(Ity_I64);
12056
12057      modrm = insn[3];
12058      do_MMX_preamble();
12059      assign( dV, getMMXReg(gregOfRM(modrm)) );
12060
12061      if (epartIsReg(modrm)) {
12062         assign( sV, getMMXReg(eregOfRM(modrm)) );
12063         delta += 3+1;
12064         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12065                                 nameMMXReg(gregOfRM(modrm)));
12066      } else {
12067         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12068         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12069         delta += 3+alen;
12070         DIP("pmulhrsw %s,%s\n", dis_buf,
12071                                 nameMMXReg(gregOfRM(modrm)));
12072      }
12073
12074      putMMXReg(
12075         gregOfRM(modrm),
12076         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
12077      );
12078      goto decode_success;
12079   }
12080
12081   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
12082      Scale (XMM) */
12083   if (sz == 2
12084       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12085      IRTemp sV  = newTemp(Ity_V128);
12086      IRTemp dV  = newTemp(Ity_V128);
12087      IRTemp sHi = newTemp(Ity_I64);
12088      IRTemp sLo = newTemp(Ity_I64);
12089      IRTemp dHi = newTemp(Ity_I64);
12090      IRTemp dLo = newTemp(Ity_I64);
12091
12092      modrm = insn[3];
12093      assign( dV, getXMMReg(gregOfRM(modrm)) );
12094
12095      if (epartIsReg(modrm)) {
12096         assign( sV, getXMMReg(eregOfRM(modrm)) );
12097         delta += 3+1;
12098         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12099                                 nameXMMReg(gregOfRM(modrm)));
12100      } else {
12101         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12102         gen_SEGV_if_not_16_aligned( addr );
12103         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12104         delta += 3+alen;
12105         DIP("pmulhrsw %s,%s\n", dis_buf,
12106                                 nameXMMReg(gregOfRM(modrm)));
12107      }
12108
12109      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12110      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12111      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12112      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12113
12114      putXMMReg(
12115         gregOfRM(modrm),
12116         binop(Iop_64HLtoV128,
12117               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
12118               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
12119         )
12120      );
12121      goto decode_success;
12122   }
12123
12124   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
12125   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
12126   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
12127   if (sz == 4
12128       && insn[0] == 0x0F && insn[1] == 0x38
12129       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12130      IRTemp sV      = newTemp(Ity_I64);
12131      IRTemp dV      = newTemp(Ity_I64);
12132      HChar* str     = "???";
12133      Int    laneszB = 0;
12134
12135      switch (insn[2]) {
12136         case 0x08: laneszB = 1; str = "b"; break;
12137         case 0x09: laneszB = 2; str = "w"; break;
12138         case 0x0A: laneszB = 4; str = "d"; break;
12139         default: vassert(0);
12140      }
12141
12142      modrm = insn[3];
12143      do_MMX_preamble();
12144      assign( dV, getMMXReg(gregOfRM(modrm)) );
12145
12146      if (epartIsReg(modrm)) {
12147         assign( sV, getMMXReg(eregOfRM(modrm)) );
12148         delta += 3+1;
12149         DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12150                                     nameMMXReg(gregOfRM(modrm)));
12151      } else {
12152         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12153         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12154         delta += 3+alen;
12155         DIP("psign%s %s,%s\n", str, dis_buf,
12156                                     nameMMXReg(gregOfRM(modrm)));
12157      }
12158
12159      putMMXReg(
12160         gregOfRM(modrm),
12161         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
12162      );
12163      goto decode_success;
12164   }
12165
12166   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
12167   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
12168   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
12169   if (sz == 2
12170       && insn[0] == 0x0F && insn[1] == 0x38
12171       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12172      IRTemp sV      = newTemp(Ity_V128);
12173      IRTemp dV      = newTemp(Ity_V128);
12174      IRTemp sHi     = newTemp(Ity_I64);
12175      IRTemp sLo     = newTemp(Ity_I64);
12176      IRTemp dHi     = newTemp(Ity_I64);
12177      IRTemp dLo     = newTemp(Ity_I64);
12178      HChar* str     = "???";
12179      Int    laneszB = 0;
12180
12181      switch (insn[2]) {
12182         case 0x08: laneszB = 1; str = "b"; break;
12183         case 0x09: laneszB = 2; str = "w"; break;
12184         case 0x0A: laneszB = 4; str = "d"; break;
12185         default: vassert(0);
12186      }
12187
12188      modrm = insn[3];
12189      assign( dV, getXMMReg(gregOfRM(modrm)) );
12190
12191      if (epartIsReg(modrm)) {
12192         assign( sV, getXMMReg(eregOfRM(modrm)) );
12193         delta += 3+1;
12194         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12195                                     nameXMMReg(gregOfRM(modrm)));
12196      } else {
12197         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12198         gen_SEGV_if_not_16_aligned( addr );
12199         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12200         delta += 3+alen;
12201         DIP("psign%s %s,%s\n", str, dis_buf,
12202                                     nameXMMReg(gregOfRM(modrm)));
12203      }
12204
12205      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12206      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12207      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12208      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12209
12210      putXMMReg(
12211         gregOfRM(modrm),
12212         binop(Iop_64HLtoV128,
12213               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
12214               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
12215         )
12216      );
12217      goto decode_success;
12218   }
12219
12220   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
12221   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
12222   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
12223   if (sz == 4
12224       && insn[0] == 0x0F && insn[1] == 0x38
12225       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12226      IRTemp sV      = newTemp(Ity_I64);
12227      HChar* str     = "???";
12228      Int    laneszB = 0;
12229
12230      switch (insn[2]) {
12231         case 0x1C: laneszB = 1; str = "b"; break;
12232         case 0x1D: laneszB = 2; str = "w"; break;
12233         case 0x1E: laneszB = 4; str = "d"; break;
12234         default: vassert(0);
12235      }
12236
12237      modrm = insn[3];
12238      do_MMX_preamble();
12239
12240      if (epartIsReg(modrm)) {
12241         assign( sV, getMMXReg(eregOfRM(modrm)) );
12242         delta += 3+1;
12243         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12244                                    nameMMXReg(gregOfRM(modrm)));
12245      } else {
12246         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12247         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12248         delta += 3+alen;
12249         DIP("pabs%s %s,%s\n", str, dis_buf,
12250                                    nameMMXReg(gregOfRM(modrm)));
12251      }
12252
12253      putMMXReg(
12254         gregOfRM(modrm),
12255         dis_PABS_helper( mkexpr(sV), laneszB )
12256      );
12257      goto decode_success;
12258   }
12259
12260   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
12261   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
12262   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
12263   if (sz == 2
12264       && insn[0] == 0x0F && insn[1] == 0x38
12265       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12266      IRTemp sV      = newTemp(Ity_V128);
12267      IRTemp sHi     = newTemp(Ity_I64);
12268      IRTemp sLo     = newTemp(Ity_I64);
12269      HChar* str     = "???";
12270      Int    laneszB = 0;
12271
12272      switch (insn[2]) {
12273         case 0x1C: laneszB = 1; str = "b"; break;
12274         case 0x1D: laneszB = 2; str = "w"; break;
12275         case 0x1E: laneszB = 4; str = "d"; break;
12276         default: vassert(0);
12277      }
12278
12279      modrm = insn[3];
12280
12281      if (epartIsReg(modrm)) {
12282         assign( sV, getXMMReg(eregOfRM(modrm)) );
12283         delta += 3+1;
12284         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12285                                    nameXMMReg(gregOfRM(modrm)));
12286      } else {
12287         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12288         gen_SEGV_if_not_16_aligned( addr );
12289         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12290         delta += 3+alen;
12291         DIP("pabs%s %s,%s\n", str, dis_buf,
12292                                    nameXMMReg(gregOfRM(modrm)));
12293      }
12294
12295      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12296      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12297
12298      putXMMReg(
12299         gregOfRM(modrm),
12300         binop(Iop_64HLtoV128,
12301               dis_PABS_helper( mkexpr(sHi), laneszB ),
12302               dis_PABS_helper( mkexpr(sLo), laneszB )
12303         )
12304      );
12305      goto decode_success;
12306   }
12307
12308   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
12309   if (sz == 4
12310       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12311      IRTemp sV  = newTemp(Ity_I64);
12312      IRTemp dV  = newTemp(Ity_I64);
12313      IRTemp res = newTemp(Ity_I64);
12314
12315      modrm = insn[3];
12316      do_MMX_preamble();
12317      assign( dV, getMMXReg(gregOfRM(modrm)) );
12318
12319      if (epartIsReg(modrm)) {
12320         assign( sV, getMMXReg(eregOfRM(modrm)) );
12321         d32 = (UInt)insn[3+1];
12322         delta += 3+1+1;
12323         DIP("palignr $%d,%s,%s\n",  (Int)d32,
12324                                     nameMMXReg(eregOfRM(modrm)),
12325                                     nameMMXReg(gregOfRM(modrm)));
12326      } else {
12327         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12328         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12329         d32 = (UInt)insn[3+alen];
12330         delta += 3+alen+1;
12331         DIP("palignr $%d%s,%s\n", (Int)d32,
12332                                   dis_buf,
12333                                   nameMMXReg(gregOfRM(modrm)));
12334      }
12335
12336      if (d32 == 0) {
12337         assign( res, mkexpr(sV) );
12338      }
12339      else if (d32 >= 1 && d32 <= 7) {
12340         assign(res,
12341                binop(Iop_Or64,
12342                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
12343                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
12344                     )));
12345      }
12346      else if (d32 == 8) {
12347        assign( res, mkexpr(dV) );
12348      }
12349      else if (d32 >= 9 && d32 <= 15) {
12350         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
12351      }
12352      else if (d32 >= 16 && d32 <= 255) {
12353         assign( res, mkU64(0) );
12354      }
12355      else
12356         vassert(0);
12357
12358      putMMXReg( gregOfRM(modrm), mkexpr(res) );
12359      goto decode_success;
12360   }
12361
12362   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
12363   if (sz == 2
12364       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12365      IRTemp sV  = newTemp(Ity_V128);
12366      IRTemp dV  = newTemp(Ity_V128);
12367      IRTemp sHi = newTemp(Ity_I64);
12368      IRTemp sLo = newTemp(Ity_I64);
12369      IRTemp dHi = newTemp(Ity_I64);
12370      IRTemp dLo = newTemp(Ity_I64);
12371      IRTemp rHi = newTemp(Ity_I64);
12372      IRTemp rLo = newTemp(Ity_I64);
12373
12374      modrm = insn[3];
12375      assign( dV, getXMMReg(gregOfRM(modrm)) );
12376
12377      if (epartIsReg(modrm)) {
12378         assign( sV, getXMMReg(eregOfRM(modrm)) );
12379         d32 = (UInt)insn[3+1];
12380         delta += 3+1+1;
12381         DIP("palignr $%d,%s,%s\n", (Int)d32,
12382                                    nameXMMReg(eregOfRM(modrm)),
12383                                    nameXMMReg(gregOfRM(modrm)));
12384      } else {
12385         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12386         gen_SEGV_if_not_16_aligned( addr );
12387         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12388         d32 = (UInt)insn[3+alen];
12389         delta += 3+alen+1;
12390         DIP("palignr $%d,%s,%s\n", (Int)d32,
12391                                    dis_buf,
12392                                    nameXMMReg(gregOfRM(modrm)));
12393      }
12394
12395      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12396      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12397      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12398      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12399
12400      if (d32 == 0) {
12401         assign( rHi, mkexpr(sHi) );
12402         assign( rLo, mkexpr(sLo) );
12403      }
12404      else if (d32 >= 1 && d32 <= 7) {
12405         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
12406         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
12407      }
12408      else if (d32 == 8) {
12409         assign( rHi, mkexpr(dLo) );
12410         assign( rLo, mkexpr(sHi) );
12411      }
12412      else if (d32 >= 9 && d32 <= 15) {
12413         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
12414         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
12415      }
12416      else if (d32 == 16) {
12417         assign( rHi, mkexpr(dHi) );
12418         assign( rLo, mkexpr(dLo) );
12419      }
12420      else if (d32 >= 17 && d32 <= 23) {
12421         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
12422         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
12423      }
12424      else if (d32 == 24) {
12425         assign( rHi, mkU64(0) );
12426         assign( rLo, mkexpr(dHi) );
12427      }
12428      else if (d32 >= 25 && d32 <= 31) {
12429         assign( rHi, mkU64(0) );
12430         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
12431      }
12432      else if (d32 >= 32 && d32 <= 255) {
12433         assign( rHi, mkU64(0) );
12434         assign( rLo, mkU64(0) );
12435      }
12436      else
12437         vassert(0);
12438
12439      putXMMReg(
12440         gregOfRM(modrm),
12441         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12442      );
12443      goto decode_success;
12444   }
12445
12446   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
12447   if (sz == 4
12448       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12449      IRTemp sV      = newTemp(Ity_I64);
12450      IRTemp dV      = newTemp(Ity_I64);
12451
12452      modrm = insn[3];
12453      do_MMX_preamble();
12454      assign( dV, getMMXReg(gregOfRM(modrm)) );
12455
12456      if (epartIsReg(modrm)) {
12457         assign( sV, getMMXReg(eregOfRM(modrm)) );
12458         delta += 3+1;
12459         DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12460                               nameMMXReg(gregOfRM(modrm)));
12461      } else {
12462         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12463         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12464         delta += 3+alen;
12465         DIP("pshufb %s,%s\n", dis_buf,
12466                               nameMMXReg(gregOfRM(modrm)));
12467      }
12468
12469      putMMXReg(
12470         gregOfRM(modrm),
12471         binop(
12472            Iop_And64,
12473            /* permute the lanes */
12474            binop(
12475               Iop_Perm8x8,
12476               mkexpr(dV),
12477               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
12478            ),
12479            /* mask off lanes which have (index & 0x80) == 0x80 */
12480            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
12481         )
12482      );
12483      goto decode_success;
12484   }
12485
12486   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
12487   if (sz == 2
12488       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12489      IRTemp sV         = newTemp(Ity_V128);
12490      IRTemp dV         = newTemp(Ity_V128);
12491      IRTemp sHi        = newTemp(Ity_I64);
12492      IRTemp sLo        = newTemp(Ity_I64);
12493      IRTemp dHi        = newTemp(Ity_I64);
12494      IRTemp dLo        = newTemp(Ity_I64);
12495      IRTemp rHi        = newTemp(Ity_I64);
12496      IRTemp rLo        = newTemp(Ity_I64);
12497      IRTemp sevens     = newTemp(Ity_I64);
12498      IRTemp mask0x80hi = newTemp(Ity_I64);
12499      IRTemp mask0x80lo = newTemp(Ity_I64);
12500      IRTemp maskBit3hi = newTemp(Ity_I64);
12501      IRTemp maskBit3lo = newTemp(Ity_I64);
12502      IRTemp sAnd7hi    = newTemp(Ity_I64);
12503      IRTemp sAnd7lo    = newTemp(Ity_I64);
12504      IRTemp permdHi    = newTemp(Ity_I64);
12505      IRTemp permdLo    = newTemp(Ity_I64);
12506
12507      modrm = insn[3];
12508      assign( dV, getXMMReg(gregOfRM(modrm)) );
12509
12510      if (epartIsReg(modrm)) {
12511         assign( sV, getXMMReg(eregOfRM(modrm)) );
12512         delta += 3+1;
12513         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12514                               nameXMMReg(gregOfRM(modrm)));
12515      } else {
12516         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12517         gen_SEGV_if_not_16_aligned( addr );
12518         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12519         delta += 3+alen;
12520         DIP("pshufb %s,%s\n", dis_buf,
12521                               nameXMMReg(gregOfRM(modrm)));
12522      }
12523
12524      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12525      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12526      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12527      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12528
12529      assign( sevens, mkU64(0x0707070707070707ULL) );
12530
12531      /*
12532      mask0x80hi = Not(SarN8x8(sHi,7))
12533      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
12534      sAnd7hi    = And(sHi,sevens)
12535      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
12536                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
12537      rHi        = And(permdHi,mask0x80hi)
12538      */
12539      assign(
12540         mask0x80hi,
12541         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
12542
12543      assign(
12544         maskBit3hi,
12545         binop(Iop_SarN8x8,
12546               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
12547               mkU8(7)));
12548
12549      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
12550
12551      assign(
12552         permdHi,
12553         binop(
12554            Iop_Or64,
12555            binop(Iop_And64,
12556                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
12557                  mkexpr(maskBit3hi)),
12558            binop(Iop_And64,
12559                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
12560                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
12561
12562      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
12563
12564      /* And the same for the lower half of the result.  What fun. */
12565
12566      assign(
12567         mask0x80lo,
12568         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
12569
12570      assign(
12571         maskBit3lo,
12572         binop(Iop_SarN8x8,
12573               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
12574               mkU8(7)));
12575
12576      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
12577
12578      assign(
12579         permdLo,
12580         binop(
12581            Iop_Or64,
12582            binop(Iop_And64,
12583                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
12584                  mkexpr(maskBit3lo)),
12585            binop(Iop_And64,
12586                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
12587                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
12588
12589      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
12590
12591      putXMMReg(
12592         gregOfRM(modrm),
12593         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12594      );
12595      goto decode_success;
12596   }
12597
12598   /* ---------------------------------------------------- */
12599   /* --- end of the SSSE3 decoder.                    --- */
12600   /* ---------------------------------------------------- */
12601
12602   /* ---------------------------------------------------- */
12603   /* --- start of the SSE4 decoder                    --- */
12604   /* ---------------------------------------------------- */
12605
12606   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
12607      (Partial implementation only -- only deal with cases where
12608      the rounding mode is specified directly by the immediate byte.)
12609      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
12610      (Limitations ditto)
12611   */
12612   if (sz == 2
12613       && insn[0] == 0x0F && insn[1] == 0x3A
12614       && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
12615
12616      Bool   isD = insn[2] == 0x0B;
12617      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
12618      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
12619      Int    imm = 0;
12620
12621      modrm = insn[3];
12622
12623      if (epartIsReg(modrm)) {
12624         assign( src,
12625                 isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
12626                     : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
12627         imm = insn[3+1];
12628         if (imm & ~3) goto decode_failure;
12629         delta += 3+1+1;
12630         DIP( "rounds%c $%d,%s,%s\n",
12631              isD ? 'd' : 's',
12632              imm, nameXMMReg( eregOfRM(modrm) ),
12633                   nameXMMReg( gregOfRM(modrm) ) );
12634      } else {
12635         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12636         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
12637         imm = insn[3+alen];
12638         if (imm & ~3) goto decode_failure;
12639         delta += 3+alen+1;
12640         DIP( "roundsd $%d,%s,%s\n",
12641              imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
12642      }
12643
12644      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
12645         that encoding is the same as the encoding for IRRoundingMode,
12646         we can use that value directly in the IR as a rounding
12647         mode. */
12648      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
12649                  mkU32(imm & 3), mkexpr(src)) );
12650
12651      if (isD)
12652         putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
12653      else
12654         putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
12655
12656      goto decode_success;
12657   }
12658
12659   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
12660      which we can only decode if we're sure this is an AMD cpu that
12661      supports LZCNT, since otherwise it's BSR, which behaves
12662      differently. */
12663   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
12664       && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
12665      vassert(sz == 2 || sz == 4);
12666      /*IRType*/ ty  = szToITy(sz);
12667      IRTemp     src = newTemp(ty);
12668      modrm = insn[3];
12669      if (epartIsReg(modrm)) {
12670         assign(src, getIReg(sz, eregOfRM(modrm)));
12671         delta += 3+1;
12672         DIP("lzcnt%c %s, %s\n", nameISize(sz),
12673             nameIReg(sz, eregOfRM(modrm)),
12674             nameIReg(sz, gregOfRM(modrm)));
12675      } else {
12676         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12677         assign(src, loadLE(ty, mkexpr(addr)));
12678         delta += 3+alen;
12679         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
12680             nameIReg(sz, gregOfRM(modrm)));
12681      }
12682
12683      IRTemp res = gen_LZCNT(ty, src);
12684      putIReg(sz, gregOfRM(modrm), mkexpr(res));
12685
12686      // Update flags.  This is pretty lame .. perhaps can do better
12687      // if this turns out to be performance critical.
12688      // O S A P are cleared.  Z is set if RESULT == 0.
12689      // C is set if SRC is zero.
12690      IRTemp src32 = newTemp(Ity_I32);
12691      IRTemp res32 = newTemp(Ity_I32);
12692      assign(src32, widenUto32(mkexpr(src)));
12693      assign(res32, widenUto32(mkexpr(res)));
12694
12695      IRTemp oszacp = newTemp(Ity_I32);
12696      assign(
12697         oszacp,
12698         binop(Iop_Or32,
12699               binop(Iop_Shl32,
12700                     unop(Iop_1Uto32,
12701                          binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
12702                     mkU8(X86G_CC_SHIFT_Z)),
12703               binop(Iop_Shl32,
12704                     unop(Iop_1Uto32,
12705                          binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
12706                     mkU8(X86G_CC_SHIFT_C))
12707         )
12708      );
12709
12710      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12711      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12712      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12713      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
12714
12715      goto decode_success;
12716   }
12717
12718   /* ---------------------------------------------------- */
12719   /* --- end of the SSE4 decoder                      --- */
12720   /* ---------------------------------------------------- */
12721
12722   after_sse_decoders:
12723
12724   /* ---------------------------------------------------- */
12725   /* --- deal with misc 0x67 pfxs (addr size override) -- */
12726   /* ---------------------------------------------------- */
12727
12728   /* 67 E3 = JCXZ (for JECXZ see below) */
12729   if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
12730      delta += 2;
12731      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
12732      delta ++;
12733      stmt( IRStmt_Exit(
12734               binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
12735               Ijk_Boring,
12736               IRConst_U32(d32)
12737            ));
12738       DIP("jcxz 0x%x\n", d32);
12739       goto decode_success;
12740   }
12741
12742   /* ---------------------------------------------------- */
12743   /* --- start of the baseline insn decoder            -- */
12744   /* ---------------------------------------------------- */
12745
12746   /* Get the primary opcode. */
12747   opc = getIByte(delta); delta++;
12748
12749   /* We get here if the current insn isn't SSE, or this CPU doesn't
12750      support SSE. */
12751
12752   switch (opc) {
12753
12754   /* ------------------------ Control flow --------------- */
12755
12756   case 0xC2: /* RET imm16 */
12757      d32 = getUDisp16(delta);
12758      delta += 2;
12759      dis_ret(d32);
12760      dres.whatNext = Dis_StopHere;
12761      DIP("ret %d\n", (Int)d32);
12762      break;
12763   case 0xC3: /* RET */
12764      dis_ret(0);
12765      dres.whatNext = Dis_StopHere;
12766      DIP("ret\n");
12767      break;
12768
12769   case 0xCF: /* IRET */
12770      /* Note, this is an extremely kludgey and limited implementation
12771         of iret.  All it really does is:
12772            popl %EIP; popl %CS; popl %EFLAGS.
12773         %CS is set but ignored (as it is in (eg) popw %cs)". */
12774      t1 = newTemp(Ity_I32); /* ESP */
12775      t2 = newTemp(Ity_I32); /* new EIP */
12776      t3 = newTemp(Ity_I32); /* new CS */
12777      t4 = newTemp(Ity_I32); /* new EFLAGS */
12778      assign(t1, getIReg(4,R_ESP));
12779      assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
12780      assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
12781      assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
12782      /* Get stuff off stack */
12783      putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
12784      /* set %CS (which is ignored anyway) */
12785      putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
12786      /* set %EFLAGS */
12787      set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
12788      /* goto new EIP value */
12789      jmp_treg(Ijk_Ret,t2);
12790      dres.whatNext = Dis_StopHere;
12791      DIP("iret (very kludgey)\n");
12792      break;
12793
12794   case 0xE8: /* CALL J4 */
12795      d32 = getUDisp32(delta); delta += 4;
12796      d32 += (guest_EIP_bbstart+delta);
12797      /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
12798      if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
12799                                         && getIByte(delta) <= 0x5F) {
12800         /* Specially treat the position-independent-code idiom
12801                 call X
12802              X: popl %reg
12803            as
12804                 movl %eip, %reg.
12805            since this generates better code, but for no other reason. */
12806         Int archReg = getIByte(delta) - 0x58;
12807         /* vex_printf("-- fPIC thingy\n"); */
12808         putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
12809         delta++; /* Step over the POP */
12810         DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
12811      } else {
12812         /* The normal sequence for a call. */
12813         t1 = newTemp(Ity_I32);
12814         assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
12815         putIReg(4, R_ESP, mkexpr(t1));
12816         storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
12817         if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
12818            /* follow into the call target. */
12819            dres.whatNext   = Dis_ResteerU;
12820            dres.continueAt = (Addr64)(Addr32)d32;
12821         } else {
12822            jmp_lit(Ijk_Call,d32);
12823            dres.whatNext = Dis_StopHere;
12824         }
12825         DIP("call 0x%x\n",d32);
12826      }
12827      break;
12828
12829//--    case 0xC8: /* ENTER */
12830//--       d32 = getUDisp16(eip); eip += 2;
12831//--       abyte = getIByte(delta); delta++;
12832//--
12833//--       vg_assert(sz == 4);
12834//--       vg_assert(abyte == 0);
12835//--
12836//--       t1 = newTemp(cb); t2 = newTemp(cb);
12837//--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
12838//--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
12839//--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
12840//--       uLiteral(cb, sz);
12841//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
12842//--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
12843//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
12844//--       if (d32) {
12845//--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
12846//--          uLiteral(cb, d32);
12847//--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
12848//--       }
12849//--       DIP("enter 0x%x, 0x%x", d32, abyte);
12850//--       break;
12851
12852   case 0xC9: /* LEAVE */
12853      vassert(sz == 4);
12854      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
12855      assign(t1, getIReg(4,R_EBP));
12856      /* First PUT ESP looks redundant, but need it because ESP must
12857         always be up-to-date for Memcheck to work... */
12858      putIReg(4, R_ESP, mkexpr(t1));
12859      assign(t2, loadLE(Ity_I32,mkexpr(t1)));
12860      putIReg(4, R_EBP, mkexpr(t2));
12861      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
12862      DIP("leave\n");
12863      break;
12864
12865   /* ---------------- Misc weird-ass insns --------------- */
12866
12867   case 0x27: /* DAA */
12868   case 0x2F: /* DAS */
12869   case 0x37: /* AAA */
12870   case 0x3F: /* AAS */
12871      /* An ugly implementation for some ugly instructions.  Oh
12872	 well. */
12873      if (sz != 4) goto decode_failure;
12874      t1 = newTemp(Ity_I32);
12875      t2 = newTemp(Ity_I32);
12876      /* Make up a 32-bit value (t1), with the old value of AX in the
12877         bottom 16 bits, and the old OSZACP bitmask in the upper 16
12878         bits. */
12879      assign(t1,
12880             binop(Iop_16HLto32,
12881                   unop(Iop_32to16,
12882                        mk_x86g_calculate_eflags_all()),
12883                   getIReg(2, R_EAX)
12884            ));
12885      /* Call the helper fn, to get a new AX and OSZACP value, and
12886         poke both back into the guest state.  Also pass the helper
12887         the actual opcode so it knows which of the 4 instructions it
12888         is doing the computation for. */
12889      vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
12890      assign(t2,
12891              mkIRExprCCall(
12892                 Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
12893                 &x86g_calculate_daa_das_aaa_aas,
12894                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
12895            ));
12896     putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
12897
12898     stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12899     stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12900     stmt( IRStmt_Put( OFFB_CC_DEP1,
12901                       binop(Iop_And32,
12902                             binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
12903                             mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
12904                                    | X86G_CC_MASK_A | X86G_CC_MASK_Z
12905                                    | X86G_CC_MASK_S| X86G_CC_MASK_O )
12906                            )
12907                      )
12908         );
12909     /* Set NDEP even though it isn't used.  This makes redundant-PUT
12910        elimination of previous stores to this field work better. */
12911     stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12912     switch (opc) {
12913        case 0x27: DIP("daa\n"); break;
12914        case 0x2F: DIP("das\n"); break;
12915        case 0x37: DIP("aaa\n"); break;
12916        case 0x3F: DIP("aas\n"); break;
12917        default: vassert(0);
12918     }
12919     break;
12920
12921   case 0xD4: /* AAM */
12922   case 0xD5: /* AAD */
12923      d32 = getIByte(delta); delta++;
12924      if (sz != 4 || d32 != 10) goto decode_failure;
12925      t1 = newTemp(Ity_I32);
12926      t2 = newTemp(Ity_I32);
12927      /* Make up a 32-bit value (t1), with the old value of AX in the
12928         bottom 16 bits, and the old OSZACP bitmask in the upper 16
12929         bits. */
12930      assign(t1,
12931             binop(Iop_16HLto32,
12932                   unop(Iop_32to16,
12933                        mk_x86g_calculate_eflags_all()),
12934                   getIReg(2, R_EAX)
12935            ));
12936      /* Call the helper fn, to get a new AX and OSZACP value, and
12937         poke both back into the guest state.  Also pass the helper
12938         the actual opcode so it knows which of the 2 instructions it
12939         is doing the computation for. */
12940      assign(t2,
12941              mkIRExprCCall(
12942                 Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
12943                 &x86g_calculate_aad_aam,
12944                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
12945            ));
12946      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
12947
12948      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12949      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12950      stmt( IRStmt_Put( OFFB_CC_DEP1,
12951                        binop(Iop_And32,
12952                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
12953                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
12954                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
12955                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
12956                             )
12957                       )
12958          );
12959      /* Set NDEP even though it isn't used.  This makes
12960         redundant-PUT elimination of previous stores to this field
12961         work better. */
12962      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12963
12964      DIP(opc == 0xD4 ? "aam\n" : "aad\n");
12965      break;
12966
12967   /* ------------------------ CWD/CDQ -------------------- */
12968
12969   case 0x98: /* CBW */
12970      if (sz == 4) {
12971         putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
12972         DIP("cwde\n");
12973      } else {
12974         vassert(sz == 2);
12975         putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
12976         DIP("cbw\n");
12977      }
12978      break;
12979
12980   case 0x99: /* CWD/CDQ */
12981      ty = szToITy(sz);
12982      putIReg(sz, R_EDX,
12983                  binop(mkSizedOp(ty,Iop_Sar8),
12984                        getIReg(sz, R_EAX),
12985                        mkU8(sz == 2 ? 15 : 31)) );
12986      DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
12987      break;
12988
12989   /* ------------------------ FPU ops -------------------- */
12990
12991   case 0x9E: /* SAHF */
12992      codegen_SAHF();
12993      DIP("sahf\n");
12994      break;
12995
12996   case 0x9F: /* LAHF */
12997      codegen_LAHF();
12998      DIP("lahf\n");
12999      break;
13000
13001   case 0x9B: /* FWAIT */
13002      /* ignore? */
13003      DIP("fwait\n");
13004      break;
13005
13006   case 0xD8:
13007   case 0xD9:
13008   case 0xDA:
13009   case 0xDB:
13010   case 0xDC:
13011   case 0xDD:
13012   case 0xDE:
13013   case 0xDF: {
13014      Int  delta0    = delta;
13015      Bool decode_OK = False;
13016      delta = dis_FPU ( &decode_OK, sorb, delta );
13017      if (!decode_OK) {
13018         delta = delta0;
13019         goto decode_failure;
13020      }
13021      break;
13022   }
13023
13024   /* ------------------------ INC & DEC ------------------ */
13025
13026   case 0x40: /* INC eAX */
13027   case 0x41: /* INC eCX */
13028   case 0x42: /* INC eDX */
13029   case 0x43: /* INC eBX */
13030   case 0x44: /* INC eSP */
13031   case 0x45: /* INC eBP */
13032   case 0x46: /* INC eSI */
13033   case 0x47: /* INC eDI */
13034      vassert(sz == 2 || sz == 4);
13035      ty = szToITy(sz);
13036      t1 = newTemp(ty);
13037      assign( t1, binop(mkSizedOp(ty,Iop_Add8),
13038                        getIReg(sz, (UInt)(opc - 0x40)),
13039                        mkU(ty,1)) );
13040      setFlags_INC_DEC( True, t1, ty );
13041      putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
13042      DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
13043      break;
13044
13045   case 0x48: /* DEC eAX */
13046   case 0x49: /* DEC eCX */
13047   case 0x4A: /* DEC eDX */
13048   case 0x4B: /* DEC eBX */
13049   case 0x4C: /* DEC eSP */
13050   case 0x4D: /* DEC eBP */
13051   case 0x4E: /* DEC eSI */
13052   case 0x4F: /* DEC eDI */
13053      vassert(sz == 2 || sz == 4);
13054      ty = szToITy(sz);
13055      t1 = newTemp(ty);
13056      assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
13057                        getIReg(sz, (UInt)(opc - 0x48)),
13058                        mkU(ty,1)) );
13059      setFlags_INC_DEC( False, t1, ty );
13060      putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
13061      DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
13062      break;
13063
13064   /* ------------------------ INT ------------------------ */
13065
13066   case 0xCC: /* INT 3 */
13067      jmp_lit(Ijk_SigTRAP,((Addr32)guest_EIP_bbstart)+delta);
13068      dres.whatNext = Dis_StopHere;
13069      DIP("int $0x3\n");
13070      break;
13071
13072   case 0xCD: /* INT imm8 */
13073      d32 = getIByte(delta); delta++;
13074
13075      /* For any of the cases where we emit a jump (that is, for all
13076         currently handled cases), it's important that all ArchRegs
13077         carry their up-to-date value at this point.  So we declare an
13078         end-of-block here, which forces any TempRegs caching ArchRegs
13079         to be flushed. */
13080
13081      /* Handle int $0x40 .. $0x43 by synthesising a segfault and a
13082         restart of this instruction (hence the "-2" two lines below,
13083         to get the restart EIP to be this instruction.  This is
13084         probably Linux-specific and it would be more correct to only
13085         do this if the VexAbiInfo says that is what we should do. */
13086      if (d32 >= 0x40 && d32 <= 0x43) {
13087         jmp_lit(Ijk_SigSEGV,((Addr32)guest_EIP_bbstart)+delta-2);
13088         dres.whatNext = Dis_StopHere;
13089         DIP("int $0x%x\n", (Int)d32);
13090         break;
13091      }
13092
13093      /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
13094         (darwin syscalls).  As part of this, note where we are, so we
13095         can back up the guest to this point if the syscall needs to
13096         be restarted. */
13097      if (d32 == 0x80) {
13098         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13099                           mkU32(guest_EIP_curr_instr) ) );
13100         jmp_lit(Ijk_Sys_int128,((Addr32)guest_EIP_bbstart)+delta);
13101         dres.whatNext = Dis_StopHere;
13102         DIP("int $0x80\n");
13103         break;
13104      }
13105      if (d32 == 0x81) {
13106         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13107                           mkU32(guest_EIP_curr_instr) ) );
13108         jmp_lit(Ijk_Sys_int129,((Addr32)guest_EIP_bbstart)+delta);
13109         dres.whatNext = Dis_StopHere;
13110         DIP("int $0x81\n");
13111         break;
13112      }
13113      if (d32 == 0x82) {
13114         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13115                           mkU32(guest_EIP_curr_instr) ) );
13116         jmp_lit(Ijk_Sys_int130,((Addr32)guest_EIP_bbstart)+delta);
13117         dres.whatNext = Dis_StopHere;
13118         DIP("int $0x82\n");
13119         break;
13120      }
13121
13122      /* none of the above */
13123      goto decode_failure;
13124
13125   /* ------------------------ Jcond, byte offset --------- */
13126
13127   case 0xEB: /* Jb (jump, byte offset) */
13128      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13129      delta++;
13130      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13131         dres.whatNext   = Dis_ResteerU;
13132         dres.continueAt = (Addr64)(Addr32)d32;
13133      } else {
13134         jmp_lit(Ijk_Boring,d32);
13135         dres.whatNext = Dis_StopHere;
13136      }
13137      DIP("jmp-8 0x%x\n", d32);
13138      break;
13139
13140   case 0xE9: /* Jv (jump, 16/32 offset) */
13141      vassert(sz == 4); /* JRS added 2004 July 11 */
13142      d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
13143      delta += sz;
13144      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13145         dres.whatNext   = Dis_ResteerU;
13146         dres.continueAt = (Addr64)(Addr32)d32;
13147      } else {
13148         jmp_lit(Ijk_Boring,d32);
13149         dres.whatNext = Dis_StopHere;
13150      }
13151      DIP("jmp 0x%x\n", d32);
13152      break;
13153
13154   case 0x70:
13155   case 0x71:
13156   case 0x72: /* JBb/JNAEb (jump below) */
13157   case 0x73: /* JNBb/JAEb (jump not below) */
13158   case 0x74: /* JZb/JEb (jump zero) */
13159   case 0x75: /* JNZb/JNEb (jump not zero) */
13160   case 0x76: /* JBEb/JNAb (jump below or equal) */
13161   case 0x77: /* JNBEb/JAb (jump not below or equal) */
13162   case 0x78: /* JSb (jump negative) */
13163   case 0x79: /* JSb (jump not negative) */
13164   case 0x7A: /* JP (jump parity even) */
13165   case 0x7B: /* JNP/JPO (jump parity odd) */
13166   case 0x7C: /* JLb/JNGEb (jump less) */
13167   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
13168   case 0x7E: /* JLEb/JNGb (jump less or equal) */
13169   case 0x7F: /* JGb/JNLEb (jump greater) */
13170    { Int    jmpDelta;
13171      HChar* comment  = "";
13172      jmpDelta = (Int)getSDisp8(delta);
13173      vassert(-128 <= jmpDelta && jmpDelta < 128);
13174      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
13175      delta++;
13176      if (resteerCisOk
13177          && vex_control.guest_chase_cond
13178          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13179          && jmpDelta < 0
13180          && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13181         /* Speculation: assume this backward branch is taken.  So we
13182            need to emit a side-exit to the insn following this one,
13183            on the negation of the condition, and continue at the
13184            branch target address (d32).  If we wind up back at the
13185            first instruction of the trace, just stop; it's better to
13186            let the IR loop unroller handle that case. */
13187         stmt( IRStmt_Exit(
13188                  mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
13189                  Ijk_Boring,
13190                  IRConst_U32(guest_EIP_bbstart+delta) ) );
13191         dres.whatNext   = Dis_ResteerC;
13192         dres.continueAt = (Addr64)(Addr32)d32;
13193         comment = "(assumed taken)";
13194      }
13195      else
13196      if (resteerCisOk
13197          && vex_control.guest_chase_cond
13198          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13199          && jmpDelta >= 0
13200          && resteerOkFn( callback_opaque,
13201                          (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
13202         /* Speculation: assume this forward branch is not taken.  So
13203            we need to emit a side-exit to d32 (the dest) and continue
13204            disassembling at the insn immediately following this
13205            one. */
13206         stmt( IRStmt_Exit(
13207                  mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
13208                  Ijk_Boring,
13209                  IRConst_U32(d32) ) );
13210         dres.whatNext   = Dis_ResteerC;
13211         dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
13212         comment = "(assumed not taken)";
13213      }
13214      else {
13215         /* Conservative default translation - end the block at this
13216            point. */
13217         jcc_01( (X86Condcode)(opc - 0x70),
13218                 (Addr32)(guest_EIP_bbstart+delta), d32);
13219         dres.whatNext = Dis_StopHere;
13220      }
13221      DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
13222      break;
13223    }
13224
13225   case 0xE3: /* JECXZ (for JCXZ see above) */
13226      if (sz != 4) goto decode_failure;
13227      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13228      delta ++;
13229      stmt( IRStmt_Exit(
13230               binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
13231            Ijk_Boring,
13232            IRConst_U32(d32)
13233          ));
13234      DIP("jecxz 0x%x\n", d32);
13235      break;
13236
13237   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
13238   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
13239   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
13240    { /* Again, the docs say this uses ECX/CX as a count depending on
13241         the address size override, not the operand one.  Since we
13242         don't handle address size overrides, I guess that means
13243         ECX. */
13244      IRExpr* zbit  = NULL;
13245      IRExpr* count = NULL;
13246      IRExpr* cond  = NULL;
13247      HChar*  xtra  = NULL;
13248
13249      if (sz != 4) goto decode_failure;
13250      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13251      delta++;
13252      putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
13253
13254      count = getIReg(4,R_ECX);
13255      cond = binop(Iop_CmpNE32, count, mkU32(0));
13256      switch (opc) {
13257         case 0xE2:
13258            xtra = "";
13259            break;
13260         case 0xE1:
13261            xtra = "e";
13262            zbit = mk_x86g_calculate_condition( X86CondZ );
13263	    cond = mkAnd1(cond, zbit);
13264            break;
13265         case 0xE0:
13266            xtra = "ne";
13267            zbit = mk_x86g_calculate_condition( X86CondNZ );
13268	    cond = mkAnd1(cond, zbit);
13269            break;
13270         default:
13271	    vassert(0);
13272      }
13273      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32)) );
13274
13275      DIP("loop%s 0x%x\n", xtra, d32);
13276      break;
13277    }
13278
13279   /* ------------------------ IMUL ----------------------- */
13280
13281   case 0x69: /* IMUL Iv, Ev, Gv */
13282      delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
13283      break;
13284   case 0x6B: /* IMUL Ib, Ev, Gv */
13285      delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
13286      break;
13287
13288   /* ------------------------ MOV ------------------------ */
13289
13290   case 0x88: /* MOV Gb,Eb */
13291      delta = dis_mov_G_E(sorb, 1, delta);
13292      break;
13293
13294   case 0x89: /* MOV Gv,Ev */
13295      delta = dis_mov_G_E(sorb, sz, delta);
13296      break;
13297
13298   case 0x8A: /* MOV Eb,Gb */
13299      delta = dis_mov_E_G(sorb, 1, delta);
13300      break;
13301
13302   case 0x8B: /* MOV Ev,Gv */
13303      delta = dis_mov_E_G(sorb, sz, delta);
13304      break;
13305
13306   case 0x8D: /* LEA M,Gv */
13307      if (sz != 4)
13308         goto decode_failure;
13309      modrm = getIByte(delta);
13310      if (epartIsReg(modrm))
13311         goto decode_failure;
13312      /* NOTE!  this is the one place where a segment override prefix
13313         has no effect on the address calculation.  Therefore we pass
13314         zero instead of sorb here. */
13315      addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
13316      delta += alen;
13317      putIReg(sz, gregOfRM(modrm), mkexpr(addr));
13318      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
13319                            nameIReg(sz,gregOfRM(modrm)));
13320      break;
13321
13322   case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
13323      delta = dis_mov_Sw_Ew(sorb, sz, delta);
13324      break;
13325
13326   case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
13327      delta = dis_mov_Ew_Sw(sorb, delta);
13328      break;
13329
13330   case 0xA0: /* MOV Ob,AL */
13331      sz = 1;
13332      /* Fall through ... */
13333   case 0xA1: /* MOV Ov,eAX */
13334      d32 = getUDisp32(delta); delta += 4;
13335      ty = szToITy(sz);
13336      addr = newTemp(Ity_I32);
13337      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13338      putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
13339      DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
13340                                d32, nameIReg(sz,R_EAX));
13341      break;
13342
13343   case 0xA2: /* MOV Ob,AL */
13344      sz = 1;
13345      /* Fall through ... */
13346   case 0xA3: /* MOV eAX,Ov */
13347      d32 = getUDisp32(delta); delta += 4;
13348      ty = szToITy(sz);
13349      addr = newTemp(Ity_I32);
13350      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13351      storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
13352      DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
13353                                sorbTxt(sorb), d32);
13354      break;
13355
13356   case 0xB0: /* MOV imm,AL */
13357   case 0xB1: /* MOV imm,CL */
13358   case 0xB2: /* MOV imm,DL */
13359   case 0xB3: /* MOV imm,BL */
13360   case 0xB4: /* MOV imm,AH */
13361   case 0xB5: /* MOV imm,CH */
13362   case 0xB6: /* MOV imm,DH */
13363   case 0xB7: /* MOV imm,BH */
13364      d32 = getIByte(delta); delta += 1;
13365      putIReg(1, opc-0xB0, mkU8(d32));
13366      DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
13367      break;
13368
13369   case 0xB8: /* MOV imm,eAX */
13370   case 0xB9: /* MOV imm,eCX */
13371   case 0xBA: /* MOV imm,eDX */
13372   case 0xBB: /* MOV imm,eBX */
13373   case 0xBC: /* MOV imm,eSP */
13374   case 0xBD: /* MOV imm,eBP */
13375   case 0xBE: /* MOV imm,eSI */
13376   case 0xBF: /* MOV imm,eDI */
13377      d32 = getUDisp(sz,delta); delta += sz;
13378      putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
13379      DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
13380      break;
13381
13382   case 0xC6: /* MOV Ib,Eb */
13383      sz = 1;
13384      goto do_Mov_I_E;
13385   case 0xC7: /* MOV Iv,Ev */
13386      goto do_Mov_I_E;
13387
13388   do_Mov_I_E:
13389      modrm = getIByte(delta);
13390      if (epartIsReg(modrm)) {
13391         delta++; /* mod/rm byte */
13392         d32 = getUDisp(sz,delta); delta += sz;
13393         putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
13394         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
13395                                  nameIReg(sz,eregOfRM(modrm)));
13396      } else {
13397         addr = disAMode ( &alen, sorb, delta, dis_buf );
13398         delta += alen;
13399         d32 = getUDisp(sz,delta); delta += sz;
13400         storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
13401         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
13402      }
13403      break;
13404
13405   /* ------------------------ opl imm, A ----------------- */
13406
13407   case 0x04: /* ADD Ib, AL */
13408      delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
13409      break;
13410   case 0x05: /* ADD Iv, eAX */
13411      delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
13412      break;
13413
13414   case 0x0C: /* OR Ib, AL */
13415      delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
13416      break;
13417   case 0x0D: /* OR Iv, eAX */
13418      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
13419      break;
13420
13421   case 0x14: /* ADC Ib, AL */
13422      delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
13423      break;
13424   case 0x15: /* ADC Iv, eAX */
13425      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
13426      break;
13427
13428   case 0x1C: /* SBB Ib, AL */
13429      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
13430      break;
13431   case 0x1D: /* SBB Iv, eAX */
13432      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
13433      break;
13434
13435   case 0x24: /* AND Ib, AL */
13436      delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
13437      break;
13438   case 0x25: /* AND Iv, eAX */
13439      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
13440      break;
13441
13442   case 0x2C: /* SUB Ib, AL */
13443      delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
13444      break;
13445   case 0x2D: /* SUB Iv, eAX */
13446      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
13447      break;
13448
13449   case 0x34: /* XOR Ib, AL */
13450      delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
13451      break;
13452   case 0x35: /* XOR Iv, eAX */
13453      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
13454      break;
13455
13456   case 0x3C: /* CMP Ib, AL */
13457      delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
13458      break;
13459   case 0x3D: /* CMP Iv, eAX */
13460      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
13461      break;
13462
13463   case 0xA8: /* TEST Ib, AL */
13464      delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
13465      break;
13466   case 0xA9: /* TEST Iv, eAX */
13467      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
13468      break;
13469
13470   /* ------------------------ opl Ev, Gv ----------------- */
13471
13472   case 0x02: /* ADD Eb,Gb */
13473      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
13474      break;
13475   case 0x03: /* ADD Ev,Gv */
13476      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
13477      break;
13478
13479   case 0x0A: /* OR Eb,Gb */
13480      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
13481      break;
13482   case 0x0B: /* OR Ev,Gv */
13483      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
13484      break;
13485
13486   case 0x12: /* ADC Eb,Gb */
13487      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
13488      break;
13489   case 0x13: /* ADC Ev,Gv */
13490      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
13491      break;
13492
13493   case 0x1A: /* SBB Eb,Gb */
13494      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
13495      break;
13496   case 0x1B: /* SBB Ev,Gv */
13497      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
13498      break;
13499
13500   case 0x22: /* AND Eb,Gb */
13501      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
13502      break;
13503   case 0x23: /* AND Ev,Gv */
13504      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
13505      break;
13506
13507   case 0x2A: /* SUB Eb,Gb */
13508      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
13509      break;
13510   case 0x2B: /* SUB Ev,Gv */
13511      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
13512      break;
13513
13514   case 0x32: /* XOR Eb,Gb */
13515      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
13516      break;
13517   case 0x33: /* XOR Ev,Gv */
13518      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
13519      break;
13520
13521   case 0x3A: /* CMP Eb,Gb */
13522      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
13523      break;
13524   case 0x3B: /* CMP Ev,Gv */
13525      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
13526      break;
13527
13528   case 0x84: /* TEST Eb,Gb */
13529      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
13530      break;
13531   case 0x85: /* TEST Ev,Gv */
13532      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
13533      break;
13534
13535   /* ------------------------ opl Gv, Ev ----------------- */
13536
13537   case 0x00: /* ADD Gb,Eb */
13538      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13539                            Iop_Add8, True, 1, delta, "add" );
13540      break;
13541   case 0x01: /* ADD Gv,Ev */
13542      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13543                            Iop_Add8, True, sz, delta, "add" );
13544      break;
13545
13546   case 0x08: /* OR Gb,Eb */
13547      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13548                            Iop_Or8, True, 1, delta, "or" );
13549      break;
13550   case 0x09: /* OR Gv,Ev */
13551      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13552                            Iop_Or8, True, sz, delta, "or" );
13553      break;
13554
13555   case 0x10: /* ADC Gb,Eb */
13556      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13557                            Iop_Add8, True, 1, delta, "adc" );
13558      break;
13559   case 0x11: /* ADC Gv,Ev */
13560      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13561                            Iop_Add8, True, sz, delta, "adc" );
13562      break;
13563
13564   case 0x18: /* SBB Gb,Eb */
13565      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13566                            Iop_Sub8, True, 1, delta, "sbb" );
13567      break;
13568   case 0x19: /* SBB Gv,Ev */
13569      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13570                            Iop_Sub8, True, sz, delta, "sbb" );
13571      break;
13572
13573   case 0x20: /* AND Gb,Eb */
13574      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13575                            Iop_And8, True, 1, delta, "and" );
13576      break;
13577   case 0x21: /* AND Gv,Ev */
13578      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13579                            Iop_And8, True, sz, delta, "and" );
13580      break;
13581
13582   case 0x28: /* SUB Gb,Eb */
13583      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13584                            Iop_Sub8, True, 1, delta, "sub" );
13585      break;
13586   case 0x29: /* SUB Gv,Ev */
13587      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13588                            Iop_Sub8, True, sz, delta, "sub" );
13589      break;
13590
13591   case 0x30: /* XOR Gb,Eb */
13592      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13593                            Iop_Xor8, True, 1, delta, "xor" );
13594      break;
13595   case 0x31: /* XOR Gv,Ev */
13596      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13597                            Iop_Xor8, True, sz, delta, "xor" );
13598      break;
13599
13600   case 0x38: /* CMP Gb,Eb */
13601      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13602                            Iop_Sub8, False, 1, delta, "cmp" );
13603      break;
13604   case 0x39: /* CMP Gv,Ev */
13605      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13606                            Iop_Sub8, False, sz, delta, "cmp" );
13607      break;
13608
13609   /* ------------------------ POP ------------------------ */
13610
13611   case 0x58: /* POP eAX */
13612   case 0x59: /* POP eCX */
13613   case 0x5A: /* POP eDX */
13614   case 0x5B: /* POP eBX */
13615   case 0x5D: /* POP eBP */
13616   case 0x5E: /* POP eSI */
13617   case 0x5F: /* POP eDI */
13618   case 0x5C: /* POP eSP */
13619      vassert(sz == 2 || sz == 4);
13620      t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
13621      assign(t2, getIReg(4, R_ESP));
13622      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
13623      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13624      putIReg(sz, opc-0x58, mkexpr(t1));
13625      DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
13626      break;
13627
13628   case 0x9D: /* POPF */
13629      vassert(sz == 2 || sz == 4);
13630      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13631      assign(t2, getIReg(4, R_ESP));
13632      assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
13633      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13634
13635      /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
13636	 value in t1. */
13637      set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
13638                                 ((Addr32)guest_EIP_bbstart)+delta );
13639
13640      DIP("popf%c\n", nameISize(sz));
13641      break;
13642
13643   case 0x61: /* POPA */
13644      /* This is almost certainly wrong for sz==2.  So ... */
13645      if (sz != 4) goto decode_failure;
13646
13647      /* t5 is the old %ESP value. */
13648      t5 = newTemp(Ity_I32);
13649      assign( t5, getIReg(4, R_ESP) );
13650
13651      /* Reload all the registers, except %esp. */
13652      putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
13653      putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
13654      putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
13655      putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
13656      /* ignore saved %ESP */
13657      putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
13658      putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
13659      putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
13660
13661      /* and move %ESP back up */
13662      putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
13663
13664      DIP("popa%c\n", nameISize(sz));
13665      break;
13666
13667   case 0x8F: /* POPL/POPW m32 */
13668     { Int    len;
13669       UChar  rm = getIByte(delta);
13670
13671       /* make sure this instruction is correct POP */
13672       if (epartIsReg(rm) || gregOfRM(rm) != 0)
13673          goto decode_failure;
13674       /* and has correct size */
13675       if (sz != 4 && sz != 2)
13676          goto decode_failure;
13677       ty = szToITy(sz);
13678
13679       t1 = newTemp(Ity_I32); /* stack address */
13680       t3 = newTemp(ty); /* data */
13681       /* set t1 to ESP: t1 = ESP */
13682       assign( t1, getIReg(4, R_ESP) );
13683       /* load M[ESP] to virtual register t3: t3 = M[t1] */
13684       assign( t3, loadLE(ty, mkexpr(t1)) );
13685
13686       /* increase ESP; must be done before the STORE.  Intel manual says:
13687            If the ESP register is used as a base register for addressing
13688            a destination operand in memory, the POP instruction computes
13689            the effective address of the operand after it increments the
13690            ESP register.
13691       */
13692       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
13693
13694       /* resolve MODR/M */
13695       addr = disAMode ( &len, sorb, delta, dis_buf);
13696       storeLE( mkexpr(addr), mkexpr(t3) );
13697
13698       DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
13699
13700       delta += len;
13701       break;
13702     }
13703
13704   case 0x1F: /* POP %DS */
13705      dis_pop_segreg( R_DS, sz ); break;
13706   case 0x07: /* POP %ES */
13707      dis_pop_segreg( R_ES, sz ); break;
13708   case 0x17: /* POP %SS */
13709      dis_pop_segreg( R_SS, sz ); break;
13710
13711   /* ------------------------ PUSH ----------------------- */
13712
13713   case 0x50: /* PUSH eAX */
13714   case 0x51: /* PUSH eCX */
13715   case 0x52: /* PUSH eDX */
13716   case 0x53: /* PUSH eBX */
13717   case 0x55: /* PUSH eBP */
13718   case 0x56: /* PUSH eSI */
13719   case 0x57: /* PUSH eDI */
13720   case 0x54: /* PUSH eSP */
13721      /* This is the Right Way, in that the value to be pushed is
13722         established before %esp is changed, so that pushl %esp
13723         correctly pushes the old value. */
13724      vassert(sz == 2 || sz == 4);
13725      ty = sz==2 ? Ity_I16 : Ity_I32;
13726      t1 = newTemp(ty); t2 = newTemp(Ity_I32);
13727      assign(t1, getIReg(sz, opc-0x50));
13728      assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
13729      putIReg(4, R_ESP, mkexpr(t2) );
13730      storeLE(mkexpr(t2),mkexpr(t1));
13731      DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
13732      break;
13733
13734
13735   case 0x68: /* PUSH Iv */
13736      d32 = getUDisp(sz,delta); delta += sz;
13737      goto do_push_I;
13738   case 0x6A: /* PUSH Ib, sign-extended to sz */
13739      d32 = getSDisp8(delta); delta += 1;
13740      goto do_push_I;
13741   do_push_I:
13742      ty = szToITy(sz);
13743      t1 = newTemp(Ity_I32); t2 = newTemp(ty);
13744      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
13745      putIReg(4, R_ESP, mkexpr(t1) );
13746      /* stop mkU16 asserting if d32 is a negative 16-bit number
13747         (bug #132813) */
13748      if (ty == Ity_I16)
13749         d32 &= 0xFFFF;
13750      storeLE( mkexpr(t1), mkU(ty,d32) );
13751      DIP("push%c $0x%x\n", nameISize(sz), d32);
13752      break;
13753
13754   case 0x9C: /* PUSHF */ {
13755      vassert(sz == 2 || sz == 4);
13756
13757      t1 = newTemp(Ity_I32);
13758      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
13759      putIReg(4, R_ESP, mkexpr(t1) );
13760
13761      /* Calculate OSZACP, and patch in fixed fields as per
13762         Intel docs.
13763         - bit 1 is always 1
13764         - bit 9 is Interrupt Enable (should always be 1 in user mode?)
13765      */
13766      t2 = newTemp(Ity_I32);
13767      assign( t2, binop(Iop_Or32,
13768                        mk_x86g_calculate_eflags_all(),
13769                        mkU32( (1<<1)|(1<<9) ) ));
13770
13771      /* Patch in the D flag.  This can simply be a copy of bit 10 of
13772         baseBlock[OFFB_DFLAG]. */
13773      t3 = newTemp(Ity_I32);
13774      assign( t3, binop(Iop_Or32,
13775                        mkexpr(t2),
13776                        binop(Iop_And32,
13777                              IRExpr_Get(OFFB_DFLAG,Ity_I32),
13778                              mkU32(1<<10)))
13779            );
13780
13781      /* And patch in the ID flag. */
13782      t4 = newTemp(Ity_I32);
13783      assign( t4, binop(Iop_Or32,
13784                        mkexpr(t3),
13785                        binop(Iop_And32,
13786                              binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
13787                                               mkU8(21)),
13788                              mkU32(1<<21)))
13789            );
13790
13791      /* And patch in the AC flag. */
13792      t5 = newTemp(Ity_I32);
13793      assign( t5, binop(Iop_Or32,
13794                        mkexpr(t4),
13795                        binop(Iop_And32,
13796                              binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
13797                                               mkU8(18)),
13798                              mkU32(1<<18)))
13799            );
13800
13801      /* if sz==2, the stored value needs to be narrowed. */
13802      if (sz == 2)
13803        storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
13804      else
13805        storeLE( mkexpr(t1), mkexpr(t5) );
13806
13807      DIP("pushf%c\n", nameISize(sz));
13808      break;
13809   }
13810
13811   case 0x60: /* PUSHA */
13812      /* This is almost certainly wrong for sz==2.  So ... */
13813      if (sz != 4) goto decode_failure;
13814
13815      /* This is the Right Way, in that the value to be pushed is
13816         established before %esp is changed, so that pusha
13817         correctly pushes the old %esp value.  New value of %esp is
13818         pushed at start. */
13819      /* t0 is the %ESP value we're going to push. */
13820      t0 = newTemp(Ity_I32);
13821      assign( t0, getIReg(4, R_ESP) );
13822
13823      /* t5 will be the new %ESP value. */
13824      t5 = newTemp(Ity_I32);
13825      assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
13826
13827      /* Update guest state before prodding memory. */
13828      putIReg(4, R_ESP, mkexpr(t5));
13829
13830      /* Dump all the registers. */
13831      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
13832      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
13833      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
13834      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
13835      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
13836      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
13837      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
13838      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
13839
13840      DIP("pusha%c\n", nameISize(sz));
13841      break;
13842
13843   case 0x0E: /* PUSH %CS */
13844      dis_push_segreg( R_CS, sz ); break;
13845   case 0x1E: /* PUSH %DS */
13846      dis_push_segreg( R_DS, sz ); break;
13847   case 0x06: /* PUSH %ES */
13848      dis_push_segreg( R_ES, sz ); break;
13849   case 0x16: /* PUSH %SS */
13850      dis_push_segreg( R_SS, sz ); break;
13851
13852   /* ------------------------ SCAS et al ----------------- */
13853
13854   case 0xA4: /* MOVS, no REP prefix */
13855   case 0xA5:
13856      if (sorb != 0)
13857         goto decode_failure; /* else dis_string_op asserts */
13858      dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
13859      break;
13860
13861  case 0xA6: /* CMPSb, no REP prefix */
13862  case 0xA7:
13863      if (sorb != 0)
13864         goto decode_failure; /* else dis_string_op asserts */
13865      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
13866      break;
13867
13868   case 0xAA: /* STOS, no REP prefix */
13869   case 0xAB:
13870      if (sorb != 0)
13871         goto decode_failure; /* else dis_string_op asserts */
13872      dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
13873      break;
13874
13875   case 0xAC: /* LODS, no REP prefix */
13876   case 0xAD:
13877      if (sorb != 0)
13878         goto decode_failure; /* else dis_string_op asserts */
13879      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
13880      break;
13881
13882   case 0xAE: /* SCAS, no REP prefix */
13883   case 0xAF:
13884      if (sorb != 0)
13885         goto decode_failure; /* else dis_string_op asserts */
13886      dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
13887      break;
13888
13889
13890   case 0xFC: /* CLD */
13891      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
13892      DIP("cld\n");
13893      break;
13894
13895   case 0xFD: /* STD */
13896      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
13897      DIP("std\n");
13898      break;
13899
13900   case 0xF8: /* CLC */
13901   case 0xF9: /* STC */
13902   case 0xF5: /* CMC */
13903      t0 = newTemp(Ity_I32);
13904      t1 = newTemp(Ity_I32);
13905      assign( t0, mk_x86g_calculate_eflags_all() );
13906      switch (opc) {
13907         case 0xF8:
13908            assign( t1, binop(Iop_And32, mkexpr(t0),
13909                                         mkU32(~X86G_CC_MASK_C)));
13910            DIP("clc\n");
13911            break;
13912         case 0xF9:
13913            assign( t1, binop(Iop_Or32, mkexpr(t0),
13914                                        mkU32(X86G_CC_MASK_C)));
13915            DIP("stc\n");
13916            break;
13917         case 0xF5:
13918            assign( t1, binop(Iop_Xor32, mkexpr(t0),
13919                                         mkU32(X86G_CC_MASK_C)));
13920            DIP("cmc\n");
13921            break;
13922         default:
13923            vpanic("disInstr(x86)(clc/stc/cmc)");
13924      }
13925      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13926      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13927      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
13928      /* Set NDEP even though it isn't used.  This makes redundant-PUT
13929         elimination of previous stores to this field work better. */
13930      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13931      break;
13932
13933   case 0xD6: /* SALC */
13934      t0 = newTemp(Ity_I32);
13935      t1 = newTemp(Ity_I32);
13936      assign( t0,  binop(Iop_And32,
13937                         mk_x86g_calculate_eflags_c(),
13938                         mkU32(1)) );
13939      assign( t1, binop(Iop_Sar32,
13940                        binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
13941                        mkU8(31)) );
13942      putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
13943      DIP("salc\n");
13944      break;
13945
13946   /* REPNE prefix insn */
13947   case 0xF2: {
13948      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
13949      if (sorb != 0) goto decode_failure;
13950      abyte = getIByte(delta); delta++;
13951
13952      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
13953      dres.whatNext = Dis_StopHere;
13954
13955      switch (abyte) {
13956      /* According to the Intel manual, "repne movs" should never occur, but
13957       * in practice it has happened, so allow for it here... */
13958      case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
13959      case 0xA5:
13960         dis_REP_op ( X86CondNZ, dis_MOVS, sz, eip_orig,
13961                                 guest_EIP_bbstart+delta, "repne movs" );
13962         break;
13963
13964      case 0xA6: sz = 1;   /* REPNE CMP<sz> */
13965      case 0xA7:
13966         dis_REP_op ( X86CondNZ, dis_CMPS, sz, eip_orig,
13967                                 guest_EIP_bbstart+delta, "repne cmps" );
13968         break;
13969
13970      case 0xAA: sz = 1;   /* REPNE STOS<sz> */
13971      case 0xAB:
13972         dis_REP_op ( X86CondNZ, dis_STOS, sz, eip_orig,
13973                                 guest_EIP_bbstart+delta, "repne stos" );
13974         break;
13975
13976      case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
13977      case 0xAF:
13978         dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
13979                                 guest_EIP_bbstart+delta, "repne scas" );
13980         break;
13981
13982      default:
13983         goto decode_failure;
13984      }
13985      break;
13986   }
13987
13988   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
13989      for the rest, it means REP) */
13990   case 0xF3: {
13991      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
13992      if (sorb != 0) goto decode_failure;
13993      abyte = getIByte(delta); delta++;
13994
13995      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
13996      dres.whatNext = Dis_StopHere;
13997
13998      switch (abyte) {
13999      case 0xA4: sz = 1;   /* REP MOVS<sz> */
14000      case 0xA5:
14001         dis_REP_op ( X86CondAlways, dis_MOVS, sz, eip_orig,
14002                                     guest_EIP_bbstart+delta, "rep movs" );
14003         break;
14004
14005      case 0xA6: sz = 1;   /* REPE CMP<sz> */
14006      case 0xA7:
14007         dis_REP_op ( X86CondZ, dis_CMPS, sz, eip_orig,
14008                                guest_EIP_bbstart+delta, "repe cmps" );
14009         break;
14010
14011      case 0xAA: sz = 1;   /* REP STOS<sz> */
14012      case 0xAB:
14013         dis_REP_op ( X86CondAlways, dis_STOS, sz, eip_orig,
14014                                     guest_EIP_bbstart+delta, "rep stos" );
14015         break;
14016
14017      case 0xAC: sz = 1;   /* REP LODS<sz> */
14018      case 0xAD:
14019         dis_REP_op ( X86CondAlways, dis_LODS, sz, eip_orig,
14020                                     guest_EIP_bbstart+delta, "rep lods" );
14021         break;
14022
14023      case 0xAE: sz = 1;   /* REPE SCAS<sz> */
14024      case 0xAF:
14025         dis_REP_op ( X86CondZ, dis_SCAS, sz, eip_orig,
14026                                guest_EIP_bbstart+delta, "repe scas" );
14027         break;
14028
14029      case 0x90:           /* REP NOP (PAUSE) */
14030         /* a hint to the P4 re spin-wait loop */
14031         DIP("rep nop (P4 pause)\n");
14032         /* "observe" the hint.  The Vex client needs to be careful not
14033            to cause very long delays as a result, though. */
14034         jmp_lit(Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
14035         dres.whatNext = Dis_StopHere;
14036         break;
14037
14038      case 0xC3:           /* REP RET -- same as normal ret? */
14039         dis_ret(0);
14040         dres.whatNext = Dis_StopHere;
14041         DIP("rep ret\n");
14042         break;
14043
14044      default:
14045         goto decode_failure;
14046      }
14047      break;
14048   }
14049
14050   /* ------------------------ XCHG ----------------------- */
14051
14052   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
14053      prefix; hence it must be translated with an IRCAS (at least, the
14054      memory variant). */
14055   case 0x86: /* XCHG Gb,Eb */
14056      sz = 1;
14057      /* Fall through ... */
14058   case 0x87: /* XCHG Gv,Ev */
14059      modrm = getIByte(delta);
14060      ty = szToITy(sz);
14061      t1 = newTemp(ty); t2 = newTemp(ty);
14062      if (epartIsReg(modrm)) {
14063         assign(t1, getIReg(sz, eregOfRM(modrm)));
14064         assign(t2, getIReg(sz, gregOfRM(modrm)));
14065         putIReg(sz, gregOfRM(modrm), mkexpr(t1));
14066         putIReg(sz, eregOfRM(modrm), mkexpr(t2));
14067         delta++;
14068         DIP("xchg%c %s, %s\n",
14069             nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
14070                            nameIReg(sz,eregOfRM(modrm)));
14071      } else {
14072         *expect_CAS = True;
14073         addr = disAMode ( &alen, sorb, delta, dis_buf );
14074         assign( t1, loadLE(ty,mkexpr(addr)) );
14075         assign( t2, getIReg(sz,gregOfRM(modrm)) );
14076         casLE( mkexpr(addr),
14077                mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
14078         putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
14079         delta += alen;
14080         DIP("xchg%c %s, %s\n", nameISize(sz),
14081                                nameIReg(sz,gregOfRM(modrm)), dis_buf);
14082      }
14083      break;
14084
14085   case 0x90: /* XCHG eAX,eAX */
14086      DIP("nop\n");
14087      break;
14088   case 0x91: /* XCHG eAX,eCX */
14089   case 0x92: /* XCHG eAX,eDX */
14090   case 0x93: /* XCHG eAX,eBX */
14091   case 0x94: /* XCHG eAX,eSP */
14092   case 0x95: /* XCHG eAX,eBP */
14093   case 0x96: /* XCHG eAX,eSI */
14094   case 0x97: /* XCHG eAX,eDI */
14095      codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
14096      break;
14097
14098   /* ------------------------ XLAT ----------------------- */
14099
14100   case 0xD7: /* XLAT */
14101      if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
14102      putIReg(
14103         1,
14104         R_EAX/*AL*/,
14105         loadLE(Ity_I8,
14106                handleSegOverride(
14107                   sorb,
14108                   binop(Iop_Add32,
14109                         getIReg(4, R_EBX),
14110                         unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
14111
14112      DIP("xlat%c [ebx]\n", nameISize(sz));
14113      break;
14114
14115   /* ------------------------ IN / OUT ----------------------- */
14116
14117   case 0xE4: /* IN imm8, AL */
14118      sz = 1;
14119      t1 = newTemp(Ity_I32);
14120      abyte = getIByte(delta); delta++;
14121      assign(t1, mkU32( abyte & 0xFF ));
14122      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14123      goto do_IN;
14124   case 0xE5: /* IN imm8, eAX */
14125      vassert(sz == 2 || sz == 4);
14126      t1 = newTemp(Ity_I32);
14127      abyte = getIByte(delta); delta++;
14128      assign(t1, mkU32( abyte & 0xFF ));
14129      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14130      goto do_IN;
14131   case 0xEC: /* IN %DX, AL */
14132      sz = 1;
14133      t1 = newTemp(Ity_I32);
14134      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14135      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14136                                         nameIReg(sz,R_EAX));
14137      goto do_IN;
14138   case 0xED: /* IN %DX, eAX */
14139      vassert(sz == 2 || sz == 4);
14140      t1 = newTemp(Ity_I32);
14141      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14142      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14143                                         nameIReg(sz,R_EAX));
14144      goto do_IN;
14145   do_IN: {
14146      /* At this point, sz indicates the width, and t1 is a 32-bit
14147         value giving port number. */
14148      IRDirty* d;
14149      vassert(sz == 1 || sz == 2 || sz == 4);
14150      ty = szToITy(sz);
14151      t2 = newTemp(Ity_I32);
14152      d = unsafeIRDirty_1_N(
14153             t2,
14154             0/*regparms*/,
14155             "x86g_dirtyhelper_IN",
14156             &x86g_dirtyhelper_IN,
14157             mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
14158          );
14159      /* do the call, dumping the result in t2. */
14160      stmt( IRStmt_Dirty(d) );
14161      putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
14162      break;
14163   }
14164
14165   case 0xE6: /* OUT AL, imm8 */
14166      sz = 1;
14167      t1 = newTemp(Ity_I32);
14168      abyte = getIByte(delta); delta++;
14169      assign( t1, mkU32( abyte & 0xFF ) );
14170      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14171      goto do_OUT;
14172   case 0xE7: /* OUT eAX, imm8 */
14173      vassert(sz == 2 || sz == 4);
14174      t1 = newTemp(Ity_I32);
14175      abyte = getIByte(delta); delta++;
14176      assign( t1, mkU32( abyte & 0xFF ) );
14177      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14178      goto do_OUT;
14179   case 0xEE: /* OUT AL, %DX */
14180      sz = 1;
14181      t1 = newTemp(Ity_I32);
14182      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14183      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14184                                          nameIReg(2,R_EDX));
14185      goto do_OUT;
14186   case 0xEF: /* OUT eAX, %DX */
14187      vassert(sz == 2 || sz == 4);
14188      t1 = newTemp(Ity_I32);
14189      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14190      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14191                                          nameIReg(2,R_EDX));
14192      goto do_OUT;
14193   do_OUT: {
14194      /* At this point, sz indicates the width, and t1 is a 32-bit
14195         value giving port number. */
14196      IRDirty* d;
14197      vassert(sz == 1 || sz == 2 || sz == 4);
14198      ty = szToITy(sz);
14199      d = unsafeIRDirty_0_N(
14200             0/*regparms*/,
14201             "x86g_dirtyhelper_OUT",
14202             &x86g_dirtyhelper_OUT,
14203             mkIRExprVec_3( mkexpr(t1),
14204                            widenUto32( getIReg(sz, R_EAX) ),
14205                            mkU32(sz) )
14206          );
14207      stmt( IRStmt_Dirty(d) );
14208      break;
14209   }
14210
14211   /* ------------------------ (Grp1 extensions) ---------- */
14212
14213   case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
14214                 case 0x80, but only in 32-bit mode. */
14215      /* fallthru */
14216   case 0x80: /* Grp1 Ib,Eb */
14217      modrm = getIByte(delta);
14218      am_sz = lengthAMode(delta);
14219      sz    = 1;
14220      d_sz  = 1;
14221      d32   = getUChar(delta + am_sz);
14222      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14223      break;
14224
14225   case 0x81: /* Grp1 Iv,Ev */
14226      modrm = getIByte(delta);
14227      am_sz = lengthAMode(delta);
14228      d_sz  = sz;
14229      d32   = getUDisp(d_sz, delta + am_sz);
14230      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14231      break;
14232
14233   case 0x83: /* Grp1 Ib,Ev */
14234      modrm = getIByte(delta);
14235      am_sz = lengthAMode(delta);
14236      d_sz  = 1;
14237      d32   = getSDisp8(delta + am_sz);
14238      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14239      break;
14240
14241   /* ------------------------ (Grp2 extensions) ---------- */
14242
14243   case 0xC0: { /* Grp2 Ib,Eb */
14244      Bool decode_OK = True;
14245      modrm = getIByte(delta);
14246      am_sz = lengthAMode(delta);
14247      d_sz  = 1;
14248      d32   = getUChar(delta + am_sz);
14249      sz    = 1;
14250      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14251                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14252      if (!decode_OK)
14253         goto decode_failure;
14254      break;
14255   }
14256   case 0xC1: { /* Grp2 Ib,Ev */
14257      Bool decode_OK = True;
14258      modrm = getIByte(delta);
14259      am_sz = lengthAMode(delta);
14260      d_sz  = 1;
14261      d32   = getUChar(delta + am_sz);
14262      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14263                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14264      if (!decode_OK)
14265         goto decode_failure;
14266      break;
14267   }
14268   case 0xD0: { /* Grp2 1,Eb */
14269      Bool decode_OK = True;
14270      modrm = getIByte(delta);
14271      am_sz = lengthAMode(delta);
14272      d_sz  = 0;
14273      d32   = 1;
14274      sz    = 1;
14275      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14276                         mkU8(d32), NULL, &decode_OK );
14277      if (!decode_OK)
14278         goto decode_failure;
14279      break;
14280   }
14281   case 0xD1: { /* Grp2 1,Ev */
14282      Bool decode_OK = True;
14283      modrm = getUChar(delta);
14284      am_sz = lengthAMode(delta);
14285      d_sz  = 0;
14286      d32   = 1;
14287      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14288                         mkU8(d32), NULL, &decode_OK );
14289      if (!decode_OK)
14290         goto decode_failure;
14291      break;
14292   }
14293   case 0xD2: { /* Grp2 CL,Eb */
14294      Bool decode_OK = True;
14295      modrm = getUChar(delta);
14296      am_sz = lengthAMode(delta);
14297      d_sz  = 0;
14298      sz    = 1;
14299      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14300                         getIReg(1,R_ECX), "%cl", &decode_OK );
14301      if (!decode_OK)
14302         goto decode_failure;
14303      break;
14304   }
14305   case 0xD3: { /* Grp2 CL,Ev */
14306      Bool decode_OK = True;
14307      modrm = getIByte(delta);
14308      am_sz = lengthAMode(delta);
14309      d_sz  = 0;
14310      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14311                         getIReg(1,R_ECX), "%cl", &decode_OK );
14312      if (!decode_OK)
14313         goto decode_failure;
14314      break;
14315   }
14316
14317   /* ------------------------ (Grp3 extensions) ---------- */
14318
14319   case 0xF6: { /* Grp3 Eb */
14320      Bool decode_OK = True;
14321      delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
14322      if (!decode_OK)
14323         goto decode_failure;
14324      break;
14325   }
14326   case 0xF7: { /* Grp3 Ev */
14327      Bool decode_OK = True;
14328      delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
14329      if (!decode_OK)
14330         goto decode_failure;
14331      break;
14332   }
14333
14334   /* ------------------------ (Grp4 extensions) ---------- */
14335
14336   case 0xFE: { /* Grp4 Eb */
14337      Bool decode_OK = True;
14338      delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
14339      if (!decode_OK)
14340         goto decode_failure;
14341      break;
14342   }
14343
14344   /* ------------------------ (Grp5 extensions) ---------- */
14345
14346   case 0xFF: { /* Grp5 Ev */
14347      Bool decode_OK = True;
14348      delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
14349      if (!decode_OK)
14350         goto decode_failure;
14351      break;
14352   }
14353
14354   /* ------------------------ Escapes to 2-byte opcodes -- */
14355
14356   case 0x0F: {
14357      opc = getIByte(delta); delta++;
14358      switch (opc) {
14359
14360      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
14361
14362      case 0xBA: { /* Grp8 Ib,Ev */
14363         Bool decode_OK = False;
14364         modrm = getUChar(delta);
14365         am_sz = lengthAMode(delta);
14366         d32   = getSDisp8(delta + am_sz);
14367         delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
14368                                am_sz, sz, d32, &decode_OK );
14369         if (!decode_OK)
14370            goto decode_failure;
14371         break;
14372      }
14373
14374      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
14375
14376      case 0xBC: /* BSF Gv,Ev */
14377         delta = dis_bs_E_G ( sorb, sz, delta, True );
14378         break;
14379      case 0xBD: /* BSR Gv,Ev */
14380         delta = dis_bs_E_G ( sorb, sz, delta, False );
14381         break;
14382
14383      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
14384
14385      case 0xC8: /* BSWAP %eax */
14386      case 0xC9:
14387      case 0xCA:
14388      case 0xCB:
14389      case 0xCC:
14390      case 0xCD:
14391      case 0xCE:
14392      case 0xCF: /* BSWAP %edi */
14393         /* AFAICS from the Intel docs, this only exists at size 4. */
14394         vassert(sz == 4);
14395         t1 = newTemp(Ity_I32);
14396         t2 = newTemp(Ity_I32);
14397         assign( t1, getIReg(4, opc-0xC8) );
14398
14399         assign( t2,
14400            binop(Iop_Or32,
14401               binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
14402            binop(Iop_Or32,
14403               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
14404                                mkU32(0x00FF0000)),
14405            binop(Iop_Or32,
14406               binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
14407                                mkU32(0x0000FF00)),
14408               binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
14409                                mkU32(0x000000FF) )
14410            )))
14411         );
14412
14413         putIReg(4, opc-0xC8, mkexpr(t2));
14414         DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
14415         break;
14416
14417      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
14418
14419      case 0xA3: /* BT Gv,Ev */
14420         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
14421         break;
14422      case 0xB3: /* BTR Gv,Ev */
14423         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
14424         break;
14425      case 0xAB: /* BTS Gv,Ev */
14426         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
14427         break;
14428      case 0xBB: /* BTC Gv,Ev */
14429         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
14430         break;
14431
14432      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
14433
14434      case 0x40:
14435      case 0x41:
14436      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
14437      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
14438      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
14439      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
14440      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
14441      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
14442      case 0x48: /* CMOVSb (cmov negative) */
14443      case 0x49: /* CMOVSb (cmov not negative) */
14444      case 0x4A: /* CMOVP (cmov parity even) */
14445      case 0x4B: /* CMOVNP (cmov parity odd) */
14446      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
14447      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
14448      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
14449      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
14450         delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
14451         break;
14452
14453      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
14454
14455      case 0xB0: /* CMPXCHG Gb,Eb */
14456         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
14457         break;
14458      case 0xB1: /* CMPXCHG Gv,Ev */
14459         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
14460         break;
14461
14462      case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
14463         IRTemp expdHi    = newTemp(Ity_I32);
14464         IRTemp expdLo    = newTemp(Ity_I32);
14465         IRTemp dataHi    = newTemp(Ity_I32);
14466         IRTemp dataLo    = newTemp(Ity_I32);
14467         IRTemp oldHi     = newTemp(Ity_I32);
14468         IRTemp oldLo     = newTemp(Ity_I32);
14469         IRTemp flags_old = newTemp(Ity_I32);
14470         IRTemp flags_new = newTemp(Ity_I32);
14471         IRTemp success   = newTemp(Ity_I1);
14472
14473         /* Translate this using a DCAS, even if there is no LOCK
14474            prefix.  Life is too short to bother with generating two
14475            different translations for the with/without-LOCK-prefix
14476            cases. */
14477         *expect_CAS = True;
14478
14479	 /* Decode, and generate address. */
14480         if (sz != 4) goto decode_failure;
14481         modrm = getIByte(delta);
14482         if (epartIsReg(modrm)) goto decode_failure;
14483         if (gregOfRM(modrm) != 1) goto decode_failure;
14484         addr = disAMode ( &alen, sorb, delta, dis_buf );
14485         delta += alen;
14486
14487         /* Get the expected and new values. */
14488         assign( expdHi, getIReg(4,R_EDX) );
14489         assign( expdLo, getIReg(4,R_EAX) );
14490         assign( dataHi, getIReg(4,R_ECX) );
14491         assign( dataLo, getIReg(4,R_EBX) );
14492
14493         /* Do the DCAS */
14494         stmt( IRStmt_CAS(
14495                  mkIRCAS( oldHi, oldLo,
14496                           Iend_LE, mkexpr(addr),
14497                           mkexpr(expdHi), mkexpr(expdLo),
14498                           mkexpr(dataHi), mkexpr(dataLo)
14499               )));
14500
14501         /* success when oldHi:oldLo == expdHi:expdLo */
14502         assign( success,
14503                 binop(Iop_CasCmpEQ32,
14504                       binop(Iop_Or32,
14505                             binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
14506                             binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
14507                       ),
14508                       mkU32(0)
14509                 ));
14510
14511         /* If the DCAS is successful, that is to say oldHi:oldLo ==
14512            expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
14513            which is where they came from originally.  Both the actual
14514            contents of these two regs, and any shadow values, are
14515            unchanged.  If the DCAS fails then we're putting into
14516            EDX:EAX the value seen in memory. */
14517         putIReg(4, R_EDX,
14518                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
14519                                  mkexpr(oldHi),
14520                                  mkexpr(expdHi)
14521                ));
14522         putIReg(4, R_EAX,
14523                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
14524                                  mkexpr(oldLo),
14525                                  mkexpr(expdLo)
14526                ));
14527
14528         /* Copy the success bit into the Z flag and leave the others
14529            unchanged */
14530         assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
14531         assign(
14532            flags_new,
14533            binop(Iop_Or32,
14534                  binop(Iop_And32, mkexpr(flags_old),
14535                                   mkU32(~X86G_CC_MASK_Z)),
14536                  binop(Iop_Shl32,
14537                        binop(Iop_And32,
14538                              unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
14539                        mkU8(X86G_CC_SHIFT_Z)) ));
14540
14541         stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14542         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
14543         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14544         /* Set NDEP even though it isn't used.  This makes
14545            redundant-PUT elimination of previous stores to this field
14546            work better. */
14547         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14548
14549         /* Sheesh.  Aren't you glad it was me and not you that had to
14550	    write and validate all this grunge? */
14551
14552	 DIP("cmpxchg8b %s\n", dis_buf);
14553	 break;
14554      }
14555
14556      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
14557
14558      case 0xA2: { /* CPUID */
14559         /* Uses dirty helper:
14560               void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
14561            declared to mod eax, wr ebx, ecx, edx
14562         */
14563         IRDirty* d     = NULL;
14564         HChar*   fName = NULL;
14565         void*    fAddr = NULL;
14566         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
14567            fName = "x86g_dirtyhelper_CPUID_sse2";
14568            fAddr = &x86g_dirtyhelper_CPUID_sse2;
14569         }
14570         else
14571         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
14572            fName = "x86g_dirtyhelper_CPUID_sse1";
14573            fAddr = &x86g_dirtyhelper_CPUID_sse1;
14574         }
14575         else
14576         if (archinfo->hwcaps == 0/*no SSE*/) {
14577            fName = "x86g_dirtyhelper_CPUID_sse0";
14578            fAddr = &x86g_dirtyhelper_CPUID_sse0;
14579         } else
14580            vpanic("disInstr(x86)(cpuid)");
14581
14582         vassert(fName); vassert(fAddr);
14583         d = unsafeIRDirty_0_N ( 0/*regparms*/,
14584                                 fName, fAddr, mkIRExprVec_0() );
14585         /* declare guest state effects */
14586         d->needsBBP = True;
14587         d->nFxState = 4;
14588         d->fxState[0].fx     = Ifx_Modify;
14589         d->fxState[0].offset = OFFB_EAX;
14590         d->fxState[0].size   = 4;
14591         d->fxState[1].fx     = Ifx_Write;
14592         d->fxState[1].offset = OFFB_EBX;
14593         d->fxState[1].size   = 4;
14594         d->fxState[2].fx     = Ifx_Modify;
14595         d->fxState[2].offset = OFFB_ECX;
14596         d->fxState[2].size   = 4;
14597         d->fxState[3].fx     = Ifx_Write;
14598         d->fxState[3].offset = OFFB_EDX;
14599         d->fxState[3].size   = 4;
14600         /* execute the dirty call, side-effecting guest state */
14601         stmt( IRStmt_Dirty(d) );
14602         /* CPUID is a serialising insn.  So, just in case someone is
14603            using it as a memory fence ... */
14604         stmt( IRStmt_MBE(Imbe_Fence) );
14605         DIP("cpuid\n");
14606         break;
14607      }
14608
14609//--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
14610//--             goto decode_failure;
14611//--
14612//--          t1 = newTemp(cb);
14613//--          t2 = newTemp(cb);
14614//--          t3 = newTemp(cb);
14615//--          t4 = newTemp(cb);
14616//--          uInstr0(cb, CALLM_S, 0);
14617//--
14618//--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
14619//--          uInstr1(cb, PUSH,  4, TempReg, t1);
14620//--
14621//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
14622//--          uLiteral(cb, 0);
14623//--          uInstr1(cb, PUSH,  4, TempReg, t2);
14624//--
14625//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
14626//--          uLiteral(cb, 0);
14627//--          uInstr1(cb, PUSH,  4, TempReg, t3);
14628//--
14629//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
14630//--          uLiteral(cb, 0);
14631//--          uInstr1(cb, PUSH,  4, TempReg, t4);
14632//--
14633//--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
14634//--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
14635//--
14636//--          uInstr1(cb, POP,   4, TempReg, t4);
14637//--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
14638//--
14639//--          uInstr1(cb, POP,   4, TempReg, t3);
14640//--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
14641//--
14642//--          uInstr1(cb, POP,   4, TempReg, t2);
14643//--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
14644//--
14645//--          uInstr1(cb, POP,   4, TempReg, t1);
14646//--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
14647//--
14648//--          uInstr0(cb, CALLM_E, 0);
14649//--          DIP("cpuid\n");
14650//--          break;
14651//--
14652      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
14653
14654      case 0xB6: /* MOVZXb Eb,Gv */
14655         if (sz != 2 && sz != 4)
14656            goto decode_failure;
14657         delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
14658         break;
14659
14660      case 0xB7: /* MOVZXw Ew,Gv */
14661         if (sz != 4)
14662            goto decode_failure;
14663         delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
14664         break;
14665
14666      case 0xBE: /* MOVSXb Eb,Gv */
14667         if (sz != 2 && sz != 4)
14668            goto decode_failure;
14669         delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
14670         break;
14671
14672      case 0xBF: /* MOVSXw Ew,Gv */
14673         if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
14674            goto decode_failure;
14675         delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
14676         break;
14677
14678//--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
14679//--
14680//--       case 0xC3: /* MOVNTI Gv,Ev */
14681//--          vg_assert(sz == 4);
14682//--          modrm = getUChar(eip);
14683//--          vg_assert(!epartIsReg(modrm));
14684//--          t1 = newTemp(cb);
14685//--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
14686//--          pair = disAMode ( cb, sorb, eip, dis_buf );
14687//--          t2 = LOW24(pair);
14688//--          eip += HI8(pair);
14689//--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
14690//--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
14691//--          break;
14692
14693      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
14694
14695      case 0xAF: /* IMUL Ev, Gv */
14696         delta = dis_mul_E_G ( sorb, sz, delta );
14697         break;
14698
14699      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
14700
14701      case 0x1F:
14702         modrm = getUChar(delta);
14703         if (epartIsReg(modrm)) goto decode_failure;
14704         addr = disAMode ( &alen, sorb, delta, dis_buf );
14705         delta += alen;
14706         DIP("nop%c %s\n", nameISize(sz), dis_buf);
14707         break;
14708
14709      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
14710      case 0x80:
14711      case 0x81:
14712      case 0x82: /* JBb/JNAEb (jump below) */
14713      case 0x83: /* JNBb/JAEb (jump not below) */
14714      case 0x84: /* JZb/JEb (jump zero) */
14715      case 0x85: /* JNZb/JNEb (jump not zero) */
14716      case 0x86: /* JBEb/JNAb (jump below or equal) */
14717      case 0x87: /* JNBEb/JAb (jump not below or equal) */
14718      case 0x88: /* JSb (jump negative) */
14719      case 0x89: /* JSb (jump not negative) */
14720      case 0x8A: /* JP (jump parity even) */
14721      case 0x8B: /* JNP/JPO (jump parity odd) */
14722      case 0x8C: /* JLb/JNGEb (jump less) */
14723      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
14724      case 0x8E: /* JLEb/JNGb (jump less or equal) */
14725      case 0x8F: /* JGb/JNLEb (jump greater) */
14726       { Int    jmpDelta;
14727         HChar* comment  = "";
14728         jmpDelta = (Int)getUDisp32(delta);
14729         d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
14730         delta += 4;
14731         if (resteerCisOk
14732             && vex_control.guest_chase_cond
14733             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
14734             && jmpDelta < 0
14735             && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
14736            /* Speculation: assume this backward branch is taken.  So
14737               we need to emit a side-exit to the insn following this
14738               one, on the negation of the condition, and continue at
14739               the branch target address (d32).  If we wind up back at
14740               the first instruction of the trace, just stop; it's
14741               better to let the IR loop unroller handle that case.*/
14742            stmt( IRStmt_Exit(
14743                     mk_x86g_calculate_condition((X86Condcode)
14744                                                 (1 ^ (opc - 0x80))),
14745                     Ijk_Boring,
14746                     IRConst_U32(guest_EIP_bbstart+delta) ) );
14747            dres.whatNext   = Dis_ResteerC;
14748            dres.continueAt = (Addr64)(Addr32)d32;
14749            comment = "(assumed taken)";
14750         }
14751         else
14752         if (resteerCisOk
14753             && vex_control.guest_chase_cond
14754             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
14755             && jmpDelta >= 0
14756             && resteerOkFn( callback_opaque,
14757                             (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
14758            /* Speculation: assume this forward branch is not taken.
14759               So we need to emit a side-exit to d32 (the dest) and
14760               continue disassembling at the insn immediately
14761               following this one. */
14762            stmt( IRStmt_Exit(
14763                     mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
14764                     Ijk_Boring,
14765                     IRConst_U32(d32) ) );
14766            dres.whatNext   = Dis_ResteerC;
14767            dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
14768            comment = "(assumed not taken)";
14769         }
14770         else {
14771            /* Conservative default translation - end the block at
14772               this point. */
14773            jcc_01( (X86Condcode)(opc - 0x80),
14774                    (Addr32)(guest_EIP_bbstart+delta), d32);
14775            dres.whatNext = Dis_StopHere;
14776         }
14777         DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
14778         break;
14779       }
14780
14781      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
14782      case 0x31: { /* RDTSC */
14783         IRTemp   val  = newTemp(Ity_I64);
14784         IRExpr** args = mkIRExprVec_0();
14785         IRDirty* d    = unsafeIRDirty_1_N (
14786                            val,
14787                            0/*regparms*/,
14788                            "x86g_dirtyhelper_RDTSC",
14789                            &x86g_dirtyhelper_RDTSC,
14790                            args
14791                         );
14792         /* execute the dirty call, dumping the result in val. */
14793         stmt( IRStmt_Dirty(d) );
14794         putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
14795         putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
14796         DIP("rdtsc\n");
14797         break;
14798      }
14799
14800      /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
14801
14802      case 0xA1: /* POP %FS */
14803         dis_pop_segreg( R_FS, sz ); break;
14804      case 0xA9: /* POP %GS */
14805         dis_pop_segreg( R_GS, sz ); break;
14806
14807      case 0xA0: /* PUSH %FS */
14808         dis_push_segreg( R_FS, sz ); break;
14809      case 0xA8: /* PUSH %GS */
14810         dis_push_segreg( R_GS, sz ); break;
14811
14812      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
14813      case 0x90:
14814      case 0x91:
14815      case 0x92: /* set-Bb/set-NAEb (jump below) */
14816      case 0x93: /* set-NBb/set-AEb (jump not below) */
14817      case 0x94: /* set-Zb/set-Eb (jump zero) */
14818      case 0x95: /* set-NZb/set-NEb (jump not zero) */
14819      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
14820      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
14821      case 0x98: /* set-Sb (jump negative) */
14822      case 0x99: /* set-Sb (jump not negative) */
14823      case 0x9A: /* set-P (jump parity even) */
14824      case 0x9B: /* set-NP (jump parity odd) */
14825      case 0x9C: /* set-Lb/set-NGEb (jump less) */
14826      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
14827      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
14828      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
14829         t1 = newTemp(Ity_I8);
14830         assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
14831         modrm = getIByte(delta);
14832         if (epartIsReg(modrm)) {
14833            delta++;
14834            putIReg(1, eregOfRM(modrm), mkexpr(t1));
14835            DIP("set%s %s\n", name_X86Condcode(opc-0x90),
14836                              nameIReg(1,eregOfRM(modrm)));
14837         } else {
14838           addr = disAMode ( &alen, sorb, delta, dis_buf );
14839           delta += alen;
14840           storeLE( mkexpr(addr), mkexpr(t1) );
14841           DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
14842         }
14843         break;
14844
14845      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
14846
14847      case 0xA4: /* SHLDv imm8,Gv,Ev */
14848         modrm = getIByte(delta);
14849         d32   = delta + lengthAMode(delta);
14850         vex_sprintf(dis_buf, "$%d", getIByte(d32));
14851         delta = dis_SHLRD_Gv_Ev (
14852                  sorb, delta, modrm, sz,
14853                  mkU8(getIByte(d32)), True, /* literal */
14854                  dis_buf, True );
14855         break;
14856      case 0xA5: /* SHLDv %cl,Gv,Ev */
14857         modrm = getIByte(delta);
14858         delta = dis_SHLRD_Gv_Ev (
14859                    sorb, delta, modrm, sz,
14860                    getIReg(1,R_ECX), False, /* not literal */
14861                    "%cl", True );
14862         break;
14863
14864      case 0xAC: /* SHRDv imm8,Gv,Ev */
14865         modrm = getIByte(delta);
14866         d32   = delta + lengthAMode(delta);
14867         vex_sprintf(dis_buf, "$%d", getIByte(d32));
14868         delta = dis_SHLRD_Gv_Ev (
14869                    sorb, delta, modrm, sz,
14870                    mkU8(getIByte(d32)), True, /* literal */
14871                    dis_buf, False );
14872         break;
14873      case 0xAD: /* SHRDv %cl,Gv,Ev */
14874         modrm = getIByte(delta);
14875         delta = dis_SHLRD_Gv_Ev (
14876                    sorb, delta, modrm, sz,
14877                    getIReg(1,R_ECX), False, /* not literal */
14878                    "%cl", False );
14879         break;
14880
14881      /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
14882
14883      case 0x34:
14884         /* Simple implementation needing a long explaination.
14885
14886            sysenter is a kind of syscall entry.  The key thing here
14887            is that the return address is not known -- that is
14888            something that is beyond Vex's knowledge.  So this IR
14889            forces a return to the scheduler, which can do what it
14890            likes to simulate the systenter, but it MUST set this
14891            thread's guest_EIP field with the continuation address
14892            before resuming execution.  If that doesn't happen, the
14893            thread will jump to address zero, which is probably
14894            fatal.
14895         */
14896
14897         /* Note where we are, so we can back up the guest to this
14898            point if the syscall needs to be restarted. */
14899         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
14900                           mkU32(guest_EIP_curr_instr) ) );
14901         jmp_lit(Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
14902         dres.whatNext = Dis_StopHere;
14903         DIP("sysenter");
14904         break;
14905
14906      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
14907
14908      case 0xC0: { /* XADD Gb,Eb */
14909         Bool decodeOK;
14910         delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
14911         if (!decodeOK) goto decode_failure;
14912         break;
14913      }
14914      case 0xC1: { /* XADD Gv,Ev */
14915         Bool decodeOK;
14916         delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
14917         if (!decodeOK) goto decode_failure;
14918         break;
14919      }
14920
14921      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
14922
14923      case 0x71:
14924      case 0x72:
14925      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
14926
14927      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
14928      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
14929      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
14930      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
14931
14932      case 0xFC:
14933      case 0xFD:
14934      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
14935
14936      case 0xEC:
14937      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
14938
14939      case 0xDC:
14940      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
14941
14942      case 0xF8:
14943      case 0xF9:
14944      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
14945
14946      case 0xE8:
14947      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
14948
14949      case 0xD8:
14950      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
14951
14952      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
14953      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
14954
14955      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
14956
14957      case 0x74:
14958      case 0x75:
14959      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
14960
14961      case 0x64:
14962      case 0x65:
14963      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
14964
14965      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
14966      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
14967      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
14968
14969      case 0x68:
14970      case 0x69:
14971      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
14972
14973      case 0x60:
14974      case 0x61:
14975      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
14976
14977      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
14978      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
14979      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
14980      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
14981
14982      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
14983      case 0xF2:
14984      case 0xF3:
14985
14986      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
14987      case 0xD2:
14988      case 0xD3:
14989
14990      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
14991      case 0xE2:
14992      {
14993         Int  delta0    = delta-1;
14994         Bool decode_OK = False;
14995
14996         /* If sz==2 this is SSE, and we assume sse idec has
14997            already spotted those cases by now. */
14998         if (sz != 4)
14999            goto decode_failure;
15000
15001         delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
15002         if (!decode_OK) {
15003            delta = delta0;
15004            goto decode_failure;
15005         }
15006         break;
15007      }
15008
15009      case 0x0E: /* FEMMS */
15010      case 0x77: /* EMMS */
15011         if (sz != 4)
15012            goto decode_failure;
15013         do_EMMS_preamble();
15014         DIP("{f}emms\n");
15015         break;
15016
15017      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
15018      case 0x01: /* 0F 01 /0 -- SGDT */
15019                 /* 0F 01 /1 -- SIDT */
15020      {
15021          /* This is really revolting, but ... since each processor
15022             (core) only has one IDT and one GDT, just let the guest
15023             see it (pass-through semantics).  I can't see any way to
15024             construct a faked-up value, so don't bother to try. */
15025         modrm = getUChar(delta);
15026         addr = disAMode ( &alen, sorb, delta, dis_buf );
15027         delta += alen;
15028         if (epartIsReg(modrm)) goto decode_failure;
15029         if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
15030            goto decode_failure;
15031         switch (gregOfRM(modrm)) {
15032            case 0: DIP("sgdt %s\n", dis_buf); break;
15033            case 1: DIP("sidt %s\n", dis_buf); break;
15034            default: vassert(0); /*NOTREACHED*/
15035         }
15036
15037         IRDirty* d = unsafeIRDirty_0_N (
15038                          0/*regparms*/,
15039                          "x86g_dirtyhelper_SxDT",
15040                          &x86g_dirtyhelper_SxDT,
15041                          mkIRExprVec_2( mkexpr(addr),
15042                                         mkU32(gregOfRM(modrm)) )
15043                      );
15044         /* declare we're writing memory */
15045         d->mFx   = Ifx_Write;
15046         d->mAddr = mkexpr(addr);
15047         d->mSize = 6;
15048         stmt( IRStmt_Dirty(d) );
15049         break;
15050      }
15051
15052      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
15053
15054      default:
15055         goto decode_failure;
15056   } /* switch (opc) for the 2-byte opcodes */
15057   goto decode_success;
15058   } /* case 0x0F: of primary opcode */
15059
15060   /* ------------------------ ??? ------------------------ */
15061
15062  default:
15063  decode_failure:
15064   /* All decode failures end up here. */
15065   vex_printf("vex x86->IR: unhandled instruction bytes: "
15066              "0x%x 0x%x 0x%x 0x%x\n",
15067              (Int)getIByte(delta_start+0),
15068              (Int)getIByte(delta_start+1),
15069              (Int)getIByte(delta_start+2),
15070              (Int)getIByte(delta_start+3) );
15071
15072   /* Tell the dispatcher that this insn cannot be decoded, and so has
15073      not been executed, and (is currently) the next to be executed.
15074      EIP should be up-to-date since it made so at the start of each
15075      insn, but nevertheless be paranoid and update it again right
15076      now. */
15077   stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
15078   jmp_lit(Ijk_NoDecode, guest_EIP_curr_instr);
15079   dres.whatNext = Dis_StopHere;
15080   dres.len = 0;
15081   /* We also need to say that a CAS is not expected now, regardless
15082      of what it might have been set to at the start of the function,
15083      since the IR that we've emitted just above (to synthesis a
15084      SIGILL) does not involve any CAS, and presumably no other IR has
15085      been emitted for this (non-decoded) insn. */
15086   *expect_CAS = False;
15087   return dres;
15088
15089   } /* switch (opc) for the main (primary) opcode switch. */
15090
15091  decode_success:
15092   /* All decode successes end up here. */
15093   DIP("\n");
15094   dres.len = delta - delta_start;
15095   return dres;
15096}
15097
15098#undef DIP
15099#undef DIS
15100
15101
15102/*------------------------------------------------------------*/
15103/*--- Top-level fn                                         ---*/
15104/*------------------------------------------------------------*/
15105
15106/* Disassemble a single instruction into IR.  The instruction
15107   is located in host memory at &guest_code[delta]. */
15108
15109DisResult disInstr_X86 ( IRSB*        irsb_IN,
15110                         Bool         put_IP,
15111                         Bool         (*resteerOkFn) ( void*, Addr64 ),
15112                         Bool         resteerCisOk,
15113                         void*        callback_opaque,
15114                         UChar*       guest_code_IN,
15115                         Long         delta,
15116                         Addr64       guest_IP,
15117                         VexArch      guest_arch,
15118                         VexArchInfo* archinfo,
15119                         VexAbiInfo*  abiinfo,
15120                         Bool         host_bigendian_IN )
15121{
15122   Int       i, x1, x2;
15123   Bool      expect_CAS, has_CAS;
15124   DisResult dres;
15125
15126   /* Set globals (see top of this file) */
15127   vassert(guest_arch == VexArchX86);
15128   guest_code           = guest_code_IN;
15129   irsb                 = irsb_IN;
15130   host_is_bigendian    = host_bigendian_IN;
15131   guest_EIP_curr_instr = (Addr32)guest_IP;
15132   guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
15133
15134   x1 = irsb_IN->stmts_used;
15135   expect_CAS = False;
15136   dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
15137                             resteerCisOk,
15138                             callback_opaque,
15139                             delta, archinfo, abiinfo );
15140   x2 = irsb_IN->stmts_used;
15141   vassert(x2 >= x1);
15142
15143   /* See comment at the top of disInstr_X86_WRK for meaning of
15144      expect_CAS.  Here, we (sanity-)check for the presence/absence of
15145      IRCAS as directed by the returned expect_CAS value. */
15146   has_CAS = False;
15147   for (i = x1; i < x2; i++) {
15148      if (irsb_IN->stmts[i]->tag == Ist_CAS)
15149         has_CAS = True;
15150   }
15151
15152   if (expect_CAS != has_CAS) {
15153      /* inconsistency detected.  re-disassemble the instruction so as
15154         to generate a useful error message; then assert. */
15155      vex_traceflags |= VEX_TRACE_FE;
15156      dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
15157                                resteerCisOk,
15158                                callback_opaque,
15159                                delta, archinfo, abiinfo );
15160      for (i = x1; i < x2; i++) {
15161         vex_printf("\t\t");
15162         ppIRStmt(irsb_IN->stmts[i]);
15163         vex_printf("\n");
15164      }
15165      /* Failure of this assertion is serious and denotes a bug in
15166         disInstr. */
15167      vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
15168   }
15169
15170   return dres;
15171}
15172
15173
15174/*--------------------------------------------------------------------*/
15175/*--- end                                         guest_x86_toIR.c ---*/
15176/*--------------------------------------------------------------------*/
15177