1
2/*--------------------------------------------------------------------*/
3/*--- begin                                       guest_x86_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2015 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates x86 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 32-bit value is being written.
42
43   FUCOMI(P): what happens to A and S flags?  Currently are forced
44      to zero.
45
46   x87 FP Limitations:
47
48   * all arithmetic done at 64 bits
49
50   * no FP exceptions, except for handling stack over/underflow
51
52   * FP rounding mode observed only for float->int conversions
53     and int->float conversions which could lose accuracy, and
54     for float-to-float rounding.  For all other operations,
55     round-to-nearest is used, regardless.
56
57   * some of the FCOM cases could do with testing -- not convinced
58     that the args are the right way round.
59
60   * FSAVE does not re-initialise the FPU; it should do
61
62   * FINIT not only initialises the FPU environment, it also
63     zeroes all the FP registers.  It should leave the registers
64     unchanged.
65
66   SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
67   per Intel docs this bit has no meaning anyway.  Since PUSHF is the
68   only way to observe eflags[1], a proper fix would be to make that
69   bit be set by PUSHF.
70
71   The state of %eflags.AC (alignment check, bit 18) is recorded by
72   the simulation (viz, if you set it with popf then a pushf produces
73   the value you set it to), but it is otherwise ignored.  In
74   particular, setting it to 1 does NOT cause alignment checking to
75   happen.  Programs that set it to 1 and then rely on the resulting
76   SIGBUSs to inform them of misaligned accesses will not work.
77
78   Implementation of sysenter is necessarily partial.  sysenter is a
79   kind of system call entry.  When doing a sysenter, the return
80   address is not known -- that is something that is beyond Vex's
81   knowledge.  So the generated IR forces a return to the scheduler,
82   which can do what it likes to simulate the systenter, but it MUST
83   set this thread's guest_EIP field with the continuation address
84   before resuming execution.  If that doesn't happen, the thread will
85   jump to address zero, which is probably fatal.
86
87   This module uses global variables and so is not MT-safe (if that
88   should ever become relevant).
89
90   The delta values are 32-bit ints, not 64-bit ints.  That means
91   this module may not work right if run on a 64-bit host.  That should
92   be fixed properly, really -- if anyone ever wants to use Vex to
93   translate x86 code for execution on a 64-bit host.
94
95   casLE (implementation of lock-prefixed insns) and rep-prefixed
96   insns: the side-exit back to the start of the insn is done with
97   Ijk_Boring.  This is quite wrong, it should be done with
98   Ijk_NoRedir, since otherwise the side exit, which is intended to
99   restart the instruction for whatever reason, could go somewhere
100   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
101   no-redir jumps performance critical, at least for rep-prefixed
102   instructions, since all iterations thereof would involve such a
103   jump.  It's not such a big deal with casLE since the side exit is
104   only taken if the CAS fails, that is, the location is contended,
105   which is relatively unlikely.
106
107   XXXX: Nov 2009: handling of SWP on ARM suffers from the same
108   problem.
109
110   Note also, the test for CAS success vs failure is done using
111   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
112   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
113   shouldn't definedness-check these comparisons.  See
114   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
115   background/rationale.
116*/
117
118/* Performance holes:
119
120   - fcom ; fstsw %ax ; sahf
121     sahf does not update the O flag (sigh) and so O needs to
122     be computed.  This is done expensively; it would be better
123     to have a calculate_eflags_o helper.
124
125   - emwarns; some FP codes can generate huge numbers of these
126     if the fpucw is changed in an inner loop.  It would be
127     better for the guest state to have an emwarn-enable reg
128     which can be set zero or nonzero.  If it is zero, emwarns
129     are not flagged, and instead control just flows all the
130     way through bbs as usual.
131*/
132
133/* "Special" instructions.
134
135   This instruction decoder can decode three special instructions
136   which mean nothing natively (are no-ops as far as regs/mem are
137   concerned) but have meaning for supporting Valgrind.  A special
138   instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
139   C1C713 (in the standard interpretation, that means: roll $3, %edi;
140   roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
141   one of the following 3 are allowed (standard interpretation in
142   parentheses):
143
144      87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
145      87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
146      87D2 (xchgl %edx,%edx)   call-noredir *%EAX
147      87FF (xchgl %edi,%edi)   IR injection
148
149   Any other bytes following the 12-byte preamble are illegal and
150   constitute a failure in instruction decoding.  This all assumes
151   that the preamble will never occur except in specific code
152   fragments designed for Valgrind to catch.
153
154   No prefixes may precede a "Special" instruction.
155*/
156
157/* LOCK prefixed instructions.  These are translated using IR-level
158   CAS statements (IRCAS) and are believed to preserve atomicity, even
159   from the point of view of some other process racing against a
160   simulated one (presumably they communicate via a shared memory
161   segment).
162
163   Handlers which are aware of LOCK prefixes are:
164      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
165      dis_cmpxchg_G_E  (cmpxchg)
166      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
167      dis_Grp3         (not, neg)
168      dis_Grp4         (inc, dec)
169      dis_Grp5         (inc, dec)
170      dis_Grp8_Imm     (bts, btc, btr)
171      dis_bt_G_E       (bts, btc, btr)
172      dis_xadd_G_E     (xadd)
173*/
174
175
176#include "libvex_basictypes.h"
177#include "libvex_ir.h"
178#include "libvex.h"
179#include "libvex_guest_x86.h"
180
181#include "main_util.h"
182#include "main_globals.h"
183#include "guest_generic_bb_to_IR.h"
184#include "guest_generic_x87.h"
185#include "guest_x86_defs.h"
186
187
188/*------------------------------------------------------------*/
189/*--- Globals                                              ---*/
190/*------------------------------------------------------------*/
191
192/* These are set at the start of the translation of an insn, right
193   down in disInstr_X86, so that we don't have to pass them around
194   endlessly.  They are all constant during the translation of any
195   given insn. */
196
197/* We need to know this to do sub-register accesses correctly. */
198static VexEndness host_endness;
199
200/* Pointer to the guest code area (points to start of BB, not to the
201   insn being processed). */
202static const UChar* guest_code;
203
204/* The guest address corresponding to guest_code[0]. */
205static Addr32 guest_EIP_bbstart;
206
207/* The guest address for the instruction currently being
208   translated. */
209static Addr32 guest_EIP_curr_instr;
210
211/* The IRSB* into which we're generating code. */
212static IRSB* irsb;
213
214
215/*------------------------------------------------------------*/
216/*--- Debugging output                                     ---*/
217/*------------------------------------------------------------*/
218
219#define DIP(format, args...)           \
220   if (vex_traceflags & VEX_TRACE_FE)  \
221      vex_printf(format, ## args)
222
223#define DIS(buf, format, args...)      \
224   if (vex_traceflags & VEX_TRACE_FE)  \
225      vex_sprintf(buf, format, ## args)
226
227
228/*------------------------------------------------------------*/
229/*--- Offsets of various parts of the x86 guest state.     ---*/
230/*------------------------------------------------------------*/
231
232#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
233#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
234#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
235#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
236#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
237#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
238#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
239#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
240
241#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
242
243#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
244#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
245#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
246#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
247
248#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
249#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
250#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
251#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
252#define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
253#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
254#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
255#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
256
257#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
258#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
259#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
260#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
261#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
262#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
263#define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
264#define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
265
266#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
267#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
268#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
269#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
270#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
271#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
272#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
273#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
274#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
275
276#define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
277
278#define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
279#define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
280#define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
281
282#define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
283
284
285/*------------------------------------------------------------*/
286/*--- Helper bits and pieces for deconstructing the        ---*/
287/*--- x86 insn stream.                                     ---*/
288/*------------------------------------------------------------*/
289
290/* This is the Intel register encoding -- integer regs. */
291#define R_EAX 0
292#define R_ECX 1
293#define R_EDX 2
294#define R_EBX 3
295#define R_ESP 4
296#define R_EBP 5
297#define R_ESI 6
298#define R_EDI 7
299
300#define R_AL (0+R_EAX)
301#define R_AH (4+R_EAX)
302
303/* This is the Intel register encoding -- segment regs. */
304#define R_ES 0
305#define R_CS 1
306#define R_SS 2
307#define R_DS 3
308#define R_FS 4
309#define R_GS 5
310
311
312/* Add a statement to the list held by "irbb". */
313static void stmt ( IRStmt* st )
314{
315   addStmtToIRSB( irsb, st );
316}
317
318/* Generate a new temporary of the given type. */
319static IRTemp newTemp ( IRType ty )
320{
321   vassert(isPlausibleIRType(ty));
322   return newIRTemp( irsb->tyenv, ty );
323}
324
325/* Various simple conversions */
326
327static UInt extend_s_8to32( UInt x )
328{
329   return (UInt)((Int)(x << 24) >> 24);
330}
331
332static UInt extend_s_16to32 ( UInt x )
333{
334  return (UInt)((Int)(x << 16) >> 16);
335}
336
337/* Fetch a byte from the guest insn stream. */
338static UChar getIByte ( Int delta )
339{
340   return guest_code[delta];
341}
342
343/* Extract the reg field from a modRM byte. */
344static Int gregOfRM ( UChar mod_reg_rm )
345{
346   return (Int)( (mod_reg_rm >> 3) & 7 );
347}
348
349/* Figure out whether the mod and rm parts of a modRM byte refer to a
350   register or memory.  If so, the byte will have the form 11XXXYYY,
351   where YYY is the register number. */
352static Bool epartIsReg ( UChar mod_reg_rm )
353{
354   return toBool(0xC0 == (mod_reg_rm & 0xC0));
355}
356
357/* ... and extract the register number ... */
358static Int eregOfRM ( UChar mod_reg_rm )
359{
360   return (Int)(mod_reg_rm & 0x7);
361}
362
363/* Get a 8/16/32-bit unsigned value out of the insn stream. */
364
365static UChar getUChar ( Int delta )
366{
367   UChar v = guest_code[delta+0];
368   return toUChar(v);
369}
370
371static UInt getUDisp16 ( Int delta )
372{
373   UInt v = guest_code[delta+1]; v <<= 8;
374   v |= guest_code[delta+0];
375   return v & 0xFFFF;
376}
377
378static UInt getUDisp32 ( Int delta )
379{
380   UInt v = guest_code[delta+3]; v <<= 8;
381   v |= guest_code[delta+2]; v <<= 8;
382   v |= guest_code[delta+1]; v <<= 8;
383   v |= guest_code[delta+0];
384   return v;
385}
386
387static UInt getUDisp ( Int size, Int delta )
388{
389   switch (size) {
390      case 4: return getUDisp32(delta);
391      case 2: return getUDisp16(delta);
392      case 1: return (UInt)getUChar(delta);
393      default: vpanic("getUDisp(x86)");
394   }
395   return 0; /*notreached*/
396}
397
398
399/* Get a byte value out of the insn stream and sign-extend to 32
400   bits. */
401static UInt getSDisp8 ( Int delta )
402{
403   return extend_s_8to32( (UInt) (guest_code[delta]) );
404}
405
406static UInt getSDisp16 ( Int delta0 )
407{
408   const UChar* eip = &guest_code[delta0];
409   UInt d = *eip++;
410   d |= ((*eip++) << 8);
411   return extend_s_16to32(d);
412}
413
414static UInt getSDisp ( Int size, Int delta )
415{
416   switch (size) {
417      case 4: return getUDisp32(delta);
418      case 2: return getSDisp16(delta);
419      case 1: return getSDisp8(delta);
420      default: vpanic("getSDisp(x86)");
421  }
422  return 0; /*notreached*/
423}
424
425
426/*------------------------------------------------------------*/
427/*--- Helpers for constructing IR.                         ---*/
428/*------------------------------------------------------------*/
429
430/* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
431   register references, we need to take the host endianness into
432   account.  Supplied value is 0 .. 7 and in the Intel instruction
433   encoding. */
434
435static IRType szToITy ( Int n )
436{
437   switch (n) {
438      case 1: return Ity_I8;
439      case 2: return Ity_I16;
440      case 4: return Ity_I32;
441      default: vpanic("szToITy(x86)");
442   }
443}
444
445/* On a little-endian host, less significant bits of the guest
446   registers are at lower addresses.  Therefore, if a reference to a
447   register low half has the safe guest state offset as a reference to
448   the full register.
449*/
450static Int integerGuestRegOffset ( Int sz, UInt archreg )
451{
452   vassert(archreg < 8);
453
454   /* Correct for little-endian host only. */
455   vassert(host_endness == VexEndnessLE);
456
457   if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
458      switch (archreg) {
459         case R_EAX: return OFFB_EAX;
460         case R_EBX: return OFFB_EBX;
461         case R_ECX: return OFFB_ECX;
462         case R_EDX: return OFFB_EDX;
463         case R_ESI: return OFFB_ESI;
464         case R_EDI: return OFFB_EDI;
465         case R_ESP: return OFFB_ESP;
466         case R_EBP: return OFFB_EBP;
467         default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
468      }
469   }
470
471   vassert(archreg >= 4 && archreg < 8 && sz == 1);
472   switch (archreg-4) {
473      case R_EAX: return 1+ OFFB_EAX;
474      case R_EBX: return 1+ OFFB_EBX;
475      case R_ECX: return 1+ OFFB_ECX;
476      case R_EDX: return 1+ OFFB_EDX;
477      default: vpanic("integerGuestRegOffset(x86,le)(1h)");
478   }
479
480   /* NOTREACHED */
481   vpanic("integerGuestRegOffset(x86,le)");
482}
483
484static Int segmentGuestRegOffset ( UInt sreg )
485{
486   switch (sreg) {
487      case R_ES: return OFFB_ES;
488      case R_CS: return OFFB_CS;
489      case R_SS: return OFFB_SS;
490      case R_DS: return OFFB_DS;
491      case R_FS: return OFFB_FS;
492      case R_GS: return OFFB_GS;
493      default: vpanic("segmentGuestRegOffset(x86)");
494   }
495}
496
497static Int xmmGuestRegOffset ( UInt xmmreg )
498{
499   switch (xmmreg) {
500      case 0: return OFFB_XMM0;
501      case 1: return OFFB_XMM1;
502      case 2: return OFFB_XMM2;
503      case 3: return OFFB_XMM3;
504      case 4: return OFFB_XMM4;
505      case 5: return OFFB_XMM5;
506      case 6: return OFFB_XMM6;
507      case 7: return OFFB_XMM7;
508      default: vpanic("xmmGuestRegOffset");
509   }
510}
511
512/* Lanes of vector registers are always numbered from zero being the
513   least significant lane (rightmost in the register).  */
514
515static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
516{
517   /* Correct for little-endian host only. */
518   vassert(host_endness == VexEndnessLE);
519   vassert(laneno >= 0 && laneno < 8);
520   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
521}
522
523static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
524{
525   /* Correct for little-endian host only. */
526   vassert(host_endness == VexEndnessLE);
527   vassert(laneno >= 0 && laneno < 4);
528   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
529}
530
531static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
532{
533   /* Correct for little-endian host only. */
534   vassert(host_endness == VexEndnessLE);
535   vassert(laneno >= 0 && laneno < 2);
536   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
537}
538
539static IRExpr* getIReg ( Int sz, UInt archreg )
540{
541   vassert(sz == 1 || sz == 2 || sz == 4);
542   vassert(archreg < 8);
543   return IRExpr_Get( integerGuestRegOffset(sz,archreg),
544                      szToITy(sz) );
545}
546
547/* Ditto, but write to a reg instead. */
548static void putIReg ( Int sz, UInt archreg, IRExpr* e )
549{
550   IRType ty = typeOfIRExpr(irsb->tyenv, e);
551   switch (sz) {
552      case 1: vassert(ty == Ity_I8); break;
553      case 2: vassert(ty == Ity_I16); break;
554      case 4: vassert(ty == Ity_I32); break;
555      default: vpanic("putIReg(x86)");
556   }
557   vassert(archreg < 8);
558   stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
559}
560
561static IRExpr* getSReg ( UInt sreg )
562{
563   return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
564}
565
566static void putSReg ( UInt sreg, IRExpr* e )
567{
568   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
569   stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
570}
571
572static IRExpr* getXMMReg ( UInt xmmreg )
573{
574   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
575}
576
577static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
578{
579   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
580}
581
582static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
583{
584   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
585}
586
587static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
588{
589   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
590}
591
592static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
593{
594   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
595}
596
597static void putXMMReg ( UInt xmmreg, IRExpr* e )
598{
599   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
600   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
601}
602
603static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
604{
605   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
606   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
607}
608
609static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
610{
611   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
612   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
613}
614
615static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
616{
617   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
618   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
619}
620
621static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
622{
623   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
624   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
625}
626
627static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
628{
629   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
630   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
631}
632
633static void assign ( IRTemp dst, IRExpr* e )
634{
635   stmt( IRStmt_WrTmp(dst, e) );
636}
637
638static void storeLE ( IRExpr* addr, IRExpr* data )
639{
640   stmt( IRStmt_Store(Iend_LE, addr, data) );
641}
642
643static IRExpr* unop ( IROp op, IRExpr* a )
644{
645   return IRExpr_Unop(op, a);
646}
647
648static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
649{
650   return IRExpr_Binop(op, a1, a2);
651}
652
653static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
654{
655   return IRExpr_Triop(op, a1, a2, a3);
656}
657
658static IRExpr* mkexpr ( IRTemp tmp )
659{
660   return IRExpr_RdTmp(tmp);
661}
662
663static IRExpr* mkU8 ( UInt i )
664{
665   vassert(i < 256);
666   return IRExpr_Const(IRConst_U8( (UChar)i ));
667}
668
669static IRExpr* mkU16 ( UInt i )
670{
671   vassert(i < 65536);
672   return IRExpr_Const(IRConst_U16( (UShort)i ));
673}
674
675static IRExpr* mkU32 ( UInt i )
676{
677   return IRExpr_Const(IRConst_U32(i));
678}
679
680static IRExpr* mkU64 ( ULong i )
681{
682   return IRExpr_Const(IRConst_U64(i));
683}
684
685static IRExpr* mkU ( IRType ty, UInt i )
686{
687   if (ty == Ity_I8)  return mkU8(i);
688   if (ty == Ity_I16) return mkU16(i);
689   if (ty == Ity_I32) return mkU32(i);
690   /* If this panics, it usually means you passed a size (1,2,4)
691      value as the IRType, rather than a real IRType. */
692   vpanic("mkU(x86)");
693}
694
695static IRExpr* mkV128 ( UShort mask )
696{
697   return IRExpr_Const(IRConst_V128(mask));
698}
699
700static IRExpr* loadLE ( IRType ty, IRExpr* addr )
701{
702   return IRExpr_Load(Iend_LE, ty, addr);
703}
704
705static IROp mkSizedOp ( IRType ty, IROp op8 )
706{
707   Int adj;
708   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
709   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
710           || op8 == Iop_Mul8
711           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
712           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
713           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
714           || op8 == Iop_CasCmpNE8
715           || op8 == Iop_ExpCmpNE8
716           || op8 == Iop_Not8);
717   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
718   return adj + op8;
719}
720
721static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
722{
723   if (szSmall == 1 && szBig == 4) {
724      return signd ? Iop_8Sto32 : Iop_8Uto32;
725   }
726   if (szSmall == 1 && szBig == 2) {
727      return signd ? Iop_8Sto16 : Iop_8Uto16;
728   }
729   if (szSmall == 2 && szBig == 4) {
730      return signd ? Iop_16Sto32 : Iop_16Uto32;
731   }
732   vpanic("mkWidenOp(x86,guest)");
733}
734
735static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
736{
737   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
738   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
739   return unop(Iop_32to1,
740               binop(Iop_And32,
741                     unop(Iop_1Uto32,x),
742                     unop(Iop_1Uto32,y)));
743}
744
745/* Generate a compare-and-swap operation, operating on memory at
746   'addr'.  The expected value is 'expVal' and the new value is
747   'newVal'.  If the operation fails, then transfer control (with a
748   no-redir jump (XXX no -- see comment at top of this file)) to
749   'restart_point', which is presumably the address of the guest
750   instruction again -- retrying, essentially. */
751static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
752                    Addr32 restart_point )
753{
754   IRCAS* cas;
755   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
756   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
757   IRTemp oldTmp = newTemp(tyE);
758   IRTemp expTmp = newTemp(tyE);
759   vassert(tyE == tyN);
760   vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
761   assign(expTmp, expVal);
762   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
763                  NULL, mkexpr(expTmp), NULL, newVal );
764   stmt( IRStmt_CAS(cas) );
765   stmt( IRStmt_Exit(
766            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
767                   mkexpr(oldTmp), mkexpr(expTmp) ),
768            Ijk_Boring, /*Ijk_NoRedir*/
769            IRConst_U32( restart_point ),
770            OFFB_EIP
771         ));
772}
773
774
775/*------------------------------------------------------------*/
776/*--- Helpers for %eflags.                                 ---*/
777/*------------------------------------------------------------*/
778
779/* -------------- Evaluating the flags-thunk. -------------- */
780
781/* Build IR to calculate all the eflags from stored
782   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
783   Ity_I32. */
784static IRExpr* mk_x86g_calculate_eflags_all ( void )
785{
786   IRExpr** args
787      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
788                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
789                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
790                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
791   IRExpr* call
792      = mkIRExprCCall(
793           Ity_I32,
794           0/*regparm*/,
795           "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
796           args
797        );
798   /* Exclude OP and NDEP from definedness checking.  We're only
799      interested in DEP1 and DEP2. */
800   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
801   return call;
802}
803
804/* Build IR to calculate some particular condition from stored
805   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
806   Ity_Bit. */
807static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
808{
809   IRExpr** args
810      = mkIRExprVec_5( mkU32(cond),
811                       IRExpr_Get(OFFB_CC_OP,  Ity_I32),
812                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
813                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
814                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
815   IRExpr* call
816      = mkIRExprCCall(
817           Ity_I32,
818           0/*regparm*/,
819           "x86g_calculate_condition", &x86g_calculate_condition,
820           args
821        );
822   /* Exclude the requested condition, OP and NDEP from definedness
823      checking.  We're only interested in DEP1 and DEP2. */
824   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
825   return unop(Iop_32to1, call);
826}
827
828/* Build IR to calculate just the carry flag from stored
829   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
830static IRExpr* mk_x86g_calculate_eflags_c ( void )
831{
832   IRExpr** args
833      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
834                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
835                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
836                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
837   IRExpr* call
838      = mkIRExprCCall(
839           Ity_I32,
840           3/*regparm*/,
841           "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
842           args
843        );
844   /* Exclude OP and NDEP from definedness checking.  We're only
845      interested in DEP1 and DEP2. */
846   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
847   return call;
848}
849
850
851/* -------------- Building the flags-thunk. -------------- */
852
853/* The machinery in this section builds the flag-thunk following a
854   flag-setting operation.  Hence the various setFlags_* functions.
855*/
856
857static Bool isAddSub ( IROp op8 )
858{
859   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
860}
861
862static Bool isLogic ( IROp op8 )
863{
864   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
865}
866
867/* U-widen 8/16/32 bit int expr to 32. */
868static IRExpr* widenUto32 ( IRExpr* e )
869{
870   switch (typeOfIRExpr(irsb->tyenv,e)) {
871      case Ity_I32: return e;
872      case Ity_I16: return unop(Iop_16Uto32,e);
873      case Ity_I8:  return unop(Iop_8Uto32,e);
874      default: vpanic("widenUto32");
875   }
876}
877
878/* S-widen 8/16/32 bit int expr to 32. */
879static IRExpr* widenSto32 ( IRExpr* e )
880{
881   switch (typeOfIRExpr(irsb->tyenv,e)) {
882      case Ity_I32: return e;
883      case Ity_I16: return unop(Iop_16Sto32,e);
884      case Ity_I8:  return unop(Iop_8Sto32,e);
885      default: vpanic("widenSto32");
886   }
887}
888
889/* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
890   of these combinations make sense. */
891static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
892{
893   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
894   if (src_ty == dst_ty)
895      return e;
896   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
897      return unop(Iop_32to16, e);
898   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
899      return unop(Iop_32to8, e);
900
901   vex_printf("\nsrc, dst tys are: ");
902   ppIRType(src_ty);
903   vex_printf(", ");
904   ppIRType(dst_ty);
905   vex_printf("\n");
906   vpanic("narrowTo(x86)");
907}
908
909
910/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
911   auto-sized up to the real op. */
912
913static
914void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
915{
916   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
917
918   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
919
920   switch (op8) {
921      case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
922      case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
923      default:       ppIROp(op8);
924                     vpanic("setFlags_DEP1_DEP2(x86)");
925   }
926   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
927   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
928   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
929   /* Set NDEP even though it isn't used.  This makes redundant-PUT
930      elimination of previous stores to this field work better. */
931   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
932}
933
934
935/* Set the OP and DEP1 fields only, and write zero to DEP2. */
936
937static
938void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
939{
940   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
941
942   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
943
944   switch (op8) {
945      case Iop_Or8:
946      case Iop_And8:
947      case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
948      default:       ppIROp(op8);
949                     vpanic("setFlags_DEP1(x86)");
950   }
951   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
952   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
953   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
954   /* Set NDEP even though it isn't used.  This makes redundant-PUT
955      elimination of previous stores to this field work better. */
956   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
957}
958
959
960/* For shift operations, we put in the result and the undershifted
961   result.  Except if the shift amount is zero, the thunk is left
962   unchanged. */
963
964static void setFlags_DEP1_DEP2_shift ( IROp    op32,
965                                       IRTemp  res,
966                                       IRTemp  resUS,
967                                       IRType  ty,
968                                       IRTemp  guard )
969{
970   Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
971
972   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
973   vassert(guard);
974
975   /* Both kinds of right shifts are handled by the same thunk
976      operation. */
977   switch (op32) {
978      case Iop_Shr32:
979      case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
980      case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
981      default:        ppIROp(op32);
982                      vpanic("setFlags_DEP1_DEP2_shift(x86)");
983   }
984
985   /* guard :: Ity_I8.  We need to convert it to I1. */
986   IRTemp guardB = newTemp(Ity_I1);
987   assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
988
989   /* DEP1 contains the result, DEP2 contains the undershifted value. */
990   stmt( IRStmt_Put( OFFB_CC_OP,
991                     IRExpr_ITE( mkexpr(guardB),
992                                 mkU32(ccOp),
993                                 IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
994   stmt( IRStmt_Put( OFFB_CC_DEP1,
995                     IRExpr_ITE( mkexpr(guardB),
996                                 widenUto32(mkexpr(res)),
997                                 IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
998   stmt( IRStmt_Put( OFFB_CC_DEP2,
999                     IRExpr_ITE( mkexpr(guardB),
1000                                 widenUto32(mkexpr(resUS)),
1001                                 IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
1002   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1003      elimination of previous stores to this field work better. */
1004   stmt( IRStmt_Put( OFFB_CC_NDEP,
1005                     IRExpr_ITE( mkexpr(guardB),
1006                                 mkU32(0),
1007                                 IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
1008}
1009
1010
1011/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1012   the former value of the carry flag, which unfortunately we have to
1013   compute. */
1014
1015static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1016{
1017   Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
1018
1019   ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
1020   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
1021
1022   /* This has to come first, because calculating the C flag
1023      may require reading all four thunk fields. */
1024   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
1025   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
1026   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
1027   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
1028}
1029
1030
1031/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1032   two arguments. */
1033
1034static
1035void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
1036{
1037   switch (ty) {
1038      case Ity_I8:
1039         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
1040         break;
1041      case Ity_I16:
1042         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
1043         break;
1044      case Ity_I32:
1045         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
1046         break;
1047      default:
1048         vpanic("setFlags_MUL(x86)");
1049   }
1050   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
1051   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
1052   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1053      elimination of previous stores to this field work better. */
1054   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1055}
1056
1057
1058/* -------------- Condition codes. -------------- */
1059
1060/* Condition codes, using the Intel encoding.  */
1061
1062static const HChar* name_X86Condcode ( X86Condcode cond )
1063{
1064   switch (cond) {
1065      case X86CondO:      return "o";
1066      case X86CondNO:     return "no";
1067      case X86CondB:      return "b";
1068      case X86CondNB:     return "nb";
1069      case X86CondZ:      return "z";
1070      case X86CondNZ:     return "nz";
1071      case X86CondBE:     return "be";
1072      case X86CondNBE:    return "nbe";
1073      case X86CondS:      return "s";
1074      case X86CondNS:     return "ns";
1075      case X86CondP:      return "p";
1076      case X86CondNP:     return "np";
1077      case X86CondL:      return "l";
1078      case X86CondNL:     return "nl";
1079      case X86CondLE:     return "le";
1080      case X86CondNLE:    return "nle";
1081      case X86CondAlways: return "ALWAYS";
1082      default: vpanic("name_X86Condcode");
1083   }
1084}
1085
1086static
1087X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
1088                                      Bool*        needInvert )
1089{
1090   vassert(cond >= X86CondO && cond <= X86CondNLE);
1091   if (cond & 1) {
1092      *needInvert = True;
1093      return cond-1;
1094   } else {
1095      *needInvert = False;
1096      return cond;
1097   }
1098}
1099
1100
1101/* -------------- Helpers for ADD/SUB with carry. -------------- */
1102
1103/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1104   appropriately.
1105
1106   Optionally, generate a store for the 'tres' value.  This can either
1107   be a normal store, or it can be a cas-with-possible-failure style
1108   store:
1109
1110   if taddr is IRTemp_INVALID, then no store is generated.
1111
1112   if taddr is not IRTemp_INVALID, then a store (using taddr as
1113   the address) is generated:
1114
1115     if texpVal is IRTemp_INVALID then a normal store is
1116     generated, and restart_point must be zero (it is irrelevant).
1117
1118     if texpVal is not IRTemp_INVALID then a cas-style store is
1119     generated.  texpVal is the expected value, restart_point
1120     is the restart point if the store fails, and texpVal must
1121     have the same type as tres.
1122*/
1123static void helper_ADC ( Int sz,
1124                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1125                         /* info about optional store: */
1126                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1127{
1128   UInt    thunkOp;
1129   IRType  ty    = szToITy(sz);
1130   IRTemp  oldc  = newTemp(Ity_I32);
1131   IRTemp  oldcn = newTemp(ty);
1132   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1133   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1134
1135   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1136   vassert(sz == 1 || sz == 2 || sz == 4);
1137   thunkOp = sz==4 ? X86G_CC_OP_ADCL
1138                   : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
1139
1140   /* oldc = old carry flag, 0 or 1 */
1141   assign( oldc,  binop(Iop_And32,
1142                        mk_x86g_calculate_eflags_c(),
1143                        mkU32(1)) );
1144
1145   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1146
1147   assign( tres, binop(plus,
1148                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1149                       mkexpr(oldcn)) );
1150
1151   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1152      start of this function. */
1153   if (taddr != IRTemp_INVALID) {
1154      if (texpVal == IRTemp_INVALID) {
1155         vassert(restart_point == 0);
1156         storeLE( mkexpr(taddr), mkexpr(tres) );
1157      } else {
1158         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1159         /* .. and hence 'texpVal' has the same type as 'tres'. */
1160         casLE( mkexpr(taddr),
1161                mkexpr(texpVal), mkexpr(tres), restart_point );
1162      }
1163   }
1164
1165   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1166   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
1167   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1168                                                         mkexpr(oldcn)) )) );
1169   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1170}
1171
1172
1173/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1174   appropriately.  As with helper_ADC, possibly generate a store of
1175   the result -- see comments on helper_ADC for details.
1176*/
1177static void helper_SBB ( Int sz,
1178                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1179                         /* info about optional store: */
1180                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1181{
1182   UInt    thunkOp;
1183   IRType  ty    = szToITy(sz);
1184   IRTemp  oldc  = newTemp(Ity_I32);
1185   IRTemp  oldcn = newTemp(ty);
1186   IROp    minus = mkSizedOp(ty, Iop_Sub8);
1187   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1188
1189   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1190   vassert(sz == 1 || sz == 2 || sz == 4);
1191   thunkOp = sz==4 ? X86G_CC_OP_SBBL
1192                   : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
1193
1194   /* oldc = old carry flag, 0 or 1 */
1195   assign( oldc, binop(Iop_And32,
1196                       mk_x86g_calculate_eflags_c(),
1197                       mkU32(1)) );
1198
1199   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1200
1201   assign( tres, binop(minus,
1202                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
1203                       mkexpr(oldcn)) );
1204
1205   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1206      start of this function. */
1207   if (taddr != IRTemp_INVALID) {
1208      if (texpVal == IRTemp_INVALID) {
1209         vassert(restart_point == 0);
1210         storeLE( mkexpr(taddr), mkexpr(tres) );
1211      } else {
1212         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1213         /* .. and hence 'texpVal' has the same type as 'tres'. */
1214         casLE( mkexpr(taddr),
1215                mkexpr(texpVal), mkexpr(tres), restart_point );
1216      }
1217   }
1218
1219   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1220   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
1221   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1222                                                         mkexpr(oldcn)) )) );
1223   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1224}
1225
1226
1227/* -------------- Helpers for disassembly printing. -------------- */
1228
1229static const HChar* nameGrp1 ( Int opc_aux )
1230{
1231   static const HChar* grp1_names[8]
1232     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1233   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
1234   return grp1_names[opc_aux];
1235}
1236
1237static const HChar* nameGrp2 ( Int opc_aux )
1238{
1239   static const HChar* grp2_names[8]
1240     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1241   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
1242   return grp2_names[opc_aux];
1243}
1244
1245static const HChar* nameGrp4 ( Int opc_aux )
1246{
1247   static const HChar* grp4_names[8]
1248     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1249   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
1250   return grp4_names[opc_aux];
1251}
1252
1253static const HChar* nameGrp5 ( Int opc_aux )
1254{
1255   static const HChar* grp5_names[8]
1256     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
1257   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
1258   return grp5_names[opc_aux];
1259}
1260
1261static const HChar* nameGrp8 ( Int opc_aux )
1262{
1263   static const HChar* grp8_names[8]
1264     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
1265   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
1266   return grp8_names[opc_aux];
1267}
1268
1269static const HChar* nameIReg ( Int size, Int reg )
1270{
1271   static const HChar* ireg32_names[8]
1272     = { "%eax", "%ecx", "%edx", "%ebx",
1273         "%esp", "%ebp", "%esi", "%edi" };
1274   static const HChar* ireg16_names[8]
1275     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
1276   static const HChar* ireg8_names[8]
1277     = { "%al", "%cl", "%dl", "%bl",
1278         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
1279   if (reg < 0 || reg > 7) goto bad;
1280   switch (size) {
1281      case 4: return ireg32_names[reg];
1282      case 2: return ireg16_names[reg];
1283      case 1: return ireg8_names[reg];
1284   }
1285  bad:
1286   vpanic("nameIReg(X86)");
1287   return NULL; /*notreached*/
1288}
1289
1290static const HChar* nameSReg ( UInt sreg )
1291{
1292   switch (sreg) {
1293      case R_ES: return "%es";
1294      case R_CS: return "%cs";
1295      case R_SS: return "%ss";
1296      case R_DS: return "%ds";
1297      case R_FS: return "%fs";
1298      case R_GS: return "%gs";
1299      default: vpanic("nameSReg(x86)");
1300   }
1301}
1302
1303static const HChar* nameMMXReg ( Int mmxreg )
1304{
1305   static const HChar* mmx_names[8]
1306     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
1307   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
1308   return mmx_names[mmxreg];
1309}
1310
1311static const HChar* nameXMMReg ( Int xmmreg )
1312{
1313   static const HChar* xmm_names[8]
1314     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
1315         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
1316   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
1317   return xmm_names[xmmreg];
1318}
1319
1320static const HChar* nameMMXGran ( Int gran )
1321{
1322   switch (gran) {
1323      case 0: return "b";
1324      case 1: return "w";
1325      case 2: return "d";
1326      case 3: return "q";
1327      default: vpanic("nameMMXGran(x86,guest)");
1328   }
1329}
1330
1331static HChar nameISize ( Int size )
1332{
1333   switch (size) {
1334      case 4: return 'l';
1335      case 2: return 'w';
1336      case 1: return 'b';
1337      default: vpanic("nameISize(x86)");
1338   }
1339}
1340
1341
1342/*------------------------------------------------------------*/
1343/*--- JMP helpers                                          ---*/
1344/*------------------------------------------------------------*/
1345
1346static void jmp_lit( /*MOD*/DisResult* dres,
1347                     IRJumpKind kind, Addr32 d32 )
1348{
1349   vassert(dres->whatNext    == Dis_Continue);
1350   vassert(dres->len         == 0);
1351   vassert(dres->continueAt  == 0);
1352   vassert(dres->jk_StopHere == Ijk_INVALID);
1353   dres->whatNext    = Dis_StopHere;
1354   dres->jk_StopHere = kind;
1355   stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
1356}
1357
1358static void jmp_treg( /*MOD*/DisResult* dres,
1359                      IRJumpKind kind, IRTemp t )
1360{
1361   vassert(dres->whatNext    == Dis_Continue);
1362   vassert(dres->len         == 0);
1363   vassert(dres->continueAt  == 0);
1364   vassert(dres->jk_StopHere == Ijk_INVALID);
1365   dres->whatNext    = Dis_StopHere;
1366   dres->jk_StopHere = kind;
1367   stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
1368}
1369
1370static
1371void jcc_01( /*MOD*/DisResult* dres,
1372             X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
1373{
1374   Bool        invert;
1375   X86Condcode condPos;
1376   vassert(dres->whatNext    == Dis_Continue);
1377   vassert(dres->len         == 0);
1378   vassert(dres->continueAt  == 0);
1379   vassert(dres->jk_StopHere == Ijk_INVALID);
1380   dres->whatNext    = Dis_StopHere;
1381   dres->jk_StopHere = Ijk_Boring;
1382   condPos = positiveIse_X86Condcode ( cond, &invert );
1383   if (invert) {
1384      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1385                         Ijk_Boring,
1386                         IRConst_U32(d32_false),
1387                         OFFB_EIP ) );
1388      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
1389   } else {
1390      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1391                         Ijk_Boring,
1392                         IRConst_U32(d32_true),
1393                         OFFB_EIP ) );
1394      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
1395   }
1396}
1397
1398
1399/*------------------------------------------------------------*/
1400/*--- Disassembling addressing modes                       ---*/
1401/*------------------------------------------------------------*/
1402
1403static
1404const HChar* sorbTxt ( UChar sorb )
1405{
1406   switch (sorb) {
1407      case 0:    return ""; /* no override */
1408      case 0x3E: return "%ds";
1409      case 0x26: return "%es:";
1410      case 0x64: return "%fs:";
1411      case 0x65: return "%gs:";
1412      default: vpanic("sorbTxt(x86,guest)");
1413   }
1414}
1415
1416
1417/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
1418   linear address by adding any required segment override as indicated
1419   by sorb. */
1420static
1421IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
1422{
1423   Int    sreg;
1424   IRType hWordTy;
1425   IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
1426
1427   if (sorb == 0)
1428      /* the common case - no override */
1429      return virtual;
1430
1431   switch (sorb) {
1432      case 0x3E: sreg = R_DS; break;
1433      case 0x26: sreg = R_ES; break;
1434      case 0x64: sreg = R_FS; break;
1435      case 0x65: sreg = R_GS; break;
1436      default: vpanic("handleSegOverride(x86,guest)");
1437   }
1438
1439   hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
1440
1441   seg_selector = newTemp(Ity_I32);
1442   ldt_ptr      = newTemp(hWordTy);
1443   gdt_ptr      = newTemp(hWordTy);
1444   r64          = newTemp(Ity_I64);
1445
1446   assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
1447   assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
1448   assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
1449
1450   /*
1451   Call this to do the translation and limit checks:
1452   ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
1453                                 UInt seg_selector, UInt virtual_addr )
1454   */
1455   assign(
1456      r64,
1457      mkIRExprCCall(
1458         Ity_I64,
1459         0/*regparms*/,
1460         "x86g_use_seg_selector",
1461         &x86g_use_seg_selector,
1462         mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
1463                        mkexpr(seg_selector), virtual)
1464      )
1465   );
1466
1467   /* If the high 32 of the result are non-zero, there was a
1468      failure in address translation.  In which case, make a
1469      quick exit.
1470   */
1471   stmt(
1472      IRStmt_Exit(
1473         binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
1474         Ijk_MapFail,
1475         IRConst_U32( guest_EIP_curr_instr ),
1476         OFFB_EIP
1477      )
1478   );
1479
1480   /* otherwise, here's the translated result. */
1481   return unop(Iop_64to32, mkexpr(r64));
1482}
1483
1484
1485/* Generate IR to calculate an address indicated by a ModRM and
1486   following SIB bytes.  The expression, and the number of bytes in
1487   the address mode, are returned.  Note that this fn should not be
1488   called if the R/M part of the address denotes a register instead of
1489   memory.  If print_codegen is true, text of the addressing mode is
1490   placed in buf.
1491
1492   The computed address is stored in a new tempreg, and the
1493   identity of the tempreg is returned.  */
1494
1495static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
1496{
1497   IRTemp tmp = newTemp(Ity_I32);
1498   assign( tmp, addr32 );
1499   return tmp;
1500}
1501
1502static
1503IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
1504{
1505   UChar mod_reg_rm = getIByte(delta);
1506   delta++;
1507
1508   buf[0] = (UChar)0;
1509
1510   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1511      jump table seems a bit excessive.
1512   */
1513   mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1514   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1515                                            /* is now XX0XXYYY */
1516   mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1517   switch (mod_reg_rm) {
1518
1519      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
1520         --> GET %reg, t
1521      */
1522      case 0x00: case 0x01: case 0x02: case 0x03:
1523      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1524         { UChar rm = mod_reg_rm;
1525           DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
1526           *len = 1;
1527           return disAMode_copy2tmp(
1528                  handleSegOverride(sorb, getIReg(4,rm)));
1529         }
1530
1531      /* d8(%eax) ... d8(%edi), not including d8(%esp)
1532         --> GET %reg, t ; ADDL d8, t
1533      */
1534      case 0x08: case 0x09: case 0x0A: case 0x0B:
1535      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1536         { UChar rm = toUChar(mod_reg_rm & 7);
1537           UInt  d  = getSDisp8(delta);
1538           DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1539           *len = 2;
1540           return disAMode_copy2tmp(
1541                  handleSegOverride(sorb,
1542                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1543         }
1544
1545      /* d32(%eax) ... d32(%edi), not including d32(%esp)
1546         --> GET %reg, t ; ADDL d8, t
1547      */
1548      case 0x10: case 0x11: case 0x12: case 0x13:
1549      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1550         { UChar rm = toUChar(mod_reg_rm & 7);
1551           UInt  d  = getUDisp32(delta);
1552           DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), d, nameIReg(4,rm));
1553           *len = 5;
1554           return disAMode_copy2tmp(
1555                  handleSegOverride(sorb,
1556                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1557         }
1558
1559      /* a register, %eax .. %edi.  This shouldn't happen. */
1560      case 0x18: case 0x19: case 0x1A: case 0x1B:
1561      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1562         vpanic("disAMode(x86): not an addr!");
1563
1564      /* a 32-bit literal address
1565         --> MOV d32, tmp
1566      */
1567      case 0x05:
1568         { UInt d = getUDisp32(delta);
1569           *len = 5;
1570           DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1571           return disAMode_copy2tmp(
1572                     handleSegOverride(sorb, mkU32(d)));
1573         }
1574
1575      case 0x04: {
1576         /* SIB, with no displacement.  Special cases:
1577            -- %esp cannot act as an index value.
1578               If index_r indicates %esp, zero is used for the index.
1579            -- when mod is zero and base indicates EBP, base is instead
1580               a 32-bit literal.
1581            It's all madness, I tell you.  Extract %index, %base and
1582            scale from the SIB byte.  The value denoted is then:
1583               | %index == %ESP && %base == %EBP
1584               = d32 following SIB byte
1585               | %index == %ESP && %base != %EBP
1586               = %base
1587               | %index != %ESP && %base == %EBP
1588               = d32 following SIB byte + (%index << scale)
1589               | %index != %ESP && %base != %ESP
1590               = %base + (%index << scale)
1591
1592            What happens to the souls of CPU architects who dream up such
1593            horrendous schemes, do you suppose?
1594         */
1595         UChar sib     = getIByte(delta);
1596         UChar scale   = toUChar((sib >> 6) & 3);
1597         UChar index_r = toUChar((sib >> 3) & 7);
1598         UChar base_r  = toUChar(sib & 7);
1599         delta++;
1600
1601         if (index_r != R_ESP && base_r != R_EBP) {
1602            DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
1603                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1604            *len = 2;
1605            return
1606               disAMode_copy2tmp(
1607               handleSegOverride(sorb,
1608                  binop(Iop_Add32,
1609                        getIReg(4,base_r),
1610                        binop(Iop_Shl32, getIReg(4,index_r),
1611                              mkU8(scale)))));
1612         }
1613
1614         if (index_r != R_ESP && base_r == R_EBP) {
1615            UInt d = getUDisp32(delta);
1616            DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
1617                      nameIReg(4,index_r), 1<<scale);
1618            *len = 6;
1619            return
1620               disAMode_copy2tmp(
1621               handleSegOverride(sorb,
1622                  binop(Iop_Add32,
1623                        binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
1624                        mkU32(d))));
1625         }
1626
1627         if (index_r == R_ESP && base_r != R_EBP) {
1628            DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
1629            *len = 2;
1630            return disAMode_copy2tmp(
1631                   handleSegOverride(sorb, getIReg(4,base_r)));
1632         }
1633
1634         if (index_r == R_ESP && base_r == R_EBP) {
1635            UInt d = getUDisp32(delta);
1636            DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
1637            *len = 6;
1638            return disAMode_copy2tmp(
1639                   handleSegOverride(sorb, mkU32(d)));
1640         }
1641         /*NOTREACHED*/
1642         vassert(0);
1643      }
1644
1645      /* SIB, with 8-bit displacement.  Special cases:
1646         -- %esp cannot act as an index value.
1647            If index_r indicates %esp, zero is used for the index.
1648         Denoted value is:
1649            | %index == %ESP
1650            = d8 + %base
1651            | %index != %ESP
1652            = d8 + %base + (%index << scale)
1653      */
1654      case 0x0C: {
1655         UChar sib     = getIByte(delta);
1656         UChar scale   = toUChar((sib >> 6) & 3);
1657         UChar index_r = toUChar((sib >> 3) & 7);
1658         UChar base_r  = toUChar(sib & 7);
1659         UInt  d       = getSDisp8(delta+1);
1660
1661         if (index_r == R_ESP) {
1662            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1663                                   (Int)d, nameIReg(4,base_r));
1664            *len = 3;
1665            return disAMode_copy2tmp(
1666                   handleSegOverride(sorb,
1667                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1668         } else {
1669            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1670                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1671            *len = 3;
1672            return
1673                disAMode_copy2tmp(
1674                handleSegOverride(sorb,
1675                  binop(Iop_Add32,
1676                        binop(Iop_Add32,
1677                              getIReg(4,base_r),
1678                              binop(Iop_Shl32,
1679                                    getIReg(4,index_r), mkU8(scale))),
1680                        mkU32(d))));
1681         }
1682	 /*NOTREACHED*/
1683         vassert(0);
1684      }
1685
1686      /* SIB, with 32-bit displacement.  Special cases:
1687         -- %esp cannot act as an index value.
1688            If index_r indicates %esp, zero is used for the index.
1689         Denoted value is:
1690            | %index == %ESP
1691            = d32 + %base
1692            | %index != %ESP
1693            = d32 + %base + (%index << scale)
1694      */
1695      case 0x14: {
1696         UChar sib     = getIByte(delta);
1697         UChar scale   = toUChar((sib >> 6) & 3);
1698         UChar index_r = toUChar((sib >> 3) & 7);
1699         UChar base_r  = toUChar(sib & 7);
1700         UInt d        = getUDisp32(delta+1);
1701
1702         if (index_r == R_ESP) {
1703            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1704                                   (Int)d, nameIReg(4,base_r));
1705            *len = 6;
1706            return disAMode_copy2tmp(
1707                   handleSegOverride(sorb,
1708                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1709         } else {
1710            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1711                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1712            *len = 6;
1713            return
1714                disAMode_copy2tmp(
1715                handleSegOverride(sorb,
1716                  binop(Iop_Add32,
1717                        binop(Iop_Add32,
1718                              getIReg(4,base_r),
1719                              binop(Iop_Shl32,
1720                                    getIReg(4,index_r), mkU8(scale))),
1721                        mkU32(d))));
1722         }
1723	 /*NOTREACHED*/
1724         vassert(0);
1725      }
1726
1727      default:
1728         vpanic("disAMode(x86)");
1729         return 0; /*notreached*/
1730   }
1731}
1732
1733
1734/* Figure out the number of (insn-stream) bytes constituting the amode
1735   beginning at delta.  Is useful for getting hold of literals beyond
1736   the end of the amode before it has been disassembled.  */
1737
1738static UInt lengthAMode ( Int delta )
1739{
1740   UChar mod_reg_rm = getIByte(delta); delta++;
1741
1742   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1743      jump table seems a bit excessive.
1744   */
1745   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1746   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1747                                     /* is now XX0XXYYY */
1748   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1749   switch (mod_reg_rm) {
1750
1751      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
1752      case 0x00: case 0x01: case 0x02: case 0x03:
1753      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1754         return 1;
1755
1756      /* d8(%eax) ... d8(%edi), not including d8(%esp). */
1757      case 0x08: case 0x09: case 0x0A: case 0x0B:
1758      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1759         return 2;
1760
1761      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
1762      case 0x10: case 0x11: case 0x12: case 0x13:
1763      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1764         return 5;
1765
1766      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
1767      case 0x18: case 0x19: case 0x1A: case 0x1B:
1768      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1769         return 1;
1770
1771      /* a 32-bit literal address. */
1772      case 0x05: return 5;
1773
1774      /* SIB, no displacement.  */
1775      case 0x04: {
1776         UChar sib    = getIByte(delta);
1777         UChar base_r = toUChar(sib & 7);
1778         if (base_r == R_EBP) return 6; else return 2;
1779      }
1780      /* SIB, with 8-bit displacement.  */
1781      case 0x0C: return 3;
1782
1783      /* SIB, with 32-bit displacement.  */
1784      case 0x14: return 6;
1785
1786      default:
1787         vpanic("lengthAMode");
1788         return 0; /*notreached*/
1789   }
1790}
1791
1792/*------------------------------------------------------------*/
1793/*--- Disassembling common idioms                          ---*/
1794/*------------------------------------------------------------*/
1795
1796/* Handle binary integer instructions of the form
1797      op E, G  meaning
1798      op reg-or-mem, reg
1799   Is passed the a ptr to the modRM byte, the actual operation, and the
1800   data size.  Returns the address advanced completely over this
1801   instruction.
1802
1803   E(src) is reg-or-mem
1804   G(dst) is reg.
1805
1806   If E is reg, -->    GET %G,  tmp
1807                       OP %E,   tmp
1808                       PUT tmp, %G
1809
1810   If E is mem and OP is not reversible,
1811                -->    (getAddr E) -> tmpa
1812                       LD (tmpa), tmpa
1813                       GET %G, tmp2
1814                       OP tmpa, tmp2
1815                       PUT tmp2, %G
1816
1817   If E is mem and OP is reversible
1818                -->    (getAddr E) -> tmpa
1819                       LD (tmpa), tmpa
1820                       OP %G, tmpa
1821                       PUT tmpa, %G
1822*/
1823static
1824UInt dis_op2_E_G ( UChar       sorb,
1825                   Bool        addSubCarry,
1826                   IROp        op8,
1827                   Bool        keep,
1828                   Int         size,
1829                   Int         delta0,
1830                   const HChar* t_x86opc )
1831{
1832   HChar   dis_buf[50];
1833   Int     len;
1834   IRType  ty   = szToITy(size);
1835   IRTemp  dst1 = newTemp(ty);
1836   IRTemp  src  = newTemp(ty);
1837   IRTemp  dst0 = newTemp(ty);
1838   UChar   rm   = getUChar(delta0);
1839   IRTemp  addr = IRTemp_INVALID;
1840
1841   /* addSubCarry == True indicates the intended operation is
1842      add-with-carry or subtract-with-borrow. */
1843   if (addSubCarry) {
1844      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1845      vassert(keep);
1846   }
1847
1848   if (epartIsReg(rm)) {
1849      /* Specially handle XOR reg,reg, because that doesn't really
1850         depend on reg, and doing the obvious thing potentially
1851         generates a spurious value check failure due to the bogus
1852         dependency.  Ditto SBB reg,reg. */
1853      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1854          && gregOfRM(rm) == eregOfRM(rm)) {
1855         putIReg(size, gregOfRM(rm), mkU(ty,0));
1856      }
1857      assign( dst0, getIReg(size,gregOfRM(rm)) );
1858      assign( src,  getIReg(size,eregOfRM(rm)) );
1859
1860      if (addSubCarry && op8 == Iop_Add8) {
1861         helper_ADC( size, dst1, dst0, src,
1862                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1863         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1864      } else
1865      if (addSubCarry && op8 == Iop_Sub8) {
1866         helper_SBB( size, dst1, dst0, src,
1867                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1868         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1869      } else {
1870         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1871         if (isAddSub(op8))
1872            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1873         else
1874            setFlags_DEP1(op8, dst1, ty);
1875         if (keep)
1876            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1877      }
1878
1879      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1880                          nameIReg(size,eregOfRM(rm)),
1881                          nameIReg(size,gregOfRM(rm)));
1882      return 1+delta0;
1883   } else {
1884      /* E refers to memory */
1885      addr = disAMode ( &len, sorb, delta0, dis_buf);
1886      assign( dst0, getIReg(size,gregOfRM(rm)) );
1887      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
1888
1889      if (addSubCarry && op8 == Iop_Add8) {
1890         helper_ADC( size, dst1, dst0, src,
1891                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1892         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1893      } else
1894      if (addSubCarry && op8 == Iop_Sub8) {
1895         helper_SBB( size, dst1, dst0, src,
1896                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1897         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1898      } else {
1899         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1900         if (isAddSub(op8))
1901            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1902         else
1903            setFlags_DEP1(op8, dst1, ty);
1904         if (keep)
1905            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1906      }
1907
1908      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1909                          dis_buf,nameIReg(size,gregOfRM(rm)));
1910      return len+delta0;
1911   }
1912}
1913
1914
1915
1916/* Handle binary integer instructions of the form
1917      op G, E  meaning
1918      op reg, reg-or-mem
1919   Is passed the a ptr to the modRM byte, the actual operation, and the
1920   data size.  Returns the address advanced completely over this
1921   instruction.
1922
1923   G(src) is reg.
1924   E(dst) is reg-or-mem
1925
1926   If E is reg, -->    GET %E,  tmp
1927                       OP %G,   tmp
1928                       PUT tmp, %E
1929
1930   If E is mem, -->    (getAddr E) -> tmpa
1931                       LD (tmpa), tmpv
1932                       OP %G, tmpv
1933                       ST tmpv, (tmpa)
1934*/
1935static
1936UInt dis_op2_G_E ( UChar       sorb,
1937                   Bool        locked,
1938                   Bool        addSubCarry,
1939                   IROp        op8,
1940                   Bool        keep,
1941                   Int         size,
1942                   Int         delta0,
1943                   const HChar* t_x86opc )
1944{
1945   HChar   dis_buf[50];
1946   Int     len;
1947   IRType  ty   = szToITy(size);
1948   IRTemp  dst1 = newTemp(ty);
1949   IRTemp  src  = newTemp(ty);
1950   IRTemp  dst0 = newTemp(ty);
1951   UChar   rm   = getIByte(delta0);
1952   IRTemp  addr = IRTemp_INVALID;
1953
1954   /* addSubCarry == True indicates the intended operation is
1955      add-with-carry or subtract-with-borrow. */
1956   if (addSubCarry) {
1957      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1958      vassert(keep);
1959   }
1960
1961   if (epartIsReg(rm)) {
1962      /* Specially handle XOR reg,reg, because that doesn't really
1963         depend on reg, and doing the obvious thing potentially
1964         generates a spurious value check failure due to the bogus
1965         dependency.  Ditto SBB reg,reg.*/
1966      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1967          && gregOfRM(rm) == eregOfRM(rm)) {
1968         putIReg(size, eregOfRM(rm), mkU(ty,0));
1969      }
1970      assign(dst0, getIReg(size,eregOfRM(rm)));
1971      assign(src,  getIReg(size,gregOfRM(rm)));
1972
1973      if (addSubCarry && op8 == Iop_Add8) {
1974         helper_ADC( size, dst1, dst0, src,
1975                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1976         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1977      } else
1978      if (addSubCarry && op8 == Iop_Sub8) {
1979         helper_SBB( size, dst1, dst0, src,
1980                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1981         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1982      } else {
1983         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
1984         if (isAddSub(op8))
1985            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1986         else
1987            setFlags_DEP1(op8, dst1, ty);
1988         if (keep)
1989            putIReg(size, eregOfRM(rm), mkexpr(dst1));
1990      }
1991
1992      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1993                          nameIReg(size,gregOfRM(rm)),
1994                          nameIReg(size,eregOfRM(rm)));
1995      return 1+delta0;
1996   }
1997
1998   /* E refers to memory */
1999   {
2000      addr = disAMode ( &len, sorb, delta0, dis_buf);
2001      assign(dst0, loadLE(ty,mkexpr(addr)));
2002      assign(src,  getIReg(size,gregOfRM(rm)));
2003
2004      if (addSubCarry && op8 == Iop_Add8) {
2005         if (locked) {
2006            /* cas-style store */
2007            helper_ADC( size, dst1, dst0, src,
2008                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2009         } else {
2010            /* normal store */
2011            helper_ADC( size, dst1, dst0, src,
2012                        /*store*/addr, IRTemp_INVALID, 0 );
2013         }
2014      } else
2015      if (addSubCarry && op8 == Iop_Sub8) {
2016         if (locked) {
2017            /* cas-style store */
2018            helper_SBB( size, dst1, dst0, src,
2019                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2020         } else {
2021            /* normal store */
2022            helper_SBB( size, dst1, dst0, src,
2023                        /*store*/addr, IRTemp_INVALID, 0 );
2024         }
2025      } else {
2026         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2027         if (keep) {
2028            if (locked) {
2029               if (0) vex_printf("locked case\n" );
2030               casLE( mkexpr(addr),
2031                      mkexpr(dst0)/*expval*/,
2032                      mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
2033            } else {
2034               if (0) vex_printf("nonlocked case\n");
2035               storeLE(mkexpr(addr), mkexpr(dst1));
2036            }
2037         }
2038         if (isAddSub(op8))
2039            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2040         else
2041            setFlags_DEP1(op8, dst1, ty);
2042      }
2043
2044      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2045                          nameIReg(size,gregOfRM(rm)), dis_buf);
2046      return len+delta0;
2047   }
2048}
2049
2050
2051/* Handle move instructions of the form
2052      mov E, G  meaning
2053      mov reg-or-mem, reg
2054   Is passed the a ptr to the modRM byte, and the data size.  Returns
2055   the address advanced completely over this instruction.
2056
2057   E(src) is reg-or-mem
2058   G(dst) is reg.
2059
2060   If E is reg, -->    GET %E,  tmpv
2061                       PUT tmpv, %G
2062
2063   If E is mem  -->    (getAddr E) -> tmpa
2064                       LD (tmpa), tmpb
2065                       PUT tmpb, %G
2066*/
2067static
2068UInt dis_mov_E_G ( UChar       sorb,
2069                   Int         size,
2070                   Int         delta0 )
2071{
2072   Int len;
2073   UChar rm = getIByte(delta0);
2074   HChar dis_buf[50];
2075
2076   if (epartIsReg(rm)) {
2077      putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
2078      DIP("mov%c %s,%s\n", nameISize(size),
2079                           nameIReg(size,eregOfRM(rm)),
2080                           nameIReg(size,gregOfRM(rm)));
2081      return 1+delta0;
2082   }
2083
2084   /* E refers to memory */
2085   {
2086      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
2087      putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
2088      DIP("mov%c %s,%s\n", nameISize(size),
2089                           dis_buf,nameIReg(size,gregOfRM(rm)));
2090      return delta0+len;
2091   }
2092}
2093
2094
2095/* Handle move instructions of the form
2096      mov G, E  meaning
2097      mov reg, reg-or-mem
2098   Is passed the a ptr to the modRM byte, and the data size.  Returns
2099   the address advanced completely over this instruction.
2100
2101   G(src) is reg.
2102   E(dst) is reg-or-mem
2103
2104   If E is reg, -->    GET %G,  tmp
2105                       PUT tmp, %E
2106
2107   If E is mem, -->    (getAddr E) -> tmpa
2108                       GET %G, tmpv
2109                       ST tmpv, (tmpa)
2110*/
2111static
2112UInt dis_mov_G_E ( UChar       sorb,
2113                   Int         size,
2114                   Int         delta0 )
2115{
2116   Int len;
2117   UChar rm = getIByte(delta0);
2118   HChar dis_buf[50];
2119
2120   if (epartIsReg(rm)) {
2121      putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
2122      DIP("mov%c %s,%s\n", nameISize(size),
2123                           nameIReg(size,gregOfRM(rm)),
2124                           nameIReg(size,eregOfRM(rm)));
2125      return 1+delta0;
2126   }
2127
2128   /* E refers to memory */
2129   {
2130      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
2131      storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
2132      DIP("mov%c %s,%s\n", nameISize(size),
2133                           nameIReg(size,gregOfRM(rm)), dis_buf);
2134      return len+delta0;
2135   }
2136}
2137
2138
2139/* op $immediate, AL/AX/EAX. */
2140static
2141UInt dis_op_imm_A ( Int    size,
2142                    Bool   carrying,
2143                    IROp   op8,
2144                    Bool   keep,
2145                    Int    delta,
2146                    const HChar* t_x86opc )
2147{
2148   IRType ty   = szToITy(size);
2149   IRTemp dst0 = newTemp(ty);
2150   IRTemp src  = newTemp(ty);
2151   IRTemp dst1 = newTemp(ty);
2152   UInt lit    = getUDisp(size,delta);
2153   assign(dst0, getIReg(size,R_EAX));
2154   assign(src,  mkU(ty,lit));
2155
2156   if (isAddSub(op8) && !carrying) {
2157      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2158      setFlags_DEP1_DEP2(op8, dst0, src, ty);
2159   }
2160   else
2161   if (isLogic(op8)) {
2162      vassert(!carrying);
2163      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2164      setFlags_DEP1(op8, dst1, ty);
2165   }
2166   else
2167   if (op8 == Iop_Add8 && carrying) {
2168      helper_ADC( size, dst1, dst0, src,
2169                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2170   }
2171   else
2172   if (op8 == Iop_Sub8 && carrying) {
2173      helper_SBB( size, dst1, dst0, src,
2174                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2175   }
2176   else
2177      vpanic("dis_op_imm_A(x86,guest)");
2178
2179   if (keep)
2180      putIReg(size, R_EAX, mkexpr(dst1));
2181
2182   DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
2183                           lit, nameIReg(size,R_EAX));
2184   return delta+size;
2185}
2186
2187
2188/* Sign- and Zero-extending moves. */
2189static
2190UInt dis_movx_E_G ( UChar      sorb,
2191                    Int delta, Int szs, Int szd, Bool sign_extend )
2192{
2193   UChar rm = getIByte(delta);
2194   if (epartIsReg(rm)) {
2195      if (szd == szs) {
2196         // mutant case.  See #250799
2197         putIReg(szd, gregOfRM(rm),
2198                           getIReg(szs,eregOfRM(rm)));
2199      } else {
2200         // normal case
2201         putIReg(szd, gregOfRM(rm),
2202                      unop(mkWidenOp(szs,szd,sign_extend),
2203                           getIReg(szs,eregOfRM(rm))));
2204      }
2205      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2206                               nameISize(szs), nameISize(szd),
2207                               nameIReg(szs,eregOfRM(rm)),
2208                               nameIReg(szd,gregOfRM(rm)));
2209      return 1+delta;
2210   }
2211
2212   /* E refers to memory */
2213   {
2214      Int    len;
2215      HChar  dis_buf[50];
2216      IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
2217      if (szd == szs) {
2218         // mutant case.  See #250799
2219         putIReg(szd, gregOfRM(rm),
2220                           loadLE(szToITy(szs),mkexpr(addr)));
2221      } else {
2222         // normal case
2223         putIReg(szd, gregOfRM(rm),
2224                      unop(mkWidenOp(szs,szd,sign_extend),
2225                           loadLE(szToITy(szs),mkexpr(addr))));
2226      }
2227      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2228                               nameISize(szs), nameISize(szd),
2229                               dis_buf, nameIReg(szd,gregOfRM(rm)));
2230      return len+delta;
2231   }
2232}
2233
2234
2235/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
2236   16 / 8 bit quantity in the given IRTemp.  */
2237static
2238void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
2239{
2240   IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
2241   IRTemp src64 = newTemp(Ity_I64);
2242   IRTemp dst64 = newTemp(Ity_I64);
2243   switch (sz) {
2244      case 4:
2245         assign( src64, binop(Iop_32HLto64,
2246                              getIReg(4,R_EDX), getIReg(4,R_EAX)) );
2247         assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
2248         putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
2249         putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
2250         break;
2251      case 2: {
2252         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2253         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2254         assign( src64, unop(widen3264,
2255                             binop(Iop_16HLto32,
2256                                   getIReg(2,R_EDX), getIReg(2,R_EAX))) );
2257         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
2258         putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
2259         putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
2260         break;
2261      }
2262      case 1: {
2263         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2264         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2265         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
2266         assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
2267         assign( dst64,
2268                 binop(op, mkexpr(src64),
2269                           unop(widen1632, unop(widen816, mkexpr(t)))) );
2270         putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
2271                           unop(Iop_64to32,mkexpr(dst64)))) );
2272         putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
2273                           unop(Iop_64HIto32,mkexpr(dst64)))) );
2274         break;
2275      }
2276      default: vpanic("codegen_div(x86)");
2277   }
2278}
2279
2280
2281static
2282UInt dis_Grp1 ( UChar sorb, Bool locked,
2283                Int delta, UChar modrm,
2284                Int am_sz, Int d_sz, Int sz, UInt d32 )
2285{
2286   Int     len;
2287   HChar   dis_buf[50];
2288   IRType  ty   = szToITy(sz);
2289   IRTemp  dst1 = newTemp(ty);
2290   IRTemp  src  = newTemp(ty);
2291   IRTemp  dst0 = newTemp(ty);
2292   IRTemp  addr = IRTemp_INVALID;
2293   IROp    op8  = Iop_INVALID;
2294   UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
2295
2296   switch (gregOfRM(modrm)) {
2297      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
2298      case 2: break;  // ADC
2299      case 3: break;  // SBB
2300      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
2301      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
2302      /*NOTREACHED*/
2303      default: vpanic("dis_Grp1: unhandled case");
2304   }
2305
2306   if (epartIsReg(modrm)) {
2307      vassert(am_sz == 1);
2308
2309      assign(dst0, getIReg(sz,eregOfRM(modrm)));
2310      assign(src,  mkU(ty,d32 & mask));
2311
2312      if (gregOfRM(modrm) == 2 /* ADC */) {
2313         helper_ADC( sz, dst1, dst0, src,
2314                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2315      } else
2316      if (gregOfRM(modrm) == 3 /* SBB */) {
2317         helper_SBB( sz, dst1, dst0, src,
2318                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2319      } else {
2320         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2321         if (isAddSub(op8))
2322            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2323         else
2324            setFlags_DEP1(op8, dst1, ty);
2325      }
2326
2327      if (gregOfRM(modrm) < 7)
2328         putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2329
2330      delta += (am_sz + d_sz);
2331      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
2332                              nameIReg(sz,eregOfRM(modrm)));
2333   } else {
2334      addr = disAMode ( &len, sorb, delta, dis_buf);
2335
2336      assign(dst0, loadLE(ty,mkexpr(addr)));
2337      assign(src, mkU(ty,d32 & mask));
2338
2339      if (gregOfRM(modrm) == 2 /* ADC */) {
2340         if (locked) {
2341            /* cas-style store */
2342            helper_ADC( sz, dst1, dst0, src,
2343                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2344         } else {
2345            /* normal store */
2346            helper_ADC( sz, dst1, dst0, src,
2347                        /*store*/addr, IRTemp_INVALID, 0 );
2348         }
2349      } else
2350      if (gregOfRM(modrm) == 3 /* SBB */) {
2351         if (locked) {
2352            /* cas-style store */
2353            helper_SBB( sz, dst1, dst0, src,
2354                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2355         } else {
2356            /* normal store */
2357            helper_SBB( sz, dst1, dst0, src,
2358                        /*store*/addr, IRTemp_INVALID, 0 );
2359         }
2360      } else {
2361         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2362         if (gregOfRM(modrm) < 7) {
2363            if (locked) {
2364               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
2365                                    mkexpr(dst1)/*newVal*/,
2366                                    guest_EIP_curr_instr );
2367            } else {
2368               storeLE(mkexpr(addr), mkexpr(dst1));
2369            }
2370         }
2371         if (isAddSub(op8))
2372            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2373         else
2374            setFlags_DEP1(op8, dst1, ty);
2375      }
2376
2377      delta += (len+d_sz);
2378      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
2379                              d32, dis_buf);
2380   }
2381   return delta;
2382}
2383
2384
2385/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
2386   expression. */
2387
2388static
2389UInt dis_Grp2 ( UChar sorb,
2390                Int delta, UChar modrm,
2391                Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
2392                const HChar* shift_expr_txt, Bool* decode_OK )
2393{
2394   /* delta on entry points at the modrm byte. */
2395   HChar  dis_buf[50];
2396   Int    len;
2397   Bool   isShift, isRotate, isRotateC;
2398   IRType ty    = szToITy(sz);
2399   IRTemp dst0  = newTemp(ty);
2400   IRTemp dst1  = newTemp(ty);
2401   IRTemp addr  = IRTemp_INVALID;
2402
2403   *decode_OK = True;
2404
2405   vassert(sz == 1 || sz == 2 || sz == 4);
2406
2407   /* Put value to shift/rotate in dst0. */
2408   if (epartIsReg(modrm)) {
2409      assign(dst0, getIReg(sz, eregOfRM(modrm)));
2410      delta += (am_sz + d_sz);
2411   } else {
2412      addr = disAMode ( &len, sorb, delta, dis_buf);
2413      assign(dst0, loadLE(ty,mkexpr(addr)));
2414      delta += len + d_sz;
2415   }
2416
2417   isShift = False;
2418   switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
2419
2420   isRotate = False;
2421   switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
2422
2423   isRotateC = False;
2424   switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
2425
2426   if (!isShift && !isRotate && !isRotateC) {
2427      /*NOTREACHED*/
2428      vpanic("dis_Grp2(Reg): unhandled case(x86)");
2429   }
2430
2431   if (isRotateC) {
2432      /* call a helper; these insns are so ridiculous they do not
2433         deserve better */
2434      Bool     left = toBool(gregOfRM(modrm) == 2);
2435      IRTemp   r64  = newTemp(Ity_I64);
2436      IRExpr** args
2437         = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
2438                          widenUto32(shift_expr),   /* rotate amount */
2439                          widenUto32(mk_x86g_calculate_eflags_all()),
2440                          mkU32(sz) );
2441      assign( r64, mkIRExprCCall(
2442                      Ity_I64,
2443                      0/*regparm*/,
2444                      left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
2445                      left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
2446                      args
2447                   )
2448            );
2449      /* new eflags in hi half r64; new value in lo half r64 */
2450      assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
2451      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2452      stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
2453      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2454      /* Set NDEP even though it isn't used.  This makes redundant-PUT
2455         elimination of previous stores to this field work better. */
2456      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2457   }
2458
2459   if (isShift) {
2460
2461      IRTemp pre32     = newTemp(Ity_I32);
2462      IRTemp res32     = newTemp(Ity_I32);
2463      IRTemp res32ss   = newTemp(Ity_I32);
2464      IRTemp shift_amt = newTemp(Ity_I8);
2465      IROp   op32;
2466
2467      switch (gregOfRM(modrm)) {
2468         case 4: op32 = Iop_Shl32; break;
2469         case 5: op32 = Iop_Shr32; break;
2470         case 6: op32 = Iop_Shl32; break;
2471         case 7: op32 = Iop_Sar32; break;
2472         /*NOTREACHED*/
2473         default: vpanic("dis_Grp2:shift"); break;
2474      }
2475
2476      /* Widen the value to be shifted to 32 bits, do the shift, and
2477         narrow back down.  This seems surprisingly long-winded, but
2478         unfortunately the Intel semantics requires that 8/16-bit
2479         shifts give defined results for shift values all the way up
2480         to 31, and this seems the simplest way to do it.  It has the
2481         advantage that the only IR level shifts generated are of 32
2482         bit values, and the shift amount is guaranteed to be in the
2483         range 0 .. 31, thereby observing the IR semantics requiring
2484         all shift values to be in the range 0 .. 2^word_size-1. */
2485
2486      /* shift_amt = shift_expr & 31, regardless of operation size */
2487      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
2488
2489      /* suitably widen the value to be shifted to 32 bits. */
2490      assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
2491                                     : widenUto32(mkexpr(dst0)) );
2492
2493      /* res32 = pre32 `shift` shift_amt */
2494      assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
2495
2496      /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
2497      assign( res32ss,
2498              binop(op32,
2499                    mkexpr(pre32),
2500                    binop(Iop_And8,
2501                          binop(Iop_Sub8,
2502                                mkexpr(shift_amt), mkU8(1)),
2503                          mkU8(31))) );
2504
2505      /* Build the flags thunk. */
2506      setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
2507
2508      /* Narrow the result back down. */
2509      assign( dst1, narrowTo(ty, mkexpr(res32)) );
2510
2511   } /* if (isShift) */
2512
2513   else
2514   if (isRotate) {
2515      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
2516      Bool   left      = toBool(gregOfRM(modrm) == 0);
2517      IRTemp rot_amt   = newTemp(Ity_I8);
2518      IRTemp rot_amt32 = newTemp(Ity_I8);
2519      IRTemp oldFlags  = newTemp(Ity_I32);
2520
2521      /* rot_amt = shift_expr & mask */
2522      /* By masking the rotate amount thusly, the IR-level Shl/Shr
2523         expressions never shift beyond the word size and thus remain
2524         well defined. */
2525      assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
2526
2527      if (ty == Ity_I32)
2528         assign(rot_amt, mkexpr(rot_amt32));
2529      else
2530         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
2531
2532      if (left) {
2533
2534         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
2535         assign(dst1,
2536            binop( mkSizedOp(ty,Iop_Or8),
2537                   binop( mkSizedOp(ty,Iop_Shl8),
2538                          mkexpr(dst0),
2539                          mkexpr(rot_amt)
2540                   ),
2541                   binop( mkSizedOp(ty,Iop_Shr8),
2542                          mkexpr(dst0),
2543                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2544                   )
2545            )
2546         );
2547         ccOp += X86G_CC_OP_ROLB;
2548
2549      } else { /* right */
2550
2551         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
2552         assign(dst1,
2553            binop( mkSizedOp(ty,Iop_Or8),
2554                   binop( mkSizedOp(ty,Iop_Shr8),
2555                          mkexpr(dst0),
2556                          mkexpr(rot_amt)
2557                   ),
2558                   binop( mkSizedOp(ty,Iop_Shl8),
2559                          mkexpr(dst0),
2560                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2561                   )
2562            )
2563         );
2564         ccOp += X86G_CC_OP_RORB;
2565
2566      }
2567
2568      /* dst1 now holds the rotated value.  Build flag thunk.  We
2569         need the resulting value for this, and the previous flags.
2570         Except don't set it if the rotate count is zero. */
2571
2572      assign(oldFlags, mk_x86g_calculate_eflags_all());
2573
2574      /* rot_amt32 :: Ity_I8.  We need to convert it to I1. */
2575      IRTemp rot_amt32b = newTemp(Ity_I1);
2576      assign(rot_amt32b, binop(Iop_CmpNE8, mkexpr(rot_amt32), mkU8(0)) );
2577
2578      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
2579      stmt( IRStmt_Put( OFFB_CC_OP,
2580                        IRExpr_ITE( mkexpr(rot_amt32b),
2581                                    mkU32(ccOp),
2582                                    IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
2583      stmt( IRStmt_Put( OFFB_CC_DEP1,
2584                        IRExpr_ITE( mkexpr(rot_amt32b),
2585                                    widenUto32(mkexpr(dst1)),
2586                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
2587      stmt( IRStmt_Put( OFFB_CC_DEP2,
2588                        IRExpr_ITE( mkexpr(rot_amt32b),
2589                                    mkU32(0),
2590                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
2591      stmt( IRStmt_Put( OFFB_CC_NDEP,
2592                        IRExpr_ITE( mkexpr(rot_amt32b),
2593                                    mkexpr(oldFlags),
2594                                    IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
2595   } /* if (isRotate) */
2596
2597   /* Save result, and finish up. */
2598   if (epartIsReg(modrm)) {
2599      putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2600      if (vex_traceflags & VEX_TRACE_FE) {
2601         vex_printf("%s%c ",
2602                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2603         if (shift_expr_txt)
2604            vex_printf("%s", shift_expr_txt);
2605         else
2606            ppIRExpr(shift_expr);
2607         vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
2608      }
2609   } else {
2610      storeLE(mkexpr(addr), mkexpr(dst1));
2611      if (vex_traceflags & VEX_TRACE_FE) {
2612         vex_printf("%s%c ",
2613                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2614         if (shift_expr_txt)
2615            vex_printf("%s", shift_expr_txt);
2616         else
2617            ppIRExpr(shift_expr);
2618         vex_printf(", %s\n", dis_buf);
2619      }
2620   }
2621   return delta;
2622}
2623
2624
2625/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
2626static
2627UInt dis_Grp8_Imm ( UChar sorb,
2628                    Bool locked,
2629                    Int delta, UChar modrm,
2630                    Int am_sz, Int sz, UInt src_val,
2631                    Bool* decode_OK )
2632{
2633   /* src_val denotes a d8.
2634      And delta on entry points at the modrm byte. */
2635
2636   IRType ty     = szToITy(sz);
2637   IRTemp t2     = newTemp(Ity_I32);
2638   IRTemp t2m    = newTemp(Ity_I32);
2639   IRTemp t_addr = IRTemp_INVALID;
2640   HChar  dis_buf[50];
2641   UInt   mask;
2642
2643   /* we're optimists :-) */
2644   *decode_OK = True;
2645
2646   /* Limit src_val -- the bit offset -- to something within a word.
2647      The Intel docs say that literal offsets larger than a word are
2648      masked in this way. */
2649   switch (sz) {
2650      case 2:  src_val &= 15; break;
2651      case 4:  src_val &= 31; break;
2652      default: *decode_OK = False; return delta;
2653   }
2654
2655   /* Invent a mask suitable for the operation. */
2656   switch (gregOfRM(modrm)) {
2657      case 4: /* BT */  mask = 0;               break;
2658      case 5: /* BTS */ mask = 1 << src_val;    break;
2659      case 6: /* BTR */ mask = ~(1 << src_val); break;
2660      case 7: /* BTC */ mask = 1 << src_val;    break;
2661         /* If this needs to be extended, probably simplest to make a
2662            new function to handle the other cases (0 .. 3).  The
2663            Intel docs do however not indicate any use for 0 .. 3, so
2664            we don't expect this to happen. */
2665      default: *decode_OK = False; return delta;
2666   }
2667
2668   /* Fetch the value to be tested and modified into t2, which is
2669      32-bits wide regardless of sz. */
2670   if (epartIsReg(modrm)) {
2671      vassert(am_sz == 1);
2672      assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
2673      delta += (am_sz + 1);
2674      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2675                              src_val, nameIReg(sz,eregOfRM(modrm)));
2676   } else {
2677      Int len;
2678      t_addr = disAMode ( &len, sorb, delta, dis_buf);
2679      delta  += (len+1);
2680      assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
2681      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2682                              src_val, dis_buf);
2683   }
2684
2685   /* Compute the new value into t2m, if non-BT. */
2686   switch (gregOfRM(modrm)) {
2687      case 4: /* BT */
2688         break;
2689      case 5: /* BTS */
2690         assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
2691         break;
2692      case 6: /* BTR */
2693         assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
2694         break;
2695      case 7: /* BTC */
2696         assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
2697         break;
2698      default:
2699         /*NOTREACHED*/ /*the previous switch guards this*/
2700         vassert(0);
2701   }
2702
2703   /* Write the result back, if non-BT.  If the CAS fails then we
2704      side-exit from the trace at this point, and so the flag state is
2705      not affected.  This is of course as required. */
2706   if (gregOfRM(modrm) != 4 /* BT */) {
2707      if (epartIsReg(modrm)) {
2708         putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
2709      } else {
2710         if (locked) {
2711            casLE( mkexpr(t_addr),
2712                   narrowTo(ty, mkexpr(t2))/*expd*/,
2713                   narrowTo(ty, mkexpr(t2m))/*new*/,
2714                   guest_EIP_curr_instr );
2715         } else {
2716            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
2717         }
2718      }
2719   }
2720
2721   /* Copy relevant bit from t2 into the carry flag. */
2722   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
2723   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2724   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2725   stmt( IRStmt_Put(
2726            OFFB_CC_DEP1,
2727            binop(Iop_And32,
2728                  binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
2729                  mkU32(1))
2730       ));
2731   /* Set NDEP even though it isn't used.  This makes redundant-PUT
2732      elimination of previous stores to this field work better. */
2733   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2734
2735   return delta;
2736}
2737
2738
2739/* Signed/unsigned widening multiply.  Generate IR to multiply the
2740   value in EAX/AX/AL by the given IRTemp, and park the result in
2741   EDX:EAX/DX:AX/AX.
2742*/
2743static void codegen_mulL_A_D ( Int sz, Bool syned,
2744                               IRTemp tmp, const HChar* tmp_txt )
2745{
2746   IRType ty = szToITy(sz);
2747   IRTemp t1 = newTemp(ty);
2748
2749   assign( t1, getIReg(sz, R_EAX) );
2750
2751   switch (ty) {
2752      case Ity_I32: {
2753         IRTemp res64   = newTemp(Ity_I64);
2754         IRTemp resHi   = newTemp(Ity_I32);
2755         IRTemp resLo   = newTemp(Ity_I32);
2756         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
2757         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2758         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
2759         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2760         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
2761         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
2762         putIReg(4, R_EDX, mkexpr(resHi));
2763         putIReg(4, R_EAX, mkexpr(resLo));
2764         break;
2765      }
2766      case Ity_I16: {
2767         IRTemp res32   = newTemp(Ity_I32);
2768         IRTemp resHi   = newTemp(Ity_I16);
2769         IRTemp resLo   = newTemp(Ity_I16);
2770         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
2771         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2772         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
2773         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2774         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
2775         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
2776         putIReg(2, R_EDX, mkexpr(resHi));
2777         putIReg(2, R_EAX, mkexpr(resLo));
2778         break;
2779      }
2780      case Ity_I8: {
2781         IRTemp res16   = newTemp(Ity_I16);
2782         IRTemp resHi   = newTemp(Ity_I8);
2783         IRTemp resLo   = newTemp(Ity_I8);
2784         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
2785         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2786         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
2787         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2788         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
2789         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
2790         putIReg(2, R_EAX, mkexpr(res16));
2791         break;
2792      }
2793      default:
2794         vpanic("codegen_mulL_A_D(x86)");
2795   }
2796   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
2797}
2798
2799
2800/* Group 3 extended opcodes. */
2801static
2802UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
2803{
2804   UInt    d32;
2805   UChar   modrm;
2806   HChar   dis_buf[50];
2807   Int     len;
2808   IRTemp  addr;
2809   IRType  ty = szToITy(sz);
2810   IRTemp  t1 = newTemp(ty);
2811   IRTemp dst1, src, dst0;
2812
2813   *decode_OK = True; /* may change this later */
2814
2815   modrm = getIByte(delta);
2816
2817   if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
2818      /* LOCK prefix only allowed with not and neg subopcodes */
2819      *decode_OK = False;
2820      return delta;
2821   }
2822
2823   if (epartIsReg(modrm)) {
2824      switch (gregOfRM(modrm)) {
2825         case 0: { /* TEST */
2826            delta++; d32 = getUDisp(sz, delta); delta += sz;
2827            dst1 = newTemp(ty);
2828            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2829                               getIReg(sz,eregOfRM(modrm)),
2830                               mkU(ty,d32)));
2831            setFlags_DEP1( Iop_And8, dst1, ty );
2832            DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
2833                                      nameIReg(sz, eregOfRM(modrm)));
2834            break;
2835         }
2836         case 1: /* UNDEFINED */
2837           /* The Intel docs imply this insn is undefined and binutils
2838              agrees.  Unfortunately Core 2 will run it (with who
2839              knows what result?)  sandpile.org reckons it's an alias
2840              for case 0.  We play safe. */
2841           *decode_OK = False;
2842           break;
2843         case 2: /* NOT */
2844            delta++;
2845            putIReg(sz, eregOfRM(modrm),
2846                        unop(mkSizedOp(ty,Iop_Not8),
2847                             getIReg(sz, eregOfRM(modrm))));
2848            DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2849            break;
2850         case 3: /* NEG */
2851            delta++;
2852            dst0 = newTemp(ty);
2853            src  = newTemp(ty);
2854            dst1 = newTemp(ty);
2855            assign(dst0, mkU(ty,0));
2856            assign(src,  getIReg(sz,eregOfRM(modrm)));
2857            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
2858            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2859            putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2860            DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2861            break;
2862         case 4: /* MUL (unsigned widening) */
2863            delta++;
2864            src = newTemp(ty);
2865            assign(src, getIReg(sz,eregOfRM(modrm)));
2866            codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
2867            break;
2868         case 5: /* IMUL (signed widening) */
2869            delta++;
2870            src = newTemp(ty);
2871            assign(src, getIReg(sz,eregOfRM(modrm)));
2872            codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
2873            break;
2874         case 6: /* DIV */
2875            delta++;
2876            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2877            codegen_div ( sz, t1, False );
2878            DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2879            break;
2880         case 7: /* IDIV */
2881            delta++;
2882            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2883            codegen_div ( sz, t1, True );
2884            DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2885            break;
2886         default:
2887            /* This can't happen - gregOfRM should return 0 .. 7 only */
2888            vpanic("Grp3(x86)");
2889      }
2890   } else {
2891      addr = disAMode ( &len, sorb, delta, dis_buf );
2892      t1   = newTemp(ty);
2893      delta += len;
2894      assign(t1, loadLE(ty,mkexpr(addr)));
2895      switch (gregOfRM(modrm)) {
2896         case 0: { /* TEST */
2897            d32 = getUDisp(sz, delta); delta += sz;
2898            dst1 = newTemp(ty);
2899            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2900                               mkexpr(t1), mkU(ty,d32)));
2901            setFlags_DEP1( Iop_And8, dst1, ty );
2902            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
2903            break;
2904         }
2905         case 1: /* UNDEFINED */
2906           /* See comment above on R case */
2907           *decode_OK = False;
2908           break;
2909         case 2: /* NOT */
2910            dst1 = newTemp(ty);
2911            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
2912            if (locked) {
2913               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2914                                    guest_EIP_curr_instr );
2915            } else {
2916               storeLE( mkexpr(addr), mkexpr(dst1) );
2917            }
2918            DIP("not%c %s\n", nameISize(sz), dis_buf);
2919            break;
2920         case 3: /* NEG */
2921            dst0 = newTemp(ty);
2922            src  = newTemp(ty);
2923            dst1 = newTemp(ty);
2924            assign(dst0, mkU(ty,0));
2925            assign(src,  mkexpr(t1));
2926            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
2927                               mkexpr(dst0), mkexpr(src)));
2928            if (locked) {
2929               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2930                                    guest_EIP_curr_instr );
2931            } else {
2932               storeLE( mkexpr(addr), mkexpr(dst1) );
2933            }
2934            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2935            DIP("neg%c %s\n", nameISize(sz), dis_buf);
2936            break;
2937         case 4: /* MUL */
2938            codegen_mulL_A_D ( sz, False, t1, dis_buf );
2939            break;
2940         case 5: /* IMUL */
2941            codegen_mulL_A_D ( sz, True, t1, dis_buf );
2942            break;
2943         case 6: /* DIV */
2944            codegen_div ( sz, t1, False );
2945            DIP("div%c %s\n", nameISize(sz), dis_buf);
2946            break;
2947         case 7: /* IDIV */
2948            codegen_div ( sz, t1, True );
2949            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
2950            break;
2951         default:
2952            /* This can't happen - gregOfRM should return 0 .. 7 only */
2953            vpanic("Grp3(x86)");
2954      }
2955   }
2956   return delta;
2957}
2958
2959
2960/* Group 4 extended opcodes. */
2961static
2962UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
2963{
2964   Int   alen;
2965   UChar modrm;
2966   HChar dis_buf[50];
2967   IRType ty = Ity_I8;
2968   IRTemp t1 = newTemp(ty);
2969   IRTemp t2 = newTemp(ty);
2970
2971   *decode_OK = True;
2972
2973   modrm = getIByte(delta);
2974
2975   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
2976      /* LOCK prefix only allowed with inc and dec subopcodes */
2977      *decode_OK = False;
2978      return delta;
2979   }
2980
2981   if (epartIsReg(modrm)) {
2982      assign(t1, getIReg(1, eregOfRM(modrm)));
2983      switch (gregOfRM(modrm)) {
2984         case 0: /* INC */
2985            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2986            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2987            setFlags_INC_DEC( True, t2, ty );
2988            break;
2989         case 1: /* DEC */
2990            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2991            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2992            setFlags_INC_DEC( False, t2, ty );
2993            break;
2994         default:
2995            *decode_OK = False;
2996            return delta;
2997      }
2998      delta++;
2999      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
3000                      nameIReg(1, eregOfRM(modrm)));
3001   } else {
3002      IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
3003      assign( t1, loadLE(ty, mkexpr(addr)) );
3004      switch (gregOfRM(modrm)) {
3005         case 0: /* INC */
3006            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3007            if (locked) {
3008               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3009                      guest_EIP_curr_instr );
3010            } else {
3011               storeLE( mkexpr(addr), mkexpr(t2) );
3012            }
3013            setFlags_INC_DEC( True, t2, ty );
3014            break;
3015         case 1: /* DEC */
3016            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3017            if (locked) {
3018               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3019                      guest_EIP_curr_instr );
3020            } else {
3021               storeLE( mkexpr(addr), mkexpr(t2) );
3022            }
3023            setFlags_INC_DEC( False, t2, ty );
3024            break;
3025         default:
3026            *decode_OK = False;
3027            return delta;
3028      }
3029      delta += alen;
3030      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
3031   }
3032   return delta;
3033}
3034
3035
3036/* Group 5 extended opcodes. */
3037static
3038UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
3039                /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
3040{
3041   Int     len;
3042   UChar   modrm;
3043   HChar   dis_buf[50];
3044   IRTemp  addr = IRTemp_INVALID;
3045   IRType  ty = szToITy(sz);
3046   IRTemp  t1 = newTemp(ty);
3047   IRTemp  t2 = IRTemp_INVALID;
3048
3049   *decode_OK = True;
3050
3051   modrm = getIByte(delta);
3052
3053   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3054      /* LOCK prefix only allowed with inc and dec subopcodes */
3055      *decode_OK = False;
3056      return delta;
3057   }
3058
3059   if (epartIsReg(modrm)) {
3060      assign(t1, getIReg(sz,eregOfRM(modrm)));
3061      switch (gregOfRM(modrm)) {
3062         case 0: /* INC */
3063            vassert(sz == 2 || sz == 4);
3064            t2 = newTemp(ty);
3065            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3066                             mkexpr(t1), mkU(ty,1)));
3067            setFlags_INC_DEC( True, t2, ty );
3068            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3069            break;
3070         case 1: /* DEC */
3071            vassert(sz == 2 || sz == 4);
3072            t2 = newTemp(ty);
3073            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3074                             mkexpr(t1), mkU(ty,1)));
3075            setFlags_INC_DEC( False, t2, ty );
3076            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3077            break;
3078         case 2: /* call Ev */
3079            vassert(sz == 4);
3080            t2 = newTemp(Ity_I32);
3081            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3082            putIReg(4, R_ESP, mkexpr(t2));
3083            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
3084            jmp_treg(dres, Ijk_Call, t1);
3085            vassert(dres->whatNext == Dis_StopHere);
3086            break;
3087         case 4: /* jmp Ev */
3088            vassert(sz == 4);
3089            jmp_treg(dres, Ijk_Boring, t1);
3090            vassert(dres->whatNext == Dis_StopHere);
3091            break;
3092         case 6: /* PUSH Ev */
3093            vassert(sz == 4 || sz == 2);
3094            t2 = newTemp(Ity_I32);
3095            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3096            putIReg(4, R_ESP, mkexpr(t2) );
3097            storeLE( mkexpr(t2), mkexpr(t1) );
3098            break;
3099         default:
3100            *decode_OK = False;
3101            return delta;
3102      }
3103      delta++;
3104      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3105                       nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3106   } else {
3107      addr = disAMode ( &len, sorb, delta, dis_buf );
3108      assign(t1, loadLE(ty,mkexpr(addr)));
3109      switch (gregOfRM(modrm)) {
3110         case 0: /* INC */
3111            t2 = newTemp(ty);
3112            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3113                             mkexpr(t1), mkU(ty,1)));
3114            if (locked) {
3115               casLE( mkexpr(addr),
3116                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3117            } else {
3118               storeLE(mkexpr(addr),mkexpr(t2));
3119            }
3120            setFlags_INC_DEC( True, t2, ty );
3121            break;
3122         case 1: /* DEC */
3123            t2 = newTemp(ty);
3124            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3125                             mkexpr(t1), mkU(ty,1)));
3126            if (locked) {
3127               casLE( mkexpr(addr),
3128                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3129            } else {
3130               storeLE(mkexpr(addr),mkexpr(t2));
3131            }
3132            setFlags_INC_DEC( False, t2, ty );
3133            break;
3134         case 2: /* call Ev */
3135            vassert(sz == 4);
3136            t2 = newTemp(Ity_I32);
3137            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3138            putIReg(4, R_ESP, mkexpr(t2));
3139            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
3140            jmp_treg(dres, Ijk_Call, t1);
3141            vassert(dres->whatNext == Dis_StopHere);
3142            break;
3143         case 4: /* JMP Ev */
3144            vassert(sz == 4);
3145            jmp_treg(dres, Ijk_Boring, t1);
3146            vassert(dres->whatNext == Dis_StopHere);
3147            break;
3148         case 6: /* PUSH Ev */
3149            vassert(sz == 4 || sz == 2);
3150            t2 = newTemp(Ity_I32);
3151            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3152            putIReg(4, R_ESP, mkexpr(t2) );
3153            storeLE( mkexpr(t2), mkexpr(t1) );
3154            break;
3155         default:
3156            *decode_OK = False;
3157            return delta;
3158      }
3159      delta += len;
3160      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3161                       nameISize(sz), dis_buf);
3162   }
3163   return delta;
3164}
3165
3166
3167/*------------------------------------------------------------*/
3168/*--- Disassembling string ops (including REP prefixes)    ---*/
3169/*------------------------------------------------------------*/
3170
3171/* Code shared by all the string ops */
3172static
3173void dis_string_op_increment(Int sz, Int t_inc)
3174{
3175   if (sz == 4 || sz == 2) {
3176      assign( t_inc,
3177              binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
3178                               mkU8(sz/2) ) );
3179   } else {
3180      assign( t_inc,
3181              IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
3182   }
3183}
3184
3185static
3186void dis_string_op( void (*dis_OP)( Int, IRTemp ),
3187                    Int sz, const HChar* name, UChar sorb )
3188{
3189   IRTemp t_inc = newTemp(Ity_I32);
3190   vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
3191   dis_string_op_increment(sz, t_inc);
3192   dis_OP( sz, t_inc );
3193   DIP("%s%c\n", name, nameISize(sz));
3194}
3195
3196static
3197void dis_MOVS ( Int sz, IRTemp t_inc )
3198{
3199   IRType ty = szToITy(sz);
3200   IRTemp td = newTemp(Ity_I32);   /* EDI */
3201   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3202
3203   assign( td, getIReg(4, R_EDI) );
3204   assign( ts, getIReg(4, R_ESI) );
3205
3206   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
3207
3208   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3209   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3210}
3211
3212static
3213void dis_LODS ( Int sz, IRTemp t_inc )
3214{
3215   IRType ty = szToITy(sz);
3216   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3217
3218   assign( ts, getIReg(4, R_ESI) );
3219
3220   putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
3221
3222   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3223}
3224
3225static
3226void dis_STOS ( Int sz, IRTemp t_inc )
3227{
3228   IRType ty = szToITy(sz);
3229   IRTemp ta = newTemp(ty);        /* EAX */
3230   IRTemp td = newTemp(Ity_I32);   /* EDI */
3231
3232   assign( ta, getIReg(sz, R_EAX) );
3233   assign( td, getIReg(4, R_EDI) );
3234
3235   storeLE( mkexpr(td), mkexpr(ta) );
3236
3237   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3238}
3239
3240static
3241void dis_CMPS ( Int sz, IRTemp t_inc )
3242{
3243   IRType ty  = szToITy(sz);
3244   IRTemp tdv = newTemp(ty);      /* (EDI) */
3245   IRTemp tsv = newTemp(ty);      /* (ESI) */
3246   IRTemp td  = newTemp(Ity_I32); /*  EDI  */
3247   IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
3248
3249   assign( td, getIReg(4, R_EDI) );
3250   assign( ts, getIReg(4, R_ESI) );
3251
3252   assign( tdv, loadLE(ty,mkexpr(td)) );
3253   assign( tsv, loadLE(ty,mkexpr(ts)) );
3254
3255   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
3256
3257   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3258   putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3259}
3260
3261static
3262void dis_SCAS ( Int sz, IRTemp t_inc )
3263{
3264   IRType ty  = szToITy(sz);
3265   IRTemp ta  = newTemp(ty);       /*  EAX  */
3266   IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
3267   IRTemp tdv = newTemp(ty);       /* (EDI) */
3268
3269   assign( ta, getIReg(sz, R_EAX) );
3270   assign( td, getIReg(4, R_EDI) );
3271
3272   assign( tdv, loadLE(ty,mkexpr(td)) );
3273   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
3274
3275   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3276}
3277
3278
3279/* Wrap the appropriate string op inside a REP/REPE/REPNE.
3280   We assume the insn is the last one in the basic block, and so emit a jump
3281   to the next insn, rather than just falling through. */
3282static
3283void dis_REP_op ( /*MOD*/DisResult* dres,
3284                  X86Condcode cond,
3285                  void (*dis_OP)(Int, IRTemp),
3286                  Int sz, Addr32 eip, Addr32 eip_next, const HChar* name )
3287{
3288   IRTemp t_inc = newTemp(Ity_I32);
3289   IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
3290
3291   assign( tc, getIReg(4,R_ECX) );
3292
3293   stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
3294                      Ijk_Boring,
3295                      IRConst_U32(eip_next), OFFB_EIP ) );
3296
3297   putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
3298
3299   dis_string_op_increment(sz, t_inc);
3300   dis_OP (sz, t_inc);
3301
3302   if (cond == X86CondAlways) {
3303      jmp_lit(dres, Ijk_Boring, eip);
3304      vassert(dres->whatNext == Dis_StopHere);
3305   } else {
3306      stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
3307                         Ijk_Boring,
3308                         IRConst_U32(eip), OFFB_EIP ) );
3309      jmp_lit(dres, Ijk_Boring, eip_next);
3310      vassert(dres->whatNext == Dis_StopHere);
3311   }
3312   DIP("%s%c\n", name, nameISize(sz));
3313}
3314
3315
3316/*------------------------------------------------------------*/
3317/*--- Arithmetic, etc.                                     ---*/
3318/*------------------------------------------------------------*/
3319
3320/* IMUL E, G.  Supplied eip points to the modR/M byte. */
3321static
3322UInt dis_mul_E_G ( UChar       sorb,
3323                   Int         size,
3324                   Int         delta0 )
3325{
3326   Int    alen;
3327   HChar  dis_buf[50];
3328   UChar  rm = getIByte(delta0);
3329   IRType ty = szToITy(size);
3330   IRTemp te = newTemp(ty);
3331   IRTemp tg = newTemp(ty);
3332   IRTemp resLo = newTemp(ty);
3333
3334   assign( tg, getIReg(size, gregOfRM(rm)) );
3335   if (epartIsReg(rm)) {
3336      assign( te, getIReg(size, eregOfRM(rm)) );
3337   } else {
3338      IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
3339      assign( te, loadLE(ty,mkexpr(addr)) );
3340   }
3341
3342   setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
3343
3344   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
3345
3346   putIReg(size, gregOfRM(rm), mkexpr(resLo) );
3347
3348   if (epartIsReg(rm)) {
3349      DIP("imul%c %s, %s\n", nameISize(size),
3350                             nameIReg(size,eregOfRM(rm)),
3351                             nameIReg(size,gregOfRM(rm)));
3352      return 1+delta0;
3353   } else {
3354      DIP("imul%c %s, %s\n", nameISize(size),
3355                             dis_buf, nameIReg(size,gregOfRM(rm)));
3356      return alen+delta0;
3357   }
3358}
3359
3360
3361/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
3362static
3363UInt dis_imul_I_E_G ( UChar       sorb,
3364                      Int         size,
3365                      Int         delta,
3366                      Int         litsize )
3367{
3368   Int    d32, alen;
3369   HChar  dis_buf[50];
3370   UChar  rm = getIByte(delta);
3371   IRType ty = szToITy(size);
3372   IRTemp te = newTemp(ty);
3373   IRTemp tl = newTemp(ty);
3374   IRTemp resLo = newTemp(ty);
3375
3376   vassert(size == 1 || size == 2 || size == 4);
3377
3378   if (epartIsReg(rm)) {
3379      assign(te, getIReg(size, eregOfRM(rm)));
3380      delta++;
3381   } else {
3382      IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
3383      assign(te, loadLE(ty, mkexpr(addr)));
3384      delta += alen;
3385   }
3386   d32 = getSDisp(litsize,delta);
3387   delta += litsize;
3388
3389   if (size == 1) d32 &= 0xFF;
3390   if (size == 2) d32 &= 0xFFFF;
3391
3392   assign(tl, mkU(ty,d32));
3393
3394   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
3395
3396   setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
3397
3398   putIReg(size, gregOfRM(rm), mkexpr(resLo));
3399
3400   DIP("imul %d, %s, %s\n", d32,
3401       ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
3402       nameIReg(size,gregOfRM(rm)) );
3403   return delta;
3404}
3405
3406
3407/* Generate an IR sequence to do a count-leading-zeroes operation on
3408   the supplied IRTemp, and return a new IRTemp holding the result.
3409   'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
3410   argument is zero, return the number of bits in the word (the
3411   natural semantics). */
3412static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
3413{
3414   vassert(ty == Ity_I32 || ty == Ity_I16);
3415
3416   IRTemp src32 = newTemp(Ity_I32);
3417   assign(src32, widenUto32( mkexpr(src) ));
3418
3419   IRTemp src32x = newTemp(Ity_I32);
3420   assign(src32x,
3421          binop(Iop_Shl32, mkexpr(src32),
3422                           mkU8(32 - 8 * sizeofIRType(ty))));
3423
3424   // Clz32 has undefined semantics when its input is zero, so
3425   // special-case around that.
3426   IRTemp res32 = newTemp(Ity_I32);
3427   assign(res32,
3428          IRExpr_ITE(
3429             binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0)),
3430             mkU32(8 * sizeofIRType(ty)),
3431             unop(Iop_Clz32, mkexpr(src32x))
3432   ));
3433
3434   IRTemp res = newTemp(ty);
3435   assign(res, narrowTo(ty, mkexpr(res32)));
3436   return res;
3437}
3438
3439
3440/*------------------------------------------------------------*/
3441/*---                                                      ---*/
3442/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
3443/*---                                                      ---*/
3444/*------------------------------------------------------------*/
3445
3446/* --- Helper functions for dealing with the register stack. --- */
3447
3448/* --- Set the emulation-warning pseudo-register. --- */
3449
3450static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
3451{
3452   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3453   stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
3454}
3455
3456/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
3457
3458static IRExpr* mkQNaN64 ( void )
3459{
3460  /* QNaN is 0 2047 1 0(51times)
3461     == 0b 11111111111b 1 0(51times)
3462     == 0x7FF8 0000 0000 0000
3463   */
3464   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
3465}
3466
3467/* --------- Get/put the top-of-stack pointer. --------- */
3468
3469static IRExpr* get_ftop ( void )
3470{
3471   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
3472}
3473
3474static void put_ftop ( IRExpr* e )
3475{
3476   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3477   stmt( IRStmt_Put( OFFB_FTOP, e ) );
3478}
3479
3480/* --------- Get/put the C3210 bits. --------- */
3481
3482static IRExpr* get_C3210 ( void )
3483{
3484   return IRExpr_Get( OFFB_FC3210, Ity_I32 );
3485}
3486
3487static void put_C3210 ( IRExpr* e )
3488{
3489   stmt( IRStmt_Put( OFFB_FC3210, e ) );
3490}
3491
3492/* --------- Get/put the FPU rounding mode. --------- */
3493static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
3494{
3495   return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
3496}
3497
3498static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
3499{
3500   stmt( IRStmt_Put( OFFB_FPROUND, e ) );
3501}
3502
3503
3504/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
3505/* Produces a value in 0 .. 3, which is encoded as per the type
3506   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
3507   per IRRoundingMode, we merely need to get it and mask it for
3508   safety.
3509*/
3510static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
3511{
3512   return binop( Iop_And32, get_fpround(), mkU32(3) );
3513}
3514
3515static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
3516{
3517   return mkU32(Irrm_NEAREST);
3518}
3519
3520
3521/* --------- Get/set FP register tag bytes. --------- */
3522
3523/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
3524
3525static void put_ST_TAG ( Int i, IRExpr* value )
3526{
3527   IRRegArray* descr;
3528   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
3529   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3530   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3531}
3532
3533/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
3534   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
3535
3536static IRExpr* get_ST_TAG ( Int i )
3537{
3538   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3539   return IRExpr_GetI( descr, get_ftop(), i );
3540}
3541
3542
3543/* --------- Get/set FP registers. --------- */
3544
3545/* Given i, and some expression e, emit 'ST(i) = e' and set the
3546   register's tag to indicate the register is full.  The previous
3547   state of the register is not checked. */
3548
3549static void put_ST_UNCHECKED ( Int i, IRExpr* value )
3550{
3551   IRRegArray* descr;
3552   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
3553   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3554   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3555   /* Mark the register as in-use. */
3556   put_ST_TAG(i, mkU8(1));
3557}
3558
3559/* Given i, and some expression e, emit
3560      ST(i) = is_full(i) ? NaN : e
3561   and set the tag accordingly.
3562*/
3563
3564static void put_ST ( Int i, IRExpr* value )
3565{
3566   put_ST_UNCHECKED(
3567      i,
3568      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
3569                  /* non-0 means full */
3570                  mkQNaN64(),
3571                  /* 0 means empty */
3572                  value
3573      )
3574   );
3575}
3576
3577
3578/* Given i, generate an expression yielding 'ST(i)'. */
3579
3580static IRExpr* get_ST_UNCHECKED ( Int i )
3581{
3582   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3583   return IRExpr_GetI( descr, get_ftop(), i );
3584}
3585
3586
3587/* Given i, generate an expression yielding
3588  is_full(i) ? ST(i) : NaN
3589*/
3590
3591static IRExpr* get_ST ( Int i )
3592{
3593   return
3594      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
3595                  /* non-0 means full */
3596                  get_ST_UNCHECKED(i),
3597                  /* 0 means empty */
3598                  mkQNaN64());
3599}
3600
3601
3602/* Given i, and some expression e, and a condition cond, generate IR
3603   which has the same effect as put_ST(i,e) when cond is true and has
3604   no effect when cond is false.  Given the lack of proper
3605   if-then-else in the IR, this is pretty tricky.
3606*/
3607
3608static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
3609{
3610   // new_tag = if cond then FULL else old_tag
3611   // new_val = if cond then (if old_tag==FULL then NaN else val)
3612   //                   else old_val
3613
3614   IRTemp old_tag = newTemp(Ity_I8);
3615   assign(old_tag, get_ST_TAG(i));
3616   IRTemp new_tag = newTemp(Ity_I8);
3617   assign(new_tag,
3618          IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
3619
3620   IRTemp old_val = newTemp(Ity_F64);
3621   assign(old_val, get_ST_UNCHECKED(i));
3622   IRTemp new_val = newTemp(Ity_F64);
3623   assign(new_val,
3624          IRExpr_ITE(mkexpr(cond),
3625                     IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
3626                                /* non-0 means full */
3627                                mkQNaN64(),
3628                                /* 0 means empty */
3629                                value),
3630                     mkexpr(old_val)));
3631
3632   put_ST_UNCHECKED(i, mkexpr(new_val));
3633   // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
3634   // now set it to new_tag instead.
3635   put_ST_TAG(i, mkexpr(new_tag));
3636}
3637
3638/* Adjust FTOP downwards by one register. */
3639
3640static void fp_push ( void )
3641{
3642   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
3643}
3644
3645/* Adjust FTOP downwards by one register when COND is 1:I1.  Else
3646   don't change it. */
3647
3648static void maybe_fp_push ( IRTemp cond )
3649{
3650   put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
3651}
3652
3653/* Adjust FTOP upwards by one register, and mark the vacated register
3654   as empty.  */
3655
3656static void fp_pop ( void )
3657{
3658   put_ST_TAG(0, mkU8(0));
3659   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
3660}
3661
3662/* Set the C2 bit of the FPU status register to e[0].  Assumes that
3663   e[31:1] == 0.
3664*/
3665static void set_C2 ( IRExpr* e )
3666{
3667   IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
3668   put_C3210( binop(Iop_Or32,
3669                    cleared,
3670                    binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
3671}
3672
3673/* Generate code to check that abs(d64) < 2^63 and is finite.  This is
3674   used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
3675   test is simple, but the derivation of it is not so simple.
3676
3677   The exponent field for an IEEE754 double is 11 bits.  That means it
3678   can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
3679   the number is either a NaN or an Infinity and so is not finite.
3680   Furthermore, a finite value of exactly 2^63 is the smallest value
3681   that has exponent value 0x43E.  Hence, what we need to do is
3682   extract the exponent, ignoring the sign bit and mantissa, and check
3683   it is < 0x43E, or <= 0x43D.
3684
3685   To make this easily applicable to 32- and 64-bit targets, a
3686   roundabout approach is used.  First the number is converted to I64,
3687   then the top 32 bits are taken.  Shifting them right by 20 bits
3688   places the sign bit and exponent in the bottom 12 bits.  Anding
3689   with 0x7FF gets rid of the sign bit, leaving just the exponent
3690   available for comparison.
3691*/
3692static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
3693{
3694   IRTemp i64 = newTemp(Ity_I64);
3695   assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
3696   IRTemp exponent = newTemp(Ity_I32);
3697   assign(exponent,
3698          binop(Iop_And32,
3699                binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
3700                mkU32(0x7FF)));
3701   IRTemp in_range_and_finite = newTemp(Ity_I1);
3702   assign(in_range_and_finite,
3703          binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
3704   return in_range_and_finite;
3705}
3706
3707/* Invent a plausible-looking FPU status word value:
3708      ((ftop & 7) << 11) | (c3210 & 0x4700)
3709 */
3710static IRExpr* get_FPU_sw ( void )
3711{
3712   return
3713      unop(Iop_32to16,
3714           binop(Iop_Or32,
3715                 binop(Iop_Shl32,
3716                       binop(Iop_And32, get_ftop(), mkU32(7)),
3717                             mkU8(11)),
3718                       binop(Iop_And32, get_C3210(), mkU32(0x4700))
3719      ));
3720}
3721
3722
3723/* ------------------------------------------------------- */
3724/* Given all that stack-mangling junk, we can now go ahead
3725   and describe FP instructions.
3726*/
3727
3728/* ST(0) = ST(0) `op` mem64/32(addr)
3729   Need to check ST(0)'s tag on read, but not on write.
3730*/
3731static
3732void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
3733                         IROp op, Bool dbl )
3734{
3735   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3736   if (dbl) {
3737      put_ST_UNCHECKED(0,
3738         triop( op,
3739                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3740                get_ST(0),
3741                loadLE(Ity_F64,mkexpr(addr))
3742         ));
3743   } else {
3744      put_ST_UNCHECKED(0,
3745         triop( op,
3746                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3747                get_ST(0),
3748                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
3749         ));
3750   }
3751}
3752
3753
3754/* ST(0) = mem64/32(addr) `op` ST(0)
3755   Need to check ST(0)'s tag on read, but not on write.
3756*/
3757static
3758void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
3759                            IROp op, Bool dbl )
3760{
3761   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3762   if (dbl) {
3763      put_ST_UNCHECKED(0,
3764         triop( op,
3765                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3766                loadLE(Ity_F64,mkexpr(addr)),
3767                get_ST(0)
3768         ));
3769   } else {
3770      put_ST_UNCHECKED(0,
3771         triop( op,
3772                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3773                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
3774                get_ST(0)
3775         ));
3776   }
3777}
3778
3779
3780/* ST(dst) = ST(dst) `op` ST(src).
3781   Check dst and src tags when reading but not on write.
3782*/
3783static
3784void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3785                      Bool pop_after )
3786{
3787   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"",
3788                                 st_src, st_dst);
3789   put_ST_UNCHECKED(
3790      st_dst,
3791      triop( op,
3792             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3793             get_ST(st_dst),
3794             get_ST(st_src) )
3795   );
3796   if (pop_after)
3797      fp_pop();
3798}
3799
3800/* ST(dst) = ST(src) `op` ST(dst).
3801   Check dst and src tags when reading but not on write.
3802*/
3803static
3804void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src,
3805                         UInt st_dst, Bool pop_after )
3806{
3807   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"",
3808                                 st_src, st_dst);
3809   put_ST_UNCHECKED(
3810      st_dst,
3811      triop( op,
3812             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3813             get_ST(st_src),
3814             get_ST(st_dst) )
3815   );
3816   if (pop_after)
3817      fp_pop();
3818}
3819
3820/* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
3821static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
3822{
3823   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
3824   /* This is a bit of a hack (and isn't really right).  It sets
3825      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
3826      documentation implies A and S are unchanged.
3827   */
3828   /* It's also fishy in that it is used both for COMIP and
3829      UCOMIP, and they aren't the same (although similar). */
3830   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
3831   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
3832   stmt( IRStmt_Put( OFFB_CC_DEP1,
3833                     binop( Iop_And32,
3834                            binop(Iop_CmpF64, get_ST(0), get_ST(i)),
3835                            mkU32(0x45)
3836       )));
3837   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3838      elimination of previous stores to this field work better. */
3839   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
3840   if (pop_after)
3841      fp_pop();
3842}
3843
3844
3845static
3846UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
3847{
3848   Int    len;
3849   UInt   r_src, r_dst;
3850   HChar  dis_buf[50];
3851   IRTemp t1, t2;
3852
3853   /* On entry, delta points at the second byte of the insn (the modrm
3854      byte).*/
3855   UChar first_opcode = getIByte(delta-1);
3856   UChar modrm        = getIByte(delta+0);
3857
3858   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
3859
3860   if (first_opcode == 0xD8) {
3861      if (modrm < 0xC0) {
3862
3863         /* bits 5,4,3 are an opcode extension, and the modRM also
3864           specifies an address. */
3865         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3866         delta += len;
3867
3868         switch (gregOfRM(modrm)) {
3869
3870            case 0: /* FADD single-real */
3871               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
3872               break;
3873
3874            case 1: /* FMUL single-real */
3875               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
3876               break;
3877
3878            case 2: /* FCOM single-real */
3879               DIP("fcoms %s\n", dis_buf);
3880               /* This forces C1 to zero, which isn't right. */
3881               put_C3210(
3882                   binop( Iop_And32,
3883                          binop(Iop_Shl32,
3884                                binop(Iop_CmpF64,
3885                                      get_ST(0),
3886                                      unop(Iop_F32toF64,
3887                                           loadLE(Ity_F32,mkexpr(addr)))),
3888                                mkU8(8)),
3889                          mkU32(0x4500)
3890                   ));
3891               break;
3892
3893            case 3: /* FCOMP single-real */
3894               DIP("fcomps %s\n", dis_buf);
3895               /* This forces C1 to zero, which isn't right. */
3896               put_C3210(
3897                   binop( Iop_And32,
3898                          binop(Iop_Shl32,
3899                                binop(Iop_CmpF64,
3900                                      get_ST(0),
3901                                      unop(Iop_F32toF64,
3902                                           loadLE(Ity_F32,mkexpr(addr)))),
3903                                mkU8(8)),
3904                          mkU32(0x4500)
3905                   ));
3906               fp_pop();
3907               break;
3908
3909            case 4: /* FSUB single-real */
3910               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
3911               break;
3912
3913            case 5: /* FSUBR single-real */
3914               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
3915               break;
3916
3917            case 6: /* FDIV single-real */
3918               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
3919               break;
3920
3921            case 7: /* FDIVR single-real */
3922               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
3923               break;
3924
3925            default:
3926               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
3927               vex_printf("first_opcode == 0xD8\n");
3928               goto decode_fail;
3929         }
3930      } else {
3931         delta++;
3932         switch (modrm) {
3933
3934            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
3935               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
3936               break;
3937
3938            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
3939               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
3940               break;
3941
3942            /* Dunno if this is right */
3943            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
3944               r_dst = (UInt)modrm - 0xD0;
3945               DIP("fcom %%st(0),%%st(%u)\n", r_dst);
3946               /* This forces C1 to zero, which isn't right. */
3947               put_C3210(
3948                   binop( Iop_And32,
3949                          binop(Iop_Shl32,
3950                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3951                                mkU8(8)),
3952                          mkU32(0x4500)
3953                   ));
3954               break;
3955
3956            /* Dunno if this is right */
3957            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
3958               r_dst = (UInt)modrm - 0xD8;
3959               DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
3960               /* This forces C1 to zero, which isn't right. */
3961               put_C3210(
3962                   binop( Iop_And32,
3963                          binop(Iop_Shl32,
3964                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3965                                mkU8(8)),
3966                          mkU32(0x4500)
3967                   ));
3968               fp_pop();
3969               break;
3970
3971            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
3972               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
3973               break;
3974
3975            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
3976               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
3977               break;
3978
3979            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
3980               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
3981               break;
3982
3983            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
3984               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
3985               break;
3986
3987            default:
3988               goto decode_fail;
3989         }
3990      }
3991   }
3992
3993   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
3994   else
3995   if (first_opcode == 0xD9) {
3996      if (modrm < 0xC0) {
3997
3998         /* bits 5,4,3 are an opcode extension, and the modRM also
3999            specifies an address. */
4000         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4001         delta += len;
4002
4003         switch (gregOfRM(modrm)) {
4004
4005            case 0: /* FLD single-real */
4006               DIP("flds %s\n", dis_buf);
4007               fp_push();
4008               put_ST(0, unop(Iop_F32toF64,
4009                              loadLE(Ity_F32, mkexpr(addr))));
4010               break;
4011
4012            case 2: /* FST single-real */
4013               DIP("fsts %s\n", dis_buf);
4014               storeLE(mkexpr(addr),
4015                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
4016               break;
4017
4018            case 3: /* FSTP single-real */
4019               DIP("fstps %s\n", dis_buf);
4020               storeLE(mkexpr(addr),
4021                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
4022               fp_pop();
4023               break;
4024
4025            case 4: { /* FLDENV m28 */
4026               /* Uses dirty helper:
4027                     VexEmNote x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
4028               IRTemp   ew = newTemp(Ity_I32);
4029               IRDirty* d  = unsafeIRDirty_0_N (
4030                                0/*regparms*/,
4031                                "x86g_dirtyhelper_FLDENV",
4032                                &x86g_dirtyhelper_FLDENV,
4033                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
4034                             );
4035               d->tmp   = ew;
4036               /* declare we're reading memory */
4037               d->mFx   = Ifx_Read;
4038               d->mAddr = mkexpr(addr);
4039               d->mSize = 28;
4040
4041               /* declare we're writing guest state */
4042               d->nFxState = 4;
4043               vex_bzero(&d->fxState, sizeof(d->fxState));
4044
4045               d->fxState[0].fx     = Ifx_Write;
4046               d->fxState[0].offset = OFFB_FTOP;
4047               d->fxState[0].size   = sizeof(UInt);
4048
4049               d->fxState[1].fx     = Ifx_Write;
4050               d->fxState[1].offset = OFFB_FPTAGS;
4051               d->fxState[1].size   = 8 * sizeof(UChar);
4052
4053               d->fxState[2].fx     = Ifx_Write;
4054               d->fxState[2].offset = OFFB_FPROUND;
4055               d->fxState[2].size   = sizeof(UInt);
4056
4057               d->fxState[3].fx     = Ifx_Write;
4058               d->fxState[3].offset = OFFB_FC3210;
4059               d->fxState[3].size   = sizeof(UInt);
4060
4061               stmt( IRStmt_Dirty(d) );
4062
4063               /* ew contains any emulation warning we may need to
4064                  issue.  If needed, side-exit to the next insn,
4065                  reporting the warning, so that Valgrind's dispatcher
4066                  sees the warning. */
4067               put_emwarn( mkexpr(ew) );
4068               stmt(
4069                  IRStmt_Exit(
4070                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4071                     Ijk_EmWarn,
4072                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4073                     OFFB_EIP
4074                  )
4075               );
4076
4077               DIP("fldenv %s\n", dis_buf);
4078               break;
4079            }
4080
4081            case 5: {/* FLDCW */
4082               /* The only thing we observe in the control word is the
4083                  rounding mode.  Therefore, pass the 16-bit value
4084                  (x87 native-format control word) to a clean helper,
4085                  getting back a 64-bit value, the lower half of which
4086                  is the FPROUND value to store, and the upper half of
4087                  which is the emulation-warning token which may be
4088                  generated.
4089               */
4090               /* ULong x86h_check_fldcw ( UInt ); */
4091               IRTemp t64 = newTemp(Ity_I64);
4092               IRTemp ew = newTemp(Ity_I32);
4093               DIP("fldcw %s\n", dis_buf);
4094               assign( t64, mkIRExprCCall(
4095                               Ity_I64, 0/*regparms*/,
4096                               "x86g_check_fldcw",
4097                               &x86g_check_fldcw,
4098                               mkIRExprVec_1(
4099                                  unop( Iop_16Uto32,
4100                                        loadLE(Ity_I16, mkexpr(addr)))
4101                               )
4102                            )
4103                     );
4104
4105               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
4106               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
4107               put_emwarn( mkexpr(ew) );
4108               /* Finally, if an emulation warning was reported,
4109                  side-exit to the next insn, reporting the warning,
4110                  so that Valgrind's dispatcher sees the warning. */
4111               stmt(
4112                  IRStmt_Exit(
4113                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4114                     Ijk_EmWarn,
4115                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4116                     OFFB_EIP
4117                  )
4118               );
4119               break;
4120            }
4121
4122            case 6: { /* FNSTENV m28 */
4123               /* Uses dirty helper:
4124                     void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
4125               IRDirty* d = unsafeIRDirty_0_N (
4126                               0/*regparms*/,
4127                               "x86g_dirtyhelper_FSTENV",
4128                               &x86g_dirtyhelper_FSTENV,
4129                               mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
4130                            );
4131               /* declare we're writing memory */
4132               d->mFx   = Ifx_Write;
4133               d->mAddr = mkexpr(addr);
4134               d->mSize = 28;
4135
4136               /* declare we're reading guest state */
4137               d->nFxState = 4;
4138               vex_bzero(&d->fxState, sizeof(d->fxState));
4139
4140               d->fxState[0].fx     = Ifx_Read;
4141               d->fxState[0].offset = OFFB_FTOP;
4142               d->fxState[0].size   = sizeof(UInt);
4143
4144               d->fxState[1].fx     = Ifx_Read;
4145               d->fxState[1].offset = OFFB_FPTAGS;
4146               d->fxState[1].size   = 8 * sizeof(UChar);
4147
4148               d->fxState[2].fx     = Ifx_Read;
4149               d->fxState[2].offset = OFFB_FPROUND;
4150               d->fxState[2].size   = sizeof(UInt);
4151
4152               d->fxState[3].fx     = Ifx_Read;
4153               d->fxState[3].offset = OFFB_FC3210;
4154               d->fxState[3].size   = sizeof(UInt);
4155
4156               stmt( IRStmt_Dirty(d) );
4157
4158               DIP("fnstenv %s\n", dis_buf);
4159               break;
4160            }
4161
4162            case 7: /* FNSTCW */
4163              /* Fake up a native x87 FPU control word.  The only
4164                 thing it depends on is FPROUND[1:0], so call a clean
4165                 helper to cook it up. */
4166               /* UInt x86h_create_fpucw ( UInt fpround ) */
4167               DIP("fnstcw %s\n", dis_buf);
4168               storeLE(
4169                  mkexpr(addr),
4170                  unop( Iop_32to16,
4171                        mkIRExprCCall(
4172                           Ity_I32, 0/*regp*/,
4173                           "x86g_create_fpucw", &x86g_create_fpucw,
4174                           mkIRExprVec_1( get_fpround() )
4175                        )
4176                  )
4177               );
4178               break;
4179
4180            default:
4181               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4182               vex_printf("first_opcode == 0xD9\n");
4183               goto decode_fail;
4184         }
4185
4186      } else {
4187         delta++;
4188         switch (modrm) {
4189
4190            case 0xC0 ... 0xC7: /* FLD %st(?) */
4191               r_src = (UInt)modrm - 0xC0;
4192               DIP("fld %%st(%u)\n", r_src);
4193               t1 = newTemp(Ity_F64);
4194               assign(t1, get_ST(r_src));
4195               fp_push();
4196               put_ST(0, mkexpr(t1));
4197               break;
4198
4199            case 0xC8 ... 0xCF: /* FXCH %st(?) */
4200               r_src = (UInt)modrm - 0xC8;
4201               DIP("fxch %%st(%u)\n", r_src);
4202               t1 = newTemp(Ity_F64);
4203               t2 = newTemp(Ity_F64);
4204               assign(t1, get_ST(0));
4205               assign(t2, get_ST(r_src));
4206               put_ST_UNCHECKED(0, mkexpr(t2));
4207               put_ST_UNCHECKED(r_src, mkexpr(t1));
4208               break;
4209
4210            case 0xE0: /* FCHS */
4211               DIP("fchs\n");
4212               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
4213               break;
4214
4215            case 0xE1: /* FABS */
4216               DIP("fabs\n");
4217               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
4218               break;
4219
4220            case 0xE4: /* FTST */
4221               DIP("ftst\n");
4222               /* This forces C1 to zero, which isn't right. */
4223               /* Well, in fact the Intel docs say (bizarrely): "C1 is
4224                  set to 0 if stack underflow occurred; otherwise, set
4225                  to 0" which is pretty nonsensical.  I guess it's a
4226                   typo. */
4227               put_C3210(
4228                   binop( Iop_And32,
4229                          binop(Iop_Shl32,
4230                                binop(Iop_CmpF64,
4231                                      get_ST(0),
4232                                      IRExpr_Const(IRConst_F64i(0x0ULL))),
4233                                mkU8(8)),
4234                          mkU32(0x4500)
4235                   ));
4236               break;
4237
4238            case 0xE5: { /* FXAM */
4239               /* This is an interesting one.  It examines %st(0),
4240                  regardless of whether the tag says it's empty or not.
4241                  Here, just pass both the tag (in our format) and the
4242                  value (as a double, actually a ULong) to a helper
4243                  function. */
4244               IRExpr** args
4245                  = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
4246                                   unop(Iop_ReinterpF64asI64,
4247                                        get_ST_UNCHECKED(0)) );
4248               put_C3210(mkIRExprCCall(
4249                            Ity_I32,
4250                            0/*regparm*/,
4251                            "x86g_calculate_FXAM", &x86g_calculate_FXAM,
4252                            args
4253                        ));
4254               DIP("fxam\n");
4255               break;
4256            }
4257
4258            case 0xE8: /* FLD1 */
4259               DIP("fld1\n");
4260               fp_push();
4261               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
4262               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
4263               break;
4264
4265            case 0xE9: /* FLDL2T */
4266               DIP("fldl2t\n");
4267               fp_push();
4268               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
4269               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
4270               break;
4271
4272            case 0xEA: /* FLDL2E */
4273               DIP("fldl2e\n");
4274               fp_push();
4275               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
4276               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
4277               break;
4278
4279            case 0xEB: /* FLDPI */
4280               DIP("fldpi\n");
4281               fp_push();
4282               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
4283               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
4284               break;
4285
4286            case 0xEC: /* FLDLG2 */
4287               DIP("fldlg2\n");
4288               fp_push();
4289               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
4290               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
4291               break;
4292
4293            case 0xED: /* FLDLN2 */
4294               DIP("fldln2\n");
4295               fp_push();
4296               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
4297               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
4298               break;
4299
4300            case 0xEE: /* FLDZ */
4301               DIP("fldz\n");
4302               fp_push();
4303               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
4304               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
4305               break;
4306
4307            case 0xF0: /* F2XM1 */
4308               DIP("f2xm1\n");
4309               put_ST_UNCHECKED(0,
4310                  binop(Iop_2xm1F64,
4311                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4312                        get_ST(0)));
4313               break;
4314
4315            case 0xF1: /* FYL2X */
4316               DIP("fyl2x\n");
4317               put_ST_UNCHECKED(1,
4318                  triop(Iop_Yl2xF64,
4319                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4320                        get_ST(1),
4321                        get_ST(0)));
4322               fp_pop();
4323               break;
4324
4325            case 0xF2: { /* FPTAN */
4326               DIP("fptan\n");
4327               IRTemp argD = newTemp(Ity_F64);
4328               assign(argD, get_ST(0));
4329               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4330               IRTemp resD = newTemp(Ity_F64);
4331               assign(resD,
4332                  IRExpr_ITE(
4333                     mkexpr(argOK),
4334                     binop(Iop_TanF64,
4335                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4336                           mkexpr(argD)),
4337                     mkexpr(argD))
4338               );
4339               put_ST_UNCHECKED(0, mkexpr(resD));
4340               /* Conditionally push 1.0 on the stack, if the arg is
4341                  in range */
4342               maybe_fp_push(argOK);
4343               maybe_put_ST(argOK, 0,
4344                            IRExpr_Const(IRConst_F64(1.0)));
4345               set_C2( binop(Iop_Xor32,
4346                             unop(Iop_1Uto32, mkexpr(argOK)),
4347                             mkU32(1)) );
4348               break;
4349            }
4350
4351            case 0xF3: /* FPATAN */
4352               DIP("fpatan\n");
4353               put_ST_UNCHECKED(1,
4354                  triop(Iop_AtanF64,
4355                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4356                        get_ST(1),
4357                        get_ST(0)));
4358               fp_pop();
4359               break;
4360
4361            case 0xF4: { /* FXTRACT */
4362               IRTemp argF = newTemp(Ity_F64);
4363               IRTemp sigF = newTemp(Ity_F64);
4364               IRTemp expF = newTemp(Ity_F64);
4365               IRTemp argI = newTemp(Ity_I64);
4366               IRTemp sigI = newTemp(Ity_I64);
4367               IRTemp expI = newTemp(Ity_I64);
4368               DIP("fxtract\n");
4369               assign( argF, get_ST(0) );
4370               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
4371               assign( sigI,
4372                       mkIRExprCCall(
4373                          Ity_I64, 0/*regparms*/,
4374                          "x86amd64g_calculate_FXTRACT",
4375                          &x86amd64g_calculate_FXTRACT,
4376                          mkIRExprVec_2( mkexpr(argI),
4377                                         mkIRExpr_HWord(0)/*sig*/ ))
4378               );
4379               assign( expI,
4380                       mkIRExprCCall(
4381                          Ity_I64, 0/*regparms*/,
4382                          "x86amd64g_calculate_FXTRACT",
4383                          &x86amd64g_calculate_FXTRACT,
4384                          mkIRExprVec_2( mkexpr(argI),
4385                                         mkIRExpr_HWord(1)/*exp*/ ))
4386               );
4387               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
4388               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
4389               /* exponent */
4390               put_ST_UNCHECKED(0, mkexpr(expF) );
4391               fp_push();
4392               /* significand */
4393               put_ST(0, mkexpr(sigF) );
4394               break;
4395            }
4396
4397            case 0xF5: { /* FPREM1 -- IEEE compliant */
4398               IRTemp a1 = newTemp(Ity_F64);
4399               IRTemp a2 = newTemp(Ity_F64);
4400               DIP("fprem1\n");
4401               /* Do FPREM1 twice, once to get the remainder, and once
4402                  to get the C3210 flag values. */
4403               assign( a1, get_ST(0) );
4404               assign( a2, get_ST(1) );
4405               put_ST_UNCHECKED(0,
4406                  triop(Iop_PRem1F64,
4407                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4408                        mkexpr(a1),
4409                        mkexpr(a2)));
4410               put_C3210(
4411                  triop(Iop_PRem1C3210F64,
4412                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4413                        mkexpr(a1),
4414                        mkexpr(a2)) );
4415               break;
4416            }
4417
4418            case 0xF7: /* FINCSTP */
4419               DIP("fprem\n");
4420               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4421               break;
4422
4423            case 0xF8: { /* FPREM -- not IEEE compliant */
4424               IRTemp a1 = newTemp(Ity_F64);
4425               IRTemp a2 = newTemp(Ity_F64);
4426               DIP("fprem\n");
4427               /* Do FPREM twice, once to get the remainder, and once
4428                  to get the C3210 flag values. */
4429               assign( a1, get_ST(0) );
4430               assign( a2, get_ST(1) );
4431               put_ST_UNCHECKED(0,
4432                  triop(Iop_PRemF64,
4433                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4434                        mkexpr(a1),
4435                        mkexpr(a2)));
4436               put_C3210(
4437                  triop(Iop_PRemC3210F64,
4438                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4439                        mkexpr(a1),
4440                        mkexpr(a2)) );
4441               break;
4442            }
4443
4444            case 0xF9: /* FYL2XP1 */
4445               DIP("fyl2xp1\n");
4446               put_ST_UNCHECKED(1,
4447                  triop(Iop_Yl2xp1F64,
4448                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4449                        get_ST(1),
4450                        get_ST(0)));
4451               fp_pop();
4452               break;
4453
4454            case 0xFA: /* FSQRT */
4455               DIP("fsqrt\n");
4456               put_ST_UNCHECKED(0,
4457                  binop(Iop_SqrtF64,
4458                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4459                        get_ST(0)));
4460               break;
4461
4462            case 0xFB: { /* FSINCOS */
4463               DIP("fsincos\n");
4464               IRTemp argD = newTemp(Ity_F64);
4465               assign(argD, get_ST(0));
4466               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4467               IRTemp resD = newTemp(Ity_F64);
4468               assign(resD,
4469                  IRExpr_ITE(
4470                     mkexpr(argOK),
4471                     binop(Iop_SinF64,
4472                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4473                           mkexpr(argD)),
4474                     mkexpr(argD))
4475               );
4476               put_ST_UNCHECKED(0, mkexpr(resD));
4477               /* Conditionally push the cos value on the stack, if
4478                  the arg is in range */
4479               maybe_fp_push(argOK);
4480               maybe_put_ST(argOK, 0,
4481                  binop(Iop_CosF64,
4482                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4483                        mkexpr(argD)));
4484               set_C2( binop(Iop_Xor32,
4485                             unop(Iop_1Uto32, mkexpr(argOK)),
4486                             mkU32(1)) );
4487               break;
4488            }
4489
4490            case 0xFC: /* FRNDINT */
4491               DIP("frndint\n");
4492               put_ST_UNCHECKED(0,
4493                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
4494               break;
4495
4496            case 0xFD: /* FSCALE */
4497               DIP("fscale\n");
4498               put_ST_UNCHECKED(0,
4499                  triop(Iop_ScaleF64,
4500                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4501                        get_ST(0),
4502                        get_ST(1)));
4503               break;
4504
4505            case 0xFE:   /* FSIN */
4506            case 0xFF: { /* FCOS */
4507               Bool isSIN = modrm == 0xFE;
4508               DIP("%s\n", isSIN ? "fsin" : "fcos");
4509               IRTemp argD = newTemp(Ity_F64);
4510               assign(argD, get_ST(0));
4511               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4512               IRTemp resD = newTemp(Ity_F64);
4513               assign(resD,
4514                  IRExpr_ITE(
4515                     mkexpr(argOK),
4516                     binop(isSIN ? Iop_SinF64 : Iop_CosF64,
4517                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4518                           mkexpr(argD)),
4519                     mkexpr(argD))
4520               );
4521               put_ST_UNCHECKED(0, mkexpr(resD));
4522               set_C2( binop(Iop_Xor32,
4523                             unop(Iop_1Uto32, mkexpr(argOK)),
4524                             mkU32(1)) );
4525               break;
4526            }
4527
4528            default:
4529               goto decode_fail;
4530         }
4531      }
4532   }
4533
4534   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
4535   else
4536   if (first_opcode == 0xDA) {
4537
4538      if (modrm < 0xC0) {
4539
4540         /* bits 5,4,3 are an opcode extension, and the modRM also
4541            specifies an address. */
4542         IROp   fop;
4543         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4544         delta += len;
4545         switch (gregOfRM(modrm)) {
4546
4547            case 0: /* FIADD m32int */ /* ST(0) += m32int */
4548               DIP("fiaddl %s\n", dis_buf);
4549               fop = Iop_AddF64;
4550               goto do_fop_m32;
4551
4552            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
4553               DIP("fimull %s\n", dis_buf);
4554               fop = Iop_MulF64;
4555               goto do_fop_m32;
4556
4557            case 2: /* FICOM m32int */
4558               DIP("ficoml %s\n", dis_buf);
4559               /* This forces C1 to zero, which isn't right. */
4560               put_C3210(
4561                   binop( Iop_And32,
4562                          binop(Iop_Shl32,
4563                                binop(Iop_CmpF64,
4564                                      get_ST(0),
4565                                      unop(Iop_I32StoF64,
4566                                           loadLE(Ity_I32,mkexpr(addr)))),
4567                                mkU8(8)),
4568                          mkU32(0x4500)
4569                   ));
4570               break;
4571
4572            case 3: /* FICOMP m32int */
4573               DIP("ficompl %s\n", dis_buf);
4574               /* This forces C1 to zero, which isn't right. */
4575               put_C3210(
4576                   binop( Iop_And32,
4577                          binop(Iop_Shl32,
4578                                binop(Iop_CmpF64,
4579                                      get_ST(0),
4580                                      unop(Iop_I32StoF64,
4581                                           loadLE(Ity_I32,mkexpr(addr)))),
4582                                mkU8(8)),
4583                          mkU32(0x4500)
4584                   ));
4585               fp_pop();
4586               break;
4587
4588            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
4589               DIP("fisubl %s\n", dis_buf);
4590               fop = Iop_SubF64;
4591               goto do_fop_m32;
4592
4593            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
4594               DIP("fisubrl %s\n", dis_buf);
4595               fop = Iop_SubF64;
4596               goto do_foprev_m32;
4597
4598            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
4599               DIP("fidivl %s\n", dis_buf);
4600               fop = Iop_DivF64;
4601               goto do_fop_m32;
4602
4603            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
4604               DIP("fidivrl %s\n", dis_buf);
4605               fop = Iop_DivF64;
4606               goto do_foprev_m32;
4607
4608            do_fop_m32:
4609               put_ST_UNCHECKED(0,
4610                  triop(fop,
4611                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4612                        get_ST(0),
4613                        unop(Iop_I32StoF64,
4614                             loadLE(Ity_I32, mkexpr(addr)))));
4615               break;
4616
4617            do_foprev_m32:
4618               put_ST_UNCHECKED(0,
4619                  triop(fop,
4620                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4621                        unop(Iop_I32StoF64,
4622                             loadLE(Ity_I32, mkexpr(addr))),
4623                        get_ST(0)));
4624               break;
4625
4626            default:
4627               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4628               vex_printf("first_opcode == 0xDA\n");
4629               goto decode_fail;
4630         }
4631
4632      } else {
4633
4634         delta++;
4635         switch (modrm) {
4636
4637            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
4638               r_src = (UInt)modrm - 0xC0;
4639               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
4640               put_ST_UNCHECKED(0,
4641                                IRExpr_ITE(
4642                                    mk_x86g_calculate_condition(X86CondB),
4643                                    get_ST(r_src), get_ST(0)) );
4644               break;
4645
4646            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
4647               r_src = (UInt)modrm - 0xC8;
4648               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
4649               put_ST_UNCHECKED(0,
4650                                IRExpr_ITE(
4651                                    mk_x86g_calculate_condition(X86CondZ),
4652                                    get_ST(r_src), get_ST(0)) );
4653               break;
4654
4655            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
4656               r_src = (UInt)modrm - 0xD0;
4657               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
4658               put_ST_UNCHECKED(0,
4659                                IRExpr_ITE(
4660                                    mk_x86g_calculate_condition(X86CondBE),
4661                                    get_ST(r_src), get_ST(0)) );
4662               break;
4663
4664            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
4665               r_src = (UInt)modrm - 0xD8;
4666               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
4667               put_ST_UNCHECKED(0,
4668                                IRExpr_ITE(
4669                                    mk_x86g_calculate_condition(X86CondP),
4670                                    get_ST(r_src), get_ST(0)) );
4671               break;
4672
4673            case 0xE9: /* FUCOMPP %st(0),%st(1) */
4674               DIP("fucompp %%st(0),%%st(1)\n");
4675               /* This forces C1 to zero, which isn't right. */
4676               put_C3210(
4677                   binop( Iop_And32,
4678                          binop(Iop_Shl32,
4679                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
4680                                mkU8(8)),
4681                          mkU32(0x4500)
4682                   ));
4683               fp_pop();
4684               fp_pop();
4685               break;
4686
4687            default:
4688               goto decode_fail;
4689         }
4690
4691      }
4692   }
4693
4694   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
4695   else
4696   if (first_opcode == 0xDB) {
4697      if (modrm < 0xC0) {
4698
4699         /* bits 5,4,3 are an opcode extension, and the modRM also
4700            specifies an address. */
4701         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4702         delta += len;
4703
4704         switch (gregOfRM(modrm)) {
4705
4706            case 0: /* FILD m32int */
4707               DIP("fildl %s\n", dis_buf);
4708               fp_push();
4709               put_ST(0, unop(Iop_I32StoF64,
4710                              loadLE(Ity_I32, mkexpr(addr))));
4711               break;
4712
4713            case 1: /* FISTTPL m32 (SSE3) */
4714               DIP("fisttpl %s\n", dis_buf);
4715               storeLE( mkexpr(addr),
4716                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
4717               fp_pop();
4718               break;
4719
4720            case 2: /* FIST m32 */
4721               DIP("fistl %s\n", dis_buf);
4722               storeLE( mkexpr(addr),
4723                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4724               break;
4725
4726            case 3: /* FISTP m32 */
4727               DIP("fistpl %s\n", dis_buf);
4728               storeLE( mkexpr(addr),
4729                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4730               fp_pop();
4731               break;
4732
4733            case 5: { /* FLD extended-real */
4734               /* Uses dirty helper:
4735                     ULong x86g_loadF80le ( UInt )
4736                  addr holds the address.  First, do a dirty call to
4737                  get hold of the data. */
4738               IRTemp   val  = newTemp(Ity_I64);
4739               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
4740
4741               IRDirty* d = unsafeIRDirty_1_N (
4742                               val,
4743                               0/*regparms*/,
4744                               "x86g_dirtyhelper_loadF80le",
4745                               &x86g_dirtyhelper_loadF80le,
4746                               args
4747                            );
4748               /* declare that we're reading memory */
4749               d->mFx   = Ifx_Read;
4750               d->mAddr = mkexpr(addr);
4751               d->mSize = 10;
4752
4753               /* execute the dirty call, dumping the result in val. */
4754               stmt( IRStmt_Dirty(d) );
4755               fp_push();
4756               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
4757
4758               DIP("fldt %s\n", dis_buf);
4759               break;
4760            }
4761
4762            case 7: { /* FSTP extended-real */
4763               /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
4764               IRExpr** args
4765                  = mkIRExprVec_2( mkexpr(addr),
4766                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
4767
4768               IRDirty* d = unsafeIRDirty_0_N (
4769                               0/*regparms*/,
4770                               "x86g_dirtyhelper_storeF80le",
4771                               &x86g_dirtyhelper_storeF80le,
4772                               args
4773                            );
4774               /* declare we're writing memory */
4775               d->mFx   = Ifx_Write;
4776               d->mAddr = mkexpr(addr);
4777               d->mSize = 10;
4778
4779               /* execute the dirty call. */
4780               stmt( IRStmt_Dirty(d) );
4781               fp_pop();
4782
4783               DIP("fstpt\n %s", dis_buf);
4784               break;
4785            }
4786
4787            default:
4788               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4789               vex_printf("first_opcode == 0xDB\n");
4790               goto decode_fail;
4791         }
4792
4793      } else {
4794
4795         delta++;
4796         switch (modrm) {
4797
4798            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
4799               r_src = (UInt)modrm - 0xC0;
4800               DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
4801               put_ST_UNCHECKED(0,
4802                                IRExpr_ITE(
4803                                    mk_x86g_calculate_condition(X86CondNB),
4804                                    get_ST(r_src), get_ST(0)) );
4805               break;
4806
4807            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
4808               r_src = (UInt)modrm - 0xC8;
4809               DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
4810               put_ST_UNCHECKED(0,
4811                                IRExpr_ITE(
4812                                    mk_x86g_calculate_condition(X86CondNZ),
4813                                    get_ST(r_src), get_ST(0)) );
4814               break;
4815
4816            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
4817               r_src = (UInt)modrm - 0xD0;
4818               DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
4819               put_ST_UNCHECKED(0,
4820                                IRExpr_ITE(
4821                                    mk_x86g_calculate_condition(X86CondNBE),
4822                                    get_ST(r_src), get_ST(0)) );
4823               break;
4824
4825            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
4826               r_src = (UInt)modrm - 0xD8;
4827               DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
4828               put_ST_UNCHECKED(0,
4829                                IRExpr_ITE(
4830                                    mk_x86g_calculate_condition(X86CondNP),
4831                                    get_ST(r_src), get_ST(0)) );
4832               break;
4833
4834            case 0xE2:
4835               DIP("fnclex\n");
4836               break;
4837
4838            case 0xE3: {
4839               /* Uses dirty helper:
4840                     void x86g_do_FINIT ( VexGuestX86State* ) */
4841               IRDirty* d  = unsafeIRDirty_0_N (
4842                                0/*regparms*/,
4843                                "x86g_dirtyhelper_FINIT",
4844                                &x86g_dirtyhelper_FINIT,
4845                                mkIRExprVec_1(IRExpr_BBPTR())
4846                             );
4847
4848               /* declare we're writing guest state */
4849               d->nFxState = 5;
4850               vex_bzero(&d->fxState, sizeof(d->fxState));
4851
4852               d->fxState[0].fx     = Ifx_Write;
4853               d->fxState[0].offset = OFFB_FTOP;
4854               d->fxState[0].size   = sizeof(UInt);
4855
4856               d->fxState[1].fx     = Ifx_Write;
4857               d->fxState[1].offset = OFFB_FPREGS;
4858               d->fxState[1].size   = 8 * sizeof(ULong);
4859
4860               d->fxState[2].fx     = Ifx_Write;
4861               d->fxState[2].offset = OFFB_FPTAGS;
4862               d->fxState[2].size   = 8 * sizeof(UChar);
4863
4864               d->fxState[3].fx     = Ifx_Write;
4865               d->fxState[3].offset = OFFB_FPROUND;
4866               d->fxState[3].size   = sizeof(UInt);
4867
4868               d->fxState[4].fx     = Ifx_Write;
4869               d->fxState[4].offset = OFFB_FC3210;
4870               d->fxState[4].size   = sizeof(UInt);
4871
4872               stmt( IRStmt_Dirty(d) );
4873
4874               DIP("fninit\n");
4875               break;
4876            }
4877
4878            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
4879               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
4880               break;
4881
4882            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
4883               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
4884               break;
4885
4886            default:
4887               goto decode_fail;
4888         }
4889      }
4890   }
4891
4892   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
4893   else
4894   if (first_opcode == 0xDC) {
4895      if (modrm < 0xC0) {
4896
4897         /* bits 5,4,3 are an opcode extension, and the modRM also
4898            specifies an address. */
4899         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4900         delta += len;
4901
4902         switch (gregOfRM(modrm)) {
4903
4904            case 0: /* FADD double-real */
4905               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
4906               break;
4907
4908            case 1: /* FMUL double-real */
4909               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
4910               break;
4911
4912            case 2: /* FCOM double-real */
4913               DIP("fcoml %s\n", dis_buf);
4914               /* This forces C1 to zero, which isn't right. */
4915               put_C3210(
4916                   binop( Iop_And32,
4917                          binop(Iop_Shl32,
4918                                binop(Iop_CmpF64,
4919                                      get_ST(0),
4920                                      loadLE(Ity_F64,mkexpr(addr))),
4921                                mkU8(8)),
4922                          mkU32(0x4500)
4923                   ));
4924               break;
4925
4926            case 3: /* FCOMP double-real */
4927               DIP("fcompl %s\n", dis_buf);
4928               /* This forces C1 to zero, which isn't right. */
4929               put_C3210(
4930                   binop( Iop_And32,
4931                          binop(Iop_Shl32,
4932                                binop(Iop_CmpF64,
4933                                      get_ST(0),
4934                                      loadLE(Ity_F64,mkexpr(addr))),
4935                                mkU8(8)),
4936                          mkU32(0x4500)
4937                   ));
4938               fp_pop();
4939               break;
4940
4941            case 4: /* FSUB double-real */
4942               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
4943               break;
4944
4945            case 5: /* FSUBR double-real */
4946               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
4947               break;
4948
4949            case 6: /* FDIV double-real */
4950               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
4951               break;
4952
4953            case 7: /* FDIVR double-real */
4954               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
4955               break;
4956
4957            default:
4958               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
4959               vex_printf("first_opcode == 0xDC\n");
4960               goto decode_fail;
4961         }
4962
4963      } else {
4964
4965         delta++;
4966         switch (modrm) {
4967
4968            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
4969               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
4970               break;
4971
4972            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
4973               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
4974               break;
4975
4976            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
4977               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
4978               break;
4979
4980            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
4981               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
4982               break;
4983
4984            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
4985               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
4986               break;
4987
4988            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
4989               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
4990               break;
4991
4992            default:
4993               goto decode_fail;
4994         }
4995
4996      }
4997   }
4998
4999   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
5000   else
5001   if (first_opcode == 0xDD) {
5002
5003      if (modrm < 0xC0) {
5004
5005         /* bits 5,4,3 are an opcode extension, and the modRM also
5006            specifies an address. */
5007         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5008         delta += len;
5009
5010         switch (gregOfRM(modrm)) {
5011
5012            case 0: /* FLD double-real */
5013               DIP("fldl %s\n", dis_buf);
5014               fp_push();
5015               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
5016               break;
5017
5018            case 1: /* FISTTPQ m64 (SSE3) */
5019               DIP("fistppll %s\n", dis_buf);
5020               storeLE( mkexpr(addr),
5021                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
5022               fp_pop();
5023               break;
5024
5025            case 2: /* FST double-real */
5026               DIP("fstl %s\n", dis_buf);
5027               storeLE(mkexpr(addr), get_ST(0));
5028               break;
5029
5030            case 3: /* FSTP double-real */
5031               DIP("fstpl %s\n", dis_buf);
5032               storeLE(mkexpr(addr), get_ST(0));
5033               fp_pop();
5034               break;
5035
5036            case 4: { /* FRSTOR m108 */
5037               /* Uses dirty helper:
5038                     VexEmNote x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
5039               IRTemp   ew = newTemp(Ity_I32);
5040               IRDirty* d  = unsafeIRDirty_0_N (
5041                                0/*regparms*/,
5042                                "x86g_dirtyhelper_FRSTOR",
5043                                &x86g_dirtyhelper_FRSTOR,
5044                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
5045                             );
5046               d->tmp   = ew;
5047               /* declare we're reading memory */
5048               d->mFx   = Ifx_Read;
5049               d->mAddr = mkexpr(addr);
5050               d->mSize = 108;
5051
5052               /* declare we're writing guest state */
5053               d->nFxState = 5;
5054               vex_bzero(&d->fxState, sizeof(d->fxState));
5055
5056               d->fxState[0].fx     = Ifx_Write;
5057               d->fxState[0].offset = OFFB_FTOP;
5058               d->fxState[0].size   = sizeof(UInt);
5059
5060               d->fxState[1].fx     = Ifx_Write;
5061               d->fxState[1].offset = OFFB_FPREGS;
5062               d->fxState[1].size   = 8 * sizeof(ULong);
5063
5064               d->fxState[2].fx     = Ifx_Write;
5065               d->fxState[2].offset = OFFB_FPTAGS;
5066               d->fxState[2].size   = 8 * sizeof(UChar);
5067
5068               d->fxState[3].fx     = Ifx_Write;
5069               d->fxState[3].offset = OFFB_FPROUND;
5070               d->fxState[3].size   = sizeof(UInt);
5071
5072               d->fxState[4].fx     = Ifx_Write;
5073               d->fxState[4].offset = OFFB_FC3210;
5074               d->fxState[4].size   = sizeof(UInt);
5075
5076               stmt( IRStmt_Dirty(d) );
5077
5078               /* ew contains any emulation warning we may need to
5079                  issue.  If needed, side-exit to the next insn,
5080                  reporting the warning, so that Valgrind's dispatcher
5081                  sees the warning. */
5082               put_emwarn( mkexpr(ew) );
5083               stmt(
5084                  IRStmt_Exit(
5085                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5086                     Ijk_EmWarn,
5087                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
5088                     OFFB_EIP
5089                  )
5090               );
5091
5092               DIP("frstor %s\n", dis_buf);
5093               break;
5094            }
5095
5096            case 6: { /* FNSAVE m108 */
5097               /* Uses dirty helper:
5098                     void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
5099               IRDirty* d = unsafeIRDirty_0_N (
5100                               0/*regparms*/,
5101                               "x86g_dirtyhelper_FSAVE",
5102                               &x86g_dirtyhelper_FSAVE,
5103                               mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
5104                            );
5105               /* declare we're writing memory */
5106               d->mFx   = Ifx_Write;
5107               d->mAddr = mkexpr(addr);
5108               d->mSize = 108;
5109
5110               /* declare we're reading guest state */
5111               d->nFxState = 5;
5112               vex_bzero(&d->fxState, sizeof(d->fxState));
5113
5114               d->fxState[0].fx     = Ifx_Read;
5115               d->fxState[0].offset = OFFB_FTOP;
5116               d->fxState[0].size   = sizeof(UInt);
5117
5118               d->fxState[1].fx     = Ifx_Read;
5119               d->fxState[1].offset = OFFB_FPREGS;
5120               d->fxState[1].size   = 8 * sizeof(ULong);
5121
5122               d->fxState[2].fx     = Ifx_Read;
5123               d->fxState[2].offset = OFFB_FPTAGS;
5124               d->fxState[2].size   = 8 * sizeof(UChar);
5125
5126               d->fxState[3].fx     = Ifx_Read;
5127               d->fxState[3].offset = OFFB_FPROUND;
5128               d->fxState[3].size   = sizeof(UInt);
5129
5130               d->fxState[4].fx     = Ifx_Read;
5131               d->fxState[4].offset = OFFB_FC3210;
5132               d->fxState[4].size   = sizeof(UInt);
5133
5134               stmt( IRStmt_Dirty(d) );
5135
5136               DIP("fnsave %s\n", dis_buf);
5137               break;
5138            }
5139
5140            case 7: { /* FNSTSW m16 */
5141               IRExpr* sw = get_FPU_sw();
5142               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
5143               storeLE( mkexpr(addr), sw );
5144               DIP("fnstsw %s\n", dis_buf);
5145               break;
5146            }
5147
5148            default:
5149               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5150               vex_printf("first_opcode == 0xDD\n");
5151               goto decode_fail;
5152         }
5153      } else {
5154         delta++;
5155         switch (modrm) {
5156
5157            case 0xC0 ... 0xC7: /* FFREE %st(?) */
5158               r_dst = (UInt)modrm - 0xC0;
5159               DIP("ffree %%st(%u)\n", r_dst);
5160               put_ST_TAG ( r_dst, mkU8(0) );
5161               break;
5162
5163            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
5164               r_dst = (UInt)modrm - 0xD0;
5165               DIP("fst %%st(0),%%st(%u)\n", r_dst);
5166               /* P4 manual says: "If the destination operand is a
5167                  non-empty register, the invalid-operation exception
5168                  is not generated.  Hence put_ST_UNCHECKED. */
5169               put_ST_UNCHECKED(r_dst, get_ST(0));
5170               break;
5171
5172            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
5173               r_dst = (UInt)modrm - 0xD8;
5174               DIP("fstp %%st(0),%%st(%u)\n", r_dst);
5175               /* P4 manual says: "If the destination operand is a
5176                  non-empty register, the invalid-operation exception
5177                  is not generated.  Hence put_ST_UNCHECKED. */
5178               put_ST_UNCHECKED(r_dst, get_ST(0));
5179               fp_pop();
5180               break;
5181
5182            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
5183               r_dst = (UInt)modrm - 0xE0;
5184               DIP("fucom %%st(0),%%st(%u)\n", r_dst);
5185               /* This forces C1 to zero, which isn't right. */
5186               put_C3210(
5187                   binop( Iop_And32,
5188                          binop(Iop_Shl32,
5189                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5190                                mkU8(8)),
5191                          mkU32(0x4500)
5192                   ));
5193               break;
5194
5195            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
5196               r_dst = (UInt)modrm - 0xE8;
5197               DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
5198               /* This forces C1 to zero, which isn't right. */
5199               put_C3210(
5200                   binop( Iop_And32,
5201                          binop(Iop_Shl32,
5202                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5203                                mkU8(8)),
5204                          mkU32(0x4500)
5205                   ));
5206               fp_pop();
5207               break;
5208
5209            default:
5210               goto decode_fail;
5211         }
5212      }
5213   }
5214
5215   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
5216   else
5217   if (first_opcode == 0xDE) {
5218
5219      if (modrm < 0xC0) {
5220
5221         /* bits 5,4,3 are an opcode extension, and the modRM also
5222            specifies an address. */
5223         IROp   fop;
5224         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5225         delta += len;
5226
5227         switch (gregOfRM(modrm)) {
5228
5229            case 0: /* FIADD m16int */ /* ST(0) += m16int */
5230               DIP("fiaddw %s\n", dis_buf);
5231               fop = Iop_AddF64;
5232               goto do_fop_m16;
5233
5234            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
5235               DIP("fimulw %s\n", dis_buf);
5236               fop = Iop_MulF64;
5237               goto do_fop_m16;
5238
5239            case 2: /* FICOM m16int */
5240               DIP("ficomw %s\n", dis_buf);
5241               /* This forces C1 to zero, which isn't right. */
5242               put_C3210(
5243                   binop( Iop_And32,
5244                          binop(Iop_Shl32,
5245                                binop(Iop_CmpF64,
5246                                      get_ST(0),
5247                                      unop(Iop_I32StoF64,
5248                                         unop(Iop_16Sto32,
5249                                           loadLE(Ity_I16,mkexpr(addr))))),
5250                                mkU8(8)),
5251                          mkU32(0x4500)
5252                   ));
5253               break;
5254
5255            case 3: /* FICOMP m16int */
5256               DIP("ficompw %s\n", dis_buf);
5257               /* This forces C1 to zero, which isn't right. */
5258               put_C3210(
5259                   binop( Iop_And32,
5260                          binop(Iop_Shl32,
5261                                binop(Iop_CmpF64,
5262                                      get_ST(0),
5263                                      unop(Iop_I32StoF64,
5264                                         unop(Iop_16Sto32,
5265                                              loadLE(Ity_I16,mkexpr(addr))))),
5266                                mkU8(8)),
5267                          mkU32(0x4500)
5268                   ));
5269               fp_pop();
5270               break;
5271
5272            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
5273               DIP("fisubw %s\n", dis_buf);
5274               fop = Iop_SubF64;
5275               goto do_fop_m16;
5276
5277            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
5278               DIP("fisubrw %s\n", dis_buf);
5279               fop = Iop_SubF64;
5280               goto do_foprev_m16;
5281
5282            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
5283               DIP("fisubw %s\n", dis_buf);
5284               fop = Iop_DivF64;
5285               goto do_fop_m16;
5286
5287            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
5288               DIP("fidivrw %s\n", dis_buf);
5289               fop = Iop_DivF64;
5290               goto do_foprev_m16;
5291
5292            do_fop_m16:
5293               put_ST_UNCHECKED(0,
5294                  triop(fop,
5295                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5296                        get_ST(0),
5297                        unop(Iop_I32StoF64,
5298                             unop(Iop_16Sto32,
5299                                  loadLE(Ity_I16, mkexpr(addr))))));
5300               break;
5301
5302            do_foprev_m16:
5303               put_ST_UNCHECKED(0,
5304                  triop(fop,
5305                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5306                        unop(Iop_I32StoF64,
5307                             unop(Iop_16Sto32,
5308                                  loadLE(Ity_I16, mkexpr(addr)))),
5309                        get_ST(0)));
5310               break;
5311
5312            default:
5313               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5314               vex_printf("first_opcode == 0xDE\n");
5315               goto decode_fail;
5316         }
5317
5318      } else {
5319
5320         delta++;
5321         switch (modrm) {
5322
5323            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
5324               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
5325               break;
5326
5327            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
5328               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
5329               break;
5330
5331            case 0xD9: /* FCOMPP %st(0),%st(1) */
5332               DIP("fuompp %%st(0),%%st(1)\n");
5333               /* This forces C1 to zero, which isn't right. */
5334               put_C3210(
5335                   binop( Iop_And32,
5336                          binop(Iop_Shl32,
5337                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5338                                mkU8(8)),
5339                          mkU32(0x4500)
5340                   ));
5341               fp_pop();
5342               fp_pop();
5343               break;
5344
5345            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
5346               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
5347               break;
5348
5349            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
5350               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
5351               break;
5352
5353            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
5354               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
5355               break;
5356
5357            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
5358               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
5359               break;
5360
5361            default:
5362               goto decode_fail;
5363         }
5364
5365      }
5366   }
5367
5368   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
5369   else
5370   if (first_opcode == 0xDF) {
5371
5372      if (modrm < 0xC0) {
5373
5374         /* bits 5,4,3 are an opcode extension, and the modRM also
5375            specifies an address. */
5376         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5377         delta += len;
5378
5379         switch (gregOfRM(modrm)) {
5380
5381            case 0: /* FILD m16int */
5382               DIP("fildw %s\n", dis_buf);
5383               fp_push();
5384               put_ST(0, unop(Iop_I32StoF64,
5385                              unop(Iop_16Sto32,
5386                                   loadLE(Ity_I16, mkexpr(addr)))));
5387               break;
5388
5389            case 1: /* FISTTPS m16 (SSE3) */
5390               DIP("fisttps %s\n", dis_buf);
5391               storeLE( mkexpr(addr),
5392                        binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
5393               fp_pop();
5394               break;
5395
5396            case 2: /* FIST m16 */
5397               DIP("fistp %s\n", dis_buf);
5398               storeLE( mkexpr(addr),
5399                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5400               break;
5401
5402            case 3: /* FISTP m16 */
5403               DIP("fistps %s\n", dis_buf);
5404               storeLE( mkexpr(addr),
5405                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5406               fp_pop();
5407               break;
5408
5409            case 5: /* FILD m64 */
5410               DIP("fildll %s\n", dis_buf);
5411               fp_push();
5412               put_ST(0, binop(Iop_I64StoF64,
5413                               get_roundingmode(),
5414                               loadLE(Ity_I64, mkexpr(addr))));
5415               break;
5416
5417            case 7: /* FISTP m64 */
5418               DIP("fistpll %s\n", dis_buf);
5419               storeLE( mkexpr(addr),
5420                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
5421               fp_pop();
5422               break;
5423
5424            default:
5425               vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
5426               vex_printf("first_opcode == 0xDF\n");
5427               goto decode_fail;
5428         }
5429
5430      } else {
5431
5432         delta++;
5433         switch (modrm) {
5434
5435            case 0xC0: /* FFREEP %st(0) */
5436               DIP("ffreep %%st(%d)\n", 0);
5437               put_ST_TAG ( 0, mkU8(0) );
5438               fp_pop();
5439               break;
5440
5441            case 0xE0: /* FNSTSW %ax */
5442               DIP("fnstsw %%ax\n");
5443               /* Get the FPU status word value and dump it in %AX. */
5444               if (0) {
5445                  /* The obvious thing to do is simply dump the 16-bit
5446                     status word value in %AX.  However, due to a
5447                     limitation in Memcheck's origin tracking
5448                     machinery, this causes Memcheck not to track the
5449                     origin of any undefinedness into %AH (only into
5450                     %AL/%AX/%EAX), which means origins are lost in
5451                     the sequence "fnstsw %ax; test $M,%ah; jcond .." */
5452                  putIReg(2, R_EAX, get_FPU_sw());
5453               } else {
5454                  /* So a somewhat lame kludge is to make it very
5455                     clear to Memcheck that the value is written to
5456                     both %AH and %AL.  This generates marginally
5457                     worse code, but I don't think it matters much. */
5458                  IRTemp t16 = newTemp(Ity_I16);
5459                  assign(t16, get_FPU_sw());
5460                  putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
5461                  putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
5462               }
5463               break;
5464
5465            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
5466               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
5467               break;
5468
5469            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
5470               /* not really right since COMIP != UCOMIP */
5471               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
5472               break;
5473
5474            default:
5475               goto decode_fail;
5476         }
5477      }
5478
5479   }
5480
5481   else
5482   vpanic("dis_FPU(x86): invalid primary opcode");
5483
5484   *decode_ok = True;
5485   return delta;
5486
5487  decode_fail:
5488   *decode_ok = False;
5489   return delta;
5490}
5491
5492
5493/*------------------------------------------------------------*/
5494/*---                                                      ---*/
5495/*--- MMX INSTRUCTIONS                                     ---*/
5496/*---                                                      ---*/
5497/*------------------------------------------------------------*/
5498
5499/* Effect of MMX insns on x87 FPU state (table 11-2 of
5500   IA32 arch manual, volume 3):
5501
5502   Read from, or write to MMX register (viz, any insn except EMMS):
5503   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
5504   * FP stack pointer set to zero
5505
5506   EMMS:
5507   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
5508   * FP stack pointer set to zero
5509*/
5510
5511static void do_MMX_preamble ( void )
5512{
5513   Int         i;
5514   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5515   IRExpr*     zero  = mkU32(0);
5516   IRExpr*     tag1  = mkU8(1);
5517   put_ftop(zero);
5518   for (i = 0; i < 8; i++)
5519      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
5520}
5521
5522static void do_EMMS_preamble ( void )
5523{
5524   Int         i;
5525   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5526   IRExpr*     zero  = mkU32(0);
5527   IRExpr*     tag0  = mkU8(0);
5528   put_ftop(zero);
5529   for (i = 0; i < 8; i++)
5530      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
5531}
5532
5533
5534static IRExpr* getMMXReg ( UInt archreg )
5535{
5536   vassert(archreg < 8);
5537   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
5538}
5539
5540
5541static void putMMXReg ( UInt archreg, IRExpr* e )
5542{
5543   vassert(archreg < 8);
5544   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
5545   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
5546}
5547
5548
5549/* Helper for non-shift MMX insns.  Note this is incomplete in the
5550   sense that it does not first call do_MMX_preamble() -- that is the
5551   responsibility of its caller. */
5552
5553static
5554UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
5555                               Int    delta,
5556                               UChar  opc,
5557                               const HChar* name,
5558                               Bool   show_granularity )
5559{
5560   HChar   dis_buf[50];
5561   UChar   modrm = getIByte(delta);
5562   Bool    isReg = epartIsReg(modrm);
5563   IRExpr* argL  = NULL;
5564   IRExpr* argR  = NULL;
5565   IRExpr* argG  = NULL;
5566   IRExpr* argE  = NULL;
5567   IRTemp  res   = newTemp(Ity_I64);
5568
5569   Bool    invG  = False;
5570   IROp    op    = Iop_INVALID;
5571   void*   hAddr = NULL;
5572   Bool    eLeft = False;
5573   const HChar*  hName = NULL;
5574
5575#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
5576
5577   switch (opc) {
5578      /* Original MMX ones */
5579      case 0xFC: op = Iop_Add8x8; break;
5580      case 0xFD: op = Iop_Add16x4; break;
5581      case 0xFE: op = Iop_Add32x2; break;
5582
5583      case 0xEC: op = Iop_QAdd8Sx8; break;
5584      case 0xED: op = Iop_QAdd16Sx4; break;
5585
5586      case 0xDC: op = Iop_QAdd8Ux8; break;
5587      case 0xDD: op = Iop_QAdd16Ux4; break;
5588
5589      case 0xF8: op = Iop_Sub8x8;  break;
5590      case 0xF9: op = Iop_Sub16x4; break;
5591      case 0xFA: op = Iop_Sub32x2; break;
5592
5593      case 0xE8: op = Iop_QSub8Sx8; break;
5594      case 0xE9: op = Iop_QSub16Sx4; break;
5595
5596      case 0xD8: op = Iop_QSub8Ux8; break;
5597      case 0xD9: op = Iop_QSub16Ux4; break;
5598
5599      case 0xE5: op = Iop_MulHi16Sx4; break;
5600      case 0xD5: op = Iop_Mul16x4; break;
5601      case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
5602
5603      case 0x74: op = Iop_CmpEQ8x8; break;
5604      case 0x75: op = Iop_CmpEQ16x4; break;
5605      case 0x76: op = Iop_CmpEQ32x2; break;
5606
5607      case 0x64: op = Iop_CmpGT8Sx8; break;
5608      case 0x65: op = Iop_CmpGT16Sx4; break;
5609      case 0x66: op = Iop_CmpGT32Sx2; break;
5610
5611      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
5612      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
5613      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
5614
5615      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
5616      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
5617      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
5618
5619      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
5620      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
5621      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
5622
5623      case 0xDB: op = Iop_And64; break;
5624      case 0xDF: op = Iop_And64; invG = True; break;
5625      case 0xEB: op = Iop_Or64; break;
5626      case 0xEF: /* Possibly do better here if argL and argR are the
5627                    same reg */
5628                 op = Iop_Xor64; break;
5629
5630      /* Introduced in SSE1 */
5631      case 0xE0: op = Iop_Avg8Ux8;    break;
5632      case 0xE3: op = Iop_Avg16Ux4;   break;
5633      case 0xEE: op = Iop_Max16Sx4;   break;
5634      case 0xDE: op = Iop_Max8Ux8;    break;
5635      case 0xEA: op = Iop_Min16Sx4;   break;
5636      case 0xDA: op = Iop_Min8Ux8;    break;
5637      case 0xE4: op = Iop_MulHi16Ux4; break;
5638      case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
5639
5640      /* Introduced in SSE2 */
5641      case 0xD4: op = Iop_Add64; break;
5642      case 0xFB: op = Iop_Sub64; break;
5643
5644      default:
5645         vex_printf("\n0x%x\n", opc);
5646         vpanic("dis_MMXop_regmem_to_reg");
5647   }
5648
5649#  undef XXX
5650
5651   argG = getMMXReg(gregOfRM(modrm));
5652   if (invG)
5653      argG = unop(Iop_Not64, argG);
5654
5655   if (isReg) {
5656      delta++;
5657      argE = getMMXReg(eregOfRM(modrm));
5658   } else {
5659      Int    len;
5660      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5661      delta += len;
5662      argE = loadLE(Ity_I64, mkexpr(addr));
5663   }
5664
5665   if (eLeft) {
5666      argL = argE;
5667      argR = argG;
5668   } else {
5669      argL = argG;
5670      argR = argE;
5671   }
5672
5673   if (op != Iop_INVALID) {
5674      vassert(hName == NULL);
5675      vassert(hAddr == NULL);
5676      assign(res, binop(op, argL, argR));
5677   } else {
5678      vassert(hName != NULL);
5679      vassert(hAddr != NULL);
5680      assign( res,
5681              mkIRExprCCall(
5682                 Ity_I64,
5683                 0/*regparms*/, hName, hAddr,
5684                 mkIRExprVec_2( argL, argR )
5685              )
5686            );
5687   }
5688
5689   putMMXReg( gregOfRM(modrm), mkexpr(res) );
5690
5691   DIP("%s%s %s, %s\n",
5692       name, show_granularity ? nameMMXGran(opc & 3) : "",
5693       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
5694       nameMMXReg(gregOfRM(modrm)) );
5695
5696   return delta;
5697}
5698
5699
5700/* Vector by scalar shift of G by the amount specified at the bottom
5701   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
5702
5703static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
5704                                 const HChar* opname, IROp op )
5705{
5706   HChar   dis_buf[50];
5707   Int     alen, size;
5708   IRTemp  addr;
5709   Bool    shl, shr, sar;
5710   UChar   rm   = getIByte(delta);
5711   IRTemp  g0   = newTemp(Ity_I64);
5712   IRTemp  g1   = newTemp(Ity_I64);
5713   IRTemp  amt  = newTemp(Ity_I32);
5714   IRTemp  amt8 = newTemp(Ity_I8);
5715
5716   if (epartIsReg(rm)) {
5717      assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
5718      DIP("%s %s,%s\n", opname,
5719                        nameMMXReg(eregOfRM(rm)),
5720                        nameMMXReg(gregOfRM(rm)) );
5721      delta++;
5722   } else {
5723      addr = disAMode ( &alen, sorb, delta, dis_buf );
5724      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
5725      DIP("%s %s,%s\n", opname,
5726                        dis_buf,
5727                        nameMMXReg(gregOfRM(rm)) );
5728      delta += alen;
5729   }
5730   assign( g0,   getMMXReg(gregOfRM(rm)) );
5731   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
5732
5733   shl = shr = sar = False;
5734   size = 0;
5735   switch (op) {
5736      case Iop_ShlN16x4: shl = True; size = 32; break;
5737      case Iop_ShlN32x2: shl = True; size = 32; break;
5738      case Iop_Shl64:    shl = True; size = 64; break;
5739      case Iop_ShrN16x4: shr = True; size = 16; break;
5740      case Iop_ShrN32x2: shr = True; size = 32; break;
5741      case Iop_Shr64:    shr = True; size = 64; break;
5742      case Iop_SarN16x4: sar = True; size = 16; break;
5743      case Iop_SarN32x2: sar = True; size = 32; break;
5744      default: vassert(0);
5745   }
5746
5747   if (shl || shr) {
5748     assign(
5749        g1,
5750        IRExpr_ITE(
5751           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
5752           binop(op, mkexpr(g0), mkexpr(amt8)),
5753           mkU64(0)
5754        )
5755     );
5756   } else
5757   if (sar) {
5758     assign(
5759        g1,
5760        IRExpr_ITE(
5761           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
5762           binop(op, mkexpr(g0), mkexpr(amt8)),
5763           binop(op, mkexpr(g0), mkU8(size-1))
5764        )
5765     );
5766   } else {
5767      /*NOTREACHED*/
5768      vassert(0);
5769   }
5770
5771   putMMXReg( gregOfRM(rm), mkexpr(g1) );
5772   return delta;
5773}
5774
5775
5776/* Vector by scalar shift of E by an immediate byte.  This is a
5777   straight copy of dis_SSE_shiftE_imm. */
5778
5779static
5780UInt dis_MMX_shiftE_imm ( Int delta, const HChar* opname, IROp op )
5781{
5782   Bool    shl, shr, sar;
5783   UChar   rm   = getIByte(delta);
5784   IRTemp  e0   = newTemp(Ity_I64);
5785   IRTemp  e1   = newTemp(Ity_I64);
5786   UChar   amt, size;
5787   vassert(epartIsReg(rm));
5788   vassert(gregOfRM(rm) == 2
5789           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
5790   amt = getIByte(delta+1);
5791   delta += 2;
5792   DIP("%s $%d,%s\n", opname,
5793                      (Int)amt,
5794                      nameMMXReg(eregOfRM(rm)) );
5795
5796   assign( e0, getMMXReg(eregOfRM(rm)) );
5797
5798   shl = shr = sar = False;
5799   size = 0;
5800   switch (op) {
5801      case Iop_ShlN16x4: shl = True; size = 16; break;
5802      case Iop_ShlN32x2: shl = True; size = 32; break;
5803      case Iop_Shl64:    shl = True; size = 64; break;
5804      case Iop_SarN16x4: sar = True; size = 16; break;
5805      case Iop_SarN32x2: sar = True; size = 32; break;
5806      case Iop_ShrN16x4: shr = True; size = 16; break;
5807      case Iop_ShrN32x2: shr = True; size = 32; break;
5808      case Iop_Shr64:    shr = True; size = 64; break;
5809      default: vassert(0);
5810   }
5811
5812   if (shl || shr) {
5813      assign( e1, amt >= size
5814                     ? mkU64(0)
5815                     : binop(op, mkexpr(e0), mkU8(amt))
5816      );
5817   } else
5818   if (sar) {
5819      assign( e1, amt >= size
5820                     ? binop(op, mkexpr(e0), mkU8(size-1))
5821                     : binop(op, mkexpr(e0), mkU8(amt))
5822      );
5823   } else {
5824      /*NOTREACHED*/
5825      vassert(0);
5826   }
5827
5828   putMMXReg( eregOfRM(rm), mkexpr(e1) );
5829   return delta;
5830}
5831
5832
5833/* Completely handle all MMX instructions except emms. */
5834
5835static
5836UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
5837{
5838   Int   len;
5839   UChar modrm;
5840   HChar dis_buf[50];
5841   UChar opc = getIByte(delta);
5842   delta++;
5843
5844   /* dis_MMX handles all insns except emms. */
5845   do_MMX_preamble();
5846
5847   switch (opc) {
5848
5849      case 0x6E:
5850         /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
5851         if (sz != 4)
5852            goto mmx_decode_failure;
5853         modrm = getIByte(delta);
5854         if (epartIsReg(modrm)) {
5855            delta++;
5856            putMMXReg(
5857               gregOfRM(modrm),
5858               binop( Iop_32HLto64,
5859                      mkU32(0),
5860                      getIReg(4, eregOfRM(modrm)) ) );
5861            DIP("movd %s, %s\n",
5862                nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5863         } else {
5864            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5865            delta += len;
5866            putMMXReg(
5867               gregOfRM(modrm),
5868               binop( Iop_32HLto64,
5869                      mkU32(0),
5870                      loadLE(Ity_I32, mkexpr(addr)) ) );
5871            DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
5872         }
5873         break;
5874
5875      case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
5876         if (sz != 4)
5877            goto mmx_decode_failure;
5878         modrm = getIByte(delta);
5879         if (epartIsReg(modrm)) {
5880            delta++;
5881            putIReg( 4, eregOfRM(modrm),
5882                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5883            DIP("movd %s, %s\n",
5884                nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
5885         } else {
5886            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5887            delta += len;
5888            storeLE( mkexpr(addr),
5889                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5890            DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
5891         }
5892         break;
5893
5894      case 0x6F:
5895         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
5896         if (sz != 4)
5897            goto mmx_decode_failure;
5898         modrm = getIByte(delta);
5899         if (epartIsReg(modrm)) {
5900            delta++;
5901            putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
5902            DIP("movq %s, %s\n",
5903                nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5904         } else {
5905            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5906            delta += len;
5907            putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
5908            DIP("movq %s, %s\n",
5909                dis_buf, nameMMXReg(gregOfRM(modrm)));
5910         }
5911         break;
5912
5913      case 0x7F:
5914         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
5915         if (sz != 4)
5916            goto mmx_decode_failure;
5917         modrm = getIByte(delta);
5918         if (epartIsReg(modrm)) {
5919            delta++;
5920            putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
5921            DIP("movq %s, %s\n",
5922                nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
5923         } else {
5924            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5925            delta += len;
5926            storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
5927            DIP("mov(nt)q %s, %s\n",
5928                nameMMXReg(gregOfRM(modrm)), dis_buf);
5929         }
5930         break;
5931
5932      case 0xFC:
5933      case 0xFD:
5934      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
5935         if (sz != 4)
5936            goto mmx_decode_failure;
5937         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
5938         break;
5939
5940      case 0xEC:
5941      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
5942         if (sz != 4)
5943            goto mmx_decode_failure;
5944         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
5945         break;
5946
5947      case 0xDC:
5948      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5949         if (sz != 4)
5950            goto mmx_decode_failure;
5951         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
5952         break;
5953
5954      case 0xF8:
5955      case 0xF9:
5956      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
5957         if (sz != 4)
5958            goto mmx_decode_failure;
5959         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
5960         break;
5961
5962      case 0xE8:
5963      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
5964         if (sz != 4)
5965            goto mmx_decode_failure;
5966         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
5967         break;
5968
5969      case 0xD8:
5970      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5971         if (sz != 4)
5972            goto mmx_decode_failure;
5973         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
5974         break;
5975
5976      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
5977         if (sz != 4)
5978            goto mmx_decode_failure;
5979         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
5980         break;
5981
5982      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
5983         if (sz != 4)
5984            goto mmx_decode_failure;
5985         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
5986         break;
5987
5988      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
5989         vassert(sz == 4);
5990         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
5991         break;
5992
5993      case 0x74:
5994      case 0x75:
5995      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
5996         if (sz != 4)
5997            goto mmx_decode_failure;
5998         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
5999         break;
6000
6001      case 0x64:
6002      case 0x65:
6003      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
6004         if (sz != 4)
6005            goto mmx_decode_failure;
6006         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
6007         break;
6008
6009      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
6010         if (sz != 4)
6011            goto mmx_decode_failure;
6012         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
6013         break;
6014
6015      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
6016         if (sz != 4)
6017            goto mmx_decode_failure;
6018         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
6019         break;
6020
6021      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
6022         if (sz != 4)
6023            goto mmx_decode_failure;
6024         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
6025         break;
6026
6027      case 0x68:
6028      case 0x69:
6029      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
6030         if (sz != 4)
6031            goto mmx_decode_failure;
6032         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
6033         break;
6034
6035      case 0x60:
6036      case 0x61:
6037      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
6038         if (sz != 4)
6039            goto mmx_decode_failure;
6040         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
6041         break;
6042
6043      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
6044         if (sz != 4)
6045            goto mmx_decode_failure;
6046         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
6047         break;
6048
6049      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
6050         if (sz != 4)
6051            goto mmx_decode_failure;
6052         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
6053         break;
6054
6055      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
6056         if (sz != 4)
6057            goto mmx_decode_failure;
6058         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
6059         break;
6060
6061      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
6062         if (sz != 4)
6063            goto mmx_decode_failure;
6064         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
6065         break;
6066
6067#     define SHIFT_BY_REG(_name,_op)                                 \
6068                delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
6069                break;
6070
6071      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
6072      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
6073      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
6074      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
6075
6076      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
6077      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
6078      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
6079      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
6080
6081      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
6082      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
6083      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
6084
6085#     undef SHIFT_BY_REG
6086
6087      case 0x71:
6088      case 0x72:
6089      case 0x73: {
6090         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
6091         UChar byte2, subopc;
6092         if (sz != 4)
6093            goto mmx_decode_failure;
6094         byte2  = getIByte(delta);           /* amode / sub-opcode */
6095         subopc = toUChar( (byte2 >> 3) & 7 );
6096
6097#        define SHIFT_BY_IMM(_name,_op)                         \
6098             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
6099             } while (0)
6100
6101              if (subopc == 2 /*SRL*/ && opc == 0x71)
6102                 SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
6103         else if (subopc == 2 /*SRL*/ && opc == 0x72)
6104                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
6105         else if (subopc == 2 /*SRL*/ && opc == 0x73)
6106                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
6107
6108         else if (subopc == 4 /*SAR*/ && opc == 0x71)
6109                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
6110         else if (subopc == 4 /*SAR*/ && opc == 0x72)
6111                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
6112
6113         else if (subopc == 6 /*SHL*/ && opc == 0x71)
6114                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
6115         else if (subopc == 6 /*SHL*/ && opc == 0x72)
6116                 SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
6117         else if (subopc == 6 /*SHL*/ && opc == 0x73)
6118                 SHIFT_BY_IMM("psllq", Iop_Shl64);
6119
6120         else goto mmx_decode_failure;
6121
6122#        undef SHIFT_BY_IMM
6123         break;
6124      }
6125
6126      case 0xF7: {
6127         IRTemp addr    = newTemp(Ity_I32);
6128         IRTemp regD    = newTemp(Ity_I64);
6129         IRTemp regM    = newTemp(Ity_I64);
6130         IRTemp mask    = newTemp(Ity_I64);
6131         IRTemp olddata = newTemp(Ity_I64);
6132         IRTemp newdata = newTemp(Ity_I64);
6133
6134         modrm = getIByte(delta);
6135         if (sz != 4 || (!epartIsReg(modrm)))
6136            goto mmx_decode_failure;
6137         delta++;
6138
6139         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
6140         assign( regM, getMMXReg( eregOfRM(modrm) ));
6141         assign( regD, getMMXReg( gregOfRM(modrm) ));
6142         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
6143         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
6144         assign( newdata,
6145                 binop(Iop_Or64,
6146                       binop(Iop_And64,
6147                             mkexpr(regD),
6148                             mkexpr(mask) ),
6149                       binop(Iop_And64,
6150                             mkexpr(olddata),
6151                             unop(Iop_Not64, mkexpr(mask)))) );
6152         storeLE( mkexpr(addr), mkexpr(newdata) );
6153         DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
6154                                 nameMMXReg( gregOfRM(modrm) ) );
6155         break;
6156      }
6157
6158      /* --- MMX decode failure --- */
6159      default:
6160      mmx_decode_failure:
6161         *decode_ok = False;
6162         return delta; /* ignored */
6163
6164   }
6165
6166   *decode_ok = True;
6167   return delta;
6168}
6169
6170
6171/*------------------------------------------------------------*/
6172/*--- More misc arithmetic and other obscure insns.        ---*/
6173/*------------------------------------------------------------*/
6174
6175/* Double length left and right shifts.  Apparently only required in
6176   v-size (no b- variant). */
6177static
6178UInt dis_SHLRD_Gv_Ev ( UChar sorb,
6179                       Int delta, UChar modrm,
6180                       Int sz,
6181                       IRExpr* shift_amt,
6182                       Bool amt_is_literal,
6183                       const HChar* shift_amt_txt,
6184                       Bool left_shift )
6185{
6186   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
6187      for printing it.   And eip on entry points at the modrm byte. */
6188   Int len;
6189   HChar dis_buf[50];
6190
6191   IRType ty       = szToITy(sz);
6192   IRTemp gsrc     = newTemp(ty);
6193   IRTemp esrc     = newTemp(ty);
6194   IRTemp addr     = IRTemp_INVALID;
6195   IRTemp tmpSH    = newTemp(Ity_I8);
6196   IRTemp tmpL     = IRTemp_INVALID;
6197   IRTemp tmpRes   = IRTemp_INVALID;
6198   IRTemp tmpSubSh = IRTemp_INVALID;
6199   IROp   mkpair;
6200   IROp   getres;
6201   IROp   shift;
6202   IRExpr* mask = NULL;
6203
6204   vassert(sz == 2 || sz == 4);
6205
6206   /* The E-part is the destination; this is shifted.  The G-part
6207      supplies bits to be shifted into the E-part, but is not
6208      changed.
6209
6210      If shifting left, form a double-length word with E at the top
6211      and G at the bottom, and shift this left.  The result is then in
6212      the high part.
6213
6214      If shifting right, form a double-length word with G at the top
6215      and E at the bottom, and shift this right.  The result is then
6216      at the bottom.  */
6217
6218   /* Fetch the operands. */
6219
6220   assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
6221
6222   if (epartIsReg(modrm)) {
6223      delta++;
6224      assign( esrc, getIReg(sz, eregOfRM(modrm)) );
6225      DIP("sh%cd%c %s, %s, %s\n",
6226          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6227          shift_amt_txt,
6228          nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
6229   } else {
6230      addr = disAMode ( &len, sorb, delta, dis_buf );
6231      delta += len;
6232      assign( esrc, loadLE(ty, mkexpr(addr)) );
6233      DIP("sh%cd%c %s, %s, %s\n",
6234          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6235          shift_amt_txt,
6236          nameIReg(sz, gregOfRM(modrm)), dis_buf);
6237   }
6238
6239   /* Round up the relevant primops. */
6240
6241   if (sz == 4) {
6242      tmpL     = newTemp(Ity_I64);
6243      tmpRes   = newTemp(Ity_I32);
6244      tmpSubSh = newTemp(Ity_I32);
6245      mkpair   = Iop_32HLto64;
6246      getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
6247      shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
6248      mask     = mkU8(31);
6249   } else {
6250      /* sz == 2 */
6251      tmpL     = newTemp(Ity_I32);
6252      tmpRes   = newTemp(Ity_I16);
6253      tmpSubSh = newTemp(Ity_I16);
6254      mkpair   = Iop_16HLto32;
6255      getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
6256      shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
6257      mask     = mkU8(15);
6258   }
6259
6260   /* Do the shift, calculate the subshift value, and set
6261      the flag thunk. */
6262
6263   assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
6264
6265   if (left_shift)
6266      assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
6267   else
6268      assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
6269
6270   assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
6271   assign( tmpSubSh,
6272           unop(getres,
6273                binop(shift,
6274                      mkexpr(tmpL),
6275                      binop(Iop_And8,
6276                            binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
6277                            mask))) );
6278
6279   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
6280                              tmpRes, tmpSubSh, ty, tmpSH );
6281
6282   /* Put result back. */
6283
6284   if (epartIsReg(modrm)) {
6285      putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
6286   } else {
6287      storeLE( mkexpr(addr), mkexpr(tmpRes) );
6288   }
6289
6290   if (amt_is_literal) delta++;
6291   return delta;
6292}
6293
6294
6295/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
6296   required. */
6297
6298typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
6299
6300static const HChar* nameBtOp ( BtOp op )
6301{
6302   switch (op) {
6303      case BtOpNone:  return "";
6304      case BtOpSet:   return "s";
6305      case BtOpReset: return "r";
6306      case BtOpComp:  return "c";
6307      default: vpanic("nameBtOp(x86)");
6308   }
6309}
6310
6311
6312static
6313UInt dis_bt_G_E ( const VexAbiInfo* vbi,
6314                  UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
6315{
6316   HChar  dis_buf[50];
6317   UChar  modrm;
6318   Int    len;
6319   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
6320          t_addr1, t_esp, t_mask, t_new;
6321
6322   vassert(sz == 2 || sz == 4);
6323
6324   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
6325             = t_addr0 = t_addr1 = t_esp
6326             = t_mask = t_new = IRTemp_INVALID;
6327
6328   t_fetched = newTemp(Ity_I8);
6329   t_new     = newTemp(Ity_I8);
6330   t_bitno0  = newTemp(Ity_I32);
6331   t_bitno1  = newTemp(Ity_I32);
6332   t_bitno2  = newTemp(Ity_I8);
6333   t_addr1   = newTemp(Ity_I32);
6334   modrm     = getIByte(delta);
6335
6336   assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
6337
6338   if (epartIsReg(modrm)) {
6339      delta++;
6340      /* Get it onto the client's stack. */
6341      t_esp = newTemp(Ity_I32);
6342      t_addr0 = newTemp(Ity_I32);
6343
6344      /* For the choice of the value 128, see comment in dis_bt_G_E in
6345         guest_amd64_toIR.c.  We point out here only that 128 is
6346         fast-cased in Memcheck and is > 0, so seems like a good
6347         choice. */
6348      vassert(vbi->guest_stack_redzone_size == 0);
6349      assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
6350      putIReg(4, R_ESP, mkexpr(t_esp));
6351
6352      storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
6353
6354      /* Make t_addr0 point at it. */
6355      assign( t_addr0, mkexpr(t_esp) );
6356
6357      /* Mask out upper bits of the shift amount, since we're doing a
6358         reg. */
6359      assign( t_bitno1, binop(Iop_And32,
6360                              mkexpr(t_bitno0),
6361                              mkU32(sz == 4 ? 31 : 15)) );
6362
6363   } else {
6364      t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
6365      delta += len;
6366      assign( t_bitno1, mkexpr(t_bitno0) );
6367   }
6368
6369   /* At this point: t_addr0 is the address being operated on.  If it
6370      was a reg, we will have pushed it onto the client's stack.
6371      t_bitno1 is the bit number, suitably masked in the case of a
6372      reg.  */
6373
6374   /* Now the main sequence. */
6375   assign( t_addr1,
6376           binop(Iop_Add32,
6377                 mkexpr(t_addr0),
6378                 binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
6379
6380   /* t_addr1 now holds effective address */
6381
6382   assign( t_bitno2,
6383           unop(Iop_32to8,
6384                binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
6385
6386   /* t_bitno2 contains offset of bit within byte */
6387
6388   if (op != BtOpNone) {
6389      t_mask = newTemp(Ity_I8);
6390      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
6391   }
6392
6393   /* t_mask is now a suitable byte mask */
6394
6395   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
6396
6397   if (op != BtOpNone) {
6398      switch (op) {
6399         case BtOpSet:
6400            assign( t_new,
6401                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
6402            break;
6403         case BtOpComp:
6404            assign( t_new,
6405                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
6406            break;
6407         case BtOpReset:
6408            assign( t_new,
6409                    binop(Iop_And8, mkexpr(t_fetched),
6410                                    unop(Iop_Not8, mkexpr(t_mask))) );
6411            break;
6412         default:
6413            vpanic("dis_bt_G_E(x86)");
6414      }
6415      if (locked && !epartIsReg(modrm)) {
6416         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
6417                                 mkexpr(t_new)/*new*/,
6418                                 guest_EIP_curr_instr );
6419      } else {
6420         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
6421      }
6422   }
6423
6424   /* Side effect done; now get selected bit into Carry flag */
6425   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
6426   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6427   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6428   stmt( IRStmt_Put(
6429            OFFB_CC_DEP1,
6430            binop(Iop_And32,
6431                  binop(Iop_Shr32,
6432                        unop(Iop_8Uto32, mkexpr(t_fetched)),
6433                        mkexpr(t_bitno2)),
6434                  mkU32(1)))
6435       );
6436   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6437      elimination of previous stores to this field work better. */
6438   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6439
6440   /* Move reg operand from stack back to reg */
6441   if (epartIsReg(modrm)) {
6442      /* t_esp still points at it. */
6443      putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
6444      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
6445   }
6446
6447   DIP("bt%s%c %s, %s\n",
6448       nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
6449       ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
6450
6451   return delta;
6452}
6453
6454
6455
6456/* Handle BSF/BSR.  Only v-size seems necessary. */
6457static
6458UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
6459{
6460   Bool   isReg;
6461   UChar  modrm;
6462   HChar  dis_buf[50];
6463
6464   IRType ty  = szToITy(sz);
6465   IRTemp src = newTemp(ty);
6466   IRTemp dst = newTemp(ty);
6467
6468   IRTemp src32 = newTemp(Ity_I32);
6469   IRTemp dst32 = newTemp(Ity_I32);
6470   IRTemp srcB  = newTemp(Ity_I1);
6471
6472   vassert(sz == 4 || sz == 2);
6473
6474   modrm = getIByte(delta);
6475
6476   isReg = epartIsReg(modrm);
6477   if (isReg) {
6478      delta++;
6479      assign( src, getIReg(sz, eregOfRM(modrm)) );
6480   } else {
6481      Int    len;
6482      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6483      delta += len;
6484      assign( src, loadLE(ty, mkexpr(addr)) );
6485   }
6486
6487   DIP("bs%c%c %s, %s\n",
6488       fwds ? 'f' : 'r', nameISize(sz),
6489       ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
6490       nameIReg(sz, gregOfRM(modrm)));
6491
6492   /* Generate a bool expression which is zero iff the original is
6493      zero, and nonzero otherwise.  Ask for a CmpNE version which, if
6494      instrumented by Memcheck, is instrumented expensively, since
6495      this may be used on the output of a preceding movmskb insn,
6496      which has been known to be partially defined, and in need of
6497      careful handling. */
6498   assign( srcB, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
6499                       mkexpr(src), mkU(ty,0)) );
6500
6501   /* Flags: Z is 1 iff source value is zero.  All others
6502      are undefined -- we force them to zero. */
6503   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6504   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6505   stmt( IRStmt_Put(
6506            OFFB_CC_DEP1,
6507            IRExpr_ITE( mkexpr(srcB),
6508                        /* src!=0 */
6509                        mkU32(0),
6510                        /* src==0 */
6511                        mkU32(X86G_CC_MASK_Z)
6512                        )
6513       ));
6514   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6515      elimination of previous stores to this field work better. */
6516   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6517
6518   /* Result: iff source value is zero, we can't use
6519      Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
6520      But anyway, Intel x86 semantics say the result is undefined in
6521      such situations.  Hence handle the zero case specially. */
6522
6523   /* Bleh.  What we compute:
6524
6525          bsf32:  if src == 0 then 0 else  Ctz32(src)
6526          bsr32:  if src == 0 then 0 else  31 - Clz32(src)
6527
6528          bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
6529          bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
6530
6531      First, widen src to 32 bits if it is not already.
6532
6533      Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
6534      dst register unchanged when src == 0.  Hence change accordingly.
6535   */
6536   if (sz == 2)
6537      assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
6538   else
6539      assign( src32, mkexpr(src) );
6540
6541   /* The main computation, guarding against zero. */
6542   assign( dst32,
6543           IRExpr_ITE(
6544              mkexpr(srcB),
6545              /* src != 0 */
6546              fwds ? unop(Iop_Ctz32, mkexpr(src32))
6547                   : binop(Iop_Sub32,
6548                           mkU32(31),
6549                           unop(Iop_Clz32, mkexpr(src32))),
6550              /* src == 0 -- leave dst unchanged */
6551              widenUto32( getIReg( sz, gregOfRM(modrm) ) )
6552           )
6553         );
6554
6555   if (sz == 2)
6556      assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
6557   else
6558      assign( dst, mkexpr(dst32) );
6559
6560   /* dump result back */
6561   putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
6562
6563   return delta;
6564}
6565
6566
6567static
6568void codegen_xchg_eAX_Reg ( Int sz, Int reg )
6569{
6570   IRType ty = szToITy(sz);
6571   IRTemp t1 = newTemp(ty);
6572   IRTemp t2 = newTemp(ty);
6573   vassert(sz == 2 || sz == 4);
6574   assign( t1, getIReg(sz, R_EAX) );
6575   assign( t2, getIReg(sz, reg) );
6576   putIReg( sz, R_EAX, mkexpr(t2) );
6577   putIReg( sz, reg, mkexpr(t1) );
6578   DIP("xchg%c %s, %s\n",
6579       nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
6580}
6581
6582
6583static
6584void codegen_SAHF ( void )
6585{
6586   /* Set the flags to:
6587      (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
6588      | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6589                |X86G_CC_MASK_P|X86G_CC_MASK_C)
6590   */
6591   UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6592                       |X86G_CC_MASK_C|X86G_CC_MASK_P;
6593   IRTemp oldflags   = newTemp(Ity_I32);
6594   assign( oldflags, mk_x86g_calculate_eflags_all() );
6595   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6596   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6597   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6598   stmt( IRStmt_Put( OFFB_CC_DEP1,
6599         binop(Iop_Or32,
6600               binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
6601               binop(Iop_And32,
6602                     binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
6603                     mkU32(mask_SZACP))
6604              )
6605   ));
6606   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6607      elimination of previous stores to this field work better. */
6608   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6609}
6610
6611
6612static
6613void codegen_LAHF ( void  )
6614{
6615   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
6616   IRExpr* eax_with_hole;
6617   IRExpr* new_byte;
6618   IRExpr* new_eax;
6619   UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6620                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
6621
6622   IRTemp  flags = newTemp(Ity_I32);
6623   assign( flags, mk_x86g_calculate_eflags_all() );
6624
6625   eax_with_hole
6626      = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
6627   new_byte
6628      = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
6629                        mkU32(1<<1));
6630   new_eax
6631      = binop(Iop_Or32, eax_with_hole,
6632                        binop(Iop_Shl32, new_byte, mkU8(8)));
6633   putIReg(4, R_EAX, new_eax);
6634}
6635
6636
6637static
6638UInt dis_cmpxchg_G_E ( UChar       sorb,
6639                       Bool        locked,
6640                       Int         size,
6641                       Int         delta0 )
6642{
6643   HChar dis_buf[50];
6644   Int   len;
6645
6646   IRType ty    = szToITy(size);
6647   IRTemp acc   = newTemp(ty);
6648   IRTemp src   = newTemp(ty);
6649   IRTemp dest  = newTemp(ty);
6650   IRTemp dest2 = newTemp(ty);
6651   IRTemp acc2  = newTemp(ty);
6652   IRTemp cond  = newTemp(Ity_I1);
6653   IRTemp addr  = IRTemp_INVALID;
6654   UChar  rm    = getUChar(delta0);
6655
6656   /* There are 3 cases to consider:
6657
6658      reg-reg: ignore any lock prefix, generate sequence based
6659               on ITE
6660
6661      reg-mem, not locked: ignore any lock prefix, generate sequence
6662                           based on ITE
6663
6664      reg-mem, locked: use IRCAS
6665   */
6666   if (epartIsReg(rm)) {
6667      /* case 1 */
6668      assign( dest, getIReg(size, eregOfRM(rm)) );
6669      delta0++;
6670      assign( src, getIReg(size, gregOfRM(rm)) );
6671      assign( acc, getIReg(size, R_EAX) );
6672      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6673      assign( cond, mk_x86g_calculate_condition(X86CondZ) );
6674      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
6675      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
6676      putIReg(size, R_EAX, mkexpr(acc2));
6677      putIReg(size, eregOfRM(rm), mkexpr(dest2));
6678      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6679                               nameIReg(size,gregOfRM(rm)),
6680                               nameIReg(size,eregOfRM(rm)) );
6681   }
6682   else if (!epartIsReg(rm) && !locked) {
6683      /* case 2 */
6684      addr = disAMode ( &len, sorb, delta0, dis_buf );
6685      assign( dest, loadLE(ty, mkexpr(addr)) );
6686      delta0 += len;
6687      assign( src, getIReg(size, gregOfRM(rm)) );
6688      assign( acc, getIReg(size, R_EAX) );
6689      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6690      assign( cond, mk_x86g_calculate_condition(X86CondZ) );
6691      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
6692      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
6693      putIReg(size, R_EAX, mkexpr(acc2));
6694      storeLE( mkexpr(addr), mkexpr(dest2) );
6695      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6696                               nameIReg(size,gregOfRM(rm)), dis_buf);
6697   }
6698   else if (!epartIsReg(rm) && locked) {
6699      /* case 3 */
6700      /* src is new value.  acc is expected value.  dest is old value.
6701         Compute success from the output of the IRCAS, and steer the
6702         new value for EAX accordingly: in case of success, EAX is
6703         unchanged. */
6704      addr = disAMode ( &len, sorb, delta0, dis_buf );
6705      delta0 += len;
6706      assign( src, getIReg(size, gregOfRM(rm)) );
6707      assign( acc, getIReg(size, R_EAX) );
6708      stmt( IRStmt_CAS(
6709         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
6710                  NULL, mkexpr(acc), NULL, mkexpr(src) )
6711      ));
6712      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6713      assign( cond, mk_x86g_calculate_condition(X86CondZ) );
6714      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
6715      putIReg(size, R_EAX, mkexpr(acc2));
6716      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6717                               nameIReg(size,gregOfRM(rm)), dis_buf);
6718   }
6719   else vassert(0);
6720
6721   return delta0;
6722}
6723
6724
6725/* Handle conditional move instructions of the form
6726      cmovcc E(reg-or-mem), G(reg)
6727
6728   E(src) is reg-or-mem
6729   G(dst) is reg.
6730
6731   If E is reg, -->    GET %E, tmps
6732                       GET %G, tmpd
6733                       CMOVcc tmps, tmpd
6734                       PUT tmpd, %G
6735
6736   If E is mem  -->    (getAddr E) -> tmpa
6737                       LD (tmpa), tmps
6738                       GET %G, tmpd
6739                       CMOVcc tmps, tmpd
6740                       PUT tmpd, %G
6741*/
6742static
6743UInt dis_cmov_E_G ( UChar       sorb,
6744                    Int         sz,
6745                    X86Condcode cond,
6746                    Int         delta0 )
6747{
6748   UChar rm  = getIByte(delta0);
6749   HChar dis_buf[50];
6750   Int   len;
6751
6752   IRType ty   = szToITy(sz);
6753   IRTemp tmps = newTemp(ty);
6754   IRTemp tmpd = newTemp(ty);
6755
6756   if (epartIsReg(rm)) {
6757      assign( tmps, getIReg(sz, eregOfRM(rm)) );
6758      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6759
6760      putIReg(sz, gregOfRM(rm),
6761                  IRExpr_ITE( mk_x86g_calculate_condition(cond),
6762                              mkexpr(tmps),
6763                              mkexpr(tmpd) )
6764             );
6765      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6766                              name_X86Condcode(cond),
6767                              nameIReg(sz,eregOfRM(rm)),
6768                              nameIReg(sz,gregOfRM(rm)));
6769      return 1+delta0;
6770   }
6771
6772   /* E refers to memory */
6773   {
6774      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6775      assign( tmps, loadLE(ty, mkexpr(addr)) );
6776      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6777
6778      putIReg(sz, gregOfRM(rm),
6779                  IRExpr_ITE( mk_x86g_calculate_condition(cond),
6780                              mkexpr(tmps),
6781                              mkexpr(tmpd) )
6782             );
6783
6784      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6785                              name_X86Condcode(cond),
6786                              dis_buf,
6787                              nameIReg(sz,gregOfRM(rm)));
6788      return len+delta0;
6789   }
6790}
6791
6792
6793static
6794UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
6795                    Bool* decodeOK )
6796{
6797   Int   len;
6798   UChar rm = getIByte(delta0);
6799   HChar dis_buf[50];
6800
6801   IRType ty    = szToITy(sz);
6802   IRTemp tmpd  = newTemp(ty);
6803   IRTemp tmpt0 = newTemp(ty);
6804   IRTemp tmpt1 = newTemp(ty);
6805
6806   /* There are 3 cases to consider:
6807
6808      reg-reg: ignore any lock prefix,
6809               generate 'naive' (non-atomic) sequence
6810
6811      reg-mem, not locked: ignore any lock prefix, generate 'naive'
6812                           (non-atomic) sequence
6813
6814      reg-mem, locked: use IRCAS
6815   */
6816
6817   if (epartIsReg(rm)) {
6818      /* case 1 */
6819      assign( tmpd,  getIReg(sz, eregOfRM(rm)));
6820      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6821      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6822                           mkexpr(tmpd), mkexpr(tmpt0)) );
6823      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6824      putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
6825      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6826      DIP("xadd%c %s, %s\n",
6827          nameISize(sz), nameIReg(sz,gregOfRM(rm)),
6828          				 nameIReg(sz,eregOfRM(rm)));
6829      *decodeOK = True;
6830      return 1+delta0;
6831   }
6832   else if (!epartIsReg(rm) && !locked) {
6833      /* case 2 */
6834      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6835      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6836      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6837      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6838                           mkexpr(tmpd), mkexpr(tmpt0)) );
6839      storeLE( mkexpr(addr), mkexpr(tmpt1) );
6840      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6841      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6842      DIP("xadd%c %s, %s\n",
6843          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6844      *decodeOK = True;
6845      return len+delta0;
6846   }
6847   else if (!epartIsReg(rm) && locked) {
6848      /* case 3 */
6849      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6850      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6851      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6852      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6853                           mkexpr(tmpd), mkexpr(tmpt0)) );
6854      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
6855                           mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
6856      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6857      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6858      DIP("xadd%c %s, %s\n",
6859          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6860      *decodeOK = True;
6861      return len+delta0;
6862   }
6863   /*UNREACHED*/
6864   vassert(0);
6865}
6866
6867/* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
6868
6869static
6870UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
6871{
6872   Int    len;
6873   IRTemp addr;
6874   UChar  rm  = getIByte(delta0);
6875   HChar  dis_buf[50];
6876
6877   if (epartIsReg(rm)) {
6878      putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
6879      DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
6880      return 1+delta0;
6881   } else {
6882      addr = disAMode ( &len, sorb, delta0, dis_buf );
6883      putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
6884      DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
6885      return len+delta0;
6886   }
6887}
6888
6889/* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
6890   dst is ireg and sz==4, zero out top half of it.  */
6891
6892static
6893UInt dis_mov_Sw_Ew ( UChar sorb,
6894                     Int   sz,
6895                     Int   delta0 )
6896{
6897   Int    len;
6898   IRTemp addr;
6899   UChar  rm  = getIByte(delta0);
6900   HChar  dis_buf[50];
6901
6902   vassert(sz == 2 || sz == 4);
6903
6904   if (epartIsReg(rm)) {
6905      if (sz == 4)
6906         putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
6907      else
6908         putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
6909
6910      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
6911      return 1+delta0;
6912   } else {
6913      addr = disAMode ( &len, sorb, delta0, dis_buf );
6914      storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
6915      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
6916      return len+delta0;
6917   }
6918}
6919
6920
6921static
6922void dis_push_segreg ( UInt sreg, Int sz )
6923{
6924    IRTemp t1 = newTemp(Ity_I16);
6925    IRTemp ta = newTemp(Ity_I32);
6926    vassert(sz == 2 || sz == 4);
6927
6928    assign( t1, getSReg(sreg) );
6929    assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
6930    putIReg(4, R_ESP, mkexpr(ta));
6931    storeLE( mkexpr(ta), mkexpr(t1) );
6932
6933    DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6934}
6935
6936static
6937void dis_pop_segreg ( UInt sreg, Int sz )
6938{
6939    IRTemp t1 = newTemp(Ity_I16);
6940    IRTemp ta = newTemp(Ity_I32);
6941    vassert(sz == 2 || sz == 4);
6942
6943    assign( ta, getIReg(4, R_ESP) );
6944    assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
6945
6946    putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
6947    putSReg( sreg, mkexpr(t1) );
6948    DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6949}
6950
6951static
6952void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
6953{
6954   IRTemp t1 = newTemp(Ity_I32);
6955   IRTemp t2 = newTemp(Ity_I32);
6956   assign(t1, getIReg(4,R_ESP));
6957   assign(t2, loadLE(Ity_I32,mkexpr(t1)));
6958   putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
6959   jmp_treg(dres, Ijk_Ret, t2);
6960   vassert(dres->whatNext == Dis_StopHere);
6961}
6962
6963/*------------------------------------------------------------*/
6964/*--- SSE/SSE2/SSE3 helpers                                ---*/
6965/*------------------------------------------------------------*/
6966
6967/* Indicates whether the op requires a rounding-mode argument.  Note
6968   that this covers only vector floating point arithmetic ops, and
6969   omits the scalar ones that need rounding modes.  Note also that
6970   inconsistencies here will get picked up later by the IR sanity
6971   checker, so this isn't correctness-critical. */
6972static Bool requiresRMode ( IROp op )
6973{
6974   switch (op) {
6975      /* 128 bit ops */
6976      case Iop_Add32Fx4: case Iop_Sub32Fx4:
6977      case Iop_Mul32Fx4: case Iop_Div32Fx4:
6978      case Iop_Add64Fx2: case Iop_Sub64Fx2:
6979      case Iop_Mul64Fx2: case Iop_Div64Fx2:
6980         return True;
6981      default:
6982         break;
6983   }
6984   return False;
6985}
6986
6987
6988/* Worker function; do not call directly.
6989   Handles full width G = G `op` E   and   G = (not G) `op` E.
6990*/
6991
6992static UInt dis_SSE_E_to_G_all_wrk (
6993               UChar sorb, Int delta,
6994               const HChar* opname, IROp op,
6995               Bool   invertG
6996            )
6997{
6998   HChar   dis_buf[50];
6999   Int     alen;
7000   IRTemp  addr;
7001   UChar   rm = getIByte(delta);
7002   IRExpr* gpart
7003      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
7004                : getXMMReg(gregOfRM(rm));
7005   if (epartIsReg(rm)) {
7006      putXMMReg(
7007         gregOfRM(rm),
7008         requiresRMode(op)
7009            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7010                        gpart,
7011                        getXMMReg(eregOfRM(rm)))
7012            : binop(op, gpart,
7013                        getXMMReg(eregOfRM(rm)))
7014      );
7015      DIP("%s %s,%s\n", opname,
7016                        nameXMMReg(eregOfRM(rm)),
7017                        nameXMMReg(gregOfRM(rm)) );
7018      return delta+1;
7019   } else {
7020      addr = disAMode ( &alen, sorb, delta, dis_buf );
7021      putXMMReg(
7022         gregOfRM(rm),
7023         requiresRMode(op)
7024            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7025                        gpart,
7026                        loadLE(Ity_V128, mkexpr(addr)))
7027            : binop(op, gpart,
7028                        loadLE(Ity_V128, mkexpr(addr)))
7029      );
7030      DIP("%s %s,%s\n", opname,
7031                        dis_buf,
7032                        nameXMMReg(gregOfRM(rm)) );
7033      return delta+alen;
7034   }
7035}
7036
7037
7038/* All lanes SSE binary operation, G = G `op` E. */
7039
7040static
7041UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, const HChar* opname, IROp op )
7042{
7043   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
7044}
7045
7046/* All lanes SSE binary operation, G = (not G) `op` E. */
7047
7048static
7049UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
7050                               const HChar* opname, IROp op )
7051{
7052   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
7053}
7054
7055
7056/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
7057
7058static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
7059                                  const HChar* opname, IROp op )
7060{
7061   HChar   dis_buf[50];
7062   Int     alen;
7063   IRTemp  addr;
7064   UChar   rm = getIByte(delta);
7065   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7066   if (epartIsReg(rm)) {
7067      putXMMReg( gregOfRM(rm),
7068                 binop(op, gpart,
7069                           getXMMReg(eregOfRM(rm))) );
7070      DIP("%s %s,%s\n", opname,
7071                        nameXMMReg(eregOfRM(rm)),
7072                        nameXMMReg(gregOfRM(rm)) );
7073      return delta+1;
7074   } else {
7075      /* We can only do a 32-bit memory read, so the upper 3/4 of the
7076         E operand needs to be made simply of zeroes. */
7077      IRTemp epart = newTemp(Ity_V128);
7078      addr = disAMode ( &alen, sorb, delta, dis_buf );
7079      assign( epart, unop( Iop_32UtoV128,
7080                           loadLE(Ity_I32, mkexpr(addr))) );
7081      putXMMReg( gregOfRM(rm),
7082                 binop(op, gpart, mkexpr(epart)) );
7083      DIP("%s %s,%s\n", opname,
7084                        dis_buf,
7085                        nameXMMReg(gregOfRM(rm)) );
7086      return delta+alen;
7087   }
7088}
7089
7090
7091/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
7092
7093static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
7094                                  const HChar* opname, IROp op )
7095{
7096   HChar   dis_buf[50];
7097   Int     alen;
7098   IRTemp  addr;
7099   UChar   rm = getIByte(delta);
7100   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7101   if (epartIsReg(rm)) {
7102      putXMMReg( gregOfRM(rm),
7103                 binop(op, gpart,
7104                           getXMMReg(eregOfRM(rm))) );
7105      DIP("%s %s,%s\n", opname,
7106                        nameXMMReg(eregOfRM(rm)),
7107                        nameXMMReg(gregOfRM(rm)) );
7108      return delta+1;
7109   } else {
7110      /* We can only do a 64-bit memory read, so the upper half of the
7111         E operand needs to be made simply of zeroes. */
7112      IRTemp epart = newTemp(Ity_V128);
7113      addr = disAMode ( &alen, sorb, delta, dis_buf );
7114      assign( epart, unop( Iop_64UtoV128,
7115                           loadLE(Ity_I64, mkexpr(addr))) );
7116      putXMMReg( gregOfRM(rm),
7117                 binop(op, gpart, mkexpr(epart)) );
7118      DIP("%s %s,%s\n", opname,
7119                        dis_buf,
7120                        nameXMMReg(gregOfRM(rm)) );
7121      return delta+alen;
7122   }
7123}
7124
7125
7126/* All lanes unary SSE operation, G = op(E). */
7127
7128static UInt dis_SSE_E_to_G_unary_all (
7129               UChar sorb, Int delta,
7130               const HChar* opname, IROp op
7131            )
7132{
7133   HChar   dis_buf[50];
7134   Int     alen;
7135   IRTemp  addr;
7136   UChar   rm = getIByte(delta);
7137   // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
7138   // up in the usual way.
7139   Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
7140   if (epartIsReg(rm)) {
7141      IRExpr* src = getXMMReg(eregOfRM(rm));
7142      /* XXXROUNDINGFIXME */
7143      IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
7144                              : unop(op, src);
7145      putXMMReg( gregOfRM(rm), res );
7146      DIP("%s %s,%s\n", opname,
7147                        nameXMMReg(eregOfRM(rm)),
7148                        nameXMMReg(gregOfRM(rm)) );
7149      return delta+1;
7150   } else {
7151      addr = disAMode ( &alen, sorb, delta, dis_buf );
7152      IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
7153      /* XXXROUNDINGFIXME */
7154      IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
7155                              : unop(op, src);
7156      putXMMReg( gregOfRM(rm), res );
7157      DIP("%s %s,%s\n", opname,
7158                        dis_buf,
7159                        nameXMMReg(gregOfRM(rm)) );
7160      return delta+alen;
7161   }
7162}
7163
7164
7165/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
7166
7167static UInt dis_SSE_E_to_G_unary_lo32 (
7168               UChar sorb, Int delta,
7169               const HChar* opname, IROp op
7170            )
7171{
7172   /* First we need to get the old G value and patch the low 32 bits
7173      of the E operand into it.  Then apply op and write back to G. */
7174   HChar   dis_buf[50];
7175   Int     alen;
7176   IRTemp  addr;
7177   UChar   rm = getIByte(delta);
7178   IRTemp  oldG0 = newTemp(Ity_V128);
7179   IRTemp  oldG1 = newTemp(Ity_V128);
7180
7181   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7182
7183   if (epartIsReg(rm)) {
7184      assign( oldG1,
7185              binop( Iop_SetV128lo32,
7186                     mkexpr(oldG0),
7187                     getXMMRegLane32(eregOfRM(rm), 0)) );
7188      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7189      DIP("%s %s,%s\n", opname,
7190                        nameXMMReg(eregOfRM(rm)),
7191                        nameXMMReg(gregOfRM(rm)) );
7192      return delta+1;
7193   } else {
7194      addr = disAMode ( &alen, sorb, delta, dis_buf );
7195      assign( oldG1,
7196              binop( Iop_SetV128lo32,
7197                     mkexpr(oldG0),
7198                     loadLE(Ity_I32, mkexpr(addr)) ));
7199      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7200      DIP("%s %s,%s\n", opname,
7201                        dis_buf,
7202                        nameXMMReg(gregOfRM(rm)) );
7203      return delta+alen;
7204   }
7205}
7206
7207
7208/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
7209
7210static UInt dis_SSE_E_to_G_unary_lo64 (
7211               UChar sorb, Int delta,
7212               const HChar* opname, IROp op
7213            )
7214{
7215   /* First we need to get the old G value and patch the low 64 bits
7216      of the E operand into it.  Then apply op and write back to G. */
7217   HChar   dis_buf[50];
7218   Int     alen;
7219   IRTemp  addr;
7220   UChar   rm = getIByte(delta);
7221   IRTemp  oldG0 = newTemp(Ity_V128);
7222   IRTemp  oldG1 = newTemp(Ity_V128);
7223
7224   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7225
7226   if (epartIsReg(rm)) {
7227      assign( oldG1,
7228              binop( Iop_SetV128lo64,
7229                     mkexpr(oldG0),
7230                     getXMMRegLane64(eregOfRM(rm), 0)) );
7231      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7232      DIP("%s %s,%s\n", opname,
7233                        nameXMMReg(eregOfRM(rm)),
7234                        nameXMMReg(gregOfRM(rm)) );
7235      return delta+1;
7236   } else {
7237      addr = disAMode ( &alen, sorb, delta, dis_buf );
7238      assign( oldG1,
7239              binop( Iop_SetV128lo64,
7240                     mkexpr(oldG0),
7241                     loadLE(Ity_I64, mkexpr(addr)) ));
7242      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7243      DIP("%s %s,%s\n", opname,
7244                        dis_buf,
7245                        nameXMMReg(gregOfRM(rm)) );
7246      return delta+alen;
7247   }
7248}
7249
7250
7251/* SSE integer binary operation:
7252      G = G `op` E   (eLeft == False)
7253      G = E `op` G   (eLeft == True)
7254*/
7255static UInt dis_SSEint_E_to_G(
7256               UChar sorb, Int delta,
7257               const HChar* opname, IROp op,
7258               Bool   eLeft
7259            )
7260{
7261   HChar   dis_buf[50];
7262   Int     alen;
7263   IRTemp  addr;
7264   UChar   rm = getIByte(delta);
7265   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7266   IRExpr* epart = NULL;
7267   if (epartIsReg(rm)) {
7268      epart = getXMMReg(eregOfRM(rm));
7269      DIP("%s %s,%s\n", opname,
7270                        nameXMMReg(eregOfRM(rm)),
7271                        nameXMMReg(gregOfRM(rm)) );
7272      delta += 1;
7273   } else {
7274      addr  = disAMode ( &alen, sorb, delta, dis_buf );
7275      epart = loadLE(Ity_V128, mkexpr(addr));
7276      DIP("%s %s,%s\n", opname,
7277                        dis_buf,
7278                        nameXMMReg(gregOfRM(rm)) );
7279      delta += alen;
7280   }
7281   putXMMReg( gregOfRM(rm),
7282              eLeft ? binop(op, epart, gpart)
7283	            : binop(op, gpart, epart) );
7284   return delta;
7285}
7286
7287
7288/* Helper for doing SSE FP comparisons. */
7289
7290static void findSSECmpOp ( Bool* needNot, IROp* op,
7291                           Int imm8, Bool all_lanes, Int sz )
7292{
7293   imm8 &= 7;
7294   *needNot = False;
7295   *op      = Iop_INVALID;
7296   if (imm8 >= 4) {
7297      *needNot = True;
7298      imm8 -= 4;
7299   }
7300
7301   if (sz == 4 && all_lanes) {
7302      switch (imm8) {
7303         case 0: *op = Iop_CmpEQ32Fx4; return;
7304         case 1: *op = Iop_CmpLT32Fx4; return;
7305         case 2: *op = Iop_CmpLE32Fx4; return;
7306         case 3: *op = Iop_CmpUN32Fx4; return;
7307         default: break;
7308      }
7309   }
7310   if (sz == 4 && !all_lanes) {
7311      switch (imm8) {
7312         case 0: *op = Iop_CmpEQ32F0x4; return;
7313         case 1: *op = Iop_CmpLT32F0x4; return;
7314         case 2: *op = Iop_CmpLE32F0x4; return;
7315         case 3: *op = Iop_CmpUN32F0x4; return;
7316         default: break;
7317      }
7318   }
7319   if (sz == 8 && all_lanes) {
7320      switch (imm8) {
7321         case 0: *op = Iop_CmpEQ64Fx2; return;
7322         case 1: *op = Iop_CmpLT64Fx2; return;
7323         case 2: *op = Iop_CmpLE64Fx2; return;
7324         case 3: *op = Iop_CmpUN64Fx2; return;
7325         default: break;
7326      }
7327   }
7328   if (sz == 8 && !all_lanes) {
7329      switch (imm8) {
7330         case 0: *op = Iop_CmpEQ64F0x2; return;
7331         case 1: *op = Iop_CmpLT64F0x2; return;
7332         case 2: *op = Iop_CmpLE64F0x2; return;
7333         case 3: *op = Iop_CmpUN64F0x2; return;
7334         default: break;
7335      }
7336   }
7337   vpanic("findSSECmpOp(x86,guest)");
7338}
7339
7340/* Handles SSE 32F/64F comparisons. */
7341
7342static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
7343				const HChar* opname, Bool all_lanes, Int sz )
7344{
7345   HChar   dis_buf[50];
7346   Int     alen, imm8;
7347   IRTemp  addr;
7348   Bool    needNot = False;
7349   IROp    op      = Iop_INVALID;
7350   IRTemp  plain   = newTemp(Ity_V128);
7351   UChar   rm      = getIByte(delta);
7352   UShort  mask    = 0;
7353   vassert(sz == 4 || sz == 8);
7354   if (epartIsReg(rm)) {
7355      imm8 = getIByte(delta+1);
7356      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7357      assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
7358                               getXMMReg(eregOfRM(rm))) );
7359      delta += 2;
7360      DIP("%s $%d,%s,%s\n", opname,
7361                            imm8,
7362                            nameXMMReg(eregOfRM(rm)),
7363                            nameXMMReg(gregOfRM(rm)) );
7364   } else {
7365      addr = disAMode ( &alen, sorb, delta, dis_buf );
7366      imm8 = getIByte(delta+alen);
7367      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7368      assign( plain,
7369              binop(
7370                 op,
7371                 getXMMReg(gregOfRM(rm)),
7372                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
7373                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
7374                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
7375             )
7376      );
7377      delta += alen+1;
7378      DIP("%s $%d,%s,%s\n", opname,
7379                            imm8,
7380                            dis_buf,
7381                            nameXMMReg(gregOfRM(rm)) );
7382   }
7383
7384   if (needNot && all_lanes) {
7385      putXMMReg( gregOfRM(rm),
7386                 unop(Iop_NotV128, mkexpr(plain)) );
7387   }
7388   else
7389   if (needNot && !all_lanes) {
7390      mask = toUShort( sz==4 ? 0x000F : 0x00FF );
7391      putXMMReg( gregOfRM(rm),
7392                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
7393   }
7394   else {
7395      putXMMReg( gregOfRM(rm), mkexpr(plain) );
7396   }
7397
7398   return delta;
7399}
7400
7401
7402/* Vector by scalar shift of G by the amount specified at the bottom
7403   of E. */
7404
7405static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
7406                                 const HChar* opname, IROp op )
7407{
7408   HChar   dis_buf[50];
7409   Int     alen, size;
7410   IRTemp  addr;
7411   Bool    shl, shr, sar;
7412   UChar   rm   = getIByte(delta);
7413   IRTemp  g0   = newTemp(Ity_V128);
7414   IRTemp  g1   = newTemp(Ity_V128);
7415   IRTemp  amt  = newTemp(Ity_I32);
7416   IRTemp  amt8 = newTemp(Ity_I8);
7417   if (epartIsReg(rm)) {
7418      assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
7419      DIP("%s %s,%s\n", opname,
7420                        nameXMMReg(eregOfRM(rm)),
7421                        nameXMMReg(gregOfRM(rm)) );
7422      delta++;
7423   } else {
7424      addr = disAMode ( &alen, sorb, delta, dis_buf );
7425      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
7426      DIP("%s %s,%s\n", opname,
7427                        dis_buf,
7428                        nameXMMReg(gregOfRM(rm)) );
7429      delta += alen;
7430   }
7431   assign( g0,   getXMMReg(gregOfRM(rm)) );
7432   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
7433
7434   shl = shr = sar = False;
7435   size = 0;
7436   switch (op) {
7437      case Iop_ShlN16x8: shl = True; size = 32; break;
7438      case Iop_ShlN32x4: shl = True; size = 32; break;
7439      case Iop_ShlN64x2: shl = True; size = 64; break;
7440      case Iop_SarN16x8: sar = True; size = 16; break;
7441      case Iop_SarN32x4: sar = True; size = 32; break;
7442      case Iop_ShrN16x8: shr = True; size = 16; break;
7443      case Iop_ShrN32x4: shr = True; size = 32; break;
7444      case Iop_ShrN64x2: shr = True; size = 64; break;
7445      default: vassert(0);
7446   }
7447
7448   if (shl || shr) {
7449     assign(
7450        g1,
7451        IRExpr_ITE(
7452           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
7453           binop(op, mkexpr(g0), mkexpr(amt8)),
7454           mkV128(0x0000)
7455        )
7456     );
7457   } else
7458   if (sar) {
7459     assign(
7460        g1,
7461        IRExpr_ITE(
7462           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
7463           binop(op, mkexpr(g0), mkexpr(amt8)),
7464           binop(op, mkexpr(g0), mkU8(size-1))
7465        )
7466     );
7467   } else {
7468      /*NOTREACHED*/
7469      vassert(0);
7470   }
7471
7472   putXMMReg( gregOfRM(rm), mkexpr(g1) );
7473   return delta;
7474}
7475
7476
7477/* Vector by scalar shift of E by an immediate byte. */
7478
7479static
7480UInt dis_SSE_shiftE_imm ( Int delta, const HChar* opname, IROp op )
7481{
7482   Bool    shl, shr, sar;
7483   UChar   rm   = getIByte(delta);
7484   IRTemp  e0   = newTemp(Ity_V128);
7485   IRTemp  e1   = newTemp(Ity_V128);
7486   UChar   amt, size;
7487   vassert(epartIsReg(rm));
7488   vassert(gregOfRM(rm) == 2
7489           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
7490   amt = getIByte(delta+1);
7491   delta += 2;
7492   DIP("%s $%d,%s\n", opname,
7493                      (Int)amt,
7494                      nameXMMReg(eregOfRM(rm)) );
7495   assign( e0, getXMMReg(eregOfRM(rm)) );
7496
7497   shl = shr = sar = False;
7498   size = 0;
7499   switch (op) {
7500      case Iop_ShlN16x8: shl = True; size = 16; break;
7501      case Iop_ShlN32x4: shl = True; size = 32; break;
7502      case Iop_ShlN64x2: shl = True; size = 64; break;
7503      case Iop_SarN16x8: sar = True; size = 16; break;
7504      case Iop_SarN32x4: sar = True; size = 32; break;
7505      case Iop_ShrN16x8: shr = True; size = 16; break;
7506      case Iop_ShrN32x4: shr = True; size = 32; break;
7507      case Iop_ShrN64x2: shr = True; size = 64; break;
7508      default: vassert(0);
7509   }
7510
7511   if (shl || shr) {
7512      assign( e1, amt >= size
7513                     ? mkV128(0x0000)
7514                     : binop(op, mkexpr(e0), mkU8(amt))
7515      );
7516   } else
7517   if (sar) {
7518      assign( e1, amt >= size
7519                     ? binop(op, mkexpr(e0), mkU8(size-1))
7520                     : binop(op, mkexpr(e0), mkU8(amt))
7521      );
7522   } else {
7523      /*NOTREACHED*/
7524      vassert(0);
7525   }
7526
7527   putXMMReg( eregOfRM(rm), mkexpr(e1) );
7528   return delta;
7529}
7530
7531
7532/* Get the current SSE rounding mode. */
7533
7534static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
7535{
7536   return binop( Iop_And32,
7537                 IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
7538                 mkU32(3) );
7539}
7540
7541static void put_sse_roundingmode ( IRExpr* sseround )
7542{
7543   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
7544   stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
7545}
7546
7547/* Break a 128-bit value up into four 32-bit ints. */
7548
7549static void breakup128to32s ( IRTemp t128,
7550			      /*OUTs*/
7551                              IRTemp* t3, IRTemp* t2,
7552                              IRTemp* t1, IRTemp* t0 )
7553{
7554   IRTemp hi64 = newTemp(Ity_I64);
7555   IRTemp lo64 = newTemp(Ity_I64);
7556   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
7557   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
7558
7559   vassert(t0 && *t0 == IRTemp_INVALID);
7560   vassert(t1 && *t1 == IRTemp_INVALID);
7561   vassert(t2 && *t2 == IRTemp_INVALID);
7562   vassert(t3 && *t3 == IRTemp_INVALID);
7563
7564   *t0 = newTemp(Ity_I32);
7565   *t1 = newTemp(Ity_I32);
7566   *t2 = newTemp(Ity_I32);
7567   *t3 = newTemp(Ity_I32);
7568   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
7569   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
7570   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
7571   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
7572}
7573
7574/* Construct a 128-bit value from four 32-bit ints. */
7575
7576static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
7577                              IRTemp t1, IRTemp t0 )
7578{
7579   return
7580      binop( Iop_64HLtoV128,
7581             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
7582             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
7583   );
7584}
7585
7586/* Break a 64-bit value up into four 16-bit ints. */
7587
7588static void breakup64to16s ( IRTemp t64,
7589                             /*OUTs*/
7590                             IRTemp* t3, IRTemp* t2,
7591                             IRTemp* t1, IRTemp* t0 )
7592{
7593   IRTemp hi32 = newTemp(Ity_I32);
7594   IRTemp lo32 = newTemp(Ity_I32);
7595   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
7596   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
7597
7598   vassert(t0 && *t0 == IRTemp_INVALID);
7599   vassert(t1 && *t1 == IRTemp_INVALID);
7600   vassert(t2 && *t2 == IRTemp_INVALID);
7601   vassert(t3 && *t3 == IRTemp_INVALID);
7602
7603   *t0 = newTemp(Ity_I16);
7604   *t1 = newTemp(Ity_I16);
7605   *t2 = newTemp(Ity_I16);
7606   *t3 = newTemp(Ity_I16);
7607   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
7608   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
7609   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
7610   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
7611}
7612
7613/* Construct a 64-bit value from four 16-bit ints. */
7614
7615static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
7616                             IRTemp t1, IRTemp t0 )
7617{
7618   return
7619      binop( Iop_32HLto64,
7620             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
7621             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
7622   );
7623}
7624
7625/* Generate IR to set the guest %EFLAGS from the pushfl-format image
7626   in the given 32-bit temporary.  The flags that are set are: O S Z A
7627   C P D ID AC.
7628
7629   In all cases, code to set AC is generated.  However, VEX actually
7630   ignores the AC value and so can optionally emit an emulation
7631   warning when it is enabled.  In this routine, an emulation warning
7632   is only emitted if emit_AC_emwarn is True, in which case
7633   next_insn_EIP must be correct (this allows for correct code
7634   generation for popfl/popfw).  If emit_AC_emwarn is False,
7635   next_insn_EIP is unimportant (this allows for easy if kludgey code
7636   generation for IRET.) */
7637
7638static
7639void set_EFLAGS_from_value ( IRTemp t1,
7640                             Bool   emit_AC_emwarn,
7641                             Addr32 next_insn_EIP )
7642{
7643   vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
7644
7645   /* t1 is the flag word.  Mask out everything except OSZACP and set
7646      the flags thunk to X86G_CC_OP_COPY. */
7647   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
7648   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
7649   stmt( IRStmt_Put( OFFB_CC_DEP1,
7650                     binop(Iop_And32,
7651                           mkexpr(t1),
7652                           mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
7653                                  | X86G_CC_MASK_A | X86G_CC_MASK_Z
7654                                  | X86G_CC_MASK_S| X86G_CC_MASK_O )
7655                          )
7656                    )
7657       );
7658   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7659      elimination of previous stores to this field work better. */
7660   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7661
7662   /* Also need to set the D flag, which is held in bit 10 of t1.
7663      If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
7664   stmt( IRStmt_Put(
7665            OFFB_DFLAG,
7666            IRExpr_ITE(
7667               unop(Iop_32to1,
7668                    binop(Iop_And32,
7669                          binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
7670                          mkU32(1))),
7671               mkU32(0xFFFFFFFF),
7672               mkU32(1)))
7673       );
7674
7675   /* Set the ID flag */
7676   stmt( IRStmt_Put(
7677            OFFB_IDFLAG,
7678            IRExpr_ITE(
7679               unop(Iop_32to1,
7680                    binop(Iop_And32,
7681                          binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
7682                          mkU32(1))),
7683               mkU32(1),
7684               mkU32(0)))
7685       );
7686
7687   /* And set the AC flag.  If setting it 1 to, possibly emit an
7688      emulation warning. */
7689   stmt( IRStmt_Put(
7690            OFFB_ACFLAG,
7691            IRExpr_ITE(
7692               unop(Iop_32to1,
7693                    binop(Iop_And32,
7694                          binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
7695                          mkU32(1))),
7696               mkU32(1),
7697               mkU32(0)))
7698       );
7699
7700   if (emit_AC_emwarn) {
7701      put_emwarn( mkU32(EmWarn_X86_acFlag) );
7702      stmt(
7703         IRStmt_Exit(
7704            binop( Iop_CmpNE32,
7705                   binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
7706                   mkU32(0) ),
7707            Ijk_EmWarn,
7708            IRConst_U32( next_insn_EIP ),
7709            OFFB_EIP
7710         )
7711      );
7712   }
7713}
7714
7715
7716/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
7717   values (aa,bb), computes, for each of the 4 16-bit lanes:
7718
7719   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
7720*/
7721static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
7722{
7723   IRTemp aa      = newTemp(Ity_I64);
7724   IRTemp bb      = newTemp(Ity_I64);
7725   IRTemp aahi32s = newTemp(Ity_I64);
7726   IRTemp aalo32s = newTemp(Ity_I64);
7727   IRTemp bbhi32s = newTemp(Ity_I64);
7728   IRTemp bblo32s = newTemp(Ity_I64);
7729   IRTemp rHi     = newTemp(Ity_I64);
7730   IRTemp rLo     = newTemp(Ity_I64);
7731   IRTemp one32x2 = newTemp(Ity_I64);
7732   assign(aa, aax);
7733   assign(bb, bbx);
7734   assign( aahi32s,
7735           binop(Iop_SarN32x2,
7736                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
7737                 mkU8(16) ));
7738   assign( aalo32s,
7739           binop(Iop_SarN32x2,
7740                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
7741                 mkU8(16) ));
7742   assign( bbhi32s,
7743           binop(Iop_SarN32x2,
7744                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
7745                 mkU8(16) ));
7746   assign( bblo32s,
7747           binop(Iop_SarN32x2,
7748                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
7749                 mkU8(16) ));
7750   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
7751   assign(
7752      rHi,
7753      binop(
7754         Iop_ShrN32x2,
7755         binop(
7756            Iop_Add32x2,
7757            binop(
7758               Iop_ShrN32x2,
7759               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
7760               mkU8(14)
7761            ),
7762            mkexpr(one32x2)
7763         ),
7764         mkU8(1)
7765      )
7766   );
7767   assign(
7768      rLo,
7769      binop(
7770         Iop_ShrN32x2,
7771         binop(
7772            Iop_Add32x2,
7773            binop(
7774               Iop_ShrN32x2,
7775               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
7776               mkU8(14)
7777            ),
7778            mkexpr(one32x2)
7779         ),
7780         mkU8(1)
7781      )
7782   );
7783   return
7784      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
7785}
7786
7787/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
7788   values (aa,bb), computes, for each lane:
7789
7790          if aa_lane < 0 then - bb_lane
7791     else if aa_lane > 0 then bb_lane
7792     else 0
7793*/
7794static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
7795{
7796   IRTemp aa       = newTemp(Ity_I64);
7797   IRTemp bb       = newTemp(Ity_I64);
7798   IRTemp zero     = newTemp(Ity_I64);
7799   IRTemp bbNeg    = newTemp(Ity_I64);
7800   IRTemp negMask  = newTemp(Ity_I64);
7801   IRTemp posMask  = newTemp(Ity_I64);
7802   IROp   opSub    = Iop_INVALID;
7803   IROp   opCmpGTS = Iop_INVALID;
7804
7805   switch (laneszB) {
7806      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
7807      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
7808      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
7809      default: vassert(0);
7810   }
7811
7812   assign( aa,      aax );
7813   assign( bb,      bbx );
7814   assign( zero,    mkU64(0) );
7815   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
7816   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
7817   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
7818
7819   return
7820      binop(Iop_Or64,
7821            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
7822            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
7823
7824}
7825
7826/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
7827   value aa, computes, for each lane
7828
7829   if aa < 0 then -aa else aa
7830
7831   Note that the result is interpreted as unsigned, so that the
7832   absolute value of the most negative signed input can be
7833   represented.
7834*/
7835static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
7836{
7837   IRTemp aa      = newTemp(Ity_I64);
7838   IRTemp zero    = newTemp(Ity_I64);
7839   IRTemp aaNeg   = newTemp(Ity_I64);
7840   IRTemp negMask = newTemp(Ity_I64);
7841   IRTemp posMask = newTemp(Ity_I64);
7842   IROp   opSub   = Iop_INVALID;
7843   IROp   opSarN  = Iop_INVALID;
7844
7845   switch (laneszB) {
7846      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
7847      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
7848      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
7849      default: vassert(0);
7850   }
7851
7852   assign( aa,      aax );
7853   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
7854   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
7855   assign( zero,    mkU64(0) );
7856   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
7857   return
7858      binop(Iop_Or64,
7859            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
7860            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
7861}
7862
7863static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
7864                                        IRTemp lo64, Int byteShift )
7865{
7866   vassert(byteShift >= 1 && byteShift <= 7);
7867   return
7868      binop(Iop_Or64,
7869            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
7870            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
7871      );
7872}
7873
7874/* Generate a SIGSEGV followed by a restart of the current instruction
7875   if effective_addr is not 16-aligned.  This is required behaviour
7876   for some SSE3 instructions and all 128-bit SSSE3 instructions.
7877   This assumes that guest_RIP_curr_instr is set correctly! */
7878static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
7879{
7880   stmt(
7881      IRStmt_Exit(
7882         binop(Iop_CmpNE32,
7883               binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
7884               mkU32(0)),
7885         Ijk_SigSEGV,
7886         IRConst_U32(guest_EIP_curr_instr),
7887         OFFB_EIP
7888      )
7889   );
7890}
7891
7892
7893/* Helper for deciding whether a given insn (starting at the opcode
7894   byte) may validly be used with a LOCK prefix.  The following insns
7895   may be used with LOCK when their destination operand is in memory.
7896   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
7897
7898   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
7899   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
7900   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
7901   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
7902   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
7903   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
7904   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
7905
7906   DEC        FE /1,  FF /1
7907   INC        FE /0,  FF /0
7908
7909   NEG        F6 /3,  F7 /3
7910   NOT        F6 /2,  F7 /2
7911
7912   XCHG       86, 87
7913
7914   BTC        0F BB,  0F BA /7
7915   BTR        0F B3,  0F BA /6
7916   BTS        0F AB,  0F BA /5
7917
7918   CMPXCHG    0F B0,  0F B1
7919   CMPXCHG8B  0F C7 /1
7920
7921   XADD       0F C0,  0F C1
7922
7923   ------------------------------
7924
7925   80 /0  =  addb $imm8,  rm8
7926   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
7927   82 /0  =  addb $imm8,  rm8
7928   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
7929
7930   00     =  addb r8,  rm8
7931   01     =  addl r32, rm32  and  addw r16, rm16
7932
7933   Same for ADD OR ADC SBB AND SUB XOR
7934
7935   FE /1  = dec rm8
7936   FF /1  = dec rm32  and  dec rm16
7937
7938   FE /0  = inc rm8
7939   FF /0  = inc rm32  and  inc rm16
7940
7941   F6 /3  = neg rm8
7942   F7 /3  = neg rm32  and  neg rm16
7943
7944   F6 /2  = not rm8
7945   F7 /2  = not rm32  and  not rm16
7946
7947   0F BB     = btcw r16, rm16    and  btcl r32, rm32
7948   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
7949
7950   Same for BTS, BTR
7951*/
7952static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
7953{
7954   switch (opc[0]) {
7955      case 0x00: case 0x01: case 0x08: case 0x09:
7956      case 0x10: case 0x11: case 0x18: case 0x19:
7957      case 0x20: case 0x21: case 0x28: case 0x29:
7958      case 0x30: case 0x31:
7959         if (!epartIsReg(opc[1]))
7960            return True;
7961         break;
7962
7963      case 0x80: case 0x81: case 0x82: case 0x83:
7964         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
7965             && !epartIsReg(opc[1]))
7966            return True;
7967         break;
7968
7969      case 0xFE: case 0xFF:
7970         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
7971             && !epartIsReg(opc[1]))
7972            return True;
7973         break;
7974
7975      case 0xF6: case 0xF7:
7976         if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
7977             && !epartIsReg(opc[1]))
7978            return True;
7979         break;
7980
7981      case 0x86: case 0x87:
7982         if (!epartIsReg(opc[1]))
7983            return True;
7984         break;
7985
7986      case 0x0F: {
7987         switch (opc[1]) {
7988            case 0xBB: case 0xB3: case 0xAB:
7989               if (!epartIsReg(opc[2]))
7990                  return True;
7991               break;
7992            case 0xBA:
7993               if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
7994                   && !epartIsReg(opc[2]))
7995                  return True;
7996               break;
7997            case 0xB0: case 0xB1:
7998               if (!epartIsReg(opc[2]))
7999                  return True;
8000               break;
8001            case 0xC7:
8002               if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
8003                  return True;
8004               break;
8005            case 0xC0: case 0xC1:
8006               if (!epartIsReg(opc[2]))
8007                  return True;
8008               break;
8009            default:
8010               break;
8011         } /* switch (opc[1]) */
8012         break;
8013      }
8014
8015      default:
8016         break;
8017   } /* switch (opc[0]) */
8018
8019   return False;
8020}
8021
8022static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
8023{
8024   IRTemp t2 = newTemp(ty);
8025   if (ty == Ity_I32) {
8026      assign( t2,
8027         binop(
8028            Iop_Or32,
8029            binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
8030            binop(
8031               Iop_Or32,
8032               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
8033                                mkU32(0x00FF0000)),
8034               binop(Iop_Or32,
8035                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
8036                                      mkU32(0x0000FF00)),
8037                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
8038                                      mkU32(0x000000FF) )
8039            )))
8040      );
8041      return t2;
8042   }
8043   if (ty == Ity_I16) {
8044      assign(t2,
8045             binop(Iop_Or16,
8046                   binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
8047                   binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
8048      return t2;
8049   }
8050   vassert(0);
8051   /*NOTREACHED*/
8052   return IRTemp_INVALID;
8053}
8054
8055/*------------------------------------------------------------*/
8056/*--- Disassemble a single instruction                     ---*/
8057/*------------------------------------------------------------*/
8058
8059/* Disassemble a single instruction into IR.  The instruction is
8060   located in host memory at &guest_code[delta].  *expect_CAS is set
8061   to True if the resulting IR is expected to contain an IRCAS
8062   statement, and False if it's not expected to.  This makes it
8063   possible for the caller of disInstr_X86_WRK to check that
8064   LOCK-prefixed instructions are at least plausibly translated, in
8065   that it becomes possible to check that a (validly) LOCK-prefixed
8066   instruction generates a translation containing an IRCAS, and
8067   instructions without LOCK prefixes don't generate translations
8068   containing an IRCAS.
8069*/
8070static
8071DisResult disInstr_X86_WRK (
8072             /*OUT*/Bool* expect_CAS,
8073             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
8074             Bool         resteerCisOk,
8075             void*        callback_opaque,
8076             Long         delta64,
8077             const VexArchInfo* archinfo,
8078             const VexAbiInfo*  vbi,
8079             Bool         sigill_diag
8080          )
8081{
8082   IRType    ty;
8083   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
8084   Int       alen;
8085   UChar     opc, modrm, abyte, pre;
8086   UInt      d32;
8087   HChar     dis_buf[50];
8088   Int       am_sz, d_sz, n_prefixes;
8089   DisResult dres;
8090   const UChar* insn; /* used in SSE decoders */
8091
8092   /* The running delta */
8093   Int delta = (Int)delta64;
8094
8095   /* Holds eip at the start of the insn, so that we can print
8096      consistent error messages for unimplemented insns. */
8097   Int delta_start = delta;
8098
8099   /* sz denotes the nominal data-op size of the insn; we change it to
8100      2 if an 0x66 prefix is seen */
8101   Int sz = 4;
8102
8103   /* sorb holds the segment-override-prefix byte, if any.  Zero if no
8104      prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
8105      indicating the prefix.  */
8106   UChar sorb = 0;
8107
8108   /* Gets set to True if a LOCK prefix is seen. */
8109   Bool pfx_lock = False;
8110
8111   /* Set result defaults. */
8112   dres.whatNext    = Dis_Continue;
8113   dres.len         = 0;
8114   dres.continueAt  = 0;
8115   dres.jk_StopHere = Ijk_INVALID;
8116
8117   *expect_CAS = False;
8118
8119   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
8120
8121   vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
8122   DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
8123
8124   /* Spot "Special" instructions (see comment at top of file). */
8125   {
8126      const UChar* code = guest_code + delta;
8127      /* Spot the 12-byte preamble:
8128         C1C703   roll $3,  %edi
8129         C1C70D   roll $13, %edi
8130         C1C71D   roll $29, %edi
8131         C1C713   roll $19, %edi
8132      */
8133      if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
8134          code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
8135          code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
8136          code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
8137         /* Got a "Special" instruction preamble.  Which one is it? */
8138         if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
8139            /* %EDX = client_request ( %EAX ) */
8140            DIP("%%edx = client_request ( %%eax )\n");
8141            delta += 14;
8142            jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
8143            vassert(dres.whatNext == Dis_StopHere);
8144            goto decode_success;
8145         }
8146         else
8147         if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
8148            /* %EAX = guest_NRADDR */
8149            DIP("%%eax = guest_NRADDR\n");
8150            delta += 14;
8151            putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
8152            goto decode_success;
8153         }
8154         else
8155         if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
8156            /* call-noredir *%EAX */
8157            DIP("call-noredir *%%eax\n");
8158            delta += 14;
8159            t1 = newTemp(Ity_I32);
8160            assign(t1, getIReg(4,R_EAX));
8161            t2 = newTemp(Ity_I32);
8162            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
8163            putIReg(4, R_ESP, mkexpr(t2));
8164            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
8165            jmp_treg(&dres, Ijk_NoRedir, t1);
8166            vassert(dres.whatNext == Dis_StopHere);
8167            goto decode_success;
8168         }
8169         else
8170         if (code[12] == 0x87 && code[13] == 0xFF /* xchgl %edi,%edi */) {
8171            /* IR injection */
8172            DIP("IR injection\n");
8173            vex_inject_ir(irsb, Iend_LE);
8174
8175            // Invalidate the current insn. The reason is that the IRop we're
8176            // injecting here can change. In which case the translation has to
8177            // be redone. For ease of handling, we simply invalidate all the
8178            // time.
8179            stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
8180            stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
8181
8182            delta += 14;
8183
8184            stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
8185            dres.whatNext    = Dis_StopHere;
8186            dres.jk_StopHere = Ijk_InvalICache;
8187            goto decode_success;
8188         }
8189         /* We don't know what it is. */
8190         goto decode_failure;
8191         /*NOTREACHED*/
8192      }
8193   }
8194
8195   /* Handle a couple of weird-ass NOPs that have been observed in the
8196      wild. */
8197   {
8198      const UChar* code = guest_code + delta;
8199      /* Sun's JVM 1.5.0 uses the following as a NOP:
8200         26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
8201      if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
8202          && code[3] == 0x65 && code[4] == 0x90) {
8203         DIP("%%es:%%cs:%%fs:%%gs:nop\n");
8204         delta += 5;
8205         goto decode_success;
8206      }
8207      /* Don't barf on recent binutils padding,
8208         all variants of which are: nopw %cs:0x0(%eax,%eax,1)
8209         66 2e 0f 1f 84 00 00 00 00 00
8210         66 66 2e 0f 1f 84 00 00 00 00 00
8211         66 66 66 2e 0f 1f 84 00 00 00 00 00
8212         66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8213         66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8214         66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8215      */
8216      if (code[0] == 0x66) {
8217         Int data16_cnt;
8218         for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
8219            if (code[data16_cnt] != 0x66)
8220               break;
8221         if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
8222             && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
8223             && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
8224             && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
8225             && code[data16_cnt + 8] == 0x00 ) {
8226            DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
8227            delta += 9 + data16_cnt;
8228            goto decode_success;
8229         }
8230      }
8231   }
8232
8233   /* Normal instruction handling starts here. */
8234
8235   /* Deal with some but not all prefixes:
8236         66(oso)
8237         F0(lock)
8238         2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
8239      Not dealt with (left in place):
8240         F2 F3
8241   */
8242   n_prefixes = 0;
8243   while (True) {
8244      if (n_prefixes > 7) goto decode_failure;
8245      pre = getUChar(delta);
8246      switch (pre) {
8247         case 0x66:
8248            sz = 2;
8249            break;
8250         case 0xF0:
8251            pfx_lock = True;
8252            *expect_CAS = True;
8253            break;
8254         case 0x3E: /* %DS: */
8255         case 0x26: /* %ES: */
8256         case 0x64: /* %FS: */
8257         case 0x65: /* %GS: */
8258            if (sorb != 0)
8259               goto decode_failure; /* only one seg override allowed */
8260            sorb = pre;
8261            break;
8262         case 0x2E: { /* %CS: */
8263            /* 2E prefix on a conditional branch instruction is a
8264               branch-prediction hint, which can safely be ignored.  */
8265            UChar op1 = getIByte(delta+1);
8266            UChar op2 = getIByte(delta+2);
8267            if ((op1 >= 0x70 && op1 <= 0x7F)
8268                || (op1 == 0xE3)
8269                || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
8270               if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
8271            } else {
8272               /* All other CS override cases are not handled */
8273               goto decode_failure;
8274            }
8275            break;
8276         }
8277         case 0x36: /* %SS: */
8278            /* SS override cases are not handled */
8279            goto decode_failure;
8280         default:
8281            goto not_a_prefix;
8282      }
8283      n_prefixes++;
8284      delta++;
8285   }
8286
8287   not_a_prefix:
8288
8289   /* Now we should be looking at the primary opcode byte or the
8290      leading F2 or F3.  Check that any LOCK prefix is actually
8291      allowed. */
8292
8293   if (pfx_lock) {
8294     if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
8295         DIP("lock ");
8296      } else {
8297         *expect_CAS = False;
8298         goto decode_failure;
8299      }
8300   }
8301
8302
8303   /* ---------------------------------------------------- */
8304   /* --- The SSE decoder.                             --- */
8305   /* ---------------------------------------------------- */
8306
8307   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
8308      previous life? */
8309
8310   /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
8311      later section, further on. */
8312
8313   insn = &guest_code[delta];
8314
8315   /* Treat fxsave specially.  It should be doable even on an SSE0
8316      (Pentium-II class) CPU.  Hence be prepared to handle it on
8317      any subarchitecture variant.
8318   */
8319
8320   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
8321   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8322       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
8323      IRDirty* d;
8324      modrm = getIByte(delta+2);
8325      vassert(sz == 4);
8326      vassert(!epartIsReg(modrm));
8327
8328      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8329      delta += 2+alen;
8330      gen_SEGV_if_not_16_aligned(addr);
8331
8332      DIP("fxsave %s\n", dis_buf);
8333
8334      /* Uses dirty helper:
8335            void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
8336      d = unsafeIRDirty_0_N (
8337             0/*regparms*/,
8338             "x86g_dirtyhelper_FXSAVE",
8339             &x86g_dirtyhelper_FXSAVE,
8340             mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
8341          );
8342
8343      /* declare we're writing memory */
8344      d->mFx   = Ifx_Write;
8345      d->mAddr = mkexpr(addr);
8346      d->mSize = 464; /* according to recent Intel docs */
8347
8348      /* declare we're reading guest state */
8349      d->nFxState = 7;
8350      vex_bzero(&d->fxState, sizeof(d->fxState));
8351
8352      d->fxState[0].fx     = Ifx_Read;
8353      d->fxState[0].offset = OFFB_FTOP;
8354      d->fxState[0].size   = sizeof(UInt);
8355
8356      d->fxState[1].fx     = Ifx_Read;
8357      d->fxState[1].offset = OFFB_FPREGS;
8358      d->fxState[1].size   = 8 * sizeof(ULong);
8359
8360      d->fxState[2].fx     = Ifx_Read;
8361      d->fxState[2].offset = OFFB_FPTAGS;
8362      d->fxState[2].size   = 8 * sizeof(UChar);
8363
8364      d->fxState[3].fx     = Ifx_Read;
8365      d->fxState[3].offset = OFFB_FPROUND;
8366      d->fxState[3].size   = sizeof(UInt);
8367
8368      d->fxState[4].fx     = Ifx_Read;
8369      d->fxState[4].offset = OFFB_FC3210;
8370      d->fxState[4].size   = sizeof(UInt);
8371
8372      d->fxState[5].fx     = Ifx_Read;
8373      d->fxState[5].offset = OFFB_XMM0;
8374      d->fxState[5].size   = 8 * sizeof(U128);
8375
8376      d->fxState[6].fx     = Ifx_Read;
8377      d->fxState[6].offset = OFFB_SSEROUND;
8378      d->fxState[6].size   = sizeof(UInt);
8379
8380      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8381	 images are packed back-to-back.  If not, the value of
8382	 d->fxState[5].size is wrong. */
8383      vassert(16 == sizeof(U128));
8384      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8385
8386      stmt( IRStmt_Dirty(d) );
8387
8388      goto decode_success;
8389   }
8390
8391   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
8392   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8393       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
8394      IRDirty* d;
8395      modrm = getIByte(delta+2);
8396      vassert(sz == 4);
8397      vassert(!epartIsReg(modrm));
8398
8399      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8400      delta += 2+alen;
8401      gen_SEGV_if_not_16_aligned(addr);
8402
8403      DIP("fxrstor %s\n", dis_buf);
8404
8405      /* Uses dirty helper:
8406            VexEmNote x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
8407         NOTE:
8408            the VexEmNote value is simply ignored (unlike for FRSTOR)
8409      */
8410      d = unsafeIRDirty_0_N (
8411             0/*regparms*/,
8412             "x86g_dirtyhelper_FXRSTOR",
8413             &x86g_dirtyhelper_FXRSTOR,
8414             mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
8415          );
8416
8417      /* declare we're reading memory */
8418      d->mFx   = Ifx_Read;
8419      d->mAddr = mkexpr(addr);
8420      d->mSize = 464; /* according to recent Intel docs */
8421
8422      /* declare we're writing guest state */
8423      d->nFxState = 7;
8424      vex_bzero(&d->fxState, sizeof(d->fxState));
8425
8426      d->fxState[0].fx     = Ifx_Write;
8427      d->fxState[0].offset = OFFB_FTOP;
8428      d->fxState[0].size   = sizeof(UInt);
8429
8430      d->fxState[1].fx     = Ifx_Write;
8431      d->fxState[1].offset = OFFB_FPREGS;
8432      d->fxState[1].size   = 8 * sizeof(ULong);
8433
8434      d->fxState[2].fx     = Ifx_Write;
8435      d->fxState[2].offset = OFFB_FPTAGS;
8436      d->fxState[2].size   = 8 * sizeof(UChar);
8437
8438      d->fxState[3].fx     = Ifx_Write;
8439      d->fxState[3].offset = OFFB_FPROUND;
8440      d->fxState[3].size   = sizeof(UInt);
8441
8442      d->fxState[4].fx     = Ifx_Write;
8443      d->fxState[4].offset = OFFB_FC3210;
8444      d->fxState[4].size   = sizeof(UInt);
8445
8446      d->fxState[5].fx     = Ifx_Write;
8447      d->fxState[5].offset = OFFB_XMM0;
8448      d->fxState[5].size   = 8 * sizeof(U128);
8449
8450      d->fxState[6].fx     = Ifx_Write;
8451      d->fxState[6].offset = OFFB_SSEROUND;
8452      d->fxState[6].size   = sizeof(UInt);
8453
8454      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8455	 images are packed back-to-back.  If not, the value of
8456	 d->fxState[5].size is wrong. */
8457      vassert(16 == sizeof(U128));
8458      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8459
8460      stmt( IRStmt_Dirty(d) );
8461
8462      goto decode_success;
8463   }
8464
8465   /* ------ SSE decoder main ------ */
8466
8467   /* Skip parts of the decoder which don't apply given the stated
8468      guest subarchitecture. */
8469   if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
8470      goto after_sse_decoders;
8471
8472   /* With mmxext only some extended MMX instructions are recognized.
8473      The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
8474      PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
8475      PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
8476
8477      http://support.amd.com/us/Embedded_TechDocs/22466.pdf
8478      https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
8479
8480   if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
8481      goto mmxext;
8482
8483   /* Otherwise we must be doing sse1 or sse2, so we can at least try
8484      for SSE1 here. */
8485
8486   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
8487   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
8488      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
8489      goto decode_success;
8490   }
8491
8492   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
8493   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
8494      vassert(sz == 4);
8495      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
8496      goto decode_success;
8497   }
8498
8499   /* 0F 55 = ANDNPS -- G = (not G) and E */
8500   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
8501      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
8502      goto decode_success;
8503   }
8504
8505   /* 0F 54 = ANDPS -- G = G and E */
8506   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
8507      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
8508      goto decode_success;
8509   }
8510
8511   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
8512   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
8513      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
8514      goto decode_success;
8515   }
8516
8517   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
8518   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
8519      vassert(sz == 4);
8520      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
8521      goto decode_success;
8522   }
8523
8524   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
8525   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
8526   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
8527      IRTemp argL = newTemp(Ity_F32);
8528      IRTemp argR = newTemp(Ity_F32);
8529      modrm = getIByte(delta+2);
8530      if (epartIsReg(modrm)) {
8531         assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
8532         delta += 2+1;
8533         DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8534                                  nameXMMReg(gregOfRM(modrm)) );
8535      } else {
8536         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8537	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
8538         delta += 2+alen;
8539         DIP("[u]comiss %s,%s\n", dis_buf,
8540                                  nameXMMReg(gregOfRM(modrm)) );
8541      }
8542      assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
8543
8544      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
8545      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
8546      stmt( IRStmt_Put(
8547               OFFB_CC_DEP1,
8548               binop( Iop_And32,
8549                      binop(Iop_CmpF64,
8550                            unop(Iop_F32toF64,mkexpr(argL)),
8551                            unop(Iop_F32toF64,mkexpr(argR))),
8552                      mkU32(0x45)
8553          )));
8554      /* Set NDEP even though it isn't used.  This makes redundant-PUT
8555         elimination of previous stores to this field work better. */
8556      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
8557      goto decode_success;
8558   }
8559
8560   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
8561      half xmm */
8562   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
8563      IRTemp arg64 = newTemp(Ity_I64);
8564      IRTemp rmode = newTemp(Ity_I32);
8565      vassert(sz == 4);
8566
8567      modrm = getIByte(delta+2);
8568      do_MMX_preamble();
8569      if (epartIsReg(modrm)) {
8570         assign( arg64, getMMXReg(eregOfRM(modrm)) );
8571         delta += 2+1;
8572         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8573                                 nameXMMReg(gregOfRM(modrm)));
8574      } else {
8575         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8576	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
8577         delta += 2+alen;
8578         DIP("cvtpi2ps %s,%s\n", dis_buf,
8579                                 nameXMMReg(gregOfRM(modrm)) );
8580      }
8581
8582      assign( rmode, get_sse_roundingmode() );
8583
8584      putXMMRegLane32F(
8585         gregOfRM(modrm), 0,
8586         binop(Iop_F64toF32,
8587               mkexpr(rmode),
8588               unop(Iop_I32StoF64,
8589                    unop(Iop_64to32, mkexpr(arg64)) )) );
8590
8591      putXMMRegLane32F(
8592         gregOfRM(modrm), 1,
8593         binop(Iop_F64toF32,
8594               mkexpr(rmode),
8595               unop(Iop_I32StoF64,
8596                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
8597
8598      goto decode_success;
8599   }
8600
8601   /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
8602      quarter xmm */
8603   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
8604      IRTemp arg32 = newTemp(Ity_I32);
8605      IRTemp rmode = newTemp(Ity_I32);
8606      vassert(sz == 4);
8607
8608      modrm = getIByte(delta+3);
8609      if (epartIsReg(modrm)) {
8610         assign( arg32, getIReg(4, eregOfRM(modrm)) );
8611         delta += 3+1;
8612         DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
8613                                 nameXMMReg(gregOfRM(modrm)));
8614      } else {
8615         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8616	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
8617         delta += 3+alen;
8618         DIP("cvtsi2ss %s,%s\n", dis_buf,
8619                                 nameXMMReg(gregOfRM(modrm)) );
8620      }
8621
8622      assign( rmode, get_sse_roundingmode() );
8623
8624      putXMMRegLane32F(
8625         gregOfRM(modrm), 0,
8626         binop(Iop_F64toF32,
8627               mkexpr(rmode),
8628               unop(Iop_I32StoF64, mkexpr(arg32)) ) );
8629
8630      goto decode_success;
8631   }
8632
8633   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8634      I32 in mmx, according to prevailing SSE rounding mode */
8635   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8636      I32 in mmx, rounding towards zero */
8637   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
8638      IRTemp dst64  = newTemp(Ity_I64);
8639      IRTemp rmode  = newTemp(Ity_I32);
8640      IRTemp f32lo  = newTemp(Ity_F32);
8641      IRTemp f32hi  = newTemp(Ity_F32);
8642      Bool   r2zero = toBool(insn[1] == 0x2C);
8643
8644      do_MMX_preamble();
8645      modrm = getIByte(delta+2);
8646
8647      if (epartIsReg(modrm)) {
8648         delta += 2+1;
8649	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8650	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
8651         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8652                                   nameXMMReg(eregOfRM(modrm)),
8653                                   nameMMXReg(gregOfRM(modrm)));
8654      } else {
8655         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8656	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8657	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
8658                                              mkexpr(addr),
8659                                              mkU32(4) )));
8660         delta += 2+alen;
8661         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8662                                   dis_buf,
8663                                   nameMMXReg(gregOfRM(modrm)));
8664      }
8665
8666      if (r2zero) {
8667         assign(rmode, mkU32((UInt)Irrm_ZERO) );
8668      } else {
8669         assign( rmode, get_sse_roundingmode() );
8670      }
8671
8672      assign(
8673         dst64,
8674         binop( Iop_32HLto64,
8675                binop( Iop_F64toI32S,
8676                       mkexpr(rmode),
8677                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
8678                binop( Iop_F64toI32S,
8679                       mkexpr(rmode),
8680                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
8681              )
8682      );
8683
8684      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
8685      goto decode_success;
8686   }
8687
8688   /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
8689      I32 in ireg, according to prevailing SSE rounding mode */
8690   /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
8691      I32 in ireg, rounding towards zero */
8692   if (insn[0] == 0xF3 && insn[1] == 0x0F
8693       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
8694      IRTemp rmode = newTemp(Ity_I32);
8695      IRTemp f32lo = newTemp(Ity_F32);
8696      Bool   r2zero = toBool(insn[2] == 0x2C);
8697      vassert(sz == 4);
8698
8699      modrm = getIByte(delta+3);
8700      if (epartIsReg(modrm)) {
8701         delta += 3+1;
8702	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8703         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8704                                   nameXMMReg(eregOfRM(modrm)),
8705                                   nameIReg(4, gregOfRM(modrm)));
8706      } else {
8707         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8708	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8709         delta += 3+alen;
8710         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8711                                   dis_buf,
8712                                   nameIReg(4, gregOfRM(modrm)));
8713      }
8714
8715      if (r2zero) {
8716         assign( rmode, mkU32((UInt)Irrm_ZERO) );
8717      } else {
8718         assign( rmode, get_sse_roundingmode() );
8719      }
8720
8721      putIReg(4, gregOfRM(modrm),
8722                 binop( Iop_F64toI32S,
8723                        mkexpr(rmode),
8724                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
8725      );
8726
8727      goto decode_success;
8728   }
8729
8730   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
8731   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
8732      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
8733      goto decode_success;
8734   }
8735
8736   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
8737   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
8738      vassert(sz == 4);
8739      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
8740      goto decode_success;
8741   }
8742
8743   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
8744   if (insn[0] == 0x0F && insn[1] == 0xAE
8745       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
8746
8747      IRTemp t64 = newTemp(Ity_I64);
8748      IRTemp ew = newTemp(Ity_I32);
8749
8750      modrm = getIByte(delta+2);
8751      vassert(!epartIsReg(modrm));
8752      vassert(sz == 4);
8753
8754      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8755      delta += 2+alen;
8756      DIP("ldmxcsr %s\n", dis_buf);
8757
8758      /* The only thing we observe in %mxcsr is the rounding mode.
8759         Therefore, pass the 32-bit value (SSE native-format control
8760         word) to a clean helper, getting back a 64-bit value, the
8761         lower half of which is the SSEROUND value to store, and the
8762         upper half of which is the emulation-warning token which may
8763         be generated.
8764      */
8765      /* ULong x86h_check_ldmxcsr ( UInt ); */
8766      assign( t64, mkIRExprCCall(
8767                      Ity_I64, 0/*regparms*/,
8768                      "x86g_check_ldmxcsr",
8769                      &x86g_check_ldmxcsr,
8770                      mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
8771                   )
8772            );
8773
8774      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
8775      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
8776      put_emwarn( mkexpr(ew) );
8777      /* Finally, if an emulation warning was reported, side-exit to
8778         the next insn, reporting the warning, so that Valgrind's
8779         dispatcher sees the warning. */
8780      stmt(
8781         IRStmt_Exit(
8782            binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
8783            Ijk_EmWarn,
8784            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
8785            OFFB_EIP
8786         )
8787      );
8788      goto decode_success;
8789   }
8790
8791
8792   /* mmxext sse1 subset starts here. mmxext only arches will parse
8793      only this subset of the sse1 instructions. */
8794  mmxext:
8795
8796   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8797   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
8798   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
8799      Bool ok = False;
8800      delta = dis_MMX( &ok, sorb, sz, delta+1 );
8801      if (!ok)
8802         goto decode_failure;
8803      goto decode_success;
8804   }
8805
8806   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8807   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
8808      Intel manual does not say anything about the usual business of
8809      the FP reg tags getting trashed whenever an MMX insn happens.
8810      So we just leave them alone.
8811   */
8812   if (insn[0] == 0x0F && insn[1] == 0xE7) {
8813      modrm = getIByte(delta+2);
8814      if (sz == 4 && !epartIsReg(modrm)) {
8815         /* do_MMX_preamble(); Intel docs don't specify this */
8816         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8817         storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
8818         DIP("movntq %s,%s\n", dis_buf,
8819                               nameMMXReg(gregOfRM(modrm)));
8820         delta += 2+alen;
8821         goto decode_success;
8822      }
8823      /* else fall through */
8824   }
8825
8826   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8827   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
8828   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
8829      do_MMX_preamble();
8830      delta = dis_MMXop_regmem_to_reg (
8831                sorb, delta+2, insn[1], "pavgb", False );
8832      goto decode_success;
8833   }
8834
8835   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8836   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
8837   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
8838      do_MMX_preamble();
8839      delta = dis_MMXop_regmem_to_reg (
8840                sorb, delta+2, insn[1], "pavgw", False );
8841      goto decode_success;
8842   }
8843
8844   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8845   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
8846      zero-extend of it in ireg(G). */
8847   if (insn[0] == 0x0F && insn[1] == 0xC5) {
8848      modrm = insn[2];
8849      if (sz == 4 && epartIsReg(modrm)) {
8850         IRTemp sV = newTemp(Ity_I64);
8851         t5 = newTemp(Ity_I16);
8852         do_MMX_preamble();
8853         assign(sV, getMMXReg(eregOfRM(modrm)));
8854         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
8855         switch (insn[3] & 3) {
8856            case 0:  assign(t5, mkexpr(t0)); break;
8857            case 1:  assign(t5, mkexpr(t1)); break;
8858            case 2:  assign(t5, mkexpr(t2)); break;
8859            case 3:  assign(t5, mkexpr(t3)); break;
8860            default: vassert(0); /*NOTREACHED*/
8861         }
8862         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
8863         DIP("pextrw $%d,%s,%s\n",
8864             (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
8865                           nameIReg(4,gregOfRM(modrm)));
8866         delta += 4;
8867         goto decode_success;
8868      }
8869      /* else fall through */
8870   }
8871
8872   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8873   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
8874      put it into the specified lane of mmx(G). */
8875   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
8876      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
8877         mmx reg.  t4 is the new lane value.  t5 is the original
8878         mmx value. t6 is the new mmx value. */
8879      Int lane;
8880      t4 = newTemp(Ity_I16);
8881      t5 = newTemp(Ity_I64);
8882      t6 = newTemp(Ity_I64);
8883      modrm = insn[2];
8884      do_MMX_preamble();
8885
8886      assign(t5, getMMXReg(gregOfRM(modrm)));
8887      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
8888
8889      if (epartIsReg(modrm)) {
8890         assign(t4, getIReg(2, eregOfRM(modrm)));
8891         delta += 3+1;
8892         lane = insn[3+1-1];
8893         DIP("pinsrw $%d,%s,%s\n", lane,
8894                                   nameIReg(2,eregOfRM(modrm)),
8895                                   nameMMXReg(gregOfRM(modrm)));
8896      } else {
8897         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8898         delta += 3+alen;
8899         lane = insn[3+alen-1];
8900         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
8901         DIP("pinsrw $%d,%s,%s\n", lane,
8902                                   dis_buf,
8903                                   nameMMXReg(gregOfRM(modrm)));
8904      }
8905
8906      switch (lane & 3) {
8907         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
8908         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
8909         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
8910         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
8911         default: vassert(0); /*NOTREACHED*/
8912      }
8913      putMMXReg(gregOfRM(modrm), mkexpr(t6));
8914      goto decode_success;
8915   }
8916
8917   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8918   /* 0F EE = PMAXSW -- 16x4 signed max */
8919   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
8920      do_MMX_preamble();
8921      delta = dis_MMXop_regmem_to_reg (
8922                sorb, delta+2, insn[1], "pmaxsw", False );
8923      goto decode_success;
8924   }
8925
8926   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8927   /* 0F DE = PMAXUB -- 8x8 unsigned max */
8928   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
8929      do_MMX_preamble();
8930      delta = dis_MMXop_regmem_to_reg (
8931                sorb, delta+2, insn[1], "pmaxub", False );
8932      goto decode_success;
8933   }
8934
8935   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8936   /* 0F EA = PMINSW -- 16x4 signed min */
8937   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
8938      do_MMX_preamble();
8939      delta = dis_MMXop_regmem_to_reg (
8940                sorb, delta+2, insn[1], "pminsw", False );
8941      goto decode_success;
8942   }
8943
8944   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8945   /* 0F DA = PMINUB -- 8x8 unsigned min */
8946   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
8947      do_MMX_preamble();
8948      delta = dis_MMXop_regmem_to_reg (
8949                sorb, delta+2, insn[1], "pminub", False );
8950      goto decode_success;
8951   }
8952
8953   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8954   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
8955      mmx(E), turn them into a byte, and put zero-extend of it in
8956      ireg(G). */
8957   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
8958      modrm = insn[2];
8959      if (epartIsReg(modrm)) {
8960         do_MMX_preamble();
8961         t0 = newTemp(Ity_I64);
8962         t1 = newTemp(Ity_I32);
8963         assign(t0, getMMXReg(eregOfRM(modrm)));
8964         assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
8965         putIReg(4, gregOfRM(modrm), mkexpr(t1));
8966         DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8967                                 nameIReg(4,gregOfRM(modrm)));
8968         delta += 3;
8969         goto decode_success;
8970      }
8971      /* else fall through */
8972   }
8973
8974   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8975   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
8976   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
8977      do_MMX_preamble();
8978      delta = dis_MMXop_regmem_to_reg (
8979                sorb, delta+2, insn[1], "pmuluh", False );
8980      goto decode_success;
8981   }
8982
8983   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
8984   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
8985   /* 0F 18 /2 = PREFETCH1 */
8986   /* 0F 18 /3 = PREFETCH2 */
8987   if (insn[0] == 0x0F && insn[1] == 0x18
8988       && !epartIsReg(insn[2])
8989       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
8990      const HChar* hintstr = "??";
8991
8992      modrm = getIByte(delta+2);
8993      vassert(!epartIsReg(modrm));
8994
8995      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8996      delta += 2+alen;
8997
8998      switch (gregOfRM(modrm)) {
8999         case 0: hintstr = "nta"; break;
9000         case 1: hintstr = "t0"; break;
9001         case 2: hintstr = "t1"; break;
9002         case 3: hintstr = "t2"; break;
9003         default: vassert(0); /*NOTREACHED*/
9004      }
9005
9006      DIP("prefetch%s %s\n", hintstr, dis_buf);
9007      goto decode_success;
9008   }
9009
9010   /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
9011   /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
9012   if (insn[0] == 0x0F && insn[1] == 0x0D
9013       && !epartIsReg(insn[2])
9014       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
9015      const HChar* hintstr = "??";
9016
9017      modrm = getIByte(delta+2);
9018      vassert(!epartIsReg(modrm));
9019
9020      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9021      delta += 2+alen;
9022
9023      switch (gregOfRM(modrm)) {
9024         case 0: hintstr = ""; break;
9025         case 1: hintstr = "w"; break;
9026         default: vassert(0); /*NOTREACHED*/
9027      }
9028
9029      DIP("prefetch%s %s\n", hintstr, dis_buf);
9030      goto decode_success;
9031   }
9032
9033   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9034   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
9035   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
9036      do_MMX_preamble();
9037      delta = dis_MMXop_regmem_to_reg (
9038                 sorb, delta+2, insn[1], "psadbw", False );
9039      goto decode_success;
9040   }
9041
9042   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9043   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
9044   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
9045      Int order;
9046      IRTemp sV, dV, s3, s2, s1, s0;
9047      s3 = s2 = s1 = s0 = IRTemp_INVALID;
9048      sV = newTemp(Ity_I64);
9049      dV = newTemp(Ity_I64);
9050      do_MMX_preamble();
9051      modrm = insn[2];
9052      if (epartIsReg(modrm)) {
9053         assign( sV, getMMXReg(eregOfRM(modrm)) );
9054         order = (Int)insn[3];
9055         delta += 2+2;
9056         DIP("pshufw $%d,%s,%s\n", order,
9057                                   nameMMXReg(eregOfRM(modrm)),
9058                                   nameMMXReg(gregOfRM(modrm)));
9059      } else {
9060         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9061         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
9062	 order = (Int)insn[2+alen];
9063         delta += 3+alen;
9064         DIP("pshufw $%d,%s,%s\n", order,
9065                                   dis_buf,
9066                                   nameMMXReg(gregOfRM(modrm)));
9067      }
9068      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
9069
9070#     define SEL(n) \
9071                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9072      assign(dV,
9073	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
9074                          SEL((order>>2)&3), SEL((order>>0)&3) )
9075      );
9076      putMMXReg(gregOfRM(modrm), mkexpr(dV));
9077#     undef SEL
9078      goto decode_success;
9079   }
9080
9081   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
9082   if (insn[0] == 0x0F && insn[1] == 0xAE
9083       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
9084      vassert(sz == 4);
9085      delta += 3;
9086      /* Insert a memory fence.  It's sometimes important that these
9087         are carried through to the generated code. */
9088      stmt( IRStmt_MBE(Imbe_Fence) );
9089      DIP("sfence\n");
9090      goto decode_success;
9091   }
9092
9093   /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
9094   if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
9095      goto after_sse_decoders;
9096
9097
9098   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
9099   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
9100      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
9101      goto decode_success;
9102   }
9103
9104   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
9105   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
9106      vassert(sz == 4);
9107      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
9108      goto decode_success;
9109   }
9110
9111   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
9112   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
9113      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
9114      goto decode_success;
9115   }
9116
9117   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
9118   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
9119      vassert(sz == 4);
9120      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
9121      goto decode_success;
9122   }
9123
9124   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
9125   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
9126   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
9127      modrm = getIByte(delta+2);
9128      if (epartIsReg(modrm)) {
9129         putXMMReg( gregOfRM(modrm),
9130                    getXMMReg( eregOfRM(modrm) ));
9131         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9132                                  nameXMMReg(gregOfRM(modrm)));
9133         delta += 2+1;
9134      } else {
9135         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9136         if (insn[1] == 0x28/*movaps*/)
9137            gen_SEGV_if_not_16_aligned( addr );
9138         putXMMReg( gregOfRM(modrm),
9139                    loadLE(Ity_V128, mkexpr(addr)) );
9140         DIP("mov[ua]ps %s,%s\n", dis_buf,
9141                                  nameXMMReg(gregOfRM(modrm)));
9142         delta += 2+alen;
9143      }
9144      goto decode_success;
9145   }
9146
9147   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
9148   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
9149   if (sz == 4 && insn[0] == 0x0F
9150       && (insn[1] == 0x29 || insn[1] == 0x11)) {
9151      modrm = getIByte(delta+2);
9152      if (epartIsReg(modrm)) {
9153         /* fall through; awaiting test case */
9154      } else {
9155         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9156         if (insn[1] == 0x29/*movaps*/)
9157            gen_SEGV_if_not_16_aligned( addr );
9158         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9159         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
9160                                  dis_buf );
9161         delta += 2+alen;
9162         goto decode_success;
9163      }
9164   }
9165
9166   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
9167   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
9168   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
9169      modrm = getIByte(delta+2);
9170      if (epartIsReg(modrm)) {
9171         delta += 2+1;
9172         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
9173                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
9174         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9175                               nameXMMReg(gregOfRM(modrm)));
9176      } else {
9177         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9178         delta += 2+alen;
9179         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
9180                          loadLE(Ity_I64, mkexpr(addr)) );
9181         DIP("movhps %s,%s\n", dis_buf,
9182                               nameXMMReg( gregOfRM(modrm) ));
9183      }
9184      goto decode_success;
9185   }
9186
9187   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
9188   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
9189      if (!epartIsReg(insn[2])) {
9190         delta += 2;
9191         addr = disAMode ( &alen, sorb, delta, dis_buf );
9192         delta += alen;
9193         storeLE( mkexpr(addr),
9194                  getXMMRegLane64( gregOfRM(insn[2]),
9195                                   1/*upper lane*/ ) );
9196         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
9197                               dis_buf);
9198         goto decode_success;
9199      }
9200      /* else fall through */
9201   }
9202
9203   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
9204   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
9205   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
9206      modrm = getIByte(delta+2);
9207      if (epartIsReg(modrm)) {
9208         delta += 2+1;
9209         putXMMRegLane64( gregOfRM(modrm),
9210                          0/*lower lane*/,
9211                          getXMMRegLane64( eregOfRM(modrm), 1 ));
9212         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
9213                                 nameXMMReg(gregOfRM(modrm)));
9214      } else {
9215         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9216         delta += 2+alen;
9217         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
9218                          loadLE(Ity_I64, mkexpr(addr)) );
9219         DIP("movlps %s, %s\n",
9220             dis_buf, nameXMMReg( gregOfRM(modrm) ));
9221      }
9222      goto decode_success;
9223   }
9224
9225   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
9226   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
9227      if (!epartIsReg(insn[2])) {
9228         delta += 2;
9229         addr = disAMode ( &alen, sorb, delta, dis_buf );
9230         delta += alen;
9231         storeLE( mkexpr(addr),
9232                  getXMMRegLane64( gregOfRM(insn[2]),
9233                                   0/*lower lane*/ ) );
9234         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
9235                                dis_buf);
9236         goto decode_success;
9237      }
9238      /* else fall through */
9239   }
9240
9241   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
9242      to 4 lowest bits of ireg(G) */
9243   if (insn[0] == 0x0F && insn[1] == 0x50) {
9244      modrm = getIByte(delta+2);
9245      if (sz == 4 && epartIsReg(modrm)) {
9246         Int src;
9247         t0 = newTemp(Ity_I32);
9248         t1 = newTemp(Ity_I32);
9249         t2 = newTemp(Ity_I32);
9250         t3 = newTemp(Ity_I32);
9251         delta += 2+1;
9252         src = eregOfRM(modrm);
9253         assign( t0, binop( Iop_And32,
9254                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
9255                            mkU32(1) ));
9256         assign( t1, binop( Iop_And32,
9257                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
9258                            mkU32(2) ));
9259         assign( t2, binop( Iop_And32,
9260                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
9261                            mkU32(4) ));
9262         assign( t3, binop( Iop_And32,
9263                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
9264                            mkU32(8) ));
9265         putIReg(4, gregOfRM(modrm),
9266                    binop(Iop_Or32,
9267                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
9268                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
9269                         )
9270                 );
9271         DIP("movmskps %s,%s\n", nameXMMReg(src),
9272                                 nameIReg(4, gregOfRM(modrm)));
9273         goto decode_success;
9274      }
9275      /* else fall through */
9276   }
9277
9278   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
9279   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
9280   if (insn[0] == 0x0F && insn[1] == 0x2B) {
9281      modrm = getIByte(delta+2);
9282      if (!epartIsReg(modrm)) {
9283         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9284         gen_SEGV_if_not_16_aligned( addr );
9285         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9286         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
9287                                 dis_buf,
9288                                 nameXMMReg(gregOfRM(modrm)));
9289         delta += 2+alen;
9290         goto decode_success;
9291      }
9292      /* else fall through */
9293   }
9294
9295   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
9296      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
9297   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
9298      vassert(sz == 4);
9299      modrm = getIByte(delta+3);
9300      if (epartIsReg(modrm)) {
9301         putXMMRegLane32( gregOfRM(modrm), 0,
9302                          getXMMRegLane32( eregOfRM(modrm), 0 ));
9303         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9304                              nameXMMReg(gregOfRM(modrm)));
9305         delta += 3+1;
9306      } else {
9307         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9308         /* zero bits 127:64 */
9309         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
9310         /* zero bits 63:32 */
9311         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
9312         /* write bits 31:0 */
9313         putXMMRegLane32( gregOfRM(modrm), 0,
9314                          loadLE(Ity_I32, mkexpr(addr)) );
9315         DIP("movss %s,%s\n", dis_buf,
9316                              nameXMMReg(gregOfRM(modrm)));
9317         delta += 3+alen;
9318      }
9319      goto decode_success;
9320   }
9321
9322   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
9323      or lo 1/4 xmm). */
9324   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
9325      vassert(sz == 4);
9326      modrm = getIByte(delta+3);
9327      if (epartIsReg(modrm)) {
9328         /* fall through, we don't yet have a test case */
9329      } else {
9330         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9331         storeLE( mkexpr(addr),
9332                  getXMMRegLane32(gregOfRM(modrm), 0) );
9333         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
9334                              dis_buf);
9335         delta += 3+alen;
9336         goto decode_success;
9337      }
9338   }
9339
9340   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
9341   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
9342      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
9343      goto decode_success;
9344   }
9345
9346   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
9347   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
9348      vassert(sz == 4);
9349      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
9350      goto decode_success;
9351   }
9352
9353   /* 0F 56 = ORPS -- G = G and E */
9354   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
9355      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
9356      goto decode_success;
9357   }
9358
9359   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
9360   if (insn[0] == 0x0F && insn[1] == 0x53) {
9361      vassert(sz == 4);
9362      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9363                                        "rcpps", Iop_RecipEst32Fx4 );
9364      goto decode_success;
9365   }
9366
9367   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
9368   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
9369      vassert(sz == 4);
9370      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9371                                         "rcpss", Iop_RecipEst32F0x4 );
9372      goto decode_success;
9373   }
9374
9375   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
9376   if (insn[0] == 0x0F && insn[1] == 0x52) {
9377      vassert(sz == 4);
9378      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9379                                        "rsqrtps", Iop_RSqrtEst32Fx4 );
9380      goto decode_success;
9381   }
9382
9383   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
9384   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
9385      vassert(sz == 4);
9386      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9387                                         "rsqrtss", Iop_RSqrtEst32F0x4 );
9388      goto decode_success;
9389   }
9390
9391   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
9392   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
9393      Int    select;
9394      IRTemp sV, dV;
9395      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9396      sV = newTemp(Ity_V128);
9397      dV = newTemp(Ity_V128);
9398      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9399      modrm = insn[2];
9400      assign( dV, getXMMReg(gregOfRM(modrm)) );
9401
9402      if (epartIsReg(modrm)) {
9403         assign( sV, getXMMReg(eregOfRM(modrm)) );
9404         select = (Int)insn[3];
9405         delta += 2+2;
9406         DIP("shufps $%d,%s,%s\n", select,
9407                                   nameXMMReg(eregOfRM(modrm)),
9408                                   nameXMMReg(gregOfRM(modrm)));
9409      } else {
9410         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9411         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9412         select = (Int)insn[2+alen];
9413         delta += 3+alen;
9414         DIP("shufps $%d,%s,%s\n", select,
9415                                   dis_buf,
9416                                   nameXMMReg(gregOfRM(modrm)));
9417      }
9418
9419      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9420      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9421
9422#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
9423#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9424
9425      putXMMReg(
9426         gregOfRM(modrm),
9427         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
9428                       SELD((select>>2)&3), SELD((select>>0)&3) )
9429      );
9430
9431#     undef SELD
9432#     undef SELS
9433
9434      goto decode_success;
9435   }
9436
9437   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
9438   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
9439      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9440                                        "sqrtps", Iop_Sqrt32Fx4 );
9441      goto decode_success;
9442   }
9443
9444   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
9445   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
9446      vassert(sz == 4);
9447      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9448                                         "sqrtss", Iop_Sqrt32F0x4 );
9449      goto decode_success;
9450   }
9451
9452   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
9453   if (insn[0] == 0x0F && insn[1] == 0xAE
9454       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
9455      modrm = getIByte(delta+2);
9456      vassert(sz == 4);
9457      vassert(!epartIsReg(modrm));
9458
9459      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9460      delta += 2+alen;
9461
9462      /* Fake up a native SSE mxcsr word.  The only thing it depends
9463         on is SSEROUND[1:0], so call a clean helper to cook it up.
9464      */
9465      /* UInt x86h_create_mxcsr ( UInt sseround ) */
9466      DIP("stmxcsr %s\n", dis_buf);
9467      storeLE( mkexpr(addr),
9468               mkIRExprCCall(
9469                  Ity_I32, 0/*regp*/,
9470                  "x86g_create_mxcsr", &x86g_create_mxcsr,
9471                  mkIRExprVec_1( get_sse_roundingmode() )
9472               )
9473             );
9474      goto decode_success;
9475   }
9476
9477   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
9478   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
9479      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
9480      goto decode_success;
9481   }
9482
9483   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
9484   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
9485      vassert(sz == 4);
9486      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
9487      goto decode_success;
9488   }
9489
9490   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
9491   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
9492   /* These just appear to be special cases of SHUFPS */
9493   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
9494      IRTemp sV, dV;
9495      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9496      Bool hi = toBool(insn[1] == 0x15);
9497      sV = newTemp(Ity_V128);
9498      dV = newTemp(Ity_V128);
9499      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9500      modrm = insn[2];
9501      assign( dV, getXMMReg(gregOfRM(modrm)) );
9502
9503      if (epartIsReg(modrm)) {
9504         assign( sV, getXMMReg(eregOfRM(modrm)) );
9505         delta += 2+1;
9506         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9507                                  nameXMMReg(eregOfRM(modrm)),
9508                                  nameXMMReg(gregOfRM(modrm)));
9509      } else {
9510         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9511         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9512         delta += 2+alen;
9513         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9514                                  dis_buf,
9515                                  nameXMMReg(gregOfRM(modrm)));
9516      }
9517
9518      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9519      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9520
9521      if (hi) {
9522         putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
9523      } else {
9524         putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
9525      }
9526
9527      goto decode_success;
9528   }
9529
9530   /* 0F 57 = XORPS -- G = G and E */
9531   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
9532      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
9533      goto decode_success;
9534   }
9535
9536   /* ---------------------------------------------------- */
9537   /* --- end of the SSE decoder.                      --- */
9538   /* ---------------------------------------------------- */
9539
9540   /* ---------------------------------------------------- */
9541   /* --- start of the SSE2 decoder.                   --- */
9542   /* ---------------------------------------------------- */
9543
9544   /* Skip parts of the decoder which don't apply given the stated
9545      guest subarchitecture. */
9546   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
9547      goto after_sse_decoders; /* no SSE2 capabilities */
9548
9549   insn = &guest_code[delta];
9550
9551   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
9552   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
9553      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
9554      goto decode_success;
9555   }
9556
9557   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
9558   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
9559      vassert(sz == 4);
9560      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
9561      goto decode_success;
9562   }
9563
9564   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
9565   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
9566      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
9567      goto decode_success;
9568   }
9569
9570   /* 66 0F 54 = ANDPD -- G = G and E */
9571   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
9572      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
9573      goto decode_success;
9574   }
9575
9576   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
9577   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
9578      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
9579      goto decode_success;
9580   }
9581
9582   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
9583   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
9584      vassert(sz == 4);
9585      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
9586      goto decode_success;
9587   }
9588
9589   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
9590   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
9591   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
9592      IRTemp argL = newTemp(Ity_F64);
9593      IRTemp argR = newTemp(Ity_F64);
9594      modrm = getIByte(delta+2);
9595      if (epartIsReg(modrm)) {
9596         assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
9597         delta += 2+1;
9598         DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9599                                  nameXMMReg(gregOfRM(modrm)) );
9600      } else {
9601         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9602	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9603         delta += 2+alen;
9604         DIP("[u]comisd %s,%s\n", dis_buf,
9605                                  nameXMMReg(gregOfRM(modrm)) );
9606      }
9607      assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
9608
9609      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
9610      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
9611      stmt( IRStmt_Put(
9612               OFFB_CC_DEP1,
9613               binop( Iop_And32,
9614                      binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
9615                      mkU32(0x45)
9616          )));
9617      /* Set NDEP even though it isn't used.  This makes redundant-PUT
9618         elimination of previous stores to this field work better. */
9619      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
9620      goto decode_success;
9621   }
9622
9623   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
9624      F64 in xmm(G) */
9625   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
9626      IRTemp arg64 = newTemp(Ity_I64);
9627      vassert(sz == 4);
9628
9629      modrm = getIByte(delta+3);
9630      if (epartIsReg(modrm)) {
9631         assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
9632         delta += 3+1;
9633         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9634                                 nameXMMReg(gregOfRM(modrm)));
9635      } else {
9636         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9637	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9638         delta += 3+alen;
9639         DIP("cvtdq2pd %s,%s\n", dis_buf,
9640                                 nameXMMReg(gregOfRM(modrm)) );
9641      }
9642
9643      putXMMRegLane64F(
9644         gregOfRM(modrm), 0,
9645         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
9646      );
9647
9648      putXMMRegLane64F(
9649         gregOfRM(modrm), 1,
9650         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
9651      );
9652
9653      goto decode_success;
9654   }
9655
9656   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
9657      xmm(G) */
9658   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
9659      IRTemp argV  = newTemp(Ity_V128);
9660      IRTemp rmode = newTemp(Ity_I32);
9661
9662      modrm = getIByte(delta+2);
9663      if (epartIsReg(modrm)) {
9664         assign( argV, getXMMReg(eregOfRM(modrm)) );
9665         delta += 2+1;
9666         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9667                                 nameXMMReg(gregOfRM(modrm)));
9668      } else {
9669         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9670	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9671         delta += 2+alen;
9672         DIP("cvtdq2ps %s,%s\n", dis_buf,
9673                                 nameXMMReg(gregOfRM(modrm)) );
9674      }
9675
9676      assign( rmode, get_sse_roundingmode() );
9677      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9678
9679#     define CVT(_t)  binop( Iop_F64toF32,                    \
9680                             mkexpr(rmode),                   \
9681                             unop(Iop_I32StoF64,mkexpr(_t)))
9682
9683      putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
9684      putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
9685      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9686      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9687
9688#     undef CVT
9689
9690      goto decode_success;
9691   }
9692
9693   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9694      lo half xmm(G), and zero upper half */
9695   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
9696      IRTemp argV  = newTemp(Ity_V128);
9697      IRTemp rmode = newTemp(Ity_I32);
9698      vassert(sz == 4);
9699
9700      modrm = getIByte(delta+3);
9701      if (epartIsReg(modrm)) {
9702         assign( argV, getXMMReg(eregOfRM(modrm)) );
9703         delta += 3+1;
9704         DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9705                                 nameXMMReg(gregOfRM(modrm)));
9706      } else {
9707         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9708	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9709         delta += 3+alen;
9710         DIP("cvtpd2dq %s,%s\n", dis_buf,
9711                                 nameXMMReg(gregOfRM(modrm)) );
9712      }
9713
9714      assign( rmode, get_sse_roundingmode() );
9715      t0 = newTemp(Ity_F64);
9716      t1 = newTemp(Ity_F64);
9717      assign( t0, unop(Iop_ReinterpI64asF64,
9718                       unop(Iop_V128to64, mkexpr(argV))) );
9719      assign( t1, unop(Iop_ReinterpI64asF64,
9720                       unop(Iop_V128HIto64, mkexpr(argV))) );
9721
9722#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9723                             mkexpr(rmode),                   \
9724                             mkexpr(_t) )
9725
9726      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9727      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9728      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9729      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9730
9731#     undef CVT
9732
9733      goto decode_success;
9734   }
9735
9736   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9737      I32 in mmx, according to prevailing SSE rounding mode */
9738   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9739      I32 in mmx, rounding towards zero */
9740   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9741      IRTemp dst64  = newTemp(Ity_I64);
9742      IRTemp rmode  = newTemp(Ity_I32);
9743      IRTemp f64lo  = newTemp(Ity_F64);
9744      IRTemp f64hi  = newTemp(Ity_F64);
9745      Bool   r2zero = toBool(insn[1] == 0x2C);
9746
9747      do_MMX_preamble();
9748      modrm = getIByte(delta+2);
9749
9750      if (epartIsReg(modrm)) {
9751         delta += 2+1;
9752	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9753	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
9754         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
9755                                   nameXMMReg(eregOfRM(modrm)),
9756                                   nameMMXReg(gregOfRM(modrm)));
9757      } else {
9758         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9759	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9760	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
9761                                              mkexpr(addr),
9762                                              mkU32(8) )));
9763         delta += 2+alen;
9764         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
9765                                   dis_buf,
9766                                   nameMMXReg(gregOfRM(modrm)));
9767      }
9768
9769      if (r2zero) {
9770         assign(rmode, mkU32((UInt)Irrm_ZERO) );
9771      } else {
9772         assign( rmode, get_sse_roundingmode() );
9773      }
9774
9775      assign(
9776         dst64,
9777         binop( Iop_32HLto64,
9778                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
9779                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
9780              )
9781      );
9782
9783      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
9784      goto decode_success;
9785   }
9786
9787   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
9788      lo half xmm(G), and zero upper half */
9789   /* Note, this is practically identical to CVTPD2DQ.  It would have
9790      been nicer to merge them together, but the insn[] offsets differ
9791      by one. */
9792   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
9793      IRTemp argV  = newTemp(Ity_V128);
9794      IRTemp rmode = newTemp(Ity_I32);
9795
9796      modrm = getIByte(delta+2);
9797      if (epartIsReg(modrm)) {
9798         assign( argV, getXMMReg(eregOfRM(modrm)) );
9799         delta += 2+1;
9800         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9801                                 nameXMMReg(gregOfRM(modrm)));
9802      } else {
9803         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9804	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9805         delta += 2+alen;
9806         DIP("cvtpd2ps %s,%s\n", dis_buf,
9807                                 nameXMMReg(gregOfRM(modrm)) );
9808      }
9809
9810      assign( rmode, get_sse_roundingmode() );
9811      t0 = newTemp(Ity_F64);
9812      t1 = newTemp(Ity_F64);
9813      assign( t0, unop(Iop_ReinterpI64asF64,
9814                       unop(Iop_V128to64, mkexpr(argV))) );
9815      assign( t1, unop(Iop_ReinterpI64asF64,
9816                       unop(Iop_V128HIto64, mkexpr(argV))) );
9817
9818#     define CVT(_t)  binop( Iop_F64toF32,                    \
9819                             mkexpr(rmode),                   \
9820                             mkexpr(_t) )
9821
9822      putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
9823      putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
9824      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9825      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9826
9827#     undef CVT
9828
9829      goto decode_success;
9830   }
9831
9832   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
9833      xmm(G) */
9834   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
9835      IRTemp arg64 = newTemp(Ity_I64);
9836
9837      modrm = getIByte(delta+2);
9838      if (epartIsReg(modrm)) {
9839         /* Only switch to MMX mode if the source is a MMX register.
9840            This is inconsistent with all other instructions which
9841            convert between XMM and (M64 or MMX), which always switch
9842            to MMX mode even if 64-bit operand is M64 and not MMX.  At
9843            least, that's what the Intel docs seem to me to say.
9844            Fixes #210264. */
9845         do_MMX_preamble();
9846         assign( arg64, getMMXReg(eregOfRM(modrm)) );
9847         delta += 2+1;
9848         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9849                                 nameXMMReg(gregOfRM(modrm)));
9850      } else {
9851         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9852	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9853         delta += 2+alen;
9854         DIP("cvtpi2pd %s,%s\n", dis_buf,
9855                                 nameXMMReg(gregOfRM(modrm)) );
9856      }
9857
9858      putXMMRegLane64F(
9859         gregOfRM(modrm), 0,
9860         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
9861      );
9862
9863      putXMMRegLane64F(
9864         gregOfRM(modrm), 1,
9865         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
9866      );
9867
9868      goto decode_success;
9869   }
9870
9871   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9872      xmm(G) */
9873   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
9874      IRTemp argV  = newTemp(Ity_V128);
9875      IRTemp rmode = newTemp(Ity_I32);
9876
9877      modrm = getIByte(delta+2);
9878      if (epartIsReg(modrm)) {
9879         assign( argV, getXMMReg(eregOfRM(modrm)) );
9880         delta += 2+1;
9881         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9882                                 nameXMMReg(gregOfRM(modrm)));
9883      } else {
9884         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9885	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9886         delta += 2+alen;
9887         DIP("cvtps2dq %s,%s\n", dis_buf,
9888                                 nameXMMReg(gregOfRM(modrm)) );
9889      }
9890
9891      assign( rmode, get_sse_roundingmode() );
9892      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9893
9894      /* This is less than ideal.  If it turns out to be a performance
9895	 bottleneck it can be improved. */
9896#     define CVT(_t)                            \
9897        binop( Iop_F64toI32S,                   \
9898               mkexpr(rmode),                   \
9899               unop( Iop_F32toF64,              \
9900                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9901
9902      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9903      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9904      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9905      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9906
9907#     undef CVT
9908
9909      goto decode_success;
9910   }
9911
9912   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
9913      F64 in xmm(G). */
9914   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
9915      IRTemp f32lo = newTemp(Ity_F32);
9916      IRTemp f32hi = newTemp(Ity_F32);
9917
9918      modrm = getIByte(delta+2);
9919      if (epartIsReg(modrm)) {
9920         assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
9921         assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
9922         delta += 2+1;
9923         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9924                                 nameXMMReg(gregOfRM(modrm)));
9925      } else {
9926         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9927	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
9928	 assign( f32hi, loadLE(Ity_F32,
9929                               binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
9930         delta += 2+alen;
9931         DIP("cvtps2pd %s,%s\n", dis_buf,
9932                                 nameXMMReg(gregOfRM(modrm)) );
9933      }
9934
9935      putXMMRegLane64F( gregOfRM(modrm), 1,
9936                        unop(Iop_F32toF64, mkexpr(f32hi)) );
9937      putXMMRegLane64F( gregOfRM(modrm), 0,
9938                        unop(Iop_F32toF64, mkexpr(f32lo)) );
9939
9940      goto decode_success;
9941   }
9942
9943   /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
9944      I32 in ireg, according to prevailing SSE rounding mode */
9945   /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
9946      I32 in ireg, rounding towards zero */
9947   if (insn[0] == 0xF2 && insn[1] == 0x0F
9948       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
9949      IRTemp rmode = newTemp(Ity_I32);
9950      IRTemp f64lo = newTemp(Ity_F64);
9951      Bool   r2zero = toBool(insn[2] == 0x2C);
9952      vassert(sz == 4);
9953
9954      modrm = getIByte(delta+3);
9955      if (epartIsReg(modrm)) {
9956         delta += 3+1;
9957	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9958         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9959                                   nameXMMReg(eregOfRM(modrm)),
9960                                   nameIReg(4, gregOfRM(modrm)));
9961      } else {
9962         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9963	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9964         delta += 3+alen;
9965         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9966                                   dis_buf,
9967                                   nameIReg(4, gregOfRM(modrm)));
9968      }
9969
9970      if (r2zero) {
9971         assign( rmode, mkU32((UInt)Irrm_ZERO) );
9972      } else {
9973         assign( rmode, get_sse_roundingmode() );
9974      }
9975
9976      putIReg(4, gregOfRM(modrm),
9977                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
9978
9979      goto decode_success;
9980   }
9981
9982   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
9983      low 1/4 xmm(G), according to prevailing SSE rounding mode */
9984   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
9985      IRTemp rmode = newTemp(Ity_I32);
9986      IRTemp f64lo = newTemp(Ity_F64);
9987      vassert(sz == 4);
9988
9989      modrm = getIByte(delta+3);
9990      if (epartIsReg(modrm)) {
9991         delta += 3+1;
9992	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9993         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9994                                 nameXMMReg(gregOfRM(modrm)));
9995      } else {
9996         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9997	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9998         delta += 3+alen;
9999         DIP("cvtsd2ss %s,%s\n", dis_buf,
10000                                 nameXMMReg(gregOfRM(modrm)));
10001      }
10002
10003      assign( rmode, get_sse_roundingmode() );
10004      putXMMRegLane32F(
10005         gregOfRM(modrm), 0,
10006         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
10007      );
10008
10009      goto decode_success;
10010   }
10011
10012   /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
10013      half xmm */
10014   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
10015      IRTemp arg32 = newTemp(Ity_I32);
10016      vassert(sz == 4);
10017
10018      modrm = getIByte(delta+3);
10019      if (epartIsReg(modrm)) {
10020         assign( arg32, getIReg(4, eregOfRM(modrm)) );
10021         delta += 3+1;
10022         DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
10023                                 nameXMMReg(gregOfRM(modrm)));
10024      } else {
10025         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10026	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
10027         delta += 3+alen;
10028         DIP("cvtsi2sd %s,%s\n", dis_buf,
10029                                 nameXMMReg(gregOfRM(modrm)) );
10030      }
10031
10032      putXMMRegLane64F(
10033         gregOfRM(modrm), 0,
10034         unop(Iop_I32StoF64, mkexpr(arg32)) );
10035
10036      goto decode_success;
10037   }
10038
10039   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
10040      low half xmm(G) */
10041   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
10042      IRTemp f32lo = newTemp(Ity_F32);
10043      vassert(sz == 4);
10044
10045      modrm = getIByte(delta+3);
10046      if (epartIsReg(modrm)) {
10047         delta += 3+1;
10048	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
10049         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10050                                 nameXMMReg(gregOfRM(modrm)));
10051      } else {
10052         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10053	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
10054         delta += 3+alen;
10055         DIP("cvtss2sd %s,%s\n", dis_buf,
10056                                 nameXMMReg(gregOfRM(modrm)));
10057      }
10058
10059      putXMMRegLane64F( gregOfRM(modrm), 0,
10060                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
10061
10062      goto decode_success;
10063   }
10064
10065   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
10066      lo half xmm(G), and zero upper half, rounding towards zero */
10067   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
10068      IRTemp argV  = newTemp(Ity_V128);
10069      IRTemp rmode = newTemp(Ity_I32);
10070
10071      modrm = getIByte(delta+2);
10072      if (epartIsReg(modrm)) {
10073         assign( argV, getXMMReg(eregOfRM(modrm)) );
10074         delta += 2+1;
10075         DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10076                                  nameXMMReg(gregOfRM(modrm)));
10077      } else {
10078         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10079	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10080         delta += 2+alen;
10081         DIP("cvttpd2dq %s,%s\n", dis_buf,
10082                                  nameXMMReg(gregOfRM(modrm)) );
10083      }
10084
10085      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10086
10087      t0 = newTemp(Ity_F64);
10088      t1 = newTemp(Ity_F64);
10089      assign( t0, unop(Iop_ReinterpI64asF64,
10090                       unop(Iop_V128to64, mkexpr(argV))) );
10091      assign( t1, unop(Iop_ReinterpI64asF64,
10092                       unop(Iop_V128HIto64, mkexpr(argV))) );
10093
10094#     define CVT(_t)  binop( Iop_F64toI32S,                   \
10095                             mkexpr(rmode),                   \
10096                             mkexpr(_t) )
10097
10098      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
10099      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
10100      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10101      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10102
10103#     undef CVT
10104
10105      goto decode_success;
10106   }
10107
10108   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
10109      xmm(G), rounding towards zero */
10110   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
10111      IRTemp argV  = newTemp(Ity_V128);
10112      IRTemp rmode = newTemp(Ity_I32);
10113      vassert(sz == 4);
10114
10115      modrm = getIByte(delta+3);
10116      if (epartIsReg(modrm)) {
10117         assign( argV, getXMMReg(eregOfRM(modrm)) );
10118         delta += 3+1;
10119         DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10120                                  nameXMMReg(gregOfRM(modrm)));
10121      } else {
10122         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10123	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10124         delta += 3+alen;
10125         DIP("cvttps2dq %s,%s\n", dis_buf,
10126                                  nameXMMReg(gregOfRM(modrm)) );
10127      }
10128
10129      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10130      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10131
10132      /* This is less than ideal.  If it turns out to be a performance
10133	 bottleneck it can be improved. */
10134#     define CVT(_t)                            \
10135        binop( Iop_F64toI32S,                   \
10136               mkexpr(rmode),                   \
10137               unop( Iop_F32toF64,              \
10138                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10139
10140      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
10141      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
10142      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10143      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10144
10145#     undef CVT
10146
10147      goto decode_success;
10148   }
10149
10150   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
10151   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
10152      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
10153      goto decode_success;
10154   }
10155
10156   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
10157   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
10158      vassert(sz == 4);
10159      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
10160      goto decode_success;
10161   }
10162
10163   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
10164   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
10165   if (insn[0] == 0x0F && insn[1] == 0xAE
10166       && epartIsReg(insn[2])
10167       && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
10168      vassert(sz == 4);
10169      delta += 3;
10170      /* Insert a memory fence.  It's sometimes important that these
10171         are carried through to the generated code. */
10172      stmt( IRStmt_MBE(Imbe_Fence) );
10173      DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
10174      goto decode_success;
10175   }
10176
10177   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
10178   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
10179      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
10180      goto decode_success;
10181   }
10182
10183   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
10184   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
10185      vassert(sz == 4);
10186      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
10187      goto decode_success;
10188   }
10189
10190   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
10191   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
10192      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
10193      goto decode_success;
10194   }
10195
10196   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
10197   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
10198      vassert(sz == 4);
10199      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
10200      goto decode_success;
10201   }
10202
10203   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
10204   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
10205   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
10206   if (sz == 2 && insn[0] == 0x0F
10207       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
10208      const HChar* wot = insn[1]==0x28 ? "apd" :
10209                         insn[1]==0x10 ? "upd" : "dqa";
10210      modrm = getIByte(delta+2);
10211      if (epartIsReg(modrm)) {
10212         putXMMReg( gregOfRM(modrm),
10213                    getXMMReg( eregOfRM(modrm) ));
10214         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
10215                                   nameXMMReg(gregOfRM(modrm)));
10216         delta += 2+1;
10217      } else {
10218         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10219         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
10220            gen_SEGV_if_not_16_aligned( addr );
10221         putXMMReg( gregOfRM(modrm),
10222                    loadLE(Ity_V128, mkexpr(addr)) );
10223         DIP("mov%s %s,%s\n", wot, dis_buf,
10224                                   nameXMMReg(gregOfRM(modrm)));
10225         delta += 2+alen;
10226      }
10227      goto decode_success;
10228   }
10229
10230   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
10231   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
10232   if (sz == 2 && insn[0] == 0x0F
10233       && (insn[1] == 0x29 || insn[1] == 0x11)) {
10234      const HChar* wot = insn[1]==0x29 ? "apd" : "upd";
10235      modrm = getIByte(delta+2);
10236      if (epartIsReg(modrm)) {
10237         /* fall through; awaiting test case */
10238      } else {
10239         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10240         if (insn[1] == 0x29/*movapd*/)
10241            gen_SEGV_if_not_16_aligned( addr );
10242         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10243         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
10244                                   dis_buf );
10245         delta += 2+alen;
10246         goto decode_success;
10247      }
10248   }
10249
10250   /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
10251   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
10252      modrm = getIByte(delta+2);
10253      if (epartIsReg(modrm)) {
10254         delta += 2+1;
10255         putXMMReg(
10256            gregOfRM(modrm),
10257            unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
10258         );
10259         DIP("movd %s, %s\n",
10260             nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
10261      } else {
10262         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10263         delta += 2+alen;
10264         putXMMReg(
10265            gregOfRM(modrm),
10266            unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
10267         );
10268         DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
10269      }
10270      goto decode_success;
10271   }
10272
10273   /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
10274   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
10275      modrm = getIByte(delta+2);
10276      if (epartIsReg(modrm)) {
10277         delta += 2+1;
10278         putIReg( 4, eregOfRM(modrm),
10279                  getXMMRegLane32(gregOfRM(modrm), 0) );
10280         DIP("movd %s, %s\n",
10281             nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
10282      } else {
10283         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10284         delta += 2+alen;
10285         storeLE( mkexpr(addr),
10286                  getXMMRegLane32(gregOfRM(modrm), 0) );
10287         DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10288      }
10289      goto decode_success;
10290   }
10291
10292   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
10293   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
10294      modrm = getIByte(delta+2);
10295      if (epartIsReg(modrm)) {
10296         delta += 2+1;
10297         putXMMReg( eregOfRM(modrm),
10298                    getXMMReg(gregOfRM(modrm)) );
10299         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10300                                nameXMMReg(eregOfRM(modrm)));
10301      } else {
10302         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10303         delta += 2+alen;
10304         gen_SEGV_if_not_16_aligned( addr );
10305         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10306         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10307      }
10308      goto decode_success;
10309   }
10310
10311   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
10312   /* Unfortunately can't simply use the MOVDQA case since the
10313      prefix lengths are different (66 vs F3) */
10314   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
10315      vassert(sz == 4);
10316      modrm = getIByte(delta+3);
10317      if (epartIsReg(modrm)) {
10318         putXMMReg( gregOfRM(modrm),
10319                    getXMMReg( eregOfRM(modrm) ));
10320         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10321                               nameXMMReg(gregOfRM(modrm)));
10322         delta += 3+1;
10323      } else {
10324         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10325         putXMMReg( gregOfRM(modrm),
10326                    loadLE(Ity_V128, mkexpr(addr)) );
10327         DIP("movdqu %s,%s\n", dis_buf,
10328                               nameXMMReg(gregOfRM(modrm)));
10329         delta += 3+alen;
10330      }
10331      goto decode_success;
10332   }
10333
10334   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
10335   /* Unfortunately can't simply use the MOVDQA case since the
10336      prefix lengths are different (66 vs F3) */
10337   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
10338      vassert(sz == 4);
10339      modrm = getIByte(delta+3);
10340      if (epartIsReg(modrm)) {
10341         delta += 3+1;
10342         putXMMReg( eregOfRM(modrm),
10343                    getXMMReg(gregOfRM(modrm)) );
10344         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10345                                nameXMMReg(eregOfRM(modrm)));
10346      } else {
10347         addr = disAMode( &alen, sorb, delta+3, dis_buf );
10348         delta += 3+alen;
10349         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10350         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10351      }
10352      goto decode_success;
10353   }
10354
10355   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
10356   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
10357      vassert(sz == 4);
10358      modrm = getIByte(delta+3);
10359      if (epartIsReg(modrm)) {
10360         do_MMX_preamble();
10361         putMMXReg( gregOfRM(modrm),
10362                    getXMMRegLane64( eregOfRM(modrm), 0 ));
10363         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10364                                nameMMXReg(gregOfRM(modrm)));
10365         delta += 3+1;
10366         goto decode_success;
10367      } else {
10368         /* fall through, apparently no mem case for this insn */
10369      }
10370   }
10371
10372   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
10373   /* These seems identical to MOVHPS.  This instruction encoding is
10374      completely crazy. */
10375   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
10376      modrm = getIByte(delta+2);
10377      if (epartIsReg(modrm)) {
10378         /* fall through; apparently reg-reg is not possible */
10379      } else {
10380         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10381         delta += 2+alen;
10382         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
10383                          loadLE(Ity_I64, mkexpr(addr)) );
10384         DIP("movhpd %s,%s\n", dis_buf,
10385                               nameXMMReg( gregOfRM(modrm) ));
10386         goto decode_success;
10387      }
10388   }
10389
10390   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
10391   /* Again, this seems identical to MOVHPS. */
10392   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
10393      if (!epartIsReg(insn[2])) {
10394         delta += 2;
10395         addr = disAMode ( &alen, sorb, delta, dis_buf );
10396         delta += alen;
10397         storeLE( mkexpr(addr),
10398                  getXMMRegLane64( gregOfRM(insn[2]),
10399                                   1/*upper lane*/ ) );
10400         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
10401                               dis_buf);
10402         goto decode_success;
10403      }
10404      /* else fall through */
10405   }
10406
10407   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
10408   /* Identical to MOVLPS ? */
10409   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
10410      modrm = getIByte(delta+2);
10411      if (epartIsReg(modrm)) {
10412         /* fall through; apparently reg-reg is not possible */
10413      } else {
10414         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10415         delta += 2+alen;
10416         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
10417                          loadLE(Ity_I64, mkexpr(addr)) );
10418         DIP("movlpd %s, %s\n",
10419             dis_buf, nameXMMReg( gregOfRM(modrm) ));
10420         goto decode_success;
10421      }
10422   }
10423
10424   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
10425   /* Identical to MOVLPS ? */
10426   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
10427      if (!epartIsReg(insn[2])) {
10428         delta += 2;
10429         addr = disAMode ( &alen, sorb, delta, dis_buf );
10430         delta += alen;
10431         storeLE( mkexpr(addr),
10432                  getXMMRegLane64( gregOfRM(insn[2]),
10433                                   0/*lower lane*/ ) );
10434         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
10435                                dis_buf);
10436         goto decode_success;
10437      }
10438      /* else fall through */
10439   }
10440
10441   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
10442      2 lowest bits of ireg(G) */
10443   if (insn[0] == 0x0F && insn[1] == 0x50) {
10444      modrm = getIByte(delta+2);
10445      if (sz == 2 && epartIsReg(modrm)) {
10446         Int src;
10447         t0 = newTemp(Ity_I32);
10448         t1 = newTemp(Ity_I32);
10449         delta += 2+1;
10450         src = eregOfRM(modrm);
10451         assign( t0, binop( Iop_And32,
10452                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
10453                            mkU32(1) ));
10454         assign( t1, binop( Iop_And32,
10455                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
10456                            mkU32(2) ));
10457         putIReg(4, gregOfRM(modrm),
10458                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
10459                 );
10460         DIP("movmskpd %s,%s\n", nameXMMReg(src),
10461                                 nameIReg(4, gregOfRM(modrm)));
10462         goto decode_success;
10463      }
10464      /* else fall through */
10465   }
10466
10467   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
10468   if (insn[0] == 0x0F && insn[1] == 0xF7) {
10469      modrm = getIByte(delta+2);
10470      if (sz == 2 && epartIsReg(modrm)) {
10471         IRTemp regD    = newTemp(Ity_V128);
10472         IRTemp mask    = newTemp(Ity_V128);
10473         IRTemp olddata = newTemp(Ity_V128);
10474         IRTemp newdata = newTemp(Ity_V128);
10475                addr    = newTemp(Ity_I32);
10476
10477         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
10478         assign( regD, getXMMReg( gregOfRM(modrm) ));
10479
10480         /* Unfortunately can't do the obvious thing with SarN8x16
10481            here since that can't be re-emitted as SSE2 code - no such
10482            insn. */
10483	 assign(
10484            mask,
10485            binop(Iop_64HLtoV128,
10486                  binop(Iop_SarN8x8,
10487                        getXMMRegLane64( eregOfRM(modrm), 1 ),
10488                        mkU8(7) ),
10489                  binop(Iop_SarN8x8,
10490                        getXMMRegLane64( eregOfRM(modrm), 0 ),
10491                        mkU8(7) ) ));
10492         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10493         assign( newdata,
10494                 binop(Iop_OrV128,
10495                       binop(Iop_AndV128,
10496                             mkexpr(regD),
10497                             mkexpr(mask) ),
10498                       binop(Iop_AndV128,
10499                             mkexpr(olddata),
10500                             unop(Iop_NotV128, mkexpr(mask)))) );
10501         storeLE( mkexpr(addr), mkexpr(newdata) );
10502
10503         delta += 2+1;
10504         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
10505                                   nameXMMReg( gregOfRM(modrm) ) );
10506         goto decode_success;
10507      }
10508      /* else fall through */
10509   }
10510
10511   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
10512   if (insn[0] == 0x0F && insn[1] == 0xE7) {
10513      modrm = getIByte(delta+2);
10514      if (sz == 2 && !epartIsReg(modrm)) {
10515         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10516         gen_SEGV_if_not_16_aligned( addr );
10517         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10518         DIP("movntdq %s,%s\n", dis_buf,
10519                                nameXMMReg(gregOfRM(modrm)));
10520         delta += 2+alen;
10521         goto decode_success;
10522      }
10523      /* else fall through */
10524   }
10525
10526   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
10527   if (insn[0] == 0x0F && insn[1] == 0xC3) {
10528      vassert(sz == 4);
10529      modrm = getIByte(delta+2);
10530      if (!epartIsReg(modrm)) {
10531         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10532         storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
10533         DIP("movnti %s,%s\n", dis_buf,
10534                               nameIReg(4, gregOfRM(modrm)));
10535         delta += 2+alen;
10536         goto decode_success;
10537      }
10538      /* else fall through */
10539   }
10540
10541   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
10542      or lo half xmm).  */
10543   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
10544      modrm = getIByte(delta+2);
10545      if (epartIsReg(modrm)) {
10546         /* fall through, awaiting test case */
10547         /* dst: lo half copied, hi half zeroed */
10548      } else {
10549         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10550         storeLE( mkexpr(addr),
10551                  getXMMRegLane64( gregOfRM(modrm), 0 ));
10552         DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
10553         delta += 2+alen;
10554         goto decode_success;
10555      }
10556   }
10557
10558   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
10559      hi half). */
10560   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
10561      vassert(sz == 4);
10562      modrm = getIByte(delta+3);
10563      if (epartIsReg(modrm)) {
10564         do_MMX_preamble();
10565         putXMMReg( gregOfRM(modrm),
10566                    unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
10567         DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10568                                nameXMMReg(gregOfRM(modrm)));
10569         delta += 3+1;
10570         goto decode_success;
10571      } else {
10572         /* fall through, apparently no mem case for this insn */
10573      }
10574   }
10575
10576   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
10577      G (lo half xmm).  Upper half of G is zeroed out. */
10578   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
10579      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
10580      If E is reg, upper half of G is unchanged. */
10581   if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
10582       || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
10583      vassert(sz == 4);
10584      modrm = getIByte(delta+3);
10585      if (epartIsReg(modrm)) {
10586         putXMMRegLane64( gregOfRM(modrm), 0,
10587                          getXMMRegLane64( eregOfRM(modrm), 0 ));
10588         if (insn[0] == 0xF3/*MOVQ*/) {
10589            /* zero bits 127:64 */
10590            putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10591         }
10592         DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10593                              nameXMMReg(gregOfRM(modrm)));
10594         delta += 3+1;
10595      } else {
10596         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10597         /* zero bits 127:64 */
10598         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10599         /* write bits 63:0 */
10600         putXMMRegLane64( gregOfRM(modrm), 0,
10601                          loadLE(Ity_I64, mkexpr(addr)) );
10602         DIP("movsd %s,%s\n", dis_buf,
10603                              nameXMMReg(gregOfRM(modrm)));
10604         delta += 3+alen;
10605      }
10606      goto decode_success;
10607   }
10608
10609   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
10610      or lo half xmm). */
10611   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
10612      vassert(sz == 4);
10613      modrm = getIByte(delta+3);
10614      if (epartIsReg(modrm)) {
10615         putXMMRegLane64( eregOfRM(modrm), 0,
10616                          getXMMRegLane64( gregOfRM(modrm), 0 ));
10617         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10618                              nameXMMReg(eregOfRM(modrm)));
10619         delta += 3+1;
10620      } else {
10621         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10622         storeLE( mkexpr(addr),
10623                  getXMMRegLane64(gregOfRM(modrm), 0) );
10624         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10625                              dis_buf);
10626         delta += 3+alen;
10627      }
10628      goto decode_success;
10629   }
10630
10631   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
10632   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
10633      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
10634      goto decode_success;
10635   }
10636
10637   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
10638   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
10639      vassert(sz == 4);
10640      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
10641      goto decode_success;
10642   }
10643
10644   /* 66 0F 56 = ORPD -- G = G and E */
10645   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
10646      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
10647      goto decode_success;
10648   }
10649
10650   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
10651   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
10652      Int    select;
10653      IRTemp sV = newTemp(Ity_V128);
10654      IRTemp dV = newTemp(Ity_V128);
10655      IRTemp s1 = newTemp(Ity_I64);
10656      IRTemp s0 = newTemp(Ity_I64);
10657      IRTemp d1 = newTemp(Ity_I64);
10658      IRTemp d0 = newTemp(Ity_I64);
10659
10660      modrm = insn[2];
10661      assign( dV, getXMMReg(gregOfRM(modrm)) );
10662
10663      if (epartIsReg(modrm)) {
10664         assign( sV, getXMMReg(eregOfRM(modrm)) );
10665         select = (Int)insn[3];
10666         delta += 2+2;
10667         DIP("shufpd $%d,%s,%s\n", select,
10668                                   nameXMMReg(eregOfRM(modrm)),
10669                                   nameXMMReg(gregOfRM(modrm)));
10670      } else {
10671         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10672         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10673         select = (Int)insn[2+alen];
10674         delta += 3+alen;
10675         DIP("shufpd $%d,%s,%s\n", select,
10676                                   dis_buf,
10677                                   nameXMMReg(gregOfRM(modrm)));
10678      }
10679
10680      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10681      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10682      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10683      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10684
10685#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
10686#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
10687
10688      putXMMReg(
10689         gregOfRM(modrm),
10690         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
10691      );
10692
10693#     undef SELD
10694#     undef SELS
10695
10696      goto decode_success;
10697   }
10698
10699   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
10700   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
10701      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
10702                                        "sqrtpd", Iop_Sqrt64Fx2 );
10703      goto decode_success;
10704   }
10705
10706   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
10707   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
10708      vassert(sz == 4);
10709      delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
10710                                         "sqrtsd", Iop_Sqrt64F0x2 );
10711      goto decode_success;
10712   }
10713
10714   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
10715   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
10716      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
10717      goto decode_success;
10718   }
10719
10720   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
10721   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
10722      vassert(sz == 4);
10723      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
10724      goto decode_success;
10725   }
10726
10727   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
10728   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
10729   /* These just appear to be special cases of SHUFPS */
10730   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
10731      IRTemp s1 = newTemp(Ity_I64);
10732      IRTemp s0 = newTemp(Ity_I64);
10733      IRTemp d1 = newTemp(Ity_I64);
10734      IRTemp d0 = newTemp(Ity_I64);
10735      IRTemp sV = newTemp(Ity_V128);
10736      IRTemp dV = newTemp(Ity_V128);
10737      Bool   hi = toBool(insn[1] == 0x15);
10738
10739      modrm = insn[2];
10740      assign( dV, getXMMReg(gregOfRM(modrm)) );
10741
10742      if (epartIsReg(modrm)) {
10743         assign( sV, getXMMReg(eregOfRM(modrm)) );
10744         delta += 2+1;
10745         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10746                                  nameXMMReg(eregOfRM(modrm)),
10747                                  nameXMMReg(gregOfRM(modrm)));
10748      } else {
10749         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10750         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10751         delta += 2+alen;
10752         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10753                                  dis_buf,
10754                                  nameXMMReg(gregOfRM(modrm)));
10755      }
10756
10757      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10758      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10759      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10760      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10761
10762      if (hi) {
10763         putXMMReg( gregOfRM(modrm),
10764                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
10765      } else {
10766         putXMMReg( gregOfRM(modrm),
10767                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
10768      }
10769
10770      goto decode_success;
10771   }
10772
10773   /* 66 0F 57 = XORPD -- G = G and E */
10774   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
10775      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
10776      goto decode_success;
10777   }
10778
10779   /* 66 0F 6B = PACKSSDW */
10780   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
10781      delta = dis_SSEint_E_to_G( sorb, delta+2,
10782                                 "packssdw",
10783                                 Iop_QNarrowBin32Sto16Sx8, True );
10784      goto decode_success;
10785   }
10786
10787   /* 66 0F 63 = PACKSSWB */
10788   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
10789      delta = dis_SSEint_E_to_G( sorb, delta+2,
10790                                 "packsswb",
10791                                 Iop_QNarrowBin16Sto8Sx16, True );
10792      goto decode_success;
10793   }
10794
10795   /* 66 0F 67 = PACKUSWB */
10796   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
10797      delta = dis_SSEint_E_to_G( sorb, delta+2,
10798                                 "packuswb",
10799                                 Iop_QNarrowBin16Sto8Ux16, True );
10800      goto decode_success;
10801   }
10802
10803   /* 66 0F FC = PADDB */
10804   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
10805      delta = dis_SSEint_E_to_G( sorb, delta+2,
10806                                 "paddb", Iop_Add8x16, False );
10807      goto decode_success;
10808   }
10809
10810   /* 66 0F FE = PADDD */
10811   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
10812      delta = dis_SSEint_E_to_G( sorb, delta+2,
10813                                 "paddd", Iop_Add32x4, False );
10814      goto decode_success;
10815   }
10816
10817   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10818   /* 0F D4 = PADDQ -- add 64x1 */
10819   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
10820      do_MMX_preamble();
10821      delta = dis_MMXop_regmem_to_reg (
10822                sorb, delta+2, insn[1], "paddq", False );
10823      goto decode_success;
10824   }
10825
10826   /* 66 0F D4 = PADDQ */
10827   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
10828      delta = dis_SSEint_E_to_G( sorb, delta+2,
10829                                 "paddq", Iop_Add64x2, False );
10830      goto decode_success;
10831   }
10832
10833   /* 66 0F FD = PADDW */
10834   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
10835      delta = dis_SSEint_E_to_G( sorb, delta+2,
10836                                 "paddw", Iop_Add16x8, False );
10837      goto decode_success;
10838   }
10839
10840   /* 66 0F EC = PADDSB */
10841   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
10842      delta = dis_SSEint_E_to_G( sorb, delta+2,
10843                                 "paddsb", Iop_QAdd8Sx16, False );
10844      goto decode_success;
10845   }
10846
10847   /* 66 0F ED = PADDSW */
10848   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
10849      delta = dis_SSEint_E_to_G( sorb, delta+2,
10850                                 "paddsw", Iop_QAdd16Sx8, False );
10851      goto decode_success;
10852   }
10853
10854   /* 66 0F DC = PADDUSB */
10855   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
10856      delta = dis_SSEint_E_to_G( sorb, delta+2,
10857                                 "paddusb", Iop_QAdd8Ux16, False );
10858      goto decode_success;
10859   }
10860
10861   /* 66 0F DD = PADDUSW */
10862   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
10863      delta = dis_SSEint_E_to_G( sorb, delta+2,
10864                                 "paddusw", Iop_QAdd16Ux8, False );
10865      goto decode_success;
10866   }
10867
10868   /* 66 0F DB = PAND */
10869   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
10870      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
10871      goto decode_success;
10872   }
10873
10874   /* 66 0F DF = PANDN */
10875   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
10876      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
10877      goto decode_success;
10878   }
10879
10880   /* 66 0F E0 = PAVGB */
10881   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
10882      delta = dis_SSEint_E_to_G( sorb, delta+2,
10883                                 "pavgb", Iop_Avg8Ux16, False );
10884      goto decode_success;
10885   }
10886
10887   /* 66 0F E3 = PAVGW */
10888   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
10889      delta = dis_SSEint_E_to_G( sorb, delta+2,
10890                                 "pavgw", Iop_Avg16Ux8, False );
10891      goto decode_success;
10892   }
10893
10894   /* 66 0F 74 = PCMPEQB */
10895   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
10896      delta = dis_SSEint_E_to_G( sorb, delta+2,
10897                                 "pcmpeqb", Iop_CmpEQ8x16, False );
10898      goto decode_success;
10899   }
10900
10901   /* 66 0F 76 = PCMPEQD */
10902   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
10903      delta = dis_SSEint_E_to_G( sorb, delta+2,
10904                                 "pcmpeqd", Iop_CmpEQ32x4, False );
10905      goto decode_success;
10906   }
10907
10908   /* 66 0F 75 = PCMPEQW */
10909   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
10910      delta = dis_SSEint_E_to_G( sorb, delta+2,
10911                                 "pcmpeqw", Iop_CmpEQ16x8, False );
10912      goto decode_success;
10913   }
10914
10915   /* 66 0F 64 = PCMPGTB */
10916   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
10917      delta = dis_SSEint_E_to_G( sorb, delta+2,
10918                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
10919      goto decode_success;
10920   }
10921
10922   /* 66 0F 66 = PCMPGTD */
10923   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
10924      delta = dis_SSEint_E_to_G( sorb, delta+2,
10925                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
10926      goto decode_success;
10927   }
10928
10929   /* 66 0F 65 = PCMPGTW */
10930   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
10931      delta = dis_SSEint_E_to_G( sorb, delta+2,
10932                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
10933      goto decode_success;
10934   }
10935
10936   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
10937      zero-extend of it in ireg(G). */
10938   if (insn[0] == 0x0F && insn[1] == 0xC5) {
10939      modrm = insn[2];
10940      if (sz == 2 && epartIsReg(modrm)) {
10941         t5 = newTemp(Ity_V128);
10942         t4 = newTemp(Ity_I16);
10943         assign(t5, getXMMReg(eregOfRM(modrm)));
10944         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
10945         switch (insn[3] & 7) {
10946            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
10947            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
10948            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
10949            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
10950            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
10951            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
10952            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
10953            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
10954            default: vassert(0); /*NOTREACHED*/
10955         }
10956         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
10957         DIP("pextrw $%d,%s,%s\n",
10958             (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
10959                           nameIReg(4,gregOfRM(modrm)));
10960         delta += 4;
10961         goto decode_success;
10962      }
10963      /* else fall through */
10964   }
10965
10966   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
10967      put it into the specified lane of xmm(G). */
10968   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
10969      Int lane;
10970      t4 = newTemp(Ity_I16);
10971      modrm = insn[2];
10972
10973      if (epartIsReg(modrm)) {
10974         assign(t4, getIReg(2, eregOfRM(modrm)));
10975         delta += 3+1;
10976         lane = insn[3+1-1];
10977         DIP("pinsrw $%d,%s,%s\n", lane,
10978                                   nameIReg(2,eregOfRM(modrm)),
10979                                   nameXMMReg(gregOfRM(modrm)));
10980      } else {
10981         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10982         delta += 3+alen;
10983         lane = insn[3+alen-1];
10984         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
10985         DIP("pinsrw $%d,%s,%s\n", lane,
10986                                   dis_buf,
10987                                   nameXMMReg(gregOfRM(modrm)));
10988      }
10989
10990      putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
10991      goto decode_success;
10992   }
10993
10994   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
10995      E(xmm or mem) to G(xmm) */
10996   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
10997      IRTemp s1V  = newTemp(Ity_V128);
10998      IRTemp s2V  = newTemp(Ity_V128);
10999      IRTemp dV   = newTemp(Ity_V128);
11000      IRTemp s1Hi = newTemp(Ity_I64);
11001      IRTemp s1Lo = newTemp(Ity_I64);
11002      IRTemp s2Hi = newTemp(Ity_I64);
11003      IRTemp s2Lo = newTemp(Ity_I64);
11004      IRTemp dHi  = newTemp(Ity_I64);
11005      IRTemp dLo  = newTemp(Ity_I64);
11006      modrm = insn[2];
11007      if (epartIsReg(modrm)) {
11008         assign( s1V, getXMMReg(eregOfRM(modrm)) );
11009         delta += 2+1;
11010         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11011                                nameXMMReg(gregOfRM(modrm)));
11012      } else {
11013         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11014         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11015         delta += 2+alen;
11016         DIP("pmaddwd %s,%s\n", dis_buf,
11017                                nameXMMReg(gregOfRM(modrm)));
11018      }
11019      assign( s2V, getXMMReg(gregOfRM(modrm)) );
11020      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11021      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11022      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11023      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11024      assign( dHi, mkIRExprCCall(
11025                      Ity_I64, 0/*regparms*/,
11026                      "x86g_calculate_mmx_pmaddwd",
11027                      &x86g_calculate_mmx_pmaddwd,
11028                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11029                   ));
11030      assign( dLo, mkIRExprCCall(
11031                      Ity_I64, 0/*regparms*/,
11032                      "x86g_calculate_mmx_pmaddwd",
11033                      &x86g_calculate_mmx_pmaddwd,
11034                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11035                   ));
11036      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11037      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11038      goto decode_success;
11039   }
11040
11041   /* 66 0F EE = PMAXSW -- 16x8 signed max */
11042   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
11043      delta = dis_SSEint_E_to_G( sorb, delta+2,
11044                                 "pmaxsw", Iop_Max16Sx8, False );
11045      goto decode_success;
11046   }
11047
11048   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
11049   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
11050      delta = dis_SSEint_E_to_G( sorb, delta+2,
11051                                 "pmaxub", Iop_Max8Ux16, False );
11052      goto decode_success;
11053   }
11054
11055   /* 66 0F EA = PMINSW -- 16x8 signed min */
11056   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
11057      delta = dis_SSEint_E_to_G( sorb, delta+2,
11058                                 "pminsw", Iop_Min16Sx8, False );
11059      goto decode_success;
11060   }
11061
11062   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
11063   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
11064      delta = dis_SSEint_E_to_G( sorb, delta+2,
11065                                 "pminub", Iop_Min8Ux16, False );
11066      goto decode_success;
11067   }
11068
11069   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
11070      in xmm(E), turn them into a byte, and put zero-extend of it in
11071      ireg(G). */
11072   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
11073      modrm = insn[2];
11074      if (epartIsReg(modrm)) {
11075         t0 = newTemp(Ity_I64);
11076         t1 = newTemp(Ity_I64);
11077         assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
11078         assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
11079         t5 = newTemp(Ity_I32);
11080         assign(t5,
11081                unop(Iop_16Uto32,
11082                     binop(Iop_8HLto16,
11083                           unop(Iop_GetMSBs8x8, mkexpr(t1)),
11084                           unop(Iop_GetMSBs8x8, mkexpr(t0)))));
11085         putIReg(4, gregOfRM(modrm), mkexpr(t5));
11086         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11087                                 nameIReg(4,gregOfRM(modrm)));
11088         delta += 3;
11089         goto decode_success;
11090      }
11091      /* else fall through */
11092   }
11093
11094   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
11095   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
11096      delta = dis_SSEint_E_to_G( sorb, delta+2,
11097                                 "pmulhuw", Iop_MulHi16Ux8, False );
11098      goto decode_success;
11099   }
11100
11101   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
11102   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
11103      delta = dis_SSEint_E_to_G( sorb, delta+2,
11104                                 "pmulhw", Iop_MulHi16Sx8, False );
11105      goto decode_success;
11106   }
11107
11108   /* 66 0F D5 = PMULHL -- 16x8 multiply */
11109   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
11110      delta = dis_SSEint_E_to_G( sorb, delta+2,
11111                                 "pmullw", Iop_Mul16x8, False );
11112      goto decode_success;
11113   }
11114
11115   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11116   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
11117      0 to form 64-bit result */
11118   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
11119      IRTemp sV = newTemp(Ity_I64);
11120      IRTemp dV = newTemp(Ity_I64);
11121      t1 = newTemp(Ity_I32);
11122      t0 = newTemp(Ity_I32);
11123      modrm = insn[2];
11124
11125      do_MMX_preamble();
11126      assign( dV, getMMXReg(gregOfRM(modrm)) );
11127
11128      if (epartIsReg(modrm)) {
11129         assign( sV, getMMXReg(eregOfRM(modrm)) );
11130         delta += 2+1;
11131         DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11132                                nameMMXReg(gregOfRM(modrm)));
11133      } else {
11134         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11135         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11136         delta += 2+alen;
11137         DIP("pmuludq %s,%s\n", dis_buf,
11138                                nameMMXReg(gregOfRM(modrm)));
11139      }
11140
11141      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
11142      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
11143      putMMXReg( gregOfRM(modrm),
11144                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
11145      goto decode_success;
11146   }
11147
11148   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
11149      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
11150      half */
11151   /* This is a really poor translation -- could be improved if
11152      performance critical */
11153   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
11154      IRTemp sV, dV;
11155      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11156      sV = newTemp(Ity_V128);
11157      dV = newTemp(Ity_V128);
11158      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11159      t1 = newTemp(Ity_I64);
11160      t0 = newTemp(Ity_I64);
11161      modrm = insn[2];
11162      assign( dV, getXMMReg(gregOfRM(modrm)) );
11163
11164      if (epartIsReg(modrm)) {
11165         assign( sV, getXMMReg(eregOfRM(modrm)) );
11166         delta += 2+1;
11167         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11168                                nameXMMReg(gregOfRM(modrm)));
11169      } else {
11170         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11171         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11172         delta += 2+alen;
11173         DIP("pmuludq %s,%s\n", dis_buf,
11174                                nameXMMReg(gregOfRM(modrm)));
11175      }
11176
11177      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
11178      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11179
11180      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
11181      putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
11182      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
11183      putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
11184      goto decode_success;
11185   }
11186
11187   /* 66 0F EB = POR */
11188   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
11189      delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
11190      goto decode_success;
11191   }
11192
11193   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
11194      from E(xmm or mem) to G(xmm) */
11195   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
11196      IRTemp s1V  = newTemp(Ity_V128);
11197      IRTemp s2V  = newTemp(Ity_V128);
11198      IRTemp dV   = newTemp(Ity_V128);
11199      IRTemp s1Hi = newTemp(Ity_I64);
11200      IRTemp s1Lo = newTemp(Ity_I64);
11201      IRTemp s2Hi = newTemp(Ity_I64);
11202      IRTemp s2Lo = newTemp(Ity_I64);
11203      IRTemp dHi  = newTemp(Ity_I64);
11204      IRTemp dLo  = newTemp(Ity_I64);
11205      modrm = insn[2];
11206      if (epartIsReg(modrm)) {
11207         assign( s1V, getXMMReg(eregOfRM(modrm)) );
11208         delta += 2+1;
11209         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11210                               nameXMMReg(gregOfRM(modrm)));
11211      } else {
11212         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11213         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11214         delta += 2+alen;
11215         DIP("psadbw %s,%s\n", dis_buf,
11216                               nameXMMReg(gregOfRM(modrm)));
11217      }
11218      assign( s2V, getXMMReg(gregOfRM(modrm)) );
11219      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11220      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11221      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11222      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11223      assign( dHi, mkIRExprCCall(
11224                      Ity_I64, 0/*regparms*/,
11225                      "x86g_calculate_mmx_psadbw",
11226                      &x86g_calculate_mmx_psadbw,
11227                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11228                   ));
11229      assign( dLo, mkIRExprCCall(
11230                      Ity_I64, 0/*regparms*/,
11231                      "x86g_calculate_mmx_psadbw",
11232                      &x86g_calculate_mmx_psadbw,
11233                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11234                   ));
11235      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11236      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11237      goto decode_success;
11238   }
11239
11240   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
11241   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
11242      Int order;
11243      IRTemp sV, dV, s3, s2, s1, s0;
11244      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11245      sV = newTemp(Ity_V128);
11246      dV = newTemp(Ity_V128);
11247      modrm = insn[2];
11248      if (epartIsReg(modrm)) {
11249         assign( sV, getXMMReg(eregOfRM(modrm)) );
11250         order = (Int)insn[3];
11251         delta += 2+2;
11252         DIP("pshufd $%d,%s,%s\n", order,
11253                                   nameXMMReg(eregOfRM(modrm)),
11254                                   nameXMMReg(gregOfRM(modrm)));
11255      } else {
11256         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11257         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11258	 order = (Int)insn[2+alen];
11259         delta += 3+alen;
11260         DIP("pshufd $%d,%s,%s\n", order,
11261                                   dis_buf,
11262                                   nameXMMReg(gregOfRM(modrm)));
11263      }
11264      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11265
11266#     define SEL(n) \
11267                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11268      assign(dV,
11269	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
11270                           SEL((order>>2)&3), SEL((order>>0)&3) )
11271      );
11272      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11273#     undef SEL
11274      goto decode_success;
11275   }
11276
11277   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
11278      mem) to G(xmm), and copy lower half */
11279   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
11280      Int order;
11281      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
11282      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11283      sV   = newTemp(Ity_V128);
11284      dV   = newTemp(Ity_V128);
11285      sVhi = newTemp(Ity_I64);
11286      dVhi = newTemp(Ity_I64);
11287      modrm = insn[3];
11288      if (epartIsReg(modrm)) {
11289         assign( sV, getXMMReg(eregOfRM(modrm)) );
11290         order = (Int)insn[4];
11291         delta += 4+1;
11292         DIP("pshufhw $%d,%s,%s\n", order,
11293                                    nameXMMReg(eregOfRM(modrm)),
11294                                    nameXMMReg(gregOfRM(modrm)));
11295      } else {
11296         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11297         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11298	 order = (Int)insn[3+alen];
11299         delta += 4+alen;
11300         DIP("pshufhw $%d,%s,%s\n", order,
11301                                    dis_buf,
11302                                    nameXMMReg(gregOfRM(modrm)));
11303      }
11304      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
11305      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
11306
11307#     define SEL(n) \
11308                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11309      assign(dVhi,
11310	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11311                          SEL((order>>2)&3), SEL((order>>0)&3) )
11312      );
11313      assign(dV, binop( Iop_64HLtoV128,
11314                        mkexpr(dVhi),
11315                        unop(Iop_V128to64, mkexpr(sV))) );
11316      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11317#     undef SEL
11318      goto decode_success;
11319   }
11320
11321   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
11322      mem) to G(xmm), and copy upper half */
11323   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
11324      Int order;
11325      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
11326      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11327      sV   = newTemp(Ity_V128);
11328      dV   = newTemp(Ity_V128);
11329      sVlo = newTemp(Ity_I64);
11330      dVlo = newTemp(Ity_I64);
11331      modrm = insn[3];
11332      if (epartIsReg(modrm)) {
11333         assign( sV, getXMMReg(eregOfRM(modrm)) );
11334         order = (Int)insn[4];
11335         delta += 4+1;
11336         DIP("pshuflw $%d,%s,%s\n", order,
11337                                    nameXMMReg(eregOfRM(modrm)),
11338                                    nameXMMReg(gregOfRM(modrm)));
11339      } else {
11340         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11341         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11342	 order = (Int)insn[3+alen];
11343         delta += 4+alen;
11344         DIP("pshuflw $%d,%s,%s\n", order,
11345                                    dis_buf,
11346                                    nameXMMReg(gregOfRM(modrm)));
11347      }
11348      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
11349      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
11350
11351#     define SEL(n) \
11352                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11353      assign(dVlo,
11354	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11355                          SEL((order>>2)&3), SEL((order>>0)&3) )
11356      );
11357      assign(dV, binop( Iop_64HLtoV128,
11358                        unop(Iop_V128HIto64, mkexpr(sV)),
11359                        mkexpr(dVlo) ) );
11360      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11361#     undef SEL
11362      goto decode_success;
11363   }
11364
11365   /* 66 0F 72 /6 ib = PSLLD by immediate */
11366   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11367       && epartIsReg(insn[2])
11368       && gregOfRM(insn[2]) == 6) {
11369      delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
11370      goto decode_success;
11371   }
11372
11373   /* 66 0F F2 = PSLLD by E */
11374   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
11375      delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
11376      goto decode_success;
11377   }
11378
11379   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
11380   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11381       && epartIsReg(insn[2])
11382       && gregOfRM(insn[2]) == 7) {
11383      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11384      Int    imm = (Int)insn[3];
11385      Int    reg = eregOfRM(insn[2]);
11386      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
11387      vassert(imm >= 0 && imm <= 255);
11388      delta += 4;
11389
11390      sV    = newTemp(Ity_V128);
11391      dV    = newTemp(Ity_V128);
11392      hi64  = newTemp(Ity_I64);
11393      lo64  = newTemp(Ity_I64);
11394      hi64r = newTemp(Ity_I64);
11395      lo64r = newTemp(Ity_I64);
11396
11397      if (imm >= 16) {
11398         putXMMReg(reg, mkV128(0x0000));
11399         goto decode_success;
11400      }
11401
11402      assign( sV, getXMMReg(reg) );
11403      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11404      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11405
11406      if (imm == 0) {
11407         assign( lo64r, mkexpr(lo64) );
11408         assign( hi64r, mkexpr(hi64) );
11409      }
11410      else
11411      if (imm == 8) {
11412         assign( lo64r, mkU64(0) );
11413         assign( hi64r, mkexpr(lo64) );
11414      }
11415      else
11416      if (imm > 8) {
11417         assign( lo64r, mkU64(0) );
11418         assign( hi64r, binop( Iop_Shl64,
11419                               mkexpr(lo64),
11420                               mkU8( 8*(imm-8) ) ));
11421      } else {
11422         assign( lo64r, binop( Iop_Shl64,
11423                               mkexpr(lo64),
11424                               mkU8(8 * imm) ));
11425         assign( hi64r,
11426                 binop( Iop_Or64,
11427                        binop(Iop_Shl64, mkexpr(hi64),
11428                                         mkU8(8 * imm)),
11429                        binop(Iop_Shr64, mkexpr(lo64),
11430                                         mkU8(8 * (8 - imm)) )
11431                      )
11432               );
11433      }
11434      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11435      putXMMReg(reg, mkexpr(dV));
11436      goto decode_success;
11437   }
11438
11439   /* 66 0F 73 /6 ib = PSLLQ by immediate */
11440   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11441       && epartIsReg(insn[2])
11442       && gregOfRM(insn[2]) == 6) {
11443      delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
11444      goto decode_success;
11445   }
11446
11447   /* 66 0F F3 = PSLLQ by E */
11448   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
11449      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
11450      goto decode_success;
11451   }
11452
11453   /* 66 0F 71 /6 ib = PSLLW by immediate */
11454   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11455       && epartIsReg(insn[2])
11456       && gregOfRM(insn[2]) == 6) {
11457      delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
11458      goto decode_success;
11459   }
11460
11461   /* 66 0F F1 = PSLLW by E */
11462   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
11463      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
11464      goto decode_success;
11465   }
11466
11467   /* 66 0F 72 /4 ib = PSRAD by immediate */
11468   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11469       && epartIsReg(insn[2])
11470       && gregOfRM(insn[2]) == 4) {
11471      delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
11472      goto decode_success;
11473   }
11474
11475   /* 66 0F E2 = PSRAD by E */
11476   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
11477      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
11478      goto decode_success;
11479   }
11480
11481   /* 66 0F 71 /4 ib = PSRAW by immediate */
11482   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11483       && epartIsReg(insn[2])
11484       && gregOfRM(insn[2]) == 4) {
11485      delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
11486      goto decode_success;
11487   }
11488
11489   /* 66 0F E1 = PSRAW by E */
11490   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
11491      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
11492      goto decode_success;
11493   }
11494
11495   /* 66 0F 72 /2 ib = PSRLD by immediate */
11496   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11497       && epartIsReg(insn[2])
11498       && gregOfRM(insn[2]) == 2) {
11499      delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
11500      goto decode_success;
11501   }
11502
11503   /* 66 0F D2 = PSRLD by E */
11504   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
11505      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
11506      goto decode_success;
11507   }
11508
11509   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
11510   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11511       && epartIsReg(insn[2])
11512       && gregOfRM(insn[2]) == 3) {
11513      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11514      Int    imm = (Int)insn[3];
11515      Int    reg = eregOfRM(insn[2]);
11516      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
11517      vassert(imm >= 0 && imm <= 255);
11518      delta += 4;
11519
11520      sV    = newTemp(Ity_V128);
11521      dV    = newTemp(Ity_V128);
11522      hi64  = newTemp(Ity_I64);
11523      lo64  = newTemp(Ity_I64);
11524      hi64r = newTemp(Ity_I64);
11525      lo64r = newTemp(Ity_I64);
11526
11527      if (imm >= 16) {
11528         putXMMReg(reg, mkV128(0x0000));
11529         goto decode_success;
11530      }
11531
11532      assign( sV, getXMMReg(reg) );
11533      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11534      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11535
11536      if (imm == 0) {
11537         assign( lo64r, mkexpr(lo64) );
11538         assign( hi64r, mkexpr(hi64) );
11539      }
11540      else
11541      if (imm == 8) {
11542         assign( hi64r, mkU64(0) );
11543         assign( lo64r, mkexpr(hi64) );
11544      }
11545      else
11546      if (imm > 8) {
11547         assign( hi64r, mkU64(0) );
11548         assign( lo64r, binop( Iop_Shr64,
11549                               mkexpr(hi64),
11550                               mkU8( 8*(imm-8) ) ));
11551      } else {
11552         assign( hi64r, binop( Iop_Shr64,
11553                               mkexpr(hi64),
11554                               mkU8(8 * imm) ));
11555         assign( lo64r,
11556                 binop( Iop_Or64,
11557                        binop(Iop_Shr64, mkexpr(lo64),
11558                                         mkU8(8 * imm)),
11559                        binop(Iop_Shl64, mkexpr(hi64),
11560                                         mkU8(8 * (8 - imm)) )
11561                      )
11562               );
11563      }
11564
11565      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11566      putXMMReg(reg, mkexpr(dV));
11567      goto decode_success;
11568   }
11569
11570   /* 66 0F 73 /2 ib = PSRLQ by immediate */
11571   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11572       && epartIsReg(insn[2])
11573       && gregOfRM(insn[2]) == 2) {
11574      delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
11575      goto decode_success;
11576   }
11577
11578   /* 66 0F D3 = PSRLQ by E */
11579   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
11580      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
11581      goto decode_success;
11582   }
11583
11584   /* 66 0F 71 /2 ib = PSRLW by immediate */
11585   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11586       && epartIsReg(insn[2])
11587       && gregOfRM(insn[2]) == 2) {
11588      delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
11589      goto decode_success;
11590   }
11591
11592   /* 66 0F D1 = PSRLW by E */
11593   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
11594      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
11595      goto decode_success;
11596   }
11597
11598   /* 66 0F F8 = PSUBB */
11599   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
11600      delta = dis_SSEint_E_to_G( sorb, delta+2,
11601                                 "psubb", Iop_Sub8x16, False );
11602      goto decode_success;
11603   }
11604
11605   /* 66 0F FA = PSUBD */
11606   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
11607      delta = dis_SSEint_E_to_G( sorb, delta+2,
11608                                 "psubd", Iop_Sub32x4, False );
11609      goto decode_success;
11610   }
11611
11612   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11613   /* 0F FB = PSUBQ -- sub 64x1 */
11614   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
11615      do_MMX_preamble();
11616      delta = dis_MMXop_regmem_to_reg (
11617                sorb, delta+2, insn[1], "psubq", False );
11618      goto decode_success;
11619   }
11620
11621   /* 66 0F FB = PSUBQ */
11622   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
11623      delta = dis_SSEint_E_to_G( sorb, delta+2,
11624                                 "psubq", Iop_Sub64x2, False );
11625      goto decode_success;
11626   }
11627
11628   /* 66 0F F9 = PSUBW */
11629   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
11630      delta = dis_SSEint_E_to_G( sorb, delta+2,
11631                                 "psubw", Iop_Sub16x8, False );
11632      goto decode_success;
11633   }
11634
11635   /* 66 0F E8 = PSUBSB */
11636   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
11637      delta = dis_SSEint_E_to_G( sorb, delta+2,
11638                                 "psubsb", Iop_QSub8Sx16, False );
11639      goto decode_success;
11640   }
11641
11642   /* 66 0F E9 = PSUBSW */
11643   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
11644      delta = dis_SSEint_E_to_G( sorb, delta+2,
11645                                 "psubsw", Iop_QSub16Sx8, False );
11646      goto decode_success;
11647   }
11648
11649   /* 66 0F D8 = PSUBSB */
11650   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
11651      delta = dis_SSEint_E_to_G( sorb, delta+2,
11652                                 "psubusb", Iop_QSub8Ux16, False );
11653      goto decode_success;
11654   }
11655
11656   /* 66 0F D9 = PSUBSW */
11657   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
11658      delta = dis_SSEint_E_to_G( sorb, delta+2,
11659                                 "psubusw", Iop_QSub16Ux8, False );
11660      goto decode_success;
11661   }
11662
11663   /* 66 0F 68 = PUNPCKHBW */
11664   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
11665      delta = dis_SSEint_E_to_G( sorb, delta+2,
11666                                 "punpckhbw",
11667                                 Iop_InterleaveHI8x16, True );
11668      goto decode_success;
11669   }
11670
11671   /* 66 0F 6A = PUNPCKHDQ */
11672   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
11673      delta = dis_SSEint_E_to_G( sorb, delta+2,
11674                                 "punpckhdq",
11675                                 Iop_InterleaveHI32x4, True );
11676      goto decode_success;
11677   }
11678
11679   /* 66 0F 6D = PUNPCKHQDQ */
11680   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
11681      delta = dis_SSEint_E_to_G( sorb, delta+2,
11682                                 "punpckhqdq",
11683                                 Iop_InterleaveHI64x2, True );
11684      goto decode_success;
11685   }
11686
11687   /* 66 0F 69 = PUNPCKHWD */
11688   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
11689      delta = dis_SSEint_E_to_G( sorb, delta+2,
11690                                 "punpckhwd",
11691                                 Iop_InterleaveHI16x8, True );
11692      goto decode_success;
11693   }
11694
11695   /* 66 0F 60 = PUNPCKLBW */
11696   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
11697      delta = dis_SSEint_E_to_G( sorb, delta+2,
11698                                 "punpcklbw",
11699                                 Iop_InterleaveLO8x16, True );
11700      goto decode_success;
11701   }
11702
11703   /* 66 0F 62 = PUNPCKLDQ */
11704   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
11705      delta = dis_SSEint_E_to_G( sorb, delta+2,
11706                                 "punpckldq",
11707                                 Iop_InterleaveLO32x4, True );
11708      goto decode_success;
11709   }
11710
11711   /* 66 0F 6C = PUNPCKLQDQ */
11712   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
11713      delta = dis_SSEint_E_to_G( sorb, delta+2,
11714                                 "punpcklqdq",
11715                                 Iop_InterleaveLO64x2, True );
11716      goto decode_success;
11717   }
11718
11719   /* 66 0F 61 = PUNPCKLWD */
11720   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
11721      delta = dis_SSEint_E_to_G( sorb, delta+2,
11722                                 "punpcklwd",
11723                                 Iop_InterleaveLO16x8, True );
11724      goto decode_success;
11725   }
11726
11727   /* 66 0F EF = PXOR */
11728   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
11729      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
11730      goto decode_success;
11731   }
11732
11733//--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
11734//--    if (insn[0] == 0x0F && insn[1] == 0xAE
11735//--        && (!epartIsReg(insn[2]))
11736//--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
11737//--       Bool store = gregOfRM(insn[2]) == 0;
11738//--       vg_assert(sz == 4);
11739//--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
11740//--       t1   = LOW24(pair);
11741//--       eip += 2+HI8(pair);
11742//--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
11743//--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
11744//--                   Lit16, (UShort)insn[2],
11745//--                   TempReg, t1 );
11746//--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
11747//--       goto decode_success;
11748//--    }
11749
11750   /* 0F AE /7 = CLFLUSH -- flush cache line */
11751   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
11752       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
11753
11754      /* This is something of a hack.  We need to know the size of the
11755         cache line containing addr.  Since we don't (easily), assume
11756         256 on the basis that no real cache would have a line that
11757         big.  It's safe to invalidate more stuff than we need, just
11758         inefficient. */
11759      UInt lineszB = 256;
11760
11761      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11762      delta += 2+alen;
11763
11764      /* Round addr down to the start of the containing block. */
11765      stmt( IRStmt_Put(
11766               OFFB_CMSTART,
11767               binop( Iop_And32,
11768                      mkexpr(addr),
11769                      mkU32( ~(lineszB-1) ))) );
11770
11771      stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
11772
11773      jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
11774
11775      DIP("clflush %s\n", dis_buf);
11776      goto decode_success;
11777   }
11778
11779   /* ---------------------------------------------------- */
11780   /* --- end of the SSE2 decoder.                     --- */
11781   /* ---------------------------------------------------- */
11782
11783   /* ---------------------------------------------------- */
11784   /* --- start of the SSE3 decoder.                   --- */
11785   /* ---------------------------------------------------- */
11786
11787   /* Skip parts of the decoder which don't apply given the stated
11788      guest subarchitecture. */
11789   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3))
11790      goto after_sse_decoders; /* no SSE3 capabilities */
11791
11792   insn = &guest_code[delta];
11793
11794   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
11795      duplicating some lanes (2:2:0:0). */
11796   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
11797      duplicating some lanes (3:3:1:1). */
11798   if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
11799       && (insn[2] == 0x12 || insn[2] == 0x16)) {
11800      IRTemp s3, s2, s1, s0;
11801      IRTemp sV  = newTemp(Ity_V128);
11802      Bool   isH = insn[2] == 0x16;
11803      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11804
11805      modrm = insn[3];
11806      if (epartIsReg(modrm)) {
11807         assign( sV, getXMMReg( eregOfRM(modrm)) );
11808         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11809                                  nameXMMReg(eregOfRM(modrm)),
11810                                  nameXMMReg(gregOfRM(modrm)));
11811         delta += 3+1;
11812      } else {
11813         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11814         gen_SEGV_if_not_16_aligned( addr );
11815         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11816         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11817	     dis_buf,
11818             nameXMMReg(gregOfRM(modrm)));
11819         delta += 3+alen;
11820      }
11821
11822      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11823      putXMMReg( gregOfRM(modrm),
11824                 isH ? mk128from32s( s3, s3, s1, s1 )
11825                     : mk128from32s( s2, s2, s0, s0 ) );
11826      goto decode_success;
11827   }
11828
11829   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
11830      duplicating some lanes (0:1:0:1). */
11831   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
11832      IRTemp sV = newTemp(Ity_V128);
11833      IRTemp d0 = newTemp(Ity_I64);
11834
11835      modrm = insn[3];
11836      if (epartIsReg(modrm)) {
11837         assign( sV, getXMMReg( eregOfRM(modrm)) );
11838         DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11839                                nameXMMReg(gregOfRM(modrm)));
11840         delta += 3+1;
11841         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
11842      } else {
11843         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11844         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
11845         DIP("movddup %s,%s\n", dis_buf,
11846                                nameXMMReg(gregOfRM(modrm)));
11847         delta += 3+alen;
11848      }
11849
11850      putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
11851      goto decode_success;
11852   }
11853
11854   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
11855   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
11856      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11857      IRTemp eV   = newTemp(Ity_V128);
11858      IRTemp gV   = newTemp(Ity_V128);
11859      IRTemp addV = newTemp(Ity_V128);
11860      IRTemp subV = newTemp(Ity_V128);
11861      IRTemp rm     = newTemp(Ity_I32);
11862      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11863
11864      modrm = insn[3];
11865      if (epartIsReg(modrm)) {
11866         assign( eV, getXMMReg( eregOfRM(modrm)) );
11867         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11868                                 nameXMMReg(gregOfRM(modrm)));
11869         delta += 3+1;
11870      } else {
11871         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11872         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11873         DIP("addsubps %s,%s\n", dis_buf,
11874                                 nameXMMReg(gregOfRM(modrm)));
11875         delta += 3+alen;
11876      }
11877
11878      assign( gV, getXMMReg(gregOfRM(modrm)) );
11879
11880      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11881      assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11882      assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11883
11884      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
11885      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
11886
11887      putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
11888      goto decode_success;
11889   }
11890
11891   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
11892   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
11893      IRTemp eV   = newTemp(Ity_V128);
11894      IRTemp gV   = newTemp(Ity_V128);
11895      IRTemp addV = newTemp(Ity_V128);
11896      IRTemp subV = newTemp(Ity_V128);
11897      IRTemp a1     = newTemp(Ity_I64);
11898      IRTemp s0     = newTemp(Ity_I64);
11899      IRTemp rm     = newTemp(Ity_I32);
11900
11901      modrm = insn[2];
11902      if (epartIsReg(modrm)) {
11903         assign( eV, getXMMReg( eregOfRM(modrm)) );
11904         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11905                                 nameXMMReg(gregOfRM(modrm)));
11906         delta += 2+1;
11907      } else {
11908         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11909         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11910         DIP("addsubpd %s,%s\n", dis_buf,
11911                                 nameXMMReg(gregOfRM(modrm)));
11912         delta += 2+alen;
11913      }
11914
11915      assign( gV, getXMMReg(gregOfRM(modrm)) );
11916
11917      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11918      assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11919      assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11920
11921      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11922      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11923
11924      putXMMReg( gregOfRM(modrm),
11925                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11926      goto decode_success;
11927   }
11928
11929   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
11930   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
11931   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
11932       && (insn[2] == 0x7C || insn[2] == 0x7D)) {
11933      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
11934      IRTemp eV     = newTemp(Ity_V128);
11935      IRTemp gV     = newTemp(Ity_V128);
11936      IRTemp leftV  = newTemp(Ity_V128);
11937      IRTemp rightV = newTemp(Ity_V128);
11938      IRTemp rm     = newTemp(Ity_I32);
11939      Bool   isAdd  = insn[2] == 0x7C;
11940      const HChar* str = isAdd ? "add" : "sub";
11941      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
11942
11943      modrm = insn[3];
11944      if (epartIsReg(modrm)) {
11945         assign( eV, getXMMReg( eregOfRM(modrm)) );
11946         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11947                                   nameXMMReg(gregOfRM(modrm)));
11948         delta += 3+1;
11949      } else {
11950         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11951         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11952         DIP("h%sps %s,%s\n", str, dis_buf,
11953                                   nameXMMReg(gregOfRM(modrm)));
11954         delta += 3+alen;
11955      }
11956
11957      assign( gV, getXMMReg(gregOfRM(modrm)) );
11958
11959      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
11960      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
11961
11962      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
11963      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
11964
11965      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11966      putXMMReg( gregOfRM(modrm),
11967                 triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
11968                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
11969      goto decode_success;
11970   }
11971
11972   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
11973   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
11974   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
11975      IRTemp e1     = newTemp(Ity_I64);
11976      IRTemp e0     = newTemp(Ity_I64);
11977      IRTemp g1     = newTemp(Ity_I64);
11978      IRTemp g0     = newTemp(Ity_I64);
11979      IRTemp eV     = newTemp(Ity_V128);
11980      IRTemp gV     = newTemp(Ity_V128);
11981      IRTemp leftV  = newTemp(Ity_V128);
11982      IRTemp rightV = newTemp(Ity_V128);
11983      IRTemp rm     = newTemp(Ity_I32);
11984      Bool   isAdd  = insn[1] == 0x7C;
11985      const HChar* str = isAdd ? "add" : "sub";
11986
11987      modrm = insn[2];
11988      if (epartIsReg(modrm)) {
11989         assign( eV, getXMMReg( eregOfRM(modrm)) );
11990         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11991                                   nameXMMReg(gregOfRM(modrm)));
11992         delta += 2+1;
11993      } else {
11994         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11995         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11996         DIP("h%spd %s,%s\n", str, dis_buf,
11997                              nameXMMReg(gregOfRM(modrm)));
11998         delta += 2+alen;
11999      }
12000
12001      assign( gV, getXMMReg(gregOfRM(modrm)) );
12002
12003      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
12004      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
12005      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
12006      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
12007
12008      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
12009      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
12010
12011      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
12012      putXMMReg( gregOfRM(modrm),
12013                 triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
12014                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
12015      goto decode_success;
12016   }
12017
12018   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
12019   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
12020      modrm = getIByte(delta+3);
12021      if (epartIsReg(modrm)) {
12022         goto decode_failure;
12023      } else {
12024         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12025         putXMMReg( gregOfRM(modrm),
12026                    loadLE(Ity_V128, mkexpr(addr)) );
12027         DIP("lddqu %s,%s\n", dis_buf,
12028                              nameXMMReg(gregOfRM(modrm)));
12029         delta += 3+alen;
12030      }
12031      goto decode_success;
12032   }
12033
12034   /* ---------------------------------------------------- */
12035   /* --- end of the SSE3 decoder.                     --- */
12036   /* ---------------------------------------------------- */
12037
12038   /* ---------------------------------------------------- */
12039   /* --- start of the SSSE3 decoder.                  --- */
12040   /* ---------------------------------------------------- */
12041
12042   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
12043      Unsigned Bytes (MMX) */
12044   if (sz == 4
12045       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
12046      IRTemp sV        = newTemp(Ity_I64);
12047      IRTemp dV        = newTemp(Ity_I64);
12048      IRTemp sVoddsSX  = newTemp(Ity_I64);
12049      IRTemp sVevensSX = newTemp(Ity_I64);
12050      IRTemp dVoddsZX  = newTemp(Ity_I64);
12051      IRTemp dVevensZX = newTemp(Ity_I64);
12052
12053      modrm = insn[3];
12054      do_MMX_preamble();
12055      assign( dV, getMMXReg(gregOfRM(modrm)) );
12056
12057      if (epartIsReg(modrm)) {
12058         assign( sV, getMMXReg(eregOfRM(modrm)) );
12059         delta += 3+1;
12060         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12061                                  nameMMXReg(gregOfRM(modrm)));
12062      } else {
12063         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12064         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12065         delta += 3+alen;
12066         DIP("pmaddubsw %s,%s\n", dis_buf,
12067                                  nameMMXReg(gregOfRM(modrm)));
12068      }
12069
12070      /* compute dV unsigned x sV signed */
12071      assign( sVoddsSX,
12072              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
12073      assign( sVevensSX,
12074              binop(Iop_SarN16x4,
12075                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
12076                    mkU8(8)) );
12077      assign( dVoddsZX,
12078              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
12079      assign( dVevensZX,
12080              binop(Iop_ShrN16x4,
12081                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
12082                    mkU8(8)) );
12083
12084      putMMXReg(
12085         gregOfRM(modrm),
12086         binop(Iop_QAdd16Sx4,
12087               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
12088               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
12089         )
12090      );
12091      goto decode_success;
12092   }
12093
12094   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
12095      Unsigned Bytes (XMM) */
12096   if (sz == 2
12097       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
12098      IRTemp sV        = newTemp(Ity_V128);
12099      IRTemp dV        = newTemp(Ity_V128);
12100      IRTemp sVoddsSX  = newTemp(Ity_V128);
12101      IRTemp sVevensSX = newTemp(Ity_V128);
12102      IRTemp dVoddsZX  = newTemp(Ity_V128);
12103      IRTemp dVevensZX = newTemp(Ity_V128);
12104
12105      modrm = insn[3];
12106      assign( dV, getXMMReg(gregOfRM(modrm)) );
12107
12108      if (epartIsReg(modrm)) {
12109         assign( sV, getXMMReg(eregOfRM(modrm)) );
12110         delta += 3+1;
12111         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12112                                  nameXMMReg(gregOfRM(modrm)));
12113      } else {
12114         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12115         gen_SEGV_if_not_16_aligned( addr );
12116         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12117         delta += 3+alen;
12118         DIP("pmaddubsw %s,%s\n", dis_buf,
12119                                  nameXMMReg(gregOfRM(modrm)));
12120      }
12121
12122      /* compute dV unsigned x sV signed */
12123      assign( sVoddsSX,
12124              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
12125      assign( sVevensSX,
12126              binop(Iop_SarN16x8,
12127                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
12128                    mkU8(8)) );
12129      assign( dVoddsZX,
12130              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
12131      assign( dVevensZX,
12132              binop(Iop_ShrN16x8,
12133                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
12134                    mkU8(8)) );
12135
12136      putXMMReg(
12137         gregOfRM(modrm),
12138         binop(Iop_QAdd16Sx8,
12139               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
12140               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
12141         )
12142      );
12143      goto decode_success;
12144   }
12145
12146   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
12147   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
12148      mmx) and G to G (mmx). */
12149   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
12150      mmx) and G to G (mmx). */
12151   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
12152      to G (mmx). */
12153   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
12154      to G (mmx). */
12155   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
12156      to G (mmx). */
12157   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
12158      to G (mmx). */
12159
12160   if (sz == 4
12161       && insn[0] == 0x0F && insn[1] == 0x38
12162       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12163           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12164      const HChar* str = "???";
12165      IROp   opV64  = Iop_INVALID;
12166      IROp   opCatO = Iop_CatOddLanes16x4;
12167      IROp   opCatE = Iop_CatEvenLanes16x4;
12168      IRTemp sV     = newTemp(Ity_I64);
12169      IRTemp dV     = newTemp(Ity_I64);
12170
12171      modrm = insn[3];
12172
12173      switch (insn[2]) {
12174         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12175         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12176         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12177         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12178         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12179         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12180         default: vassert(0);
12181      }
12182      if (insn[2] == 0x02 || insn[2] == 0x06) {
12183         opCatO = Iop_InterleaveHI32x2;
12184         opCatE = Iop_InterleaveLO32x2;
12185      }
12186
12187      do_MMX_preamble();
12188      assign( dV, getMMXReg(gregOfRM(modrm)) );
12189
12190      if (epartIsReg(modrm)) {
12191         assign( sV, getMMXReg(eregOfRM(modrm)) );
12192         delta += 3+1;
12193         DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12194                                  nameMMXReg(gregOfRM(modrm)));
12195      } else {
12196         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12197         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12198         delta += 3+alen;
12199         DIP("ph%s %s,%s\n", str, dis_buf,
12200                                  nameMMXReg(gregOfRM(modrm)));
12201      }
12202
12203      putMMXReg(
12204         gregOfRM(modrm),
12205         binop(opV64,
12206               binop(opCatE,mkexpr(sV),mkexpr(dV)),
12207               binop(opCatO,mkexpr(sV),mkexpr(dV))
12208         )
12209      );
12210      goto decode_success;
12211   }
12212
12213   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
12214      xmm) and G to G (xmm). */
12215   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
12216      xmm) and G to G (xmm). */
12217   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
12218      G to G (xmm). */
12219   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
12220      G to G (xmm). */
12221   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
12222      G to G (xmm). */
12223   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
12224      G to G (xmm). */
12225
12226   if (sz == 2
12227       && insn[0] == 0x0F && insn[1] == 0x38
12228       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12229           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12230      const HChar* str = "???";
12231      IROp   opV64  = Iop_INVALID;
12232      IROp   opCatO = Iop_CatOddLanes16x4;
12233      IROp   opCatE = Iop_CatEvenLanes16x4;
12234      IRTemp sV     = newTemp(Ity_V128);
12235      IRTemp dV     = newTemp(Ity_V128);
12236      IRTemp sHi    = newTemp(Ity_I64);
12237      IRTemp sLo    = newTemp(Ity_I64);
12238      IRTemp dHi    = newTemp(Ity_I64);
12239      IRTemp dLo    = newTemp(Ity_I64);
12240
12241      modrm = insn[3];
12242
12243      switch (insn[2]) {
12244         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12245         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12246         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12247         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12248         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12249         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12250         default: vassert(0);
12251      }
12252      if (insn[2] == 0x02 || insn[2] == 0x06) {
12253         opCatO = Iop_InterleaveHI32x2;
12254         opCatE = Iop_InterleaveLO32x2;
12255      }
12256
12257      assign( dV, getXMMReg(gregOfRM(modrm)) );
12258
12259      if (epartIsReg(modrm)) {
12260         assign( sV, getXMMReg( eregOfRM(modrm)) );
12261         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12262                                  nameXMMReg(gregOfRM(modrm)));
12263         delta += 3+1;
12264      } else {
12265         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12266         gen_SEGV_if_not_16_aligned( addr );
12267         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12268         DIP("ph%s %s,%s\n", str, dis_buf,
12269                             nameXMMReg(gregOfRM(modrm)));
12270         delta += 3+alen;
12271      }
12272
12273      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12274      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12275      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12276      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12277
12278      /* This isn't a particularly efficient way to compute the
12279         result, but at least it avoids a proliferation of IROps,
12280         hence avoids complication all the backends. */
12281      putXMMReg(
12282         gregOfRM(modrm),
12283         binop(Iop_64HLtoV128,
12284               binop(opV64,
12285                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
12286                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
12287               ),
12288               binop(opV64,
12289                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
12290                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
12291               )
12292         )
12293      );
12294      goto decode_success;
12295   }
12296
12297   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
12298      (MMX) */
12299   if (sz == 4
12300       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12301      IRTemp sV = newTemp(Ity_I64);
12302      IRTemp dV = newTemp(Ity_I64);
12303
12304      modrm = insn[3];
12305      do_MMX_preamble();
12306      assign( dV, getMMXReg(gregOfRM(modrm)) );
12307
12308      if (epartIsReg(modrm)) {
12309         assign( sV, getMMXReg(eregOfRM(modrm)) );
12310         delta += 3+1;
12311         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12312                                 nameMMXReg(gregOfRM(modrm)));
12313      } else {
12314         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12315         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12316         delta += 3+alen;
12317         DIP("pmulhrsw %s,%s\n", dis_buf,
12318                                 nameMMXReg(gregOfRM(modrm)));
12319      }
12320
12321      putMMXReg(
12322         gregOfRM(modrm),
12323         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
12324      );
12325      goto decode_success;
12326   }
12327
12328   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
12329      Scale (XMM) */
12330   if (sz == 2
12331       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12332      IRTemp sV  = newTemp(Ity_V128);
12333      IRTemp dV  = newTemp(Ity_V128);
12334      IRTemp sHi = newTemp(Ity_I64);
12335      IRTemp sLo = newTemp(Ity_I64);
12336      IRTemp dHi = newTemp(Ity_I64);
12337      IRTemp dLo = newTemp(Ity_I64);
12338
12339      modrm = insn[3];
12340      assign( dV, getXMMReg(gregOfRM(modrm)) );
12341
12342      if (epartIsReg(modrm)) {
12343         assign( sV, getXMMReg(eregOfRM(modrm)) );
12344         delta += 3+1;
12345         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12346                                 nameXMMReg(gregOfRM(modrm)));
12347      } else {
12348         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12349         gen_SEGV_if_not_16_aligned( addr );
12350         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12351         delta += 3+alen;
12352         DIP("pmulhrsw %s,%s\n", dis_buf,
12353                                 nameXMMReg(gregOfRM(modrm)));
12354      }
12355
12356      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12357      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12358      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12359      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12360
12361      putXMMReg(
12362         gregOfRM(modrm),
12363         binop(Iop_64HLtoV128,
12364               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
12365               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
12366         )
12367      );
12368      goto decode_success;
12369   }
12370
12371   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
12372   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
12373   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
12374   if (sz == 4
12375       && insn[0] == 0x0F && insn[1] == 0x38
12376       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12377      IRTemp sV      = newTemp(Ity_I64);
12378      IRTemp dV      = newTemp(Ity_I64);
12379      const HChar* str = "???";
12380      Int    laneszB = 0;
12381
12382      switch (insn[2]) {
12383         case 0x08: laneszB = 1; str = "b"; break;
12384         case 0x09: laneszB = 2; str = "w"; break;
12385         case 0x0A: laneszB = 4; str = "d"; break;
12386         default: vassert(0);
12387      }
12388
12389      modrm = insn[3];
12390      do_MMX_preamble();
12391      assign( dV, getMMXReg(gregOfRM(modrm)) );
12392
12393      if (epartIsReg(modrm)) {
12394         assign( sV, getMMXReg(eregOfRM(modrm)) );
12395         delta += 3+1;
12396         DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12397                                     nameMMXReg(gregOfRM(modrm)));
12398      } else {
12399         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12400         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12401         delta += 3+alen;
12402         DIP("psign%s %s,%s\n", str, dis_buf,
12403                                     nameMMXReg(gregOfRM(modrm)));
12404      }
12405
12406      putMMXReg(
12407         gregOfRM(modrm),
12408         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
12409      );
12410      goto decode_success;
12411   }
12412
12413   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
12414   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
12415   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
12416   if (sz == 2
12417       && insn[0] == 0x0F && insn[1] == 0x38
12418       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12419      IRTemp sV      = newTemp(Ity_V128);
12420      IRTemp dV      = newTemp(Ity_V128);
12421      IRTemp sHi     = newTemp(Ity_I64);
12422      IRTemp sLo     = newTemp(Ity_I64);
12423      IRTemp dHi     = newTemp(Ity_I64);
12424      IRTemp dLo     = newTemp(Ity_I64);
12425      const HChar* str = "???";
12426      Int    laneszB = 0;
12427
12428      switch (insn[2]) {
12429         case 0x08: laneszB = 1; str = "b"; break;
12430         case 0x09: laneszB = 2; str = "w"; break;
12431         case 0x0A: laneszB = 4; str = "d"; break;
12432         default: vassert(0);
12433      }
12434
12435      modrm = insn[3];
12436      assign( dV, getXMMReg(gregOfRM(modrm)) );
12437
12438      if (epartIsReg(modrm)) {
12439         assign( sV, getXMMReg(eregOfRM(modrm)) );
12440         delta += 3+1;
12441         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12442                                     nameXMMReg(gregOfRM(modrm)));
12443      } else {
12444         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12445         gen_SEGV_if_not_16_aligned( addr );
12446         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12447         delta += 3+alen;
12448         DIP("psign%s %s,%s\n", str, dis_buf,
12449                                     nameXMMReg(gregOfRM(modrm)));
12450      }
12451
12452      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12453      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12454      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12455      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12456
12457      putXMMReg(
12458         gregOfRM(modrm),
12459         binop(Iop_64HLtoV128,
12460               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
12461               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
12462         )
12463      );
12464      goto decode_success;
12465   }
12466
12467   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
12468   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
12469   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
12470   if (sz == 4
12471       && insn[0] == 0x0F && insn[1] == 0x38
12472       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12473      IRTemp sV      = newTemp(Ity_I64);
12474      const HChar* str = "???";
12475      Int    laneszB = 0;
12476
12477      switch (insn[2]) {
12478         case 0x1C: laneszB = 1; str = "b"; break;
12479         case 0x1D: laneszB = 2; str = "w"; break;
12480         case 0x1E: laneszB = 4; str = "d"; break;
12481         default: vassert(0);
12482      }
12483
12484      modrm = insn[3];
12485      do_MMX_preamble();
12486
12487      if (epartIsReg(modrm)) {
12488         assign( sV, getMMXReg(eregOfRM(modrm)) );
12489         delta += 3+1;
12490         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12491                                    nameMMXReg(gregOfRM(modrm)));
12492      } else {
12493         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12494         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12495         delta += 3+alen;
12496         DIP("pabs%s %s,%s\n", str, dis_buf,
12497                                    nameMMXReg(gregOfRM(modrm)));
12498      }
12499
12500      putMMXReg(
12501         gregOfRM(modrm),
12502         dis_PABS_helper( mkexpr(sV), laneszB )
12503      );
12504      goto decode_success;
12505   }
12506
12507   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
12508   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
12509   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
12510   if (sz == 2
12511       && insn[0] == 0x0F && insn[1] == 0x38
12512       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12513      IRTemp sV      = newTemp(Ity_V128);
12514      IRTemp sHi     = newTemp(Ity_I64);
12515      IRTemp sLo     = newTemp(Ity_I64);
12516      const HChar* str = "???";
12517      Int    laneszB = 0;
12518
12519      switch (insn[2]) {
12520         case 0x1C: laneszB = 1; str = "b"; break;
12521         case 0x1D: laneszB = 2; str = "w"; break;
12522         case 0x1E: laneszB = 4; str = "d"; break;
12523         default: vassert(0);
12524      }
12525
12526      modrm = insn[3];
12527
12528      if (epartIsReg(modrm)) {
12529         assign( sV, getXMMReg(eregOfRM(modrm)) );
12530         delta += 3+1;
12531         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12532                                    nameXMMReg(gregOfRM(modrm)));
12533      } else {
12534         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12535         gen_SEGV_if_not_16_aligned( addr );
12536         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12537         delta += 3+alen;
12538         DIP("pabs%s %s,%s\n", str, dis_buf,
12539                                    nameXMMReg(gregOfRM(modrm)));
12540      }
12541
12542      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12543      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12544
12545      putXMMReg(
12546         gregOfRM(modrm),
12547         binop(Iop_64HLtoV128,
12548               dis_PABS_helper( mkexpr(sHi), laneszB ),
12549               dis_PABS_helper( mkexpr(sLo), laneszB )
12550         )
12551      );
12552      goto decode_success;
12553   }
12554
12555   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
12556   if (sz == 4
12557       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12558      IRTemp sV  = newTemp(Ity_I64);
12559      IRTemp dV  = newTemp(Ity_I64);
12560      IRTemp res = newTemp(Ity_I64);
12561
12562      modrm = insn[3];
12563      do_MMX_preamble();
12564      assign( dV, getMMXReg(gregOfRM(modrm)) );
12565
12566      if (epartIsReg(modrm)) {
12567         assign( sV, getMMXReg(eregOfRM(modrm)) );
12568         d32 = (UInt)insn[3+1];
12569         delta += 3+1+1;
12570         DIP("palignr $%u,%s,%s\n",  d32,
12571                                     nameMMXReg(eregOfRM(modrm)),
12572                                     nameMMXReg(gregOfRM(modrm)));
12573      } else {
12574         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12575         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12576         d32 = (UInt)insn[3+alen];
12577         delta += 3+alen+1;
12578         DIP("palignr $%u%s,%s\n", d32,
12579                                   dis_buf,
12580                                   nameMMXReg(gregOfRM(modrm)));
12581      }
12582
12583      if (d32 == 0) {
12584         assign( res, mkexpr(sV) );
12585      }
12586      else if (d32 >= 1 && d32 <= 7) {
12587         assign(res,
12588                binop(Iop_Or64,
12589                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
12590                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
12591                     )));
12592      }
12593      else if (d32 == 8) {
12594        assign( res, mkexpr(dV) );
12595      }
12596      else if (d32 >= 9 && d32 <= 15) {
12597         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
12598      }
12599      else if (d32 >= 16 && d32 <= 255) {
12600         assign( res, mkU64(0) );
12601      }
12602      else
12603         vassert(0);
12604
12605      putMMXReg( gregOfRM(modrm), mkexpr(res) );
12606      goto decode_success;
12607   }
12608
12609   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
12610   if (sz == 2
12611       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12612      IRTemp sV  = newTemp(Ity_V128);
12613      IRTemp dV  = newTemp(Ity_V128);
12614      IRTemp sHi = newTemp(Ity_I64);
12615      IRTemp sLo = newTemp(Ity_I64);
12616      IRTemp dHi = newTemp(Ity_I64);
12617      IRTemp dLo = newTemp(Ity_I64);
12618      IRTemp rHi = newTemp(Ity_I64);
12619      IRTemp rLo = newTemp(Ity_I64);
12620
12621      modrm = insn[3];
12622      assign( dV, getXMMReg(gregOfRM(modrm)) );
12623
12624      if (epartIsReg(modrm)) {
12625         assign( sV, getXMMReg(eregOfRM(modrm)) );
12626         d32 = (UInt)insn[3+1];
12627         delta += 3+1+1;
12628         DIP("palignr $%u,%s,%s\n", d32,
12629                                    nameXMMReg(eregOfRM(modrm)),
12630                                    nameXMMReg(gregOfRM(modrm)));
12631      } else {
12632         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12633         gen_SEGV_if_not_16_aligned( addr );
12634         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12635         d32 = (UInt)insn[3+alen];
12636         delta += 3+alen+1;
12637         DIP("palignr $%u,%s,%s\n", d32,
12638                                    dis_buf,
12639                                    nameXMMReg(gregOfRM(modrm)));
12640      }
12641
12642      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12643      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12644      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12645      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12646
12647      if (d32 == 0) {
12648         assign( rHi, mkexpr(sHi) );
12649         assign( rLo, mkexpr(sLo) );
12650      }
12651      else if (d32 >= 1 && d32 <= 7) {
12652         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
12653         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
12654      }
12655      else if (d32 == 8) {
12656         assign( rHi, mkexpr(dLo) );
12657         assign( rLo, mkexpr(sHi) );
12658      }
12659      else if (d32 >= 9 && d32 <= 15) {
12660         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
12661         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
12662      }
12663      else if (d32 == 16) {
12664         assign( rHi, mkexpr(dHi) );
12665         assign( rLo, mkexpr(dLo) );
12666      }
12667      else if (d32 >= 17 && d32 <= 23) {
12668         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
12669         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
12670      }
12671      else if (d32 == 24) {
12672         assign( rHi, mkU64(0) );
12673         assign( rLo, mkexpr(dHi) );
12674      }
12675      else if (d32 >= 25 && d32 <= 31) {
12676         assign( rHi, mkU64(0) );
12677         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
12678      }
12679      else if (d32 >= 32 && d32 <= 255) {
12680         assign( rHi, mkU64(0) );
12681         assign( rLo, mkU64(0) );
12682      }
12683      else
12684         vassert(0);
12685
12686      putXMMReg(
12687         gregOfRM(modrm),
12688         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12689      );
12690      goto decode_success;
12691   }
12692
12693   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
12694   if (sz == 4
12695       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12696      IRTemp sV      = newTemp(Ity_I64);
12697      IRTemp dV      = newTemp(Ity_I64);
12698
12699      modrm = insn[3];
12700      do_MMX_preamble();
12701      assign( dV, getMMXReg(gregOfRM(modrm)) );
12702
12703      if (epartIsReg(modrm)) {
12704         assign( sV, getMMXReg(eregOfRM(modrm)) );
12705         delta += 3+1;
12706         DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12707                               nameMMXReg(gregOfRM(modrm)));
12708      } else {
12709         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12710         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12711         delta += 3+alen;
12712         DIP("pshufb %s,%s\n", dis_buf,
12713                               nameMMXReg(gregOfRM(modrm)));
12714      }
12715
12716      putMMXReg(
12717         gregOfRM(modrm),
12718         binop(
12719            Iop_And64,
12720            /* permute the lanes */
12721            binop(
12722               Iop_Perm8x8,
12723               mkexpr(dV),
12724               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
12725            ),
12726            /* mask off lanes which have (index & 0x80) == 0x80 */
12727            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
12728         )
12729      );
12730      goto decode_success;
12731   }
12732
12733   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
12734   if (sz == 2
12735       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12736      IRTemp sV         = newTemp(Ity_V128);
12737      IRTemp dV         = newTemp(Ity_V128);
12738      IRTemp sHi        = newTemp(Ity_I64);
12739      IRTemp sLo        = newTemp(Ity_I64);
12740      IRTemp dHi        = newTemp(Ity_I64);
12741      IRTemp dLo        = newTemp(Ity_I64);
12742      IRTemp rHi        = newTemp(Ity_I64);
12743      IRTemp rLo        = newTemp(Ity_I64);
12744      IRTemp sevens     = newTemp(Ity_I64);
12745      IRTemp mask0x80hi = newTemp(Ity_I64);
12746      IRTemp mask0x80lo = newTemp(Ity_I64);
12747      IRTemp maskBit3hi = newTemp(Ity_I64);
12748      IRTemp maskBit3lo = newTemp(Ity_I64);
12749      IRTemp sAnd7hi    = newTemp(Ity_I64);
12750      IRTemp sAnd7lo    = newTemp(Ity_I64);
12751      IRTemp permdHi    = newTemp(Ity_I64);
12752      IRTemp permdLo    = newTemp(Ity_I64);
12753
12754      modrm = insn[3];
12755      assign( dV, getXMMReg(gregOfRM(modrm)) );
12756
12757      if (epartIsReg(modrm)) {
12758         assign( sV, getXMMReg(eregOfRM(modrm)) );
12759         delta += 3+1;
12760         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12761                               nameXMMReg(gregOfRM(modrm)));
12762      } else {
12763         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12764         gen_SEGV_if_not_16_aligned( addr );
12765         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12766         delta += 3+alen;
12767         DIP("pshufb %s,%s\n", dis_buf,
12768                               nameXMMReg(gregOfRM(modrm)));
12769      }
12770
12771      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12772      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12773      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12774      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12775
12776      assign( sevens, mkU64(0x0707070707070707ULL) );
12777
12778      /*
12779      mask0x80hi = Not(SarN8x8(sHi,7))
12780      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
12781      sAnd7hi    = And(sHi,sevens)
12782      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
12783                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
12784      rHi        = And(permdHi,mask0x80hi)
12785      */
12786      assign(
12787         mask0x80hi,
12788         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
12789
12790      assign(
12791         maskBit3hi,
12792         binop(Iop_SarN8x8,
12793               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
12794               mkU8(7)));
12795
12796      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
12797
12798      assign(
12799         permdHi,
12800         binop(
12801            Iop_Or64,
12802            binop(Iop_And64,
12803                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
12804                  mkexpr(maskBit3hi)),
12805            binop(Iop_And64,
12806                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
12807                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
12808
12809      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
12810
12811      /* And the same for the lower half of the result.  What fun. */
12812
12813      assign(
12814         mask0x80lo,
12815         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
12816
12817      assign(
12818         maskBit3lo,
12819         binop(Iop_SarN8x8,
12820               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
12821               mkU8(7)));
12822
12823      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
12824
12825      assign(
12826         permdLo,
12827         binop(
12828            Iop_Or64,
12829            binop(Iop_And64,
12830                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
12831                  mkexpr(maskBit3lo)),
12832            binop(Iop_And64,
12833                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
12834                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
12835
12836      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
12837
12838      putXMMReg(
12839         gregOfRM(modrm),
12840         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12841      );
12842      goto decode_success;
12843   }
12844
12845   /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
12846   /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
12847   if ((sz == 2 || sz == 4)
12848       && insn[0] == 0x0F && insn[1] == 0x38
12849       && (insn[2] == 0xF0 || insn[2] == 0xF1)
12850       && !epartIsReg(insn[3])) {
12851
12852      modrm = insn[3];
12853      addr = disAMode(&alen, sorb, delta + 3, dis_buf);
12854      delta += 3 + alen;
12855      ty = szToITy(sz);
12856      IRTemp src = newTemp(ty);
12857
12858      if (insn[2] == 0xF0) { /* LOAD */
12859         assign(src, loadLE(ty, mkexpr(addr)));
12860         IRTemp dst = math_BSWAP(src, ty);
12861         putIReg(sz, gregOfRM(modrm), mkexpr(dst));
12862         DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
12863      } else { /* STORE */
12864         assign(src, getIReg(sz, gregOfRM(modrm)));
12865         IRTemp dst = math_BSWAP(src, ty);
12866         storeLE(mkexpr(addr), mkexpr(dst));
12867         DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
12868      }
12869      goto decode_success;
12870   }
12871
12872   /* ---------------------------------------------------- */
12873   /* --- end of the SSSE3 decoder.                    --- */
12874   /* ---------------------------------------------------- */
12875
12876   /* ---------------------------------------------------- */
12877   /* --- start of the SSE4 decoder                    --- */
12878   /* ---------------------------------------------------- */
12879
12880   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
12881      (Partial implementation only -- only deal with cases where
12882      the rounding mode is specified directly by the immediate byte.)
12883      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
12884      (Limitations ditto)
12885   */
12886   if (sz == 2
12887       && insn[0] == 0x0F && insn[1] == 0x3A
12888       && (insn[2] == 0x0B || insn[2] == 0x0A)) {
12889
12890      Bool   isD = insn[2] == 0x0B;
12891      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
12892      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
12893      Int    imm = 0;
12894
12895      modrm = insn[3];
12896
12897      if (epartIsReg(modrm)) {
12898         assign( src,
12899                 isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
12900                     : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
12901         imm = insn[3+1];
12902         if (imm & ~3) goto decode_failure;
12903         delta += 3+1+1;
12904         DIP( "rounds%c $%d,%s,%s\n",
12905              isD ? 'd' : 's',
12906              imm, nameXMMReg( eregOfRM(modrm) ),
12907                   nameXMMReg( gregOfRM(modrm) ) );
12908      } else {
12909         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12910         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
12911         imm = insn[3+alen];
12912         if (imm & ~3) goto decode_failure;
12913         delta += 3+alen+1;
12914         DIP( "roundsd $%d,%s,%s\n",
12915              imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
12916      }
12917
12918      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
12919         that encoding is the same as the encoding for IRRoundingMode,
12920         we can use that value directly in the IR as a rounding
12921         mode. */
12922      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
12923                  mkU32(imm & 3), mkexpr(src)) );
12924
12925      if (isD)
12926         putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
12927      else
12928         putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
12929
12930      goto decode_success;
12931   }
12932
12933   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
12934      which we can only decode if we're sure this is an AMD cpu that
12935      supports LZCNT, since otherwise it's BSR, which behaves
12936      differently. */
12937   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
12938       && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
12939      vassert(sz == 2 || sz == 4);
12940      /*IRType*/ ty  = szToITy(sz);
12941      IRTemp     src = newTemp(ty);
12942      modrm = insn[3];
12943      if (epartIsReg(modrm)) {
12944         assign(src, getIReg(sz, eregOfRM(modrm)));
12945         delta += 3+1;
12946         DIP("lzcnt%c %s, %s\n", nameISize(sz),
12947             nameIReg(sz, eregOfRM(modrm)),
12948             nameIReg(sz, gregOfRM(modrm)));
12949      } else {
12950         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12951         assign(src, loadLE(ty, mkexpr(addr)));
12952         delta += 3+alen;
12953         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
12954             nameIReg(sz, gregOfRM(modrm)));
12955      }
12956
12957      IRTemp res = gen_LZCNT(ty, src);
12958      putIReg(sz, gregOfRM(modrm), mkexpr(res));
12959
12960      // Update flags.  This is pretty lame .. perhaps can do better
12961      // if this turns out to be performance critical.
12962      // O S A P are cleared.  Z is set if RESULT == 0.
12963      // C is set if SRC is zero.
12964      IRTemp src32 = newTemp(Ity_I32);
12965      IRTemp res32 = newTemp(Ity_I32);
12966      assign(src32, widenUto32(mkexpr(src)));
12967      assign(res32, widenUto32(mkexpr(res)));
12968
12969      IRTemp oszacp = newTemp(Ity_I32);
12970      assign(
12971         oszacp,
12972         binop(Iop_Or32,
12973               binop(Iop_Shl32,
12974                     unop(Iop_1Uto32,
12975                          binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
12976                     mkU8(X86G_CC_SHIFT_Z)),
12977               binop(Iop_Shl32,
12978                     unop(Iop_1Uto32,
12979                          binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
12980                     mkU8(X86G_CC_SHIFT_C))
12981         )
12982      );
12983
12984      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12985      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12986      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12987      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
12988
12989      goto decode_success;
12990   }
12991
12992   /* ---------------------------------------------------- */
12993   /* --- end of the SSE4 decoder                      --- */
12994   /* ---------------------------------------------------- */
12995
12996   after_sse_decoders:
12997
12998   /* ---------------------------------------------------- */
12999   /* --- deal with misc 0x67 pfxs (addr size override) -- */
13000   /* ---------------------------------------------------- */
13001
13002   /* 67 E3 = JCXZ (for JECXZ see below) */
13003   if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
13004      delta += 2;
13005      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13006      delta ++;
13007      stmt( IRStmt_Exit(
13008               binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
13009               Ijk_Boring,
13010               IRConst_U32(d32),
13011               OFFB_EIP
13012            ));
13013       DIP("jcxz 0x%x\n", d32);
13014       goto decode_success;
13015   }
13016
13017   /* ---------------------------------------------------- */
13018   /* --- start of the baseline insn decoder            -- */
13019   /* ---------------------------------------------------- */
13020
13021   /* Get the primary opcode. */
13022   opc = getIByte(delta); delta++;
13023
13024   /* We get here if the current insn isn't SSE, or this CPU doesn't
13025      support SSE. */
13026
13027   switch (opc) {
13028
13029   /* ------------------------ Control flow --------------- */
13030
13031   case 0xC2: /* RET imm16 */
13032      d32 = getUDisp16(delta);
13033      delta += 2;
13034      dis_ret(&dres, d32);
13035      DIP("ret %u\n", d32);
13036      break;
13037   case 0xC3: /* RET */
13038      dis_ret(&dres, 0);
13039      DIP("ret\n");
13040      break;
13041
13042   case 0xCF: /* IRET */
13043      /* Note, this is an extremely kludgey and limited implementation
13044         of iret.  All it really does is:
13045            popl %EIP; popl %CS; popl %EFLAGS.
13046         %CS is set but ignored (as it is in (eg) popw %cs)". */
13047      t1 = newTemp(Ity_I32); /* ESP */
13048      t2 = newTemp(Ity_I32); /* new EIP */
13049      t3 = newTemp(Ity_I32); /* new CS */
13050      t4 = newTemp(Ity_I32); /* new EFLAGS */
13051      assign(t1, getIReg(4,R_ESP));
13052      assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
13053      assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
13054      assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
13055      /* Get stuff off stack */
13056      putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
13057      /* set %CS (which is ignored anyway) */
13058      putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
13059      /* set %EFLAGS */
13060      set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
13061      /* goto new EIP value */
13062      jmp_treg(&dres, Ijk_Ret, t2);
13063      vassert(dres.whatNext == Dis_StopHere);
13064      DIP("iret (very kludgey)\n");
13065      break;
13066
13067   case 0xE8: /* CALL J4 */
13068      d32 = getUDisp32(delta); delta += 4;
13069      d32 += (guest_EIP_bbstart+delta);
13070      /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
13071      if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
13072                                         && getIByte(delta) <= 0x5F) {
13073         /* Specially treat the position-independent-code idiom
13074                 call X
13075              X: popl %reg
13076            as
13077                 movl %eip, %reg.
13078            since this generates better code, but for no other reason. */
13079         Int archReg = getIByte(delta) - 0x58;
13080         /* vex_printf("-- fPIC thingy\n"); */
13081         putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
13082         delta++; /* Step over the POP */
13083         DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
13084      } else {
13085         /* The normal sequence for a call. */
13086         t1 = newTemp(Ity_I32);
13087         assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
13088         putIReg(4, R_ESP, mkexpr(t1));
13089         storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
13090         if (resteerOkFn( callback_opaque, (Addr32)d32 )) {
13091            /* follow into the call target. */
13092            dres.whatNext   = Dis_ResteerU;
13093            dres.continueAt = (Addr32)d32;
13094         } else {
13095            jmp_lit(&dres, Ijk_Call, d32);
13096            vassert(dres.whatNext == Dis_StopHere);
13097         }
13098         DIP("call 0x%x\n",d32);
13099      }
13100      break;
13101
13102//--    case 0xC8: /* ENTER */
13103//--       d32 = getUDisp16(eip); eip += 2;
13104//--       abyte = getIByte(delta); delta++;
13105//--
13106//--       vg_assert(sz == 4);
13107//--       vg_assert(abyte == 0);
13108//--
13109//--       t1 = newTemp(cb); t2 = newTemp(cb);
13110//--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
13111//--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
13112//--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
13113//--       uLiteral(cb, sz);
13114//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
13115//--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
13116//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
13117//--       if (d32) {
13118//--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
13119//--          uLiteral(cb, d32);
13120//--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
13121//--       }
13122//--       DIP("enter 0x%x, 0x%x", d32, abyte);
13123//--       break;
13124
13125   case 0xC9: /* LEAVE */
13126      vassert(sz == 4);
13127      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13128      assign(t1, getIReg(4,R_EBP));
13129      /* First PUT ESP looks redundant, but need it because ESP must
13130         always be up-to-date for Memcheck to work... */
13131      putIReg(4, R_ESP, mkexpr(t1));
13132      assign(t2, loadLE(Ity_I32,mkexpr(t1)));
13133      putIReg(4, R_EBP, mkexpr(t2));
13134      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
13135      DIP("leave\n");
13136      break;
13137
13138   /* ---------------- Misc weird-ass insns --------------- */
13139
13140   case 0x27: /* DAA */
13141   case 0x2F: /* DAS */
13142   case 0x37: /* AAA */
13143   case 0x3F: /* AAS */
13144      /* An ugly implementation for some ugly instructions.  Oh
13145	 well. */
13146      if (sz != 4) goto decode_failure;
13147      t1 = newTemp(Ity_I32);
13148      t2 = newTemp(Ity_I32);
13149      /* Make up a 32-bit value (t1), with the old value of AX in the
13150         bottom 16 bits, and the old OSZACP bitmask in the upper 16
13151         bits. */
13152      assign(t1,
13153             binop(Iop_16HLto32,
13154                   unop(Iop_32to16,
13155                        mk_x86g_calculate_eflags_all()),
13156                   getIReg(2, R_EAX)
13157            ));
13158      /* Call the helper fn, to get a new AX and OSZACP value, and
13159         poke both back into the guest state.  Also pass the helper
13160         the actual opcode so it knows which of the 4 instructions it
13161         is doing the computation for. */
13162      vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
13163      assign(t2,
13164              mkIRExprCCall(
13165                 Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
13166                 &x86g_calculate_daa_das_aaa_aas,
13167                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13168            ));
13169     putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13170
13171     stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13172     stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13173     stmt( IRStmt_Put( OFFB_CC_DEP1,
13174                       binop(Iop_And32,
13175                             binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13176                             mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13177                                    | X86G_CC_MASK_A | X86G_CC_MASK_Z
13178                                    | X86G_CC_MASK_S| X86G_CC_MASK_O )
13179                            )
13180                      )
13181         );
13182     /* Set NDEP even though it isn't used.  This makes redundant-PUT
13183        elimination of previous stores to this field work better. */
13184     stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13185     switch (opc) {
13186        case 0x27: DIP("daa\n"); break;
13187        case 0x2F: DIP("das\n"); break;
13188        case 0x37: DIP("aaa\n"); break;
13189        case 0x3F: DIP("aas\n"); break;
13190        default: vassert(0);
13191     }
13192     break;
13193
13194   case 0xD4: /* AAM */
13195   case 0xD5: /* AAD */
13196      d32 = getIByte(delta); delta++;
13197      if (sz != 4 || d32 != 10) goto decode_failure;
13198      t1 = newTemp(Ity_I32);
13199      t2 = newTemp(Ity_I32);
13200      /* Make up a 32-bit value (t1), with the old value of AX in the
13201         bottom 16 bits, and the old OSZACP bitmask in the upper 16
13202         bits. */
13203      assign(t1,
13204             binop(Iop_16HLto32,
13205                   unop(Iop_32to16,
13206                        mk_x86g_calculate_eflags_all()),
13207                   getIReg(2, R_EAX)
13208            ));
13209      /* Call the helper fn, to get a new AX and OSZACP value, and
13210         poke both back into the guest state.  Also pass the helper
13211         the actual opcode so it knows which of the 2 instructions it
13212         is doing the computation for. */
13213      assign(t2,
13214              mkIRExprCCall(
13215                 Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
13216                 &x86g_calculate_aad_aam,
13217                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13218            ));
13219      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13220
13221      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13222      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13223      stmt( IRStmt_Put( OFFB_CC_DEP1,
13224                        binop(Iop_And32,
13225                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13226                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13227                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
13228                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
13229                             )
13230                       )
13231          );
13232      /* Set NDEP even though it isn't used.  This makes
13233         redundant-PUT elimination of previous stores to this field
13234         work better. */
13235      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13236
13237      DIP(opc == 0xD4 ? "aam\n" : "aad\n");
13238      break;
13239
13240   /* ------------------------ CWD/CDQ -------------------- */
13241
13242   case 0x98: /* CBW */
13243      if (sz == 4) {
13244         putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
13245         DIP("cwde\n");
13246      } else {
13247         vassert(sz == 2);
13248         putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
13249         DIP("cbw\n");
13250      }
13251      break;
13252
13253   case 0x99: /* CWD/CDQ */
13254      ty = szToITy(sz);
13255      putIReg(sz, R_EDX,
13256                  binop(mkSizedOp(ty,Iop_Sar8),
13257                        getIReg(sz, R_EAX),
13258                        mkU8(sz == 2 ? 15 : 31)) );
13259      DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
13260      break;
13261
13262   /* ------------------------ FPU ops -------------------- */
13263
13264   case 0x9E: /* SAHF */
13265      codegen_SAHF();
13266      DIP("sahf\n");
13267      break;
13268
13269   case 0x9F: /* LAHF */
13270      codegen_LAHF();
13271      DIP("lahf\n");
13272      break;
13273
13274   case 0x9B: /* FWAIT */
13275      /* ignore? */
13276      DIP("fwait\n");
13277      break;
13278
13279   case 0xD8:
13280   case 0xD9:
13281   case 0xDA:
13282   case 0xDB:
13283   case 0xDC:
13284   case 0xDD:
13285   case 0xDE:
13286   case 0xDF: {
13287      Int  delta0    = delta;
13288      Bool decode_OK = False;
13289      delta = dis_FPU ( &decode_OK, sorb, delta );
13290      if (!decode_OK) {
13291         delta = delta0;
13292         goto decode_failure;
13293      }
13294      break;
13295   }
13296
13297   /* ------------------------ INC & DEC ------------------ */
13298
13299   case 0x40: /* INC eAX */
13300   case 0x41: /* INC eCX */
13301   case 0x42: /* INC eDX */
13302   case 0x43: /* INC eBX */
13303   case 0x44: /* INC eSP */
13304   case 0x45: /* INC eBP */
13305   case 0x46: /* INC eSI */
13306   case 0x47: /* INC eDI */
13307      vassert(sz == 2 || sz == 4);
13308      ty = szToITy(sz);
13309      t1 = newTemp(ty);
13310      assign( t1, binop(mkSizedOp(ty,Iop_Add8),
13311                        getIReg(sz, (UInt)(opc - 0x40)),
13312                        mkU(ty,1)) );
13313      setFlags_INC_DEC( True, t1, ty );
13314      putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
13315      DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
13316      break;
13317
13318   case 0x48: /* DEC eAX */
13319   case 0x49: /* DEC eCX */
13320   case 0x4A: /* DEC eDX */
13321   case 0x4B: /* DEC eBX */
13322   case 0x4C: /* DEC eSP */
13323   case 0x4D: /* DEC eBP */
13324   case 0x4E: /* DEC eSI */
13325   case 0x4F: /* DEC eDI */
13326      vassert(sz == 2 || sz == 4);
13327      ty = szToITy(sz);
13328      t1 = newTemp(ty);
13329      assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
13330                        getIReg(sz, (UInt)(opc - 0x48)),
13331                        mkU(ty,1)) );
13332      setFlags_INC_DEC( False, t1, ty );
13333      putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
13334      DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
13335      break;
13336
13337   /* ------------------------ INT ------------------------ */
13338
13339   case 0xCC: /* INT 3 */
13340      jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
13341      vassert(dres.whatNext == Dis_StopHere);
13342      DIP("int $0x3\n");
13343      break;
13344
13345   case 0xCD: /* INT imm8 */
13346      d32 = getIByte(delta); delta++;
13347
13348      /* For any of the cases where we emit a jump (that is, for all
13349         currently handled cases), it's important that all ArchRegs
13350         carry their up-to-date value at this point.  So we declare an
13351         end-of-block here, which forces any TempRegs caching ArchRegs
13352         to be flushed. */
13353
13354      /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
13355         restart of this instruction (hence the "-2" two lines below,
13356         to get the restart EIP to be this instruction.  This is
13357         probably Linux-specific and it would be more correct to only
13358         do this if the VexAbiInfo says that is what we should do.
13359         This used to handle just 0x40-0x43; Jikes RVM uses a larger
13360         range (0x3F-0x49), and this allows some slack as well. */
13361      if (d32 >= 0x3F && d32 <= 0x4F) {
13362         jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
13363         vassert(dres.whatNext == Dis_StopHere);
13364         DIP("int $0x%x\n", d32);
13365         break;
13366      }
13367
13368      /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
13369         (darwin syscalls), int $0x91 (Solaris syscalls) and int $0xD2
13370         (Solaris fasttrap syscalls).  As part of this, note where we are, so we
13371         can back up the guest to this point if the syscall needs to
13372         be restarted. */
13373      IRJumpKind jump_kind;
13374      switch (d32) {
13375      case 0x80:
13376         jump_kind = Ijk_Sys_int128;
13377         break;
13378      case 0x81:
13379         jump_kind = Ijk_Sys_int129;
13380         break;
13381      case 0x82:
13382         jump_kind = Ijk_Sys_int130;
13383         break;
13384      case 0x91:
13385         jump_kind = Ijk_Sys_int145;
13386         break;
13387      case 0xD2:
13388         jump_kind = Ijk_Sys_int210;
13389         break;
13390      default:
13391         /* none of the above */
13392         goto decode_failure;
13393      }
13394
13395      stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13396                        mkU32(guest_EIP_curr_instr) ) );
13397      jmp_lit(&dres, jump_kind, ((Addr32)guest_EIP_bbstart)+delta);
13398      vassert(dres.whatNext == Dis_StopHere);
13399      DIP("int $0x%x\n", d32);
13400      break;
13401
13402   /* ------------------------ Jcond, byte offset --------- */
13403
13404   case 0xEB: /* Jb (jump, byte offset) */
13405      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13406      delta++;
13407      if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
13408         dres.whatNext   = Dis_ResteerU;
13409         dres.continueAt = (Addr32)d32;
13410      } else {
13411         jmp_lit(&dres, Ijk_Boring, d32);
13412         vassert(dres.whatNext == Dis_StopHere);
13413      }
13414      DIP("jmp-8 0x%x\n", d32);
13415      break;
13416
13417   case 0xE9: /* Jv (jump, 16/32 offset) */
13418      vassert(sz == 4); /* JRS added 2004 July 11 */
13419      d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
13420      delta += sz;
13421      if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
13422         dres.whatNext   = Dis_ResteerU;
13423         dres.continueAt = (Addr32)d32;
13424      } else {
13425         jmp_lit(&dres, Ijk_Boring, d32);
13426         vassert(dres.whatNext == Dis_StopHere);
13427      }
13428      DIP("jmp 0x%x\n", d32);
13429      break;
13430
13431   case 0x70:
13432   case 0x71:
13433   case 0x72: /* JBb/JNAEb (jump below) */
13434   case 0x73: /* JNBb/JAEb (jump not below) */
13435   case 0x74: /* JZb/JEb (jump zero) */
13436   case 0x75: /* JNZb/JNEb (jump not zero) */
13437   case 0x76: /* JBEb/JNAb (jump below or equal) */
13438   case 0x77: /* JNBEb/JAb (jump not below or equal) */
13439   case 0x78: /* JSb (jump negative) */
13440   case 0x79: /* JSb (jump not negative) */
13441   case 0x7A: /* JP (jump parity even) */
13442   case 0x7B: /* JNP/JPO (jump parity odd) */
13443   case 0x7C: /* JLb/JNGEb (jump less) */
13444   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
13445   case 0x7E: /* JLEb/JNGb (jump less or equal) */
13446   case 0x7F: /* JGb/JNLEb (jump greater) */
13447    { Int    jmpDelta;
13448      const HChar* comment  = "";
13449      jmpDelta = (Int)getSDisp8(delta);
13450      vassert(-128 <= jmpDelta && jmpDelta < 128);
13451      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
13452      delta++;
13453      if (resteerCisOk
13454          && vex_control.guest_chase_cond
13455          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13456          && jmpDelta < 0
13457          && resteerOkFn( callback_opaque, (Addr32)d32) ) {
13458         /* Speculation: assume this backward branch is taken.  So we
13459            need to emit a side-exit to the insn following this one,
13460            on the negation of the condition, and continue at the
13461            branch target address (d32).  If we wind up back at the
13462            first instruction of the trace, just stop; it's better to
13463            let the IR loop unroller handle that case. */
13464         stmt( IRStmt_Exit(
13465                  mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
13466                  Ijk_Boring,
13467                  IRConst_U32(guest_EIP_bbstart+delta),
13468                  OFFB_EIP ) );
13469         dres.whatNext   = Dis_ResteerC;
13470         dres.continueAt = (Addr32)d32;
13471         comment = "(assumed taken)";
13472      }
13473      else
13474      if (resteerCisOk
13475          && vex_control.guest_chase_cond
13476          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13477          && jmpDelta >= 0
13478          && resteerOkFn( callback_opaque,
13479                          (Addr32)(guest_EIP_bbstart+delta)) ) {
13480         /* Speculation: assume this forward branch is not taken.  So
13481            we need to emit a side-exit to d32 (the dest) and continue
13482            disassembling at the insn immediately following this
13483            one. */
13484         stmt( IRStmt_Exit(
13485                  mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
13486                  Ijk_Boring,
13487                  IRConst_U32(d32),
13488                  OFFB_EIP ) );
13489         dres.whatNext   = Dis_ResteerC;
13490         dres.continueAt = guest_EIP_bbstart + delta;
13491         comment = "(assumed not taken)";
13492      }
13493      else {
13494         /* Conservative default translation - end the block at this
13495            point. */
13496         jcc_01( &dres, (X86Condcode)(opc - 0x70),
13497                 (Addr32)(guest_EIP_bbstart+delta), d32);
13498         vassert(dres.whatNext == Dis_StopHere);
13499      }
13500      DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
13501      break;
13502    }
13503
13504   case 0xE3: /* JECXZ (for JCXZ see above) */
13505      if (sz != 4) goto decode_failure;
13506      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13507      delta ++;
13508      stmt( IRStmt_Exit(
13509               binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
13510            Ijk_Boring,
13511            IRConst_U32(d32),
13512            OFFB_EIP
13513          ));
13514      DIP("jecxz 0x%x\n", d32);
13515      break;
13516
13517   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
13518   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
13519   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
13520    { /* Again, the docs say this uses ECX/CX as a count depending on
13521         the address size override, not the operand one.  Since we
13522         don't handle address size overrides, I guess that means
13523         ECX. */
13524      IRExpr* zbit  = NULL;
13525      IRExpr* count = NULL;
13526      IRExpr* cond  = NULL;
13527      const HChar* xtra = NULL;
13528
13529      if (sz != 4) goto decode_failure;
13530      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13531      delta++;
13532      putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
13533
13534      count = getIReg(4,R_ECX);
13535      cond = binop(Iop_CmpNE32, count, mkU32(0));
13536      switch (opc) {
13537         case 0xE2:
13538            xtra = "";
13539            break;
13540         case 0xE1:
13541            xtra = "e";
13542            zbit = mk_x86g_calculate_condition( X86CondZ );
13543	    cond = mkAnd1(cond, zbit);
13544            break;
13545         case 0xE0:
13546            xtra = "ne";
13547            zbit = mk_x86g_calculate_condition( X86CondNZ );
13548	    cond = mkAnd1(cond, zbit);
13549            break;
13550         default:
13551	    vassert(0);
13552      }
13553      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
13554
13555      DIP("loop%s 0x%x\n", xtra, d32);
13556      break;
13557    }
13558
13559   /* ------------------------ IMUL ----------------------- */
13560
13561   case 0x69: /* IMUL Iv, Ev, Gv */
13562      delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
13563      break;
13564   case 0x6B: /* IMUL Ib, Ev, Gv */
13565      delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
13566      break;
13567
13568   /* ------------------------ MOV ------------------------ */
13569
13570   case 0x88: /* MOV Gb,Eb */
13571      delta = dis_mov_G_E(sorb, 1, delta);
13572      break;
13573
13574   case 0x89: /* MOV Gv,Ev */
13575      delta = dis_mov_G_E(sorb, sz, delta);
13576      break;
13577
13578   case 0x8A: /* MOV Eb,Gb */
13579      delta = dis_mov_E_G(sorb, 1, delta);
13580      break;
13581
13582   case 0x8B: /* MOV Ev,Gv */
13583      delta = dis_mov_E_G(sorb, sz, delta);
13584      break;
13585
13586   case 0x8D: /* LEA M,Gv */
13587      if (sz != 4)
13588         goto decode_failure;
13589      modrm = getIByte(delta);
13590      if (epartIsReg(modrm))
13591         goto decode_failure;
13592      /* NOTE!  this is the one place where a segment override prefix
13593         has no effect on the address calculation.  Therefore we pass
13594         zero instead of sorb here. */
13595      addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
13596      delta += alen;
13597      putIReg(sz, gregOfRM(modrm), mkexpr(addr));
13598      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
13599                            nameIReg(sz,gregOfRM(modrm)));
13600      break;
13601
13602   case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
13603      delta = dis_mov_Sw_Ew(sorb, sz, delta);
13604      break;
13605
13606   case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
13607      delta = dis_mov_Ew_Sw(sorb, delta);
13608      break;
13609
13610   case 0xA0: /* MOV Ob,AL */
13611      sz = 1;
13612      /* Fall through ... */
13613   case 0xA1: /* MOV Ov,eAX */
13614      d32 = getUDisp32(delta); delta += 4;
13615      ty = szToITy(sz);
13616      addr = newTemp(Ity_I32);
13617      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13618      putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
13619      DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
13620                                d32, nameIReg(sz,R_EAX));
13621      break;
13622
13623   case 0xA2: /* MOV Ob,AL */
13624      sz = 1;
13625      /* Fall through ... */
13626   case 0xA3: /* MOV eAX,Ov */
13627      d32 = getUDisp32(delta); delta += 4;
13628      ty = szToITy(sz);
13629      addr = newTemp(Ity_I32);
13630      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13631      storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
13632      DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
13633                                sorbTxt(sorb), d32);
13634      break;
13635
13636   case 0xB0: /* MOV imm,AL */
13637   case 0xB1: /* MOV imm,CL */
13638   case 0xB2: /* MOV imm,DL */
13639   case 0xB3: /* MOV imm,BL */
13640   case 0xB4: /* MOV imm,AH */
13641   case 0xB5: /* MOV imm,CH */
13642   case 0xB6: /* MOV imm,DH */
13643   case 0xB7: /* MOV imm,BH */
13644      d32 = getIByte(delta); delta += 1;
13645      putIReg(1, opc-0xB0, mkU8(d32));
13646      DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
13647      break;
13648
13649   case 0xB8: /* MOV imm,eAX */
13650   case 0xB9: /* MOV imm,eCX */
13651   case 0xBA: /* MOV imm,eDX */
13652   case 0xBB: /* MOV imm,eBX */
13653   case 0xBC: /* MOV imm,eSP */
13654   case 0xBD: /* MOV imm,eBP */
13655   case 0xBE: /* MOV imm,eSI */
13656   case 0xBF: /* MOV imm,eDI */
13657      d32 = getUDisp(sz,delta); delta += sz;
13658      putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
13659      DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
13660      break;
13661
13662   case 0xC6: /* C6 /0 = MOV Ib,Eb */
13663      sz = 1;
13664      goto maybe_do_Mov_I_E;
13665   case 0xC7: /* C7 /0 = MOV Iv,Ev */
13666      goto maybe_do_Mov_I_E;
13667
13668   maybe_do_Mov_I_E:
13669      modrm = getIByte(delta);
13670      if (gregOfRM(modrm) == 0) {
13671         if (epartIsReg(modrm)) {
13672            delta++; /* mod/rm byte */
13673            d32 = getUDisp(sz,delta); delta += sz;
13674            putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
13675            DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
13676                                     nameIReg(sz,eregOfRM(modrm)));
13677         } else {
13678            addr = disAMode ( &alen, sorb, delta, dis_buf );
13679            delta += alen;
13680            d32 = getUDisp(sz,delta); delta += sz;
13681            storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
13682            DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
13683         }
13684         break;
13685      }
13686      goto decode_failure;
13687
13688   /* ------------------------ opl imm, A ----------------- */
13689
13690   case 0x04: /* ADD Ib, AL */
13691      delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
13692      break;
13693   case 0x05: /* ADD Iv, eAX */
13694      delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
13695      break;
13696
13697   case 0x0C: /* OR Ib, AL */
13698      delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
13699      break;
13700   case 0x0D: /* OR Iv, eAX */
13701      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
13702      break;
13703
13704   case 0x14: /* ADC Ib, AL */
13705      delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
13706      break;
13707   case 0x15: /* ADC Iv, eAX */
13708      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
13709      break;
13710
13711   case 0x1C: /* SBB Ib, AL */
13712      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
13713      break;
13714   case 0x1D: /* SBB Iv, eAX */
13715      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
13716      break;
13717
13718   case 0x24: /* AND Ib, AL */
13719      delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
13720      break;
13721   case 0x25: /* AND Iv, eAX */
13722      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
13723      break;
13724
13725   case 0x2C: /* SUB Ib, AL */
13726      delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
13727      break;
13728   case 0x2D: /* SUB Iv, eAX */
13729      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
13730      break;
13731
13732   case 0x34: /* XOR Ib, AL */
13733      delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
13734      break;
13735   case 0x35: /* XOR Iv, eAX */
13736      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
13737      break;
13738
13739   case 0x3C: /* CMP Ib, AL */
13740      delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
13741      break;
13742   case 0x3D: /* CMP Iv, eAX */
13743      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
13744      break;
13745
13746   case 0xA8: /* TEST Ib, AL */
13747      delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
13748      break;
13749   case 0xA9: /* TEST Iv, eAX */
13750      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
13751      break;
13752
13753   /* ------------------------ opl Ev, Gv ----------------- */
13754
13755   case 0x02: /* ADD Eb,Gb */
13756      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
13757      break;
13758   case 0x03: /* ADD Ev,Gv */
13759      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
13760      break;
13761
13762   case 0x0A: /* OR Eb,Gb */
13763      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
13764      break;
13765   case 0x0B: /* OR Ev,Gv */
13766      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
13767      break;
13768
13769   case 0x12: /* ADC Eb,Gb */
13770      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
13771      break;
13772   case 0x13: /* ADC Ev,Gv */
13773      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
13774      break;
13775
13776   case 0x1A: /* SBB Eb,Gb */
13777      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
13778      break;
13779   case 0x1B: /* SBB Ev,Gv */
13780      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
13781      break;
13782
13783   case 0x22: /* AND Eb,Gb */
13784      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
13785      break;
13786   case 0x23: /* AND Ev,Gv */
13787      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
13788      break;
13789
13790   case 0x2A: /* SUB Eb,Gb */
13791      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
13792      break;
13793   case 0x2B: /* SUB Ev,Gv */
13794      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
13795      break;
13796
13797   case 0x32: /* XOR Eb,Gb */
13798      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
13799      break;
13800   case 0x33: /* XOR Ev,Gv */
13801      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
13802      break;
13803
13804   case 0x3A: /* CMP Eb,Gb */
13805      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
13806      break;
13807   case 0x3B: /* CMP Ev,Gv */
13808      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
13809      break;
13810
13811   case 0x84: /* TEST Eb,Gb */
13812      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
13813      break;
13814   case 0x85: /* TEST Ev,Gv */
13815      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
13816      break;
13817
13818   /* ------------------------ opl Gv, Ev ----------------- */
13819
13820   case 0x00: /* ADD Gb,Eb */
13821      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13822                            Iop_Add8, True, 1, delta, "add" );
13823      break;
13824   case 0x01: /* ADD Gv,Ev */
13825      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13826                            Iop_Add8, True, sz, delta, "add" );
13827      break;
13828
13829   case 0x08: /* OR Gb,Eb */
13830      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13831                            Iop_Or8, True, 1, delta, "or" );
13832      break;
13833   case 0x09: /* OR Gv,Ev */
13834      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13835                            Iop_Or8, True, sz, delta, "or" );
13836      break;
13837
13838   case 0x10: /* ADC Gb,Eb */
13839      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13840                            Iop_Add8, True, 1, delta, "adc" );
13841      break;
13842   case 0x11: /* ADC Gv,Ev */
13843      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13844                            Iop_Add8, True, sz, delta, "adc" );
13845      break;
13846
13847   case 0x18: /* SBB Gb,Eb */
13848      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13849                            Iop_Sub8, True, 1, delta, "sbb" );
13850      break;
13851   case 0x19: /* SBB Gv,Ev */
13852      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13853                            Iop_Sub8, True, sz, delta, "sbb" );
13854      break;
13855
13856   case 0x20: /* AND Gb,Eb */
13857      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13858                            Iop_And8, True, 1, delta, "and" );
13859      break;
13860   case 0x21: /* AND Gv,Ev */
13861      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13862                            Iop_And8, True, sz, delta, "and" );
13863      break;
13864
13865   case 0x28: /* SUB Gb,Eb */
13866      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13867                            Iop_Sub8, True, 1, delta, "sub" );
13868      break;
13869   case 0x29: /* SUB Gv,Ev */
13870      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13871                            Iop_Sub8, True, sz, delta, "sub" );
13872      break;
13873
13874   case 0x30: /* XOR Gb,Eb */
13875      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13876                            Iop_Xor8, True, 1, delta, "xor" );
13877      break;
13878   case 0x31: /* XOR Gv,Ev */
13879      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13880                            Iop_Xor8, True, sz, delta, "xor" );
13881      break;
13882
13883   case 0x38: /* CMP Gb,Eb */
13884      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13885                            Iop_Sub8, False, 1, delta, "cmp" );
13886      break;
13887   case 0x39: /* CMP Gv,Ev */
13888      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13889                            Iop_Sub8, False, sz, delta, "cmp" );
13890      break;
13891
13892   /* ------------------------ POP ------------------------ */
13893
13894   case 0x58: /* POP eAX */
13895   case 0x59: /* POP eCX */
13896   case 0x5A: /* POP eDX */
13897   case 0x5B: /* POP eBX */
13898   case 0x5D: /* POP eBP */
13899   case 0x5E: /* POP eSI */
13900   case 0x5F: /* POP eDI */
13901   case 0x5C: /* POP eSP */
13902      vassert(sz == 2 || sz == 4);
13903      t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
13904      assign(t2, getIReg(4, R_ESP));
13905      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
13906      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13907      putIReg(sz, opc-0x58, mkexpr(t1));
13908      DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
13909      break;
13910
13911   case 0x9D: /* POPF */
13912      vassert(sz == 2 || sz == 4);
13913      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13914      assign(t2, getIReg(4, R_ESP));
13915      assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
13916      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13917
13918      /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
13919	 value in t1. */
13920      set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
13921                                 ((Addr32)guest_EIP_bbstart)+delta );
13922
13923      DIP("popf%c\n", nameISize(sz));
13924      break;
13925
13926   case 0x61: /* POPA */
13927      /* This is almost certainly wrong for sz==2.  So ... */
13928      if (sz != 4) goto decode_failure;
13929
13930      /* t5 is the old %ESP value. */
13931      t5 = newTemp(Ity_I32);
13932      assign( t5, getIReg(4, R_ESP) );
13933
13934      /* Reload all the registers, except %esp. */
13935      putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
13936      putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
13937      putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
13938      putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
13939      /* ignore saved %ESP */
13940      putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
13941      putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
13942      putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
13943
13944      /* and move %ESP back up */
13945      putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
13946
13947      DIP("popa%c\n", nameISize(sz));
13948      break;
13949
13950   case 0x8F: /* POPL/POPW m32 */
13951     { Int    len;
13952       UChar  rm = getIByte(delta);
13953
13954       /* make sure this instruction is correct POP */
13955       if (epartIsReg(rm) || gregOfRM(rm) != 0)
13956          goto decode_failure;
13957       /* and has correct size */
13958       if (sz != 4 && sz != 2)
13959          goto decode_failure;
13960       ty = szToITy(sz);
13961
13962       t1 = newTemp(Ity_I32); /* stack address */
13963       t3 = newTemp(ty); /* data */
13964       /* set t1 to ESP: t1 = ESP */
13965       assign( t1, getIReg(4, R_ESP) );
13966       /* load M[ESP] to virtual register t3: t3 = M[t1] */
13967       assign( t3, loadLE(ty, mkexpr(t1)) );
13968
13969       /* increase ESP; must be done before the STORE.  Intel manual says:
13970            If the ESP register is used as a base register for addressing
13971            a destination operand in memory, the POP instruction computes
13972            the effective address of the operand after it increments the
13973            ESP register.
13974       */
13975       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
13976
13977       /* resolve MODR/M */
13978       addr = disAMode ( &len, sorb, delta, dis_buf);
13979       storeLE( mkexpr(addr), mkexpr(t3) );
13980
13981       DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
13982
13983       delta += len;
13984       break;
13985     }
13986
13987   case 0x1F: /* POP %DS */
13988      dis_pop_segreg( R_DS, sz ); break;
13989   case 0x07: /* POP %ES */
13990      dis_pop_segreg( R_ES, sz ); break;
13991   case 0x17: /* POP %SS */
13992      dis_pop_segreg( R_SS, sz ); break;
13993
13994   /* ------------------------ PUSH ----------------------- */
13995
13996   case 0x50: /* PUSH eAX */
13997   case 0x51: /* PUSH eCX */
13998   case 0x52: /* PUSH eDX */
13999   case 0x53: /* PUSH eBX */
14000   case 0x55: /* PUSH eBP */
14001   case 0x56: /* PUSH eSI */
14002   case 0x57: /* PUSH eDI */
14003   case 0x54: /* PUSH eSP */
14004      /* This is the Right Way, in that the value to be pushed is
14005         established before %esp is changed, so that pushl %esp
14006         correctly pushes the old value. */
14007      vassert(sz == 2 || sz == 4);
14008      ty = sz==2 ? Ity_I16 : Ity_I32;
14009      t1 = newTemp(ty); t2 = newTemp(Ity_I32);
14010      assign(t1, getIReg(sz, opc-0x50));
14011      assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
14012      putIReg(4, R_ESP, mkexpr(t2) );
14013      storeLE(mkexpr(t2),mkexpr(t1));
14014      DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
14015      break;
14016
14017
14018   case 0x68: /* PUSH Iv */
14019      d32 = getUDisp(sz,delta); delta += sz;
14020      goto do_push_I;
14021   case 0x6A: /* PUSH Ib, sign-extended to sz */
14022      d32 = getSDisp8(delta); delta += 1;
14023      goto do_push_I;
14024   do_push_I:
14025      ty = szToITy(sz);
14026      t1 = newTemp(Ity_I32); t2 = newTemp(ty);
14027      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
14028      putIReg(4, R_ESP, mkexpr(t1) );
14029      /* stop mkU16 asserting if d32 is a negative 16-bit number
14030         (bug #132813) */
14031      if (ty == Ity_I16)
14032         d32 &= 0xFFFF;
14033      storeLE( mkexpr(t1), mkU(ty,d32) );
14034      DIP("push%c $0x%x\n", nameISize(sz), d32);
14035      break;
14036
14037   case 0x9C: /* PUSHF */ {
14038      vassert(sz == 2 || sz == 4);
14039
14040      t1 = newTemp(Ity_I32);
14041      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
14042      putIReg(4, R_ESP, mkexpr(t1) );
14043
14044      /* Calculate OSZACP, and patch in fixed fields as per
14045         Intel docs.
14046         - bit 1 is always 1
14047         - bit 9 is Interrupt Enable (should always be 1 in user mode?)
14048      */
14049      t2 = newTemp(Ity_I32);
14050      assign( t2, binop(Iop_Or32,
14051                        mk_x86g_calculate_eflags_all(),
14052                        mkU32( (1<<1)|(1<<9) ) ));
14053
14054      /* Patch in the D flag.  This can simply be a copy of bit 10 of
14055         baseBlock[OFFB_DFLAG]. */
14056      t3 = newTemp(Ity_I32);
14057      assign( t3, binop(Iop_Or32,
14058                        mkexpr(t2),
14059                        binop(Iop_And32,
14060                              IRExpr_Get(OFFB_DFLAG,Ity_I32),
14061                              mkU32(1<<10)))
14062            );
14063
14064      /* And patch in the ID flag. */
14065      t4 = newTemp(Ity_I32);
14066      assign( t4, binop(Iop_Or32,
14067                        mkexpr(t3),
14068                        binop(Iop_And32,
14069                              binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
14070                                               mkU8(21)),
14071                              mkU32(1<<21)))
14072            );
14073
14074      /* And patch in the AC flag. */
14075      t5 = newTemp(Ity_I32);
14076      assign( t5, binop(Iop_Or32,
14077                        mkexpr(t4),
14078                        binop(Iop_And32,
14079                              binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
14080                                               mkU8(18)),
14081                              mkU32(1<<18)))
14082            );
14083
14084      /* if sz==2, the stored value needs to be narrowed. */
14085      if (sz == 2)
14086        storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
14087      else
14088        storeLE( mkexpr(t1), mkexpr(t5) );
14089
14090      DIP("pushf%c\n", nameISize(sz));
14091      break;
14092   }
14093
14094   case 0x60: /* PUSHA */
14095      /* This is almost certainly wrong for sz==2.  So ... */
14096      if (sz != 4) goto decode_failure;
14097
14098      /* This is the Right Way, in that the value to be pushed is
14099         established before %esp is changed, so that pusha
14100         correctly pushes the old %esp value.  New value of %esp is
14101         pushed at start. */
14102      /* t0 is the %ESP value we're going to push. */
14103      t0 = newTemp(Ity_I32);
14104      assign( t0, getIReg(4, R_ESP) );
14105
14106      /* t5 will be the new %ESP value. */
14107      t5 = newTemp(Ity_I32);
14108      assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
14109
14110      /* Update guest state before prodding memory. */
14111      putIReg(4, R_ESP, mkexpr(t5));
14112
14113      /* Dump all the registers. */
14114      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
14115      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
14116      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
14117      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
14118      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
14119      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
14120      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
14121      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
14122
14123      DIP("pusha%c\n", nameISize(sz));
14124      break;
14125
14126   case 0x0E: /* PUSH %CS */
14127      dis_push_segreg( R_CS, sz ); break;
14128   case 0x1E: /* PUSH %DS */
14129      dis_push_segreg( R_DS, sz ); break;
14130   case 0x06: /* PUSH %ES */
14131      dis_push_segreg( R_ES, sz ); break;
14132   case 0x16: /* PUSH %SS */
14133      dis_push_segreg( R_SS, sz ); break;
14134
14135   /* ------------------------ SCAS et al ----------------- */
14136
14137   case 0xA4: /* MOVS, no REP prefix */
14138   case 0xA5:
14139      if (sorb != 0)
14140         goto decode_failure; /* else dis_string_op asserts */
14141      dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
14142      break;
14143
14144  case 0xA6: /* CMPSb, no REP prefix */
14145  case 0xA7:
14146      if (sorb != 0)
14147         goto decode_failure; /* else dis_string_op asserts */
14148      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
14149      break;
14150
14151   case 0xAA: /* STOS, no REP prefix */
14152   case 0xAB:
14153      if (sorb != 0)
14154         goto decode_failure; /* else dis_string_op asserts */
14155      dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
14156      break;
14157
14158   case 0xAC: /* LODS, no REP prefix */
14159   case 0xAD:
14160      if (sorb != 0)
14161         goto decode_failure; /* else dis_string_op asserts */
14162      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
14163      break;
14164
14165   case 0xAE: /* SCAS, no REP prefix */
14166   case 0xAF:
14167      if (sorb != 0)
14168         goto decode_failure; /* else dis_string_op asserts */
14169      dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
14170      break;
14171
14172
14173   case 0xFC: /* CLD */
14174      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
14175      DIP("cld\n");
14176      break;
14177
14178   case 0xFD: /* STD */
14179      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
14180      DIP("std\n");
14181      break;
14182
14183   case 0xF8: /* CLC */
14184   case 0xF9: /* STC */
14185   case 0xF5: /* CMC */
14186      t0 = newTemp(Ity_I32);
14187      t1 = newTemp(Ity_I32);
14188      assign( t0, mk_x86g_calculate_eflags_all() );
14189      switch (opc) {
14190         case 0xF8:
14191            assign( t1, binop(Iop_And32, mkexpr(t0),
14192                                         mkU32(~X86G_CC_MASK_C)));
14193            DIP("clc\n");
14194            break;
14195         case 0xF9:
14196            assign( t1, binop(Iop_Or32, mkexpr(t0),
14197                                        mkU32(X86G_CC_MASK_C)));
14198            DIP("stc\n");
14199            break;
14200         case 0xF5:
14201            assign( t1, binop(Iop_Xor32, mkexpr(t0),
14202                                         mkU32(X86G_CC_MASK_C)));
14203            DIP("cmc\n");
14204            break;
14205         default:
14206            vpanic("disInstr(x86)(clc/stc/cmc)");
14207      }
14208      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14209      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14210      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
14211      /* Set NDEP even though it isn't used.  This makes redundant-PUT
14212         elimination of previous stores to this field work better. */
14213      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14214      break;
14215
14216   case 0xD6: /* SALC */
14217      t0 = newTemp(Ity_I32);
14218      t1 = newTemp(Ity_I32);
14219      assign( t0,  binop(Iop_And32,
14220                         mk_x86g_calculate_eflags_c(),
14221                         mkU32(1)) );
14222      assign( t1, binop(Iop_Sar32,
14223                        binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
14224                        mkU8(31)) );
14225      putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
14226      DIP("salc\n");
14227      break;
14228
14229   /* REPNE prefix insn */
14230   case 0xF2: {
14231      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14232      if (sorb != 0) goto decode_failure;
14233      abyte = getIByte(delta); delta++;
14234
14235      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14236
14237      switch (abyte) {
14238      /* According to the Intel manual, "repne movs" should never occur, but
14239       * in practice it has happened, so allow for it here... */
14240      case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
14241      case 0xA5:
14242         dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
14243                             guest_EIP_bbstart+delta, "repne movs" );
14244         break;
14245
14246      case 0xA6: sz = 1;   /* REPNE CMP<sz> */
14247      case 0xA7:
14248         dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
14249                             guest_EIP_bbstart+delta, "repne cmps" );
14250         break;
14251
14252      case 0xAA: sz = 1;   /* REPNE STOS<sz> */
14253      case 0xAB:
14254         dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
14255                             guest_EIP_bbstart+delta, "repne stos" );
14256         break;
14257
14258      case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
14259      case 0xAF:
14260         dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
14261                             guest_EIP_bbstart+delta, "repne scas" );
14262         break;
14263
14264      default:
14265         goto decode_failure;
14266      }
14267      break;
14268   }
14269
14270   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
14271      for the rest, it means REP) */
14272   case 0xF3: {
14273      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14274      abyte = getIByte(delta); delta++;
14275
14276      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14277
14278      if (sorb != 0 && abyte != 0x0F) goto decode_failure;
14279
14280      switch (abyte) {
14281      case 0x0F:
14282         switch (getIByte(delta)) {
14283         /* On older CPUs, TZCNT behaves the same as BSF.  */
14284         case 0xBC: /* REP BSF Gv,Ev */
14285            delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
14286            break;
14287         /* On older CPUs, LZCNT behaves the same as BSR.  */
14288         case 0xBD: /* REP BSR Gv,Ev */
14289            delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
14290            break;
14291         default:
14292            goto decode_failure;
14293         }
14294         break;
14295
14296      case 0xA4: sz = 1;   /* REP MOVS<sz> */
14297      case 0xA5:
14298         dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
14299                             guest_EIP_bbstart+delta, "rep movs" );
14300         break;
14301
14302      case 0xA6: sz = 1;   /* REPE CMP<sz> */
14303      case 0xA7:
14304         dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
14305                             guest_EIP_bbstart+delta, "repe cmps" );
14306         break;
14307
14308      case 0xAA: sz = 1;   /* REP STOS<sz> */
14309      case 0xAB:
14310         dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
14311                             guest_EIP_bbstart+delta, "rep stos" );
14312         break;
14313
14314      case 0xAC: sz = 1;   /* REP LODS<sz> */
14315      case 0xAD:
14316         dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
14317                             guest_EIP_bbstart+delta, "rep lods" );
14318         break;
14319
14320      case 0xAE: sz = 1;   /* REPE SCAS<sz> */
14321      case 0xAF:
14322         dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
14323                             guest_EIP_bbstart+delta, "repe scas" );
14324         break;
14325
14326      case 0x90:           /* REP NOP (PAUSE) */
14327         /* a hint to the P4 re spin-wait loop */
14328         DIP("rep nop (P4 pause)\n");
14329         /* "observe" the hint.  The Vex client needs to be careful not
14330            to cause very long delays as a result, though. */
14331         jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
14332         vassert(dres.whatNext == Dis_StopHere);
14333         break;
14334
14335      case 0xC3:           /* REP RET -- same as normal ret? */
14336         dis_ret(&dres, 0);
14337         DIP("rep ret\n");
14338         break;
14339
14340      default:
14341         goto decode_failure;
14342      }
14343      break;
14344   }
14345
14346   /* ------------------------ XCHG ----------------------- */
14347
14348   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
14349      prefix; hence it must be translated with an IRCAS (at least, the
14350      memory variant). */
14351   case 0x86: /* XCHG Gb,Eb */
14352      sz = 1;
14353      /* Fall through ... */
14354   case 0x87: /* XCHG Gv,Ev */
14355      modrm = getIByte(delta);
14356      ty = szToITy(sz);
14357      t1 = newTemp(ty); t2 = newTemp(ty);
14358      if (epartIsReg(modrm)) {
14359         assign(t1, getIReg(sz, eregOfRM(modrm)));
14360         assign(t2, getIReg(sz, gregOfRM(modrm)));
14361         putIReg(sz, gregOfRM(modrm), mkexpr(t1));
14362         putIReg(sz, eregOfRM(modrm), mkexpr(t2));
14363         delta++;
14364         DIP("xchg%c %s, %s\n",
14365             nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
14366                            nameIReg(sz,eregOfRM(modrm)));
14367      } else {
14368         *expect_CAS = True;
14369         addr = disAMode ( &alen, sorb, delta, dis_buf );
14370         assign( t1, loadLE(ty,mkexpr(addr)) );
14371         assign( t2, getIReg(sz,gregOfRM(modrm)) );
14372         casLE( mkexpr(addr),
14373                mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
14374         putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
14375         delta += alen;
14376         DIP("xchg%c %s, %s\n", nameISize(sz),
14377                                nameIReg(sz,gregOfRM(modrm)), dis_buf);
14378      }
14379      break;
14380
14381   case 0x90: /* XCHG eAX,eAX */
14382      DIP("nop\n");
14383      break;
14384   case 0x91: /* XCHG eAX,eCX */
14385   case 0x92: /* XCHG eAX,eDX */
14386   case 0x93: /* XCHG eAX,eBX */
14387   case 0x94: /* XCHG eAX,eSP */
14388   case 0x95: /* XCHG eAX,eBP */
14389   case 0x96: /* XCHG eAX,eSI */
14390   case 0x97: /* XCHG eAX,eDI */
14391      codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
14392      break;
14393
14394   /* ------------------------ XLAT ----------------------- */
14395
14396   case 0xD7: /* XLAT */
14397      if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
14398      putIReg(
14399         1,
14400         R_EAX/*AL*/,
14401         loadLE(Ity_I8,
14402                handleSegOverride(
14403                   sorb,
14404                   binop(Iop_Add32,
14405                         getIReg(4, R_EBX),
14406                         unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
14407
14408      DIP("xlat%c [ebx]\n", nameISize(sz));
14409      break;
14410
14411   /* ------------------------ IN / OUT ----------------------- */
14412
14413   case 0xE4: /* IN imm8, AL */
14414      sz = 1;
14415      t1 = newTemp(Ity_I32);
14416      abyte = getIByte(delta); delta++;
14417      assign(t1, mkU32( abyte & 0xFF ));
14418      DIP("in%c $%d,%s\n", nameISize(sz), abyte, nameIReg(sz,R_EAX));
14419      goto do_IN;
14420   case 0xE5: /* IN imm8, eAX */
14421      vassert(sz == 2 || sz == 4);
14422      t1 = newTemp(Ity_I32);
14423      abyte = getIByte(delta); delta++;
14424      assign(t1, mkU32( abyte & 0xFF ));
14425      DIP("in%c $%d,%s\n", nameISize(sz), abyte, nameIReg(sz,R_EAX));
14426      goto do_IN;
14427   case 0xEC: /* IN %DX, AL */
14428      sz = 1;
14429      t1 = newTemp(Ity_I32);
14430      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14431      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14432                                         nameIReg(sz,R_EAX));
14433      goto do_IN;
14434   case 0xED: /* IN %DX, eAX */
14435      vassert(sz == 2 || sz == 4);
14436      t1 = newTemp(Ity_I32);
14437      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14438      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14439                                         nameIReg(sz,R_EAX));
14440      goto do_IN;
14441   do_IN: {
14442      /* At this point, sz indicates the width, and t1 is a 32-bit
14443         value giving port number. */
14444      IRDirty* d;
14445      vassert(sz == 1 || sz == 2 || sz == 4);
14446      ty = szToITy(sz);
14447      t2 = newTemp(Ity_I32);
14448      d = unsafeIRDirty_1_N(
14449             t2,
14450             0/*regparms*/,
14451             "x86g_dirtyhelper_IN",
14452             &x86g_dirtyhelper_IN,
14453             mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
14454          );
14455      /* do the call, dumping the result in t2. */
14456      stmt( IRStmt_Dirty(d) );
14457      putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
14458      break;
14459   }
14460
14461   case 0xE6: /* OUT AL, imm8 */
14462      sz = 1;
14463      t1 = newTemp(Ity_I32);
14464      abyte = getIByte(delta); delta++;
14465      assign( t1, mkU32( abyte & 0xFF ) );
14466      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), abyte);
14467      goto do_OUT;
14468   case 0xE7: /* OUT eAX, imm8 */
14469      vassert(sz == 2 || sz == 4);
14470      t1 = newTemp(Ity_I32);
14471      abyte = getIByte(delta); delta++;
14472      assign( t1, mkU32( abyte & 0xFF ) );
14473      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), abyte);
14474      goto do_OUT;
14475   case 0xEE: /* OUT AL, %DX */
14476      sz = 1;
14477      t1 = newTemp(Ity_I32);
14478      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14479      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14480                                          nameIReg(2,R_EDX));
14481      goto do_OUT;
14482   case 0xEF: /* OUT eAX, %DX */
14483      vassert(sz == 2 || sz == 4);
14484      t1 = newTemp(Ity_I32);
14485      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14486      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14487                                          nameIReg(2,R_EDX));
14488      goto do_OUT;
14489   do_OUT: {
14490      /* At this point, sz indicates the width, and t1 is a 32-bit
14491         value giving port number. */
14492      IRDirty* d;
14493      vassert(sz == 1 || sz == 2 || sz == 4);
14494      ty = szToITy(sz);
14495      d = unsafeIRDirty_0_N(
14496             0/*regparms*/,
14497             "x86g_dirtyhelper_OUT",
14498             &x86g_dirtyhelper_OUT,
14499             mkIRExprVec_3( mkexpr(t1),
14500                            widenUto32( getIReg(sz, R_EAX) ),
14501                            mkU32(sz) )
14502          );
14503      stmt( IRStmt_Dirty(d) );
14504      break;
14505   }
14506
14507   /* ------------------------ (Grp1 extensions) ---------- */
14508
14509   case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
14510                 case 0x80, but only in 32-bit mode. */
14511      /* fallthru */
14512   case 0x80: /* Grp1 Ib,Eb */
14513      modrm = getIByte(delta);
14514      am_sz = lengthAMode(delta);
14515      sz    = 1;
14516      d_sz  = 1;
14517      d32   = getUChar(delta + am_sz);
14518      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14519      break;
14520
14521   case 0x81: /* Grp1 Iv,Ev */
14522      modrm = getIByte(delta);
14523      am_sz = lengthAMode(delta);
14524      d_sz  = sz;
14525      d32   = getUDisp(d_sz, delta + am_sz);
14526      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14527      break;
14528
14529   case 0x83: /* Grp1 Ib,Ev */
14530      modrm = getIByte(delta);
14531      am_sz = lengthAMode(delta);
14532      d_sz  = 1;
14533      d32   = getSDisp8(delta + am_sz);
14534      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14535      break;
14536
14537   /* ------------------------ (Grp2 extensions) ---------- */
14538
14539   case 0xC0: { /* Grp2 Ib,Eb */
14540      Bool decode_OK = True;
14541      modrm = getIByte(delta);
14542      am_sz = lengthAMode(delta);
14543      d_sz  = 1;
14544      d32   = getUChar(delta + am_sz);
14545      sz    = 1;
14546      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14547                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14548      if (!decode_OK)
14549         goto decode_failure;
14550      break;
14551   }
14552   case 0xC1: { /* Grp2 Ib,Ev */
14553      Bool decode_OK = True;
14554      modrm = getIByte(delta);
14555      am_sz = lengthAMode(delta);
14556      d_sz  = 1;
14557      d32   = getUChar(delta + am_sz);
14558      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14559                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14560      if (!decode_OK)
14561         goto decode_failure;
14562      break;
14563   }
14564   case 0xD0: { /* Grp2 1,Eb */
14565      Bool decode_OK = True;
14566      modrm = getIByte(delta);
14567      am_sz = lengthAMode(delta);
14568      d_sz  = 0;
14569      d32   = 1;
14570      sz    = 1;
14571      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14572                         mkU8(d32), NULL, &decode_OK );
14573      if (!decode_OK)
14574         goto decode_failure;
14575      break;
14576   }
14577   case 0xD1: { /* Grp2 1,Ev */
14578      Bool decode_OK = True;
14579      modrm = getUChar(delta);
14580      am_sz = lengthAMode(delta);
14581      d_sz  = 0;
14582      d32   = 1;
14583      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14584                         mkU8(d32), NULL, &decode_OK );
14585      if (!decode_OK)
14586         goto decode_failure;
14587      break;
14588   }
14589   case 0xD2: { /* Grp2 CL,Eb */
14590      Bool decode_OK = True;
14591      modrm = getUChar(delta);
14592      am_sz = lengthAMode(delta);
14593      d_sz  = 0;
14594      sz    = 1;
14595      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14596                         getIReg(1,R_ECX), "%cl", &decode_OK );
14597      if (!decode_OK)
14598         goto decode_failure;
14599      break;
14600   }
14601   case 0xD3: { /* Grp2 CL,Ev */
14602      Bool decode_OK = True;
14603      modrm = getIByte(delta);
14604      am_sz = lengthAMode(delta);
14605      d_sz  = 0;
14606      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14607                         getIReg(1,R_ECX), "%cl", &decode_OK );
14608      if (!decode_OK)
14609         goto decode_failure;
14610      break;
14611   }
14612
14613   /* ------------------------ (Grp3 extensions) ---------- */
14614
14615   case 0xF6: { /* Grp3 Eb */
14616      Bool decode_OK = True;
14617      delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
14618      if (!decode_OK)
14619         goto decode_failure;
14620      break;
14621   }
14622   case 0xF7: { /* Grp3 Ev */
14623      Bool decode_OK = True;
14624      delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
14625      if (!decode_OK)
14626         goto decode_failure;
14627      break;
14628   }
14629
14630   /* ------------------------ (Grp4 extensions) ---------- */
14631
14632   case 0xFE: { /* Grp4 Eb */
14633      Bool decode_OK = True;
14634      delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
14635      if (!decode_OK)
14636         goto decode_failure;
14637      break;
14638   }
14639
14640   /* ------------------------ (Grp5 extensions) ---------- */
14641
14642   case 0xFF: { /* Grp5 Ev */
14643      Bool decode_OK = True;
14644      delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
14645      if (!decode_OK)
14646         goto decode_failure;
14647      break;
14648   }
14649
14650   /* ------------------------ Escapes to 2-byte opcodes -- */
14651
14652   case 0x0F: {
14653      opc = getIByte(delta); delta++;
14654      switch (opc) {
14655
14656      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
14657
14658      case 0xBA: { /* Grp8 Ib,Ev */
14659         Bool decode_OK = False;
14660         modrm = getUChar(delta);
14661         am_sz = lengthAMode(delta);
14662         d32   = getSDisp8(delta + am_sz);
14663         delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
14664                                am_sz, sz, d32, &decode_OK );
14665         if (!decode_OK)
14666            goto decode_failure;
14667         break;
14668      }
14669
14670      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
14671
14672      case 0xBC: /* BSF Gv,Ev */
14673         delta = dis_bs_E_G ( sorb, sz, delta, True );
14674         break;
14675      case 0xBD: /* BSR Gv,Ev */
14676         delta = dis_bs_E_G ( sorb, sz, delta, False );
14677         break;
14678
14679      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
14680
14681      case 0xC8: /* BSWAP %eax */
14682      case 0xC9:
14683      case 0xCA:
14684      case 0xCB:
14685      case 0xCC:
14686      case 0xCD:
14687      case 0xCE:
14688      case 0xCF: /* BSWAP %edi */
14689         /* AFAICS from the Intel docs, this only exists at size 4. */
14690         if (sz != 4) goto decode_failure;
14691
14692         t1 = newTemp(Ity_I32);
14693         assign( t1, getIReg(4, opc-0xC8) );
14694         t2 = math_BSWAP(t1, Ity_I32);
14695
14696         putIReg(4, opc-0xC8, mkexpr(t2));
14697         DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
14698         break;
14699
14700      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
14701
14702      case 0xA3: /* BT Gv,Ev */
14703         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
14704         break;
14705      case 0xB3: /* BTR Gv,Ev */
14706         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
14707         break;
14708      case 0xAB: /* BTS Gv,Ev */
14709         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
14710         break;
14711      case 0xBB: /* BTC Gv,Ev */
14712         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
14713         break;
14714
14715      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
14716
14717      case 0x40:
14718      case 0x41:
14719      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
14720      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
14721      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
14722      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
14723      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
14724      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
14725      case 0x48: /* CMOVSb (cmov negative) */
14726      case 0x49: /* CMOVSb (cmov not negative) */
14727      case 0x4A: /* CMOVP (cmov parity even) */
14728      case 0x4B: /* CMOVNP (cmov parity odd) */
14729      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
14730      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
14731      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
14732      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
14733         delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
14734         break;
14735
14736      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
14737
14738      case 0xB0: /* CMPXCHG Gb,Eb */
14739         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
14740         break;
14741      case 0xB1: /* CMPXCHG Gv,Ev */
14742         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
14743         break;
14744
14745      case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
14746         IRTemp expdHi    = newTemp(Ity_I32);
14747         IRTemp expdLo    = newTemp(Ity_I32);
14748         IRTemp dataHi    = newTemp(Ity_I32);
14749         IRTemp dataLo    = newTemp(Ity_I32);
14750         IRTemp oldHi     = newTemp(Ity_I32);
14751         IRTemp oldLo     = newTemp(Ity_I32);
14752         IRTemp flags_old = newTemp(Ity_I32);
14753         IRTemp flags_new = newTemp(Ity_I32);
14754         IRTemp success   = newTemp(Ity_I1);
14755
14756         /* Translate this using a DCAS, even if there is no LOCK
14757            prefix.  Life is too short to bother with generating two
14758            different translations for the with/without-LOCK-prefix
14759            cases. */
14760         *expect_CAS = True;
14761
14762	 /* Decode, and generate address. */
14763         if (sz != 4) goto decode_failure;
14764         modrm = getIByte(delta);
14765         if (epartIsReg(modrm)) goto decode_failure;
14766         if (gregOfRM(modrm) != 1) goto decode_failure;
14767         addr = disAMode ( &alen, sorb, delta, dis_buf );
14768         delta += alen;
14769
14770         /* Get the expected and new values. */
14771         assign( expdHi, getIReg(4,R_EDX) );
14772         assign( expdLo, getIReg(4,R_EAX) );
14773         assign( dataHi, getIReg(4,R_ECX) );
14774         assign( dataLo, getIReg(4,R_EBX) );
14775
14776         /* Do the DCAS */
14777         stmt( IRStmt_CAS(
14778                  mkIRCAS( oldHi, oldLo,
14779                           Iend_LE, mkexpr(addr),
14780                           mkexpr(expdHi), mkexpr(expdLo),
14781                           mkexpr(dataHi), mkexpr(dataLo)
14782               )));
14783
14784         /* success when oldHi:oldLo == expdHi:expdLo */
14785         assign( success,
14786                 binop(Iop_CasCmpEQ32,
14787                       binop(Iop_Or32,
14788                             binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
14789                             binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
14790                       ),
14791                       mkU32(0)
14792                 ));
14793
14794         /* If the DCAS is successful, that is to say oldHi:oldLo ==
14795            expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
14796            which is where they came from originally.  Both the actual
14797            contents of these two regs, and any shadow values, are
14798            unchanged.  If the DCAS fails then we're putting into
14799            EDX:EAX the value seen in memory. */
14800         putIReg(4, R_EDX,
14801                    IRExpr_ITE( mkexpr(success),
14802                                mkexpr(expdHi), mkexpr(oldHi)
14803                ));
14804         putIReg(4, R_EAX,
14805                    IRExpr_ITE( mkexpr(success),
14806                                mkexpr(expdLo), mkexpr(oldLo)
14807                ));
14808
14809         /* Copy the success bit into the Z flag and leave the others
14810            unchanged */
14811         assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
14812         assign(
14813            flags_new,
14814            binop(Iop_Or32,
14815                  binop(Iop_And32, mkexpr(flags_old),
14816                                   mkU32(~X86G_CC_MASK_Z)),
14817                  binop(Iop_Shl32,
14818                        binop(Iop_And32,
14819                              unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
14820                        mkU8(X86G_CC_SHIFT_Z)) ));
14821
14822         stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14823         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
14824         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14825         /* Set NDEP even though it isn't used.  This makes
14826            redundant-PUT elimination of previous stores to this field
14827            work better. */
14828         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14829
14830         /* Sheesh.  Aren't you glad it was me and not you that had to
14831	    write and validate all this grunge? */
14832
14833	 DIP("cmpxchg8b %s\n", dis_buf);
14834	 break;
14835      }
14836
14837      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
14838
14839      case 0xA2: { /* CPUID */
14840         /* Uses dirty helper:
14841               void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
14842            declared to mod eax, wr ebx, ecx, edx
14843         */
14844         IRDirty* d     = NULL;
14845         void*    fAddr = NULL;
14846         const HChar* fName = NULL;
14847         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3) {
14848            fName = "x86g_dirtyhelper_CPUID_sse3";
14849            fAddr = &x86g_dirtyhelper_CPUID_sse3;
14850         }
14851         else
14852         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
14853            fName = "x86g_dirtyhelper_CPUID_sse2";
14854            fAddr = &x86g_dirtyhelper_CPUID_sse2;
14855         }
14856         else
14857         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
14858            fName = "x86g_dirtyhelper_CPUID_sse1";
14859            fAddr = &x86g_dirtyhelper_CPUID_sse1;
14860         }
14861         else
14862         if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
14863            fName = "x86g_dirtyhelper_CPUID_mmxext";
14864            fAddr = &x86g_dirtyhelper_CPUID_mmxext;
14865         }
14866         else
14867         if (archinfo->hwcaps == 0/*no SSE*/) {
14868            fName = "x86g_dirtyhelper_CPUID_sse0";
14869            fAddr = &x86g_dirtyhelper_CPUID_sse0;
14870         } else
14871            vpanic("disInstr(x86)(cpuid)");
14872
14873         vassert(fName); vassert(fAddr);
14874         d = unsafeIRDirty_0_N ( 0/*regparms*/,
14875                                 fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
14876         /* declare guest state effects */
14877         d->nFxState = 4;
14878         vex_bzero(&d->fxState, sizeof(d->fxState));
14879         d->fxState[0].fx     = Ifx_Modify;
14880         d->fxState[0].offset = OFFB_EAX;
14881         d->fxState[0].size   = 4;
14882         d->fxState[1].fx     = Ifx_Write;
14883         d->fxState[1].offset = OFFB_EBX;
14884         d->fxState[1].size   = 4;
14885         d->fxState[2].fx     = Ifx_Modify;
14886         d->fxState[2].offset = OFFB_ECX;
14887         d->fxState[2].size   = 4;
14888         d->fxState[3].fx     = Ifx_Write;
14889         d->fxState[3].offset = OFFB_EDX;
14890         d->fxState[3].size   = 4;
14891         /* execute the dirty call, side-effecting guest state */
14892         stmt( IRStmt_Dirty(d) );
14893         /* CPUID is a serialising insn.  So, just in case someone is
14894            using it as a memory fence ... */
14895         stmt( IRStmt_MBE(Imbe_Fence) );
14896         DIP("cpuid\n");
14897         break;
14898      }
14899
14900//--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
14901//--             goto decode_failure;
14902//--
14903//--          t1 = newTemp(cb);
14904//--          t2 = newTemp(cb);
14905//--          t3 = newTemp(cb);
14906//--          t4 = newTemp(cb);
14907//--          uInstr0(cb, CALLM_S, 0);
14908//--
14909//--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
14910//--          uInstr1(cb, PUSH,  4, TempReg, t1);
14911//--
14912//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
14913//--          uLiteral(cb, 0);
14914//--          uInstr1(cb, PUSH,  4, TempReg, t2);
14915//--
14916//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
14917//--          uLiteral(cb, 0);
14918//--          uInstr1(cb, PUSH,  4, TempReg, t3);
14919//--
14920//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
14921//--          uLiteral(cb, 0);
14922//--          uInstr1(cb, PUSH,  4, TempReg, t4);
14923//--
14924//--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
14925//--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
14926//--
14927//--          uInstr1(cb, POP,   4, TempReg, t4);
14928//--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
14929//--
14930//--          uInstr1(cb, POP,   4, TempReg, t3);
14931//--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
14932//--
14933//--          uInstr1(cb, POP,   4, TempReg, t2);
14934//--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
14935//--
14936//--          uInstr1(cb, POP,   4, TempReg, t1);
14937//--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
14938//--
14939//--          uInstr0(cb, CALLM_E, 0);
14940//--          DIP("cpuid\n");
14941//--          break;
14942//--
14943      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
14944
14945      case 0xB6: /* MOVZXb Eb,Gv */
14946         if (sz != 2 && sz != 4)
14947            goto decode_failure;
14948         delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
14949         break;
14950
14951      case 0xB7: /* MOVZXw Ew,Gv */
14952         if (sz != 4)
14953            goto decode_failure;
14954         delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
14955         break;
14956
14957      case 0xBE: /* MOVSXb Eb,Gv */
14958         if (sz != 2 && sz != 4)
14959            goto decode_failure;
14960         delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
14961         break;
14962
14963      case 0xBF: /* MOVSXw Ew,Gv */
14964         if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
14965            goto decode_failure;
14966         delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
14967         break;
14968
14969//--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
14970//--
14971//--       case 0xC3: /* MOVNTI Gv,Ev */
14972//--          vg_assert(sz == 4);
14973//--          modrm = getUChar(eip);
14974//--          vg_assert(!epartIsReg(modrm));
14975//--          t1 = newTemp(cb);
14976//--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
14977//--          pair = disAMode ( cb, sorb, eip, dis_buf );
14978//--          t2 = LOW24(pair);
14979//--          eip += HI8(pair);
14980//--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
14981//--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
14982//--          break;
14983
14984      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
14985
14986      case 0xAF: /* IMUL Ev, Gv */
14987         delta = dis_mul_E_G ( sorb, sz, delta );
14988         break;
14989
14990      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
14991
14992      case 0x1F:
14993         modrm = getUChar(delta);
14994         if (epartIsReg(modrm)) goto decode_failure;
14995         addr = disAMode ( &alen, sorb, delta, dis_buf );
14996         delta += alen;
14997         DIP("nop%c %s\n", nameISize(sz), dis_buf);
14998         break;
14999
15000      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
15001      case 0x80:
15002      case 0x81:
15003      case 0x82: /* JBb/JNAEb (jump below) */
15004      case 0x83: /* JNBb/JAEb (jump not below) */
15005      case 0x84: /* JZb/JEb (jump zero) */
15006      case 0x85: /* JNZb/JNEb (jump not zero) */
15007      case 0x86: /* JBEb/JNAb (jump below or equal) */
15008      case 0x87: /* JNBEb/JAb (jump not below or equal) */
15009      case 0x88: /* JSb (jump negative) */
15010      case 0x89: /* JSb (jump not negative) */
15011      case 0x8A: /* JP (jump parity even) */
15012      case 0x8B: /* JNP/JPO (jump parity odd) */
15013      case 0x8C: /* JLb/JNGEb (jump less) */
15014      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
15015      case 0x8E: /* JLEb/JNGb (jump less or equal) */
15016      case 0x8F: /* JGb/JNLEb (jump greater) */
15017       { Int    jmpDelta;
15018         const HChar* comment  = "";
15019         jmpDelta = (Int)getUDisp32(delta);
15020         d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
15021         delta += 4;
15022         if (resteerCisOk
15023             && vex_control.guest_chase_cond
15024             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
15025             && jmpDelta < 0
15026             && resteerOkFn( callback_opaque, (Addr32)d32) ) {
15027            /* Speculation: assume this backward branch is taken.  So
15028               we need to emit a side-exit to the insn following this
15029               one, on the negation of the condition, and continue at
15030               the branch target address (d32).  If we wind up back at
15031               the first instruction of the trace, just stop; it's
15032               better to let the IR loop unroller handle that case.*/
15033            stmt( IRStmt_Exit(
15034                     mk_x86g_calculate_condition((X86Condcode)
15035                                                 (1 ^ (opc - 0x80))),
15036                     Ijk_Boring,
15037                     IRConst_U32(guest_EIP_bbstart+delta),
15038                     OFFB_EIP ) );
15039            dres.whatNext   = Dis_ResteerC;
15040            dres.continueAt = (Addr32)d32;
15041            comment = "(assumed taken)";
15042         }
15043         else
15044         if (resteerCisOk
15045             && vex_control.guest_chase_cond
15046             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
15047             && jmpDelta >= 0
15048             && resteerOkFn( callback_opaque,
15049                             (Addr32)(guest_EIP_bbstart+delta)) ) {
15050            /* Speculation: assume this forward branch is not taken.
15051               So we need to emit a side-exit to d32 (the dest) and
15052               continue disassembling at the insn immediately
15053               following this one. */
15054            stmt( IRStmt_Exit(
15055                     mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
15056                     Ijk_Boring,
15057                     IRConst_U32(d32),
15058                     OFFB_EIP ) );
15059            dres.whatNext   = Dis_ResteerC;
15060            dres.continueAt = guest_EIP_bbstart + delta;
15061            comment = "(assumed not taken)";
15062         }
15063         else {
15064            /* Conservative default translation - end the block at
15065               this point. */
15066            jcc_01( &dres, (X86Condcode)(opc - 0x80),
15067                    (Addr32)(guest_EIP_bbstart+delta), d32);
15068            vassert(dres.whatNext == Dis_StopHere);
15069         }
15070         DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
15071         break;
15072       }
15073
15074      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
15075      case 0x31: { /* RDTSC */
15076         IRTemp   val  = newTemp(Ity_I64);
15077         IRExpr** args = mkIRExprVec_0();
15078         IRDirty* d    = unsafeIRDirty_1_N (
15079                            val,
15080                            0/*regparms*/,
15081                            "x86g_dirtyhelper_RDTSC",
15082                            &x86g_dirtyhelper_RDTSC,
15083                            args
15084                         );
15085         /* execute the dirty call, dumping the result in val. */
15086         stmt( IRStmt_Dirty(d) );
15087         putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
15088         putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
15089         DIP("rdtsc\n");
15090         break;
15091      }
15092
15093      /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
15094
15095      case 0xA1: /* POP %FS */
15096         dis_pop_segreg( R_FS, sz ); break;
15097      case 0xA9: /* POP %GS */
15098         dis_pop_segreg( R_GS, sz ); break;
15099
15100      case 0xA0: /* PUSH %FS */
15101         dis_push_segreg( R_FS, sz ); break;
15102      case 0xA8: /* PUSH %GS */
15103         dis_push_segreg( R_GS, sz ); break;
15104
15105      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
15106      case 0x90:
15107      case 0x91:
15108      case 0x92: /* set-Bb/set-NAEb (jump below) */
15109      case 0x93: /* set-NBb/set-AEb (jump not below) */
15110      case 0x94: /* set-Zb/set-Eb (jump zero) */
15111      case 0x95: /* set-NZb/set-NEb (jump not zero) */
15112      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
15113      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
15114      case 0x98: /* set-Sb (jump negative) */
15115      case 0x99: /* set-Sb (jump not negative) */
15116      case 0x9A: /* set-P (jump parity even) */
15117      case 0x9B: /* set-NP (jump parity odd) */
15118      case 0x9C: /* set-Lb/set-NGEb (jump less) */
15119      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
15120      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
15121      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
15122         t1 = newTemp(Ity_I8);
15123         assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
15124         modrm = getIByte(delta);
15125         if (epartIsReg(modrm)) {
15126            delta++;
15127            putIReg(1, eregOfRM(modrm), mkexpr(t1));
15128            DIP("set%s %s\n", name_X86Condcode(opc-0x90),
15129                              nameIReg(1,eregOfRM(modrm)));
15130         } else {
15131           addr = disAMode ( &alen, sorb, delta, dis_buf );
15132           delta += alen;
15133           storeLE( mkexpr(addr), mkexpr(t1) );
15134           DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
15135         }
15136         break;
15137
15138      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
15139
15140      case 0xA4: /* SHLDv imm8,Gv,Ev */
15141         modrm = getIByte(delta);
15142         d32   = delta + lengthAMode(delta);
15143         vex_sprintf(dis_buf, "$%d", getIByte(d32));
15144         delta = dis_SHLRD_Gv_Ev (
15145                  sorb, delta, modrm, sz,
15146                  mkU8(getIByte(d32)), True, /* literal */
15147                  dis_buf, True );
15148         break;
15149      case 0xA5: /* SHLDv %cl,Gv,Ev */
15150         modrm = getIByte(delta);
15151         delta = dis_SHLRD_Gv_Ev (
15152                    sorb, delta, modrm, sz,
15153                    getIReg(1,R_ECX), False, /* not literal */
15154                    "%cl", True );
15155         break;
15156
15157      case 0xAC: /* SHRDv imm8,Gv,Ev */
15158         modrm = getIByte(delta);
15159         d32   = delta + lengthAMode(delta);
15160         vex_sprintf(dis_buf, "$%d", getIByte(d32));
15161         delta = dis_SHLRD_Gv_Ev (
15162                    sorb, delta, modrm, sz,
15163                    mkU8(getIByte(d32)), True, /* literal */
15164                    dis_buf, False );
15165         break;
15166      case 0xAD: /* SHRDv %cl,Gv,Ev */
15167         modrm = getIByte(delta);
15168         delta = dis_SHLRD_Gv_Ev (
15169                    sorb, delta, modrm, sz,
15170                    getIReg(1,R_ECX), False, /* not literal */
15171                    "%cl", False );
15172         break;
15173
15174      /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
15175
15176      case 0x34:
15177         /* Simple implementation needing a long explaination.
15178
15179            sysenter is a kind of syscall entry.  The key thing here
15180            is that the return address is not known -- that is
15181            something that is beyond Vex's knowledge.  So this IR
15182            forces a return to the scheduler, which can do what it
15183            likes to simulate the systenter, but it MUST set this
15184            thread's guest_EIP field with the continuation address
15185            before resuming execution.  If that doesn't happen, the
15186            thread will jump to address zero, which is probably
15187            fatal.
15188         */
15189
15190         /* Note where we are, so we can back up the guest to this
15191            point if the syscall needs to be restarted. */
15192         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
15193                           mkU32(guest_EIP_curr_instr) ) );
15194         jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
15195         vassert(dres.whatNext == Dis_StopHere);
15196         DIP("sysenter");
15197         break;
15198
15199      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
15200
15201      case 0xC0: { /* XADD Gb,Eb */
15202         Bool decodeOK;
15203         delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
15204         if (!decodeOK) goto decode_failure;
15205         break;
15206      }
15207      case 0xC1: { /* XADD Gv,Ev */
15208         Bool decodeOK;
15209         delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
15210         if (!decodeOK) goto decode_failure;
15211         break;
15212      }
15213
15214      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
15215
15216      case 0x71:
15217      case 0x72:
15218      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
15219
15220      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
15221      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
15222      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
15223      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
15224
15225      case 0xFC:
15226      case 0xFD:
15227      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
15228
15229      case 0xEC:
15230      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
15231
15232      case 0xDC:
15233      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15234
15235      case 0xF8:
15236      case 0xF9:
15237      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
15238
15239      case 0xE8:
15240      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
15241
15242      case 0xD8:
15243      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15244
15245      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
15246      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
15247
15248      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
15249
15250      case 0x74:
15251      case 0x75:
15252      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
15253
15254      case 0x64:
15255      case 0x65:
15256      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
15257
15258      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
15259      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
15260      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
15261
15262      case 0x68:
15263      case 0x69:
15264      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
15265
15266      case 0x60:
15267      case 0x61:
15268      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
15269
15270      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
15271      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
15272      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
15273      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
15274
15275      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
15276      case 0xF2:
15277      case 0xF3:
15278
15279      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
15280      case 0xD2:
15281      case 0xD3:
15282
15283      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
15284      case 0xE2:
15285      {
15286         Int  delta0    = delta-1;
15287         Bool decode_OK = False;
15288
15289         /* If sz==2 this is SSE, and we assume sse idec has
15290            already spotted those cases by now. */
15291         if (sz != 4)
15292            goto decode_failure;
15293
15294         delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
15295         if (!decode_OK) {
15296            delta = delta0;
15297            goto decode_failure;
15298         }
15299         break;
15300      }
15301
15302      case 0x0E: /* FEMMS */
15303      case 0x77: /* EMMS */
15304         if (sz != 4)
15305            goto decode_failure;
15306         do_EMMS_preamble();
15307         DIP("{f}emms\n");
15308         break;
15309
15310      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
15311      case 0x01: /* 0F 01 /0 -- SGDT */
15312                 /* 0F 01 /1 -- SIDT */
15313      {
15314          /* This is really revolting, but ... since each processor
15315             (core) only has one IDT and one GDT, just let the guest
15316             see it (pass-through semantics).  I can't see any way to
15317             construct a faked-up value, so don't bother to try. */
15318         modrm = getUChar(delta);
15319         addr = disAMode ( &alen, sorb, delta, dis_buf );
15320         delta += alen;
15321         if (epartIsReg(modrm)) goto decode_failure;
15322         if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
15323            goto decode_failure;
15324         switch (gregOfRM(modrm)) {
15325            case 0: DIP("sgdt %s\n", dis_buf); break;
15326            case 1: DIP("sidt %s\n", dis_buf); break;
15327            default: vassert(0); /*NOTREACHED*/
15328         }
15329
15330         IRDirty* d = unsafeIRDirty_0_N (
15331                          0/*regparms*/,
15332                          "x86g_dirtyhelper_SxDT",
15333                          &x86g_dirtyhelper_SxDT,
15334                          mkIRExprVec_2( mkexpr(addr),
15335                                         mkU32(gregOfRM(modrm)) )
15336                      );
15337         /* declare we're writing memory */
15338         d->mFx   = Ifx_Write;
15339         d->mAddr = mkexpr(addr);
15340         d->mSize = 6;
15341         stmt( IRStmt_Dirty(d) );
15342         break;
15343      }
15344
15345      case 0x05: /* AMD's syscall */
15346         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
15347                           mkU32(guest_EIP_curr_instr) ) );
15348         jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
15349         vassert(dres.whatNext == Dis_StopHere);
15350         DIP("syscall\n");
15351         break;
15352
15353      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
15354
15355      default:
15356         goto decode_failure;
15357   } /* switch (opc) for the 2-byte opcodes */
15358   goto decode_success;
15359   } /* case 0x0F: of primary opcode */
15360
15361   /* ------------------------ ??? ------------------------ */
15362
15363  default:
15364  decode_failure:
15365   /* All decode failures end up here. */
15366   if (sigill_diag) {
15367      vex_printf("vex x86->IR: unhandled instruction bytes: "
15368                 "0x%x 0x%x 0x%x 0x%x\n",
15369                 getIByte(delta_start+0),
15370                 getIByte(delta_start+1),
15371                 getIByte(delta_start+2),
15372                 getIByte(delta_start+3));
15373   }
15374
15375   /* Tell the dispatcher that this insn cannot be decoded, and so has
15376      not been executed, and (is currently) the next to be executed.
15377      EIP should be up-to-date since it made so at the start of each
15378      insn, but nevertheless be paranoid and update it again right
15379      now. */
15380   stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
15381   jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
15382   vassert(dres.whatNext == Dis_StopHere);
15383   dres.len = 0;
15384   /* We also need to say that a CAS is not expected now, regardless
15385      of what it might have been set to at the start of the function,
15386      since the IR that we've emitted just above (to synthesis a
15387      SIGILL) does not involve any CAS, and presumably no other IR has
15388      been emitted for this (non-decoded) insn. */
15389   *expect_CAS = False;
15390   return dres;
15391
15392   } /* switch (opc) for the main (primary) opcode switch. */
15393
15394  decode_success:
15395   /* All decode successes end up here. */
15396   switch (dres.whatNext) {
15397      case Dis_Continue:
15398         stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
15399         break;
15400      case Dis_ResteerU:
15401      case Dis_ResteerC:
15402         stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
15403         break;
15404      case Dis_StopHere:
15405         break;
15406      default:
15407         vassert(0);
15408   }
15409
15410   DIP("\n");
15411   dres.len = delta - delta_start;
15412   return dres;
15413}
15414
15415#undef DIP
15416#undef DIS
15417
15418
15419/*------------------------------------------------------------*/
15420/*--- Top-level fn                                         ---*/
15421/*------------------------------------------------------------*/
15422
15423/* Disassemble a single instruction into IR.  The instruction
15424   is located in host memory at &guest_code[delta]. */
15425
15426DisResult disInstr_X86 ( IRSB*        irsb_IN,
15427                         Bool         (*resteerOkFn) ( void*, Addr ),
15428                         Bool         resteerCisOk,
15429                         void*        callback_opaque,
15430                         const UChar* guest_code_IN,
15431                         Long         delta,
15432                         Addr         guest_IP,
15433                         VexArch      guest_arch,
15434                         const VexArchInfo* archinfo,
15435                         const VexAbiInfo*  abiinfo,
15436                         VexEndness   host_endness_IN,
15437                         Bool         sigill_diag_IN )
15438{
15439   Int       i, x1, x2;
15440   Bool      expect_CAS, has_CAS;
15441   DisResult dres;
15442
15443   /* Set globals (see top of this file) */
15444   vassert(guest_arch == VexArchX86);
15445   guest_code           = guest_code_IN;
15446   irsb                 = irsb_IN;
15447   host_endness         = host_endness_IN;
15448   guest_EIP_curr_instr = (Addr32)guest_IP;
15449   guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
15450
15451   x1 = irsb_IN->stmts_used;
15452   expect_CAS = False;
15453   dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
15454                             resteerCisOk,
15455                             callback_opaque,
15456                             delta, archinfo, abiinfo, sigill_diag_IN );
15457   x2 = irsb_IN->stmts_used;
15458   vassert(x2 >= x1);
15459
15460   /* See comment at the top of disInstr_X86_WRK for meaning of
15461      expect_CAS.  Here, we (sanity-)check for the presence/absence of
15462      IRCAS as directed by the returned expect_CAS value. */
15463   has_CAS = False;
15464   for (i = x1; i < x2; i++) {
15465      if (irsb_IN->stmts[i]->tag == Ist_CAS)
15466         has_CAS = True;
15467   }
15468
15469   if (expect_CAS != has_CAS) {
15470      /* inconsistency detected.  re-disassemble the instruction so as
15471         to generate a useful error message; then assert. */
15472      vex_traceflags |= VEX_TRACE_FE;
15473      dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
15474                                resteerCisOk,
15475                                callback_opaque,
15476                                delta, archinfo, abiinfo, sigill_diag_IN );
15477      for (i = x1; i < x2; i++) {
15478         vex_printf("\t\t");
15479         ppIRStmt(irsb_IN->stmts[i]);
15480         vex_printf("\n");
15481      }
15482      /* Failure of this assertion is serious and denotes a bug in
15483         disInstr. */
15484      vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
15485   }
15486
15487   return dres;
15488}
15489
15490
15491/*--------------------------------------------------------------------*/
15492/*--- end                                         guest_x86_toIR.c ---*/
15493/*--------------------------------------------------------------------*/
15494