1
2/*--------------------------------------------------------------------*/
3/*--- begin                                       guest_x86_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates x86 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 32-bit value is being written.
42
43   FUCOMI(P): what happens to A and S flags?  Currently are forced
44      to zero.
45
46   x87 FP Limitations:
47
48   * all arithmetic done at 64 bits
49
50   * no FP exceptions, except for handling stack over/underflow
51
52   * FP rounding mode observed only for float->int conversions
53     and int->float conversions which could lose accuracy, and
54     for float-to-float rounding.  For all other operations,
55     round-to-nearest is used, regardless.
56
57   * some of the FCOM cases could do with testing -- not convinced
58     that the args are the right way round.
59
60   * FSAVE does not re-initialise the FPU; it should do
61
62   * FINIT not only initialises the FPU environment, it also
63     zeroes all the FP registers.  It should leave the registers
64     unchanged.
65
66   SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
67   per Intel docs this bit has no meaning anyway.  Since PUSHF is the
68   only way to observe eflags[1], a proper fix would be to make that
69   bit be set by PUSHF.
70
71   The state of %eflags.AC (alignment check, bit 18) is recorded by
72   the simulation (viz, if you set it with popf then a pushf produces
73   the value you set it to), but it is otherwise ignored.  In
74   particular, setting it to 1 does NOT cause alignment checking to
75   happen.  Programs that set it to 1 and then rely on the resulting
76   SIGBUSs to inform them of misaligned accesses will not work.
77
78   Implementation of sysenter is necessarily partial.  sysenter is a
79   kind of system call entry.  When doing a sysenter, the return
80   address is not known -- that is something that is beyond Vex's
81   knowledge.  So the generated IR forces a return to the scheduler,
82   which can do what it likes to simulate the systenter, but it MUST
83   set this thread's guest_EIP field with the continuation address
84   before resuming execution.  If that doesn't happen, the thread will
85   jump to address zero, which is probably fatal.
86
87   This module uses global variables and so is not MT-safe (if that
88   should ever become relevant).
89
90   The delta values are 32-bit ints, not 64-bit ints.  That means
91   this module may not work right if run on a 64-bit host.  That should
92   be fixed properly, really -- if anyone ever wants to use Vex to
93   translate x86 code for execution on a 64-bit host.
94
95   casLE (implementation of lock-prefixed insns) and rep-prefixed
96   insns: the side-exit back to the start of the insn is done with
97   Ijk_Boring.  This is quite wrong, it should be done with
98   Ijk_NoRedir, since otherwise the side exit, which is intended to
99   restart the instruction for whatever reason, could go somewhere
100   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
101   no-redir jumps performance critical, at least for rep-prefixed
102   instructions, since all iterations thereof would involve such a
103   jump.  It's not such a big deal with casLE since the side exit is
104   only taken if the CAS fails, that is, the location is contended,
105   which is relatively unlikely.
106
107   XXXX: Nov 2009: handling of SWP on ARM suffers from the same
108   problem.
109
110   Note also, the test for CAS success vs failure is done using
111   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
112   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
113   shouldn't definedness-check these comparisons.  See
114   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
115   background/rationale.
116*/
117
118/* Performance holes:
119
120   - fcom ; fstsw %ax ; sahf
121     sahf does not update the O flag (sigh) and so O needs to
122     be computed.  This is done expensively; it would be better
123     to have a calculate_eflags_o helper.
124
125   - emwarns; some FP codes can generate huge numbers of these
126     if the fpucw is changed in an inner loop.  It would be
127     better for the guest state to have an emwarn-enable reg
128     which can be set zero or nonzero.  If it is zero, emwarns
129     are not flagged, and instead control just flows all the
130     way through bbs as usual.
131*/
132
133/* "Special" instructions.
134
135   This instruction decoder can decode three special instructions
136   which mean nothing natively (are no-ops as far as regs/mem are
137   concerned) but have meaning for supporting Valgrind.  A special
138   instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
139   C1C713 (in the standard interpretation, that means: roll $3, %edi;
140   roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
141   one of the following 3 are allowed (standard interpretation in
142   parentheses):
143
144      87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
145      87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
146      87D2 (xchgl %edx,%edx)   call-noredir *%EAX
147      87FF (xchgl %edi,%edi)   IR injection
148
149   Any other bytes following the 12-byte preamble are illegal and
150   constitute a failure in instruction decoding.  This all assumes
151   that the preamble will never occur except in specific code
152   fragments designed for Valgrind to catch.
153
154   No prefixes may precede a "Special" instruction.
155*/
156
157/* LOCK prefixed instructions.  These are translated using IR-level
158   CAS statements (IRCAS) and are believed to preserve atomicity, even
159   from the point of view of some other process racing against a
160   simulated one (presumably they communicate via a shared memory
161   segment).
162
163   Handlers which are aware of LOCK prefixes are:
164      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
165      dis_cmpxchg_G_E  (cmpxchg)
166      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
167      dis_Grp3         (not, neg)
168      dis_Grp4         (inc, dec)
169      dis_Grp5         (inc, dec)
170      dis_Grp8_Imm     (bts, btc, btr)
171      dis_bt_G_E       (bts, btc, btr)
172      dis_xadd_G_E     (xadd)
173*/
174
175
176#include "libvex_basictypes.h"
177#include "libvex_ir.h"
178#include "libvex.h"
179#include "libvex_guest_x86.h"
180
181#include "main_util.h"
182#include "main_globals.h"
183#include "guest_generic_bb_to_IR.h"
184#include "guest_generic_x87.h"
185#include "guest_x86_defs.h"
186
187
188/*------------------------------------------------------------*/
189/*--- Globals                                              ---*/
190/*------------------------------------------------------------*/
191
192/* These are set at the start of the translation of an insn, right
193   down in disInstr_X86, so that we don't have to pass them around
194   endlessly.  They are all constant during the translation of any
195   given insn. */
196
197/* We need to know this to do sub-register accesses correctly. */
198static Bool host_is_bigendian;
199
200/* Pointer to the guest code area (points to start of BB, not to the
201   insn being processed). */
202static UChar* guest_code;
203
204/* The guest address corresponding to guest_code[0]. */
205static Addr32 guest_EIP_bbstart;
206
207/* The guest address for the instruction currently being
208   translated. */
209static Addr32 guest_EIP_curr_instr;
210
211/* The IRSB* into which we're generating code. */
212static IRSB* irsb;
213
214
215/*------------------------------------------------------------*/
216/*--- Debugging output                                     ---*/
217/*------------------------------------------------------------*/
218
219#define DIP(format, args...)           \
220   if (vex_traceflags & VEX_TRACE_FE)  \
221      vex_printf(format, ## args)
222
223#define DIS(buf, format, args...)      \
224   if (vex_traceflags & VEX_TRACE_FE)  \
225      vex_sprintf(buf, format, ## args)
226
227
228/*------------------------------------------------------------*/
229/*--- Offsets of various parts of the x86 guest state.     ---*/
230/*------------------------------------------------------------*/
231
232#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
233#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
234#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
235#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
236#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
237#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
238#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
239#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
240
241#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
242
243#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
244#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
245#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
246#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
247
248#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
249#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
250#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
251#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
252#define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
253#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
254#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
255#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
256
257#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
258#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
259#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
260#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
261#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
262#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
263#define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
264#define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
265
266#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
267#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
268#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
269#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
270#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
271#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
272#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
273#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
274#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
275
276#define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
277
278#define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
279#define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
280#define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
281
282#define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
283
284
285/*------------------------------------------------------------*/
286/*--- Helper bits and pieces for deconstructing the        ---*/
287/*--- x86 insn stream.                                     ---*/
288/*------------------------------------------------------------*/
289
290/* This is the Intel register encoding -- integer regs. */
291#define R_EAX 0
292#define R_ECX 1
293#define R_EDX 2
294#define R_EBX 3
295#define R_ESP 4
296#define R_EBP 5
297#define R_ESI 6
298#define R_EDI 7
299
300#define R_AL (0+R_EAX)
301#define R_AH (4+R_EAX)
302
303/* This is the Intel register encoding -- segment regs. */
304#define R_ES 0
305#define R_CS 1
306#define R_SS 2
307#define R_DS 3
308#define R_FS 4
309#define R_GS 5
310
311
312/* Add a statement to the list held by "irbb". */
313static void stmt ( IRStmt* st )
314{
315   addStmtToIRSB( irsb, st );
316}
317
318/* Generate a new temporary of the given type. */
319static IRTemp newTemp ( IRType ty )
320{
321   vassert(isPlausibleIRType(ty));
322   return newIRTemp( irsb->tyenv, ty );
323}
324
325/* Various simple conversions */
326
327static UInt extend_s_8to32( UInt x )
328{
329   return (UInt)((((Int)x) << 24) >> 24);
330}
331
332static UInt extend_s_16to32 ( UInt x )
333{
334   return (UInt)((((Int)x) << 16) >> 16);
335}
336
337/* Fetch a byte from the guest insn stream. */
338static UChar getIByte ( Int delta )
339{
340   return guest_code[delta];
341}
342
343/* Extract the reg field from a modRM byte. */
344static Int gregOfRM ( UChar mod_reg_rm )
345{
346   return (Int)( (mod_reg_rm >> 3) & 7 );
347}
348
349/* Figure out whether the mod and rm parts of a modRM byte refer to a
350   register or memory.  If so, the byte will have the form 11XXXYYY,
351   where YYY is the register number. */
352static Bool epartIsReg ( UChar mod_reg_rm )
353{
354   return toBool(0xC0 == (mod_reg_rm & 0xC0));
355}
356
357/* ... and extract the register number ... */
358static Int eregOfRM ( UChar mod_reg_rm )
359{
360   return (Int)(mod_reg_rm & 0x7);
361}
362
363/* Get a 8/16/32-bit unsigned value out of the insn stream. */
364
365static UChar getUChar ( Int delta )
366{
367   UChar v = guest_code[delta+0];
368   return toUChar(v);
369}
370
371static UInt getUDisp16 ( Int delta )
372{
373   UInt v = guest_code[delta+1]; v <<= 8;
374   v |= guest_code[delta+0];
375   return v & 0xFFFF;
376}
377
378static UInt getUDisp32 ( Int delta )
379{
380   UInt v = guest_code[delta+3]; v <<= 8;
381   v |= guest_code[delta+2]; v <<= 8;
382   v |= guest_code[delta+1]; v <<= 8;
383   v |= guest_code[delta+0];
384   return v;
385}
386
387static UInt getUDisp ( Int size, Int delta )
388{
389   switch (size) {
390      case 4: return getUDisp32(delta);
391      case 2: return getUDisp16(delta);
392      case 1: return (UInt)getUChar(delta);
393      default: vpanic("getUDisp(x86)");
394   }
395   return 0; /*notreached*/
396}
397
398
399/* Get a byte value out of the insn stream and sign-extend to 32
400   bits. */
401static UInt getSDisp8 ( Int delta )
402{
403   return extend_s_8to32( (UInt) (guest_code[delta]) );
404}
405
406static UInt getSDisp16 ( Int delta0 )
407{
408   UChar* eip = (UChar*)(&guest_code[delta0]);
409   UInt d = *eip++;
410   d |= ((*eip++) << 8);
411   return extend_s_16to32(d);
412}
413
414static UInt getSDisp ( Int size, Int delta )
415{
416   switch (size) {
417      case 4: return getUDisp32(delta);
418      case 2: return getSDisp16(delta);
419      case 1: return getSDisp8(delta);
420      default: vpanic("getSDisp(x86)");
421  }
422  return 0; /*notreached*/
423}
424
425
426/*------------------------------------------------------------*/
427/*--- Helpers for constructing IR.                         ---*/
428/*------------------------------------------------------------*/
429
430/* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
431   register references, we need to take the host endianness into
432   account.  Supplied value is 0 .. 7 and in the Intel instruction
433   encoding. */
434
435static IRType szToITy ( Int n )
436{
437   switch (n) {
438      case 1: return Ity_I8;
439      case 2: return Ity_I16;
440      case 4: return Ity_I32;
441      default: vpanic("szToITy(x86)");
442   }
443}
444
445/* On a little-endian host, less significant bits of the guest
446   registers are at lower addresses.  Therefore, if a reference to a
447   register low half has the safe guest state offset as a reference to
448   the full register.
449*/
450static Int integerGuestRegOffset ( Int sz, UInt archreg )
451{
452   vassert(archreg < 8);
453
454   /* Correct for little-endian host only. */
455   vassert(!host_is_bigendian);
456
457   if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
458      switch (archreg) {
459         case R_EAX: return OFFB_EAX;
460         case R_EBX: return OFFB_EBX;
461         case R_ECX: return OFFB_ECX;
462         case R_EDX: return OFFB_EDX;
463         case R_ESI: return OFFB_ESI;
464         case R_EDI: return OFFB_EDI;
465         case R_ESP: return OFFB_ESP;
466         case R_EBP: return OFFB_EBP;
467         default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
468      }
469   }
470
471   vassert(archreg >= 4 && archreg < 8 && sz == 1);
472   switch (archreg-4) {
473      case R_EAX: return 1+ OFFB_EAX;
474      case R_EBX: return 1+ OFFB_EBX;
475      case R_ECX: return 1+ OFFB_ECX;
476      case R_EDX: return 1+ OFFB_EDX;
477      default: vpanic("integerGuestRegOffset(x86,le)(1h)");
478   }
479
480   /* NOTREACHED */
481   vpanic("integerGuestRegOffset(x86,le)");
482}
483
484static Int segmentGuestRegOffset ( UInt sreg )
485{
486   switch (sreg) {
487      case R_ES: return OFFB_ES;
488      case R_CS: return OFFB_CS;
489      case R_SS: return OFFB_SS;
490      case R_DS: return OFFB_DS;
491      case R_FS: return OFFB_FS;
492      case R_GS: return OFFB_GS;
493      default: vpanic("segmentGuestRegOffset(x86)");
494   }
495}
496
497static Int xmmGuestRegOffset ( UInt xmmreg )
498{
499   switch (xmmreg) {
500      case 0: return OFFB_XMM0;
501      case 1: return OFFB_XMM1;
502      case 2: return OFFB_XMM2;
503      case 3: return OFFB_XMM3;
504      case 4: return OFFB_XMM4;
505      case 5: return OFFB_XMM5;
506      case 6: return OFFB_XMM6;
507      case 7: return OFFB_XMM7;
508      default: vpanic("xmmGuestRegOffset");
509   }
510}
511
512/* Lanes of vector registers are always numbered from zero being the
513   least significant lane (rightmost in the register).  */
514
515static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
516{
517   /* Correct for little-endian host only. */
518   vassert(!host_is_bigendian);
519   vassert(laneno >= 0 && laneno < 8);
520   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
521}
522
523static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
524{
525   /* Correct for little-endian host only. */
526   vassert(!host_is_bigendian);
527   vassert(laneno >= 0 && laneno < 4);
528   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
529}
530
531static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
532{
533   /* Correct for little-endian host only. */
534   vassert(!host_is_bigendian);
535   vassert(laneno >= 0 && laneno < 2);
536   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
537}
538
539static IRExpr* getIReg ( Int sz, UInt archreg )
540{
541   vassert(sz == 1 || sz == 2 || sz == 4);
542   vassert(archreg < 8);
543   return IRExpr_Get( integerGuestRegOffset(sz,archreg),
544                      szToITy(sz) );
545}
546
547/* Ditto, but write to a reg instead. */
548static void putIReg ( Int sz, UInt archreg, IRExpr* e )
549{
550   IRType ty = typeOfIRExpr(irsb->tyenv, e);
551   switch (sz) {
552      case 1: vassert(ty == Ity_I8); break;
553      case 2: vassert(ty == Ity_I16); break;
554      case 4: vassert(ty == Ity_I32); break;
555      default: vpanic("putIReg(x86)");
556   }
557   vassert(archreg < 8);
558   stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
559}
560
561static IRExpr* getSReg ( UInt sreg )
562{
563   return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
564}
565
566static void putSReg ( UInt sreg, IRExpr* e )
567{
568   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
569   stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
570}
571
572static IRExpr* getXMMReg ( UInt xmmreg )
573{
574   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
575}
576
577static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
578{
579   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
580}
581
582static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
583{
584   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
585}
586
587static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
588{
589   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
590}
591
592static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
593{
594   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
595}
596
597static void putXMMReg ( UInt xmmreg, IRExpr* e )
598{
599   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
600   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
601}
602
603static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
604{
605   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
606   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
607}
608
609static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
610{
611   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
612   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
613}
614
615static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
616{
617   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
618   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
619}
620
621static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
622{
623   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
624   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
625}
626
627static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
628{
629   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
630   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
631}
632
633static void assign ( IRTemp dst, IRExpr* e )
634{
635   stmt( IRStmt_WrTmp(dst, e) );
636}
637
638static void storeLE ( IRExpr* addr, IRExpr* data )
639{
640   stmt( IRStmt_Store(Iend_LE, addr, data) );
641}
642
643static IRExpr* unop ( IROp op, IRExpr* a )
644{
645   return IRExpr_Unop(op, a);
646}
647
648static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
649{
650   return IRExpr_Binop(op, a1, a2);
651}
652
653static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
654{
655   return IRExpr_Triop(op, a1, a2, a3);
656}
657
658static IRExpr* mkexpr ( IRTemp tmp )
659{
660   return IRExpr_RdTmp(tmp);
661}
662
663static IRExpr* mkU8 ( UInt i )
664{
665   vassert(i < 256);
666   return IRExpr_Const(IRConst_U8( (UChar)i ));
667}
668
669static IRExpr* mkU16 ( UInt i )
670{
671   vassert(i < 65536);
672   return IRExpr_Const(IRConst_U16( (UShort)i ));
673}
674
675static IRExpr* mkU32 ( UInt i )
676{
677   return IRExpr_Const(IRConst_U32(i));
678}
679
680static IRExpr* mkU64 ( ULong i )
681{
682   return IRExpr_Const(IRConst_U64(i));
683}
684
685static IRExpr* mkU ( IRType ty, UInt i )
686{
687   if (ty == Ity_I8)  return mkU8(i);
688   if (ty == Ity_I16) return mkU16(i);
689   if (ty == Ity_I32) return mkU32(i);
690   /* If this panics, it usually means you passed a size (1,2,4)
691      value as the IRType, rather than a real IRType. */
692   vpanic("mkU(x86)");
693}
694
695static IRExpr* mkV128 ( UShort mask )
696{
697   return IRExpr_Const(IRConst_V128(mask));
698}
699
700static IRExpr* loadLE ( IRType ty, IRExpr* addr )
701{
702   return IRExpr_Load(Iend_LE, ty, addr);
703}
704
705static IROp mkSizedOp ( IRType ty, IROp op8 )
706{
707   Int adj;
708   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
709   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
710           || op8 == Iop_Mul8
711           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
712           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
713           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
714           || op8 == Iop_CasCmpNE8
715           || op8 == Iop_ExpCmpNE8
716           || op8 == Iop_Not8);
717   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
718   return adj + op8;
719}
720
721static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
722{
723   if (szSmall == 1 && szBig == 4) {
724      return signd ? Iop_8Sto32 : Iop_8Uto32;
725   }
726   if (szSmall == 1 && szBig == 2) {
727      return signd ? Iop_8Sto16 : Iop_8Uto16;
728   }
729   if (szSmall == 2 && szBig == 4) {
730      return signd ? Iop_16Sto32 : Iop_16Uto32;
731   }
732   vpanic("mkWidenOp(x86,guest)");
733}
734
735static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
736{
737   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
738   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
739   return unop(Iop_32to1,
740               binop(Iop_And32,
741                     unop(Iop_1Uto32,x),
742                     unop(Iop_1Uto32,y)));
743}
744
745/* Generate a compare-and-swap operation, operating on memory at
746   'addr'.  The expected value is 'expVal' and the new value is
747   'newVal'.  If the operation fails, then transfer control (with a
748   no-redir jump (XXX no -- see comment at top of this file)) to
749   'restart_point', which is presumably the address of the guest
750   instruction again -- retrying, essentially. */
751static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
752                    Addr32 restart_point )
753{
754   IRCAS* cas;
755   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
756   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
757   IRTemp oldTmp = newTemp(tyE);
758   IRTemp expTmp = newTemp(tyE);
759   vassert(tyE == tyN);
760   vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
761   assign(expTmp, expVal);
762   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
763                  NULL, mkexpr(expTmp), NULL, newVal );
764   stmt( IRStmt_CAS(cas) );
765   stmt( IRStmt_Exit(
766            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
767                   mkexpr(oldTmp), mkexpr(expTmp) ),
768            Ijk_Boring, /*Ijk_NoRedir*/
769            IRConst_U32( restart_point ),
770            OFFB_EIP
771         ));
772}
773
774
775/*------------------------------------------------------------*/
776/*--- Helpers for %eflags.                                 ---*/
777/*------------------------------------------------------------*/
778
779/* -------------- Evaluating the flags-thunk. -------------- */
780
781/* Build IR to calculate all the eflags from stored
782   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
783   Ity_I32. */
784static IRExpr* mk_x86g_calculate_eflags_all ( void )
785{
786   IRExpr** args
787      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
788                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
789                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
790                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
791   IRExpr* call
792      = mkIRExprCCall(
793           Ity_I32,
794           0/*regparm*/,
795           "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
796           args
797        );
798   /* Exclude OP and NDEP from definedness checking.  We're only
799      interested in DEP1 and DEP2. */
800   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
801   return call;
802}
803
804/* Build IR to calculate some particular condition from stored
805   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
806   Ity_Bit. */
807static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
808{
809   IRExpr** args
810      = mkIRExprVec_5( mkU32(cond),
811                       IRExpr_Get(OFFB_CC_OP,  Ity_I32),
812                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
813                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
814                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
815   IRExpr* call
816      = mkIRExprCCall(
817           Ity_I32,
818           0/*regparm*/,
819           "x86g_calculate_condition", &x86g_calculate_condition,
820           args
821        );
822   /* Exclude the requested condition, OP and NDEP from definedness
823      checking.  We're only interested in DEP1 and DEP2. */
824   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
825   return unop(Iop_32to1, call);
826}
827
828/* Build IR to calculate just the carry flag from stored
829   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
830static IRExpr* mk_x86g_calculate_eflags_c ( void )
831{
832   IRExpr** args
833      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
834                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
835                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
836                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
837   IRExpr* call
838      = mkIRExprCCall(
839           Ity_I32,
840           3/*regparm*/,
841           "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
842           args
843        );
844   /* Exclude OP and NDEP from definedness checking.  We're only
845      interested in DEP1 and DEP2. */
846   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
847   return call;
848}
849
850
851/* -------------- Building the flags-thunk. -------------- */
852
853/* The machinery in this section builds the flag-thunk following a
854   flag-setting operation.  Hence the various setFlags_* functions.
855*/
856
857static Bool isAddSub ( IROp op8 )
858{
859   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
860}
861
862static Bool isLogic ( IROp op8 )
863{
864   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
865}
866
867/* U-widen 8/16/32 bit int expr to 32. */
868static IRExpr* widenUto32 ( IRExpr* e )
869{
870   switch (typeOfIRExpr(irsb->tyenv,e)) {
871      case Ity_I32: return e;
872      case Ity_I16: return unop(Iop_16Uto32,e);
873      case Ity_I8:  return unop(Iop_8Uto32,e);
874      default: vpanic("widenUto32");
875   }
876}
877
878/* S-widen 8/16/32 bit int expr to 32. */
879static IRExpr* widenSto32 ( IRExpr* e )
880{
881   switch (typeOfIRExpr(irsb->tyenv,e)) {
882      case Ity_I32: return e;
883      case Ity_I16: return unop(Iop_16Sto32,e);
884      case Ity_I8:  return unop(Iop_8Sto32,e);
885      default: vpanic("widenSto32");
886   }
887}
888
889/* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
890   of these combinations make sense. */
891static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
892{
893   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
894   if (src_ty == dst_ty)
895      return e;
896   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
897      return unop(Iop_32to16, e);
898   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
899      return unop(Iop_32to8, e);
900
901   vex_printf("\nsrc, dst tys are: ");
902   ppIRType(src_ty);
903   vex_printf(", ");
904   ppIRType(dst_ty);
905   vex_printf("\n");
906   vpanic("narrowTo(x86)");
907}
908
909
910/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
911   auto-sized up to the real op. */
912
913static
914void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
915{
916   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
917
918   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
919
920   switch (op8) {
921      case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
922      case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
923      default:       ppIROp(op8);
924                     vpanic("setFlags_DEP1_DEP2(x86)");
925   }
926   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
927   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
928   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
929   /* Set NDEP even though it isn't used.  This makes redundant-PUT
930      elimination of previous stores to this field work better. */
931   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
932}
933
934
935/* Set the OP and DEP1 fields only, and write zero to DEP2. */
936
937static
938void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
939{
940   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
941
942   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
943
944   switch (op8) {
945      case Iop_Or8:
946      case Iop_And8:
947      case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
948      default:       ppIROp(op8);
949                     vpanic("setFlags_DEP1(x86)");
950   }
951   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
952   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
953   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
954   /* Set NDEP even though it isn't used.  This makes redundant-PUT
955      elimination of previous stores to this field work better. */
956   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
957}
958
959
960/* For shift operations, we put in the result and the undershifted
961   result.  Except if the shift amount is zero, the thunk is left
962   unchanged. */
963
964static void setFlags_DEP1_DEP2_shift ( IROp    op32,
965                                       IRTemp  res,
966                                       IRTemp  resUS,
967                                       IRType  ty,
968                                       IRTemp  guard )
969{
970   Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
971
972   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
973   vassert(guard);
974
975   /* Both kinds of right shifts are handled by the same thunk
976      operation. */
977   switch (op32) {
978      case Iop_Shr32:
979      case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
980      case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
981      default:        ppIROp(op32);
982                      vpanic("setFlags_DEP1_DEP2_shift(x86)");
983   }
984
985   /* guard :: Ity_I8.  We need to convert it to I1. */
986   IRTemp guardB = newTemp(Ity_I1);
987   assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
988
989   /* DEP1 contains the result, DEP2 contains the undershifted value. */
990   stmt( IRStmt_Put( OFFB_CC_OP,
991                     IRExpr_ITE( mkexpr(guardB),
992                                 mkU32(ccOp),
993                                 IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
994   stmt( IRStmt_Put( OFFB_CC_DEP1,
995                     IRExpr_ITE( mkexpr(guardB),
996                                 widenUto32(mkexpr(res)),
997                                 IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
998   stmt( IRStmt_Put( OFFB_CC_DEP2,
999                     IRExpr_ITE( mkexpr(guardB),
1000                                 widenUto32(mkexpr(resUS)),
1001                                 IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
1002   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1003      elimination of previous stores to this field work better. */
1004   stmt( IRStmt_Put( OFFB_CC_NDEP,
1005                     IRExpr_ITE( mkexpr(guardB),
1006                                 mkU32(0),
1007                                 IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
1008}
1009
1010
1011/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1012   the former value of the carry flag, which unfortunately we have to
1013   compute. */
1014
1015static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1016{
1017   Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
1018
1019   ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
1020   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
1021
1022   /* This has to come first, because calculating the C flag
1023      may require reading all four thunk fields. */
1024   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
1025   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
1026   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
1027   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
1028}
1029
1030
1031/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1032   two arguments. */
1033
1034static
1035void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
1036{
1037   switch (ty) {
1038      case Ity_I8:
1039         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
1040         break;
1041      case Ity_I16:
1042         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
1043         break;
1044      case Ity_I32:
1045         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
1046         break;
1047      default:
1048         vpanic("setFlags_MUL(x86)");
1049   }
1050   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
1051   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
1052   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1053      elimination of previous stores to this field work better. */
1054   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1055}
1056
1057
1058/* -------------- Condition codes. -------------- */
1059
1060/* Condition codes, using the Intel encoding.  */
1061
1062static const HChar* name_X86Condcode ( X86Condcode cond )
1063{
1064   switch (cond) {
1065      case X86CondO:      return "o";
1066      case X86CondNO:     return "no";
1067      case X86CondB:      return "b";
1068      case X86CondNB:     return "nb";
1069      case X86CondZ:      return "z";
1070      case X86CondNZ:     return "nz";
1071      case X86CondBE:     return "be";
1072      case X86CondNBE:    return "nbe";
1073      case X86CondS:      return "s";
1074      case X86CondNS:     return "ns";
1075      case X86CondP:      return "p";
1076      case X86CondNP:     return "np";
1077      case X86CondL:      return "l";
1078      case X86CondNL:     return "nl";
1079      case X86CondLE:     return "le";
1080      case X86CondNLE:    return "nle";
1081      case X86CondAlways: return "ALWAYS";
1082      default: vpanic("name_X86Condcode");
1083   }
1084}
1085
1086static
1087X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
1088                                      Bool*        needInvert )
1089{
1090   vassert(cond >= X86CondO && cond <= X86CondNLE);
1091   if (cond & 1) {
1092      *needInvert = True;
1093      return cond-1;
1094   } else {
1095      *needInvert = False;
1096      return cond;
1097   }
1098}
1099
1100
1101/* -------------- Helpers for ADD/SUB with carry. -------------- */
1102
1103/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1104   appropriately.
1105
1106   Optionally, generate a store for the 'tres' value.  This can either
1107   be a normal store, or it can be a cas-with-possible-failure style
1108   store:
1109
1110   if taddr is IRTemp_INVALID, then no store is generated.
1111
1112   if taddr is not IRTemp_INVALID, then a store (using taddr as
1113   the address) is generated:
1114
1115     if texpVal is IRTemp_INVALID then a normal store is
1116     generated, and restart_point must be zero (it is irrelevant).
1117
1118     if texpVal is not IRTemp_INVALID then a cas-style store is
1119     generated.  texpVal is the expected value, restart_point
1120     is the restart point if the store fails, and texpVal must
1121     have the same type as tres.
1122*/
1123static void helper_ADC ( Int sz,
1124                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1125                         /* info about optional store: */
1126                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1127{
1128   UInt    thunkOp;
1129   IRType  ty    = szToITy(sz);
1130   IRTemp  oldc  = newTemp(Ity_I32);
1131   IRTemp  oldcn = newTemp(ty);
1132   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1133   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1134
1135   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1136   vassert(sz == 1 || sz == 2 || sz == 4);
1137   thunkOp = sz==4 ? X86G_CC_OP_ADCL
1138                   : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
1139
1140   /* oldc = old carry flag, 0 or 1 */
1141   assign( oldc,  binop(Iop_And32,
1142                        mk_x86g_calculate_eflags_c(),
1143                        mkU32(1)) );
1144
1145   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1146
1147   assign( tres, binop(plus,
1148                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1149                       mkexpr(oldcn)) );
1150
1151   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1152      start of this function. */
1153   if (taddr != IRTemp_INVALID) {
1154      if (texpVal == IRTemp_INVALID) {
1155         vassert(restart_point == 0);
1156         storeLE( mkexpr(taddr), mkexpr(tres) );
1157      } else {
1158         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1159         /* .. and hence 'texpVal' has the same type as 'tres'. */
1160         casLE( mkexpr(taddr),
1161                mkexpr(texpVal), mkexpr(tres), restart_point );
1162      }
1163   }
1164
1165   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1166   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
1167   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1168                                                         mkexpr(oldcn)) )) );
1169   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1170}
1171
1172
1173/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1174   appropriately.  As with helper_ADC, possibly generate a store of
1175   the result -- see comments on helper_ADC for details.
1176*/
1177static void helper_SBB ( Int sz,
1178                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1179                         /* info about optional store: */
1180                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1181{
1182   UInt    thunkOp;
1183   IRType  ty    = szToITy(sz);
1184   IRTemp  oldc  = newTemp(Ity_I32);
1185   IRTemp  oldcn = newTemp(ty);
1186   IROp    minus = mkSizedOp(ty, Iop_Sub8);
1187   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1188
1189   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1190   vassert(sz == 1 || sz == 2 || sz == 4);
1191   thunkOp = sz==4 ? X86G_CC_OP_SBBL
1192                   : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
1193
1194   /* oldc = old carry flag, 0 or 1 */
1195   assign( oldc, binop(Iop_And32,
1196                       mk_x86g_calculate_eflags_c(),
1197                       mkU32(1)) );
1198
1199   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1200
1201   assign( tres, binop(minus,
1202                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
1203                       mkexpr(oldcn)) );
1204
1205   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1206      start of this function. */
1207   if (taddr != IRTemp_INVALID) {
1208      if (texpVal == IRTemp_INVALID) {
1209         vassert(restart_point == 0);
1210         storeLE( mkexpr(taddr), mkexpr(tres) );
1211      } else {
1212         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1213         /* .. and hence 'texpVal' has the same type as 'tres'. */
1214         casLE( mkexpr(taddr),
1215                mkexpr(texpVal), mkexpr(tres), restart_point );
1216      }
1217   }
1218
1219   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1220   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
1221   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1222                                                         mkexpr(oldcn)) )) );
1223   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1224}
1225
1226
1227/* -------------- Helpers for disassembly printing. -------------- */
1228
1229static const HChar* nameGrp1 ( Int opc_aux )
1230{
1231   static const HChar* grp1_names[8]
1232     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1233   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
1234   return grp1_names[opc_aux];
1235}
1236
1237static const HChar* nameGrp2 ( Int opc_aux )
1238{
1239   static const HChar* grp2_names[8]
1240     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1241   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
1242   return grp2_names[opc_aux];
1243}
1244
1245static const HChar* nameGrp4 ( Int opc_aux )
1246{
1247   static const HChar* grp4_names[8]
1248     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1249   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
1250   return grp4_names[opc_aux];
1251}
1252
1253static const HChar* nameGrp5 ( Int opc_aux )
1254{
1255   static const HChar* grp5_names[8]
1256     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
1257   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
1258   return grp5_names[opc_aux];
1259}
1260
1261static const HChar* nameGrp8 ( Int opc_aux )
1262{
1263   static const HChar* grp8_names[8]
1264     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
1265   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
1266   return grp8_names[opc_aux];
1267}
1268
1269static const HChar* nameIReg ( Int size, Int reg )
1270{
1271   static const HChar* ireg32_names[8]
1272     = { "%eax", "%ecx", "%edx", "%ebx",
1273         "%esp", "%ebp", "%esi", "%edi" };
1274   static const HChar* ireg16_names[8]
1275     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
1276   static const HChar* ireg8_names[8]
1277     = { "%al", "%cl", "%dl", "%bl",
1278         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
1279   if (reg < 0 || reg > 7) goto bad;
1280   switch (size) {
1281      case 4: return ireg32_names[reg];
1282      case 2: return ireg16_names[reg];
1283      case 1: return ireg8_names[reg];
1284   }
1285  bad:
1286   vpanic("nameIReg(X86)");
1287   return NULL; /*notreached*/
1288}
1289
1290static const HChar* nameSReg ( UInt sreg )
1291{
1292   switch (sreg) {
1293      case R_ES: return "%es";
1294      case R_CS: return "%cs";
1295      case R_SS: return "%ss";
1296      case R_DS: return "%ds";
1297      case R_FS: return "%fs";
1298      case R_GS: return "%gs";
1299      default: vpanic("nameSReg(x86)");
1300   }
1301}
1302
1303static const HChar* nameMMXReg ( Int mmxreg )
1304{
1305   static const HChar* mmx_names[8]
1306     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
1307   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
1308   return mmx_names[mmxreg];
1309}
1310
1311static const HChar* nameXMMReg ( Int xmmreg )
1312{
1313   static const HChar* xmm_names[8]
1314     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
1315         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
1316   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
1317   return xmm_names[xmmreg];
1318}
1319
1320static const HChar* nameMMXGran ( Int gran )
1321{
1322   switch (gran) {
1323      case 0: return "b";
1324      case 1: return "w";
1325      case 2: return "d";
1326      case 3: return "q";
1327      default: vpanic("nameMMXGran(x86,guest)");
1328   }
1329}
1330
1331static HChar nameISize ( Int size )
1332{
1333   switch (size) {
1334      case 4: return 'l';
1335      case 2: return 'w';
1336      case 1: return 'b';
1337      default: vpanic("nameISize(x86)");
1338   }
1339}
1340
1341
1342/*------------------------------------------------------------*/
1343/*--- JMP helpers                                          ---*/
1344/*------------------------------------------------------------*/
1345
1346static void jmp_lit( /*MOD*/DisResult* dres,
1347                     IRJumpKind kind, Addr32 d32 )
1348{
1349   vassert(dres->whatNext    == Dis_Continue);
1350   vassert(dres->len         == 0);
1351   vassert(dres->continueAt  == 0);
1352   vassert(dres->jk_StopHere == Ijk_INVALID);
1353   dres->whatNext    = Dis_StopHere;
1354   dres->jk_StopHere = kind;
1355   stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
1356}
1357
1358static void jmp_treg( /*MOD*/DisResult* dres,
1359                      IRJumpKind kind, IRTemp t )
1360{
1361   vassert(dres->whatNext    == Dis_Continue);
1362   vassert(dres->len         == 0);
1363   vassert(dres->continueAt  == 0);
1364   vassert(dres->jk_StopHere == Ijk_INVALID);
1365   dres->whatNext    = Dis_StopHere;
1366   dres->jk_StopHere = kind;
1367   stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
1368}
1369
1370static
1371void jcc_01( /*MOD*/DisResult* dres,
1372             X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
1373{
1374   Bool        invert;
1375   X86Condcode condPos;
1376   vassert(dres->whatNext    == Dis_Continue);
1377   vassert(dres->len         == 0);
1378   vassert(dres->continueAt  == 0);
1379   vassert(dres->jk_StopHere == Ijk_INVALID);
1380   dres->whatNext    = Dis_StopHere;
1381   dres->jk_StopHere = Ijk_Boring;
1382   condPos = positiveIse_X86Condcode ( cond, &invert );
1383   if (invert) {
1384      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1385                         Ijk_Boring,
1386                         IRConst_U32(d32_false),
1387                         OFFB_EIP ) );
1388      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
1389   } else {
1390      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1391                         Ijk_Boring,
1392                         IRConst_U32(d32_true),
1393                         OFFB_EIP ) );
1394      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
1395   }
1396}
1397
1398
1399/*------------------------------------------------------------*/
1400/*--- Disassembling addressing modes                       ---*/
1401/*------------------------------------------------------------*/
1402
1403static
1404const HChar* sorbTxt ( UChar sorb )
1405{
1406   switch (sorb) {
1407      case 0:    return ""; /* no override */
1408      case 0x3E: return "%ds";
1409      case 0x26: return "%es:";
1410      case 0x64: return "%fs:";
1411      case 0x65: return "%gs:";
1412      default: vpanic("sorbTxt(x86,guest)");
1413   }
1414}
1415
1416
1417/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
1418   linear address by adding any required segment override as indicated
1419   by sorb. */
1420static
1421IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
1422{
1423   Int    sreg;
1424   IRType hWordTy;
1425   IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
1426
1427   if (sorb == 0)
1428      /* the common case - no override */
1429      return virtual;
1430
1431   switch (sorb) {
1432      case 0x3E: sreg = R_DS; break;
1433      case 0x26: sreg = R_ES; break;
1434      case 0x64: sreg = R_FS; break;
1435      case 0x65: sreg = R_GS; break;
1436      default: vpanic("handleSegOverride(x86,guest)");
1437   }
1438
1439   hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
1440
1441   seg_selector = newTemp(Ity_I32);
1442   ldt_ptr      = newTemp(hWordTy);
1443   gdt_ptr      = newTemp(hWordTy);
1444   r64          = newTemp(Ity_I64);
1445
1446   assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
1447   assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
1448   assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
1449
1450   /*
1451   Call this to do the translation and limit checks:
1452   ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
1453                                 UInt seg_selector, UInt virtual_addr )
1454   */
1455   assign(
1456      r64,
1457      mkIRExprCCall(
1458         Ity_I64,
1459         0/*regparms*/,
1460         "x86g_use_seg_selector",
1461         &x86g_use_seg_selector,
1462         mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
1463                        mkexpr(seg_selector), virtual)
1464      )
1465   );
1466
1467   /* If the high 32 of the result are non-zero, there was a
1468      failure in address translation.  In which case, make a
1469      quick exit.
1470   */
1471   stmt(
1472      IRStmt_Exit(
1473         binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
1474         Ijk_MapFail,
1475         IRConst_U32( guest_EIP_curr_instr ),
1476         OFFB_EIP
1477      )
1478   );
1479
1480   /* otherwise, here's the translated result. */
1481   return unop(Iop_64to32, mkexpr(r64));
1482}
1483
1484
1485/* Generate IR to calculate an address indicated by a ModRM and
1486   following SIB bytes.  The expression, and the number of bytes in
1487   the address mode, are returned.  Note that this fn should not be
1488   called if the R/M part of the address denotes a register instead of
1489   memory.  If print_codegen is true, text of the addressing mode is
1490   placed in buf.
1491
1492   The computed address is stored in a new tempreg, and the
1493   identity of the tempreg is returned.  */
1494
1495static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
1496{
1497   IRTemp tmp = newTemp(Ity_I32);
1498   assign( tmp, addr32 );
1499   return tmp;
1500}
1501
1502static
1503IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
1504{
1505   UChar mod_reg_rm = getIByte(delta);
1506   delta++;
1507
1508   buf[0] = (UChar)0;
1509
1510   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1511      jump table seems a bit excessive.
1512   */
1513   mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1514   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1515                                            /* is now XX0XXYYY */
1516   mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1517   switch (mod_reg_rm) {
1518
1519      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
1520         --> GET %reg, t
1521      */
1522      case 0x00: case 0x01: case 0x02: case 0x03:
1523      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1524         { UChar rm = mod_reg_rm;
1525           DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
1526           *len = 1;
1527           return disAMode_copy2tmp(
1528                  handleSegOverride(sorb, getIReg(4,rm)));
1529         }
1530
1531      /* d8(%eax) ... d8(%edi), not including d8(%esp)
1532         --> GET %reg, t ; ADDL d8, t
1533      */
1534      case 0x08: case 0x09: case 0x0A: case 0x0B:
1535      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1536         { UChar rm = toUChar(mod_reg_rm & 7);
1537           UInt  d  = getSDisp8(delta);
1538           DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1539           *len = 2;
1540           return disAMode_copy2tmp(
1541                  handleSegOverride(sorb,
1542                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1543         }
1544
1545      /* d32(%eax) ... d32(%edi), not including d32(%esp)
1546         --> GET %reg, t ; ADDL d8, t
1547      */
1548      case 0x10: case 0x11: case 0x12: case 0x13:
1549      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1550         { UChar rm = toUChar(mod_reg_rm & 7);
1551           UInt  d  = getUDisp32(delta);
1552           DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1553           *len = 5;
1554           return disAMode_copy2tmp(
1555                  handleSegOverride(sorb,
1556                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1557         }
1558
1559      /* a register, %eax .. %edi.  This shouldn't happen. */
1560      case 0x18: case 0x19: case 0x1A: case 0x1B:
1561      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1562         vpanic("disAMode(x86): not an addr!");
1563
1564      /* a 32-bit literal address
1565         --> MOV d32, tmp
1566      */
1567      case 0x05:
1568         { UInt d = getUDisp32(delta);
1569           *len = 5;
1570           DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1571           return disAMode_copy2tmp(
1572                     handleSegOverride(sorb, mkU32(d)));
1573         }
1574
1575      case 0x04: {
1576         /* SIB, with no displacement.  Special cases:
1577            -- %esp cannot act as an index value.
1578               If index_r indicates %esp, zero is used for the index.
1579            -- when mod is zero and base indicates EBP, base is instead
1580               a 32-bit literal.
1581            It's all madness, I tell you.  Extract %index, %base and
1582            scale from the SIB byte.  The value denoted is then:
1583               | %index == %ESP && %base == %EBP
1584               = d32 following SIB byte
1585               | %index == %ESP && %base != %EBP
1586               = %base
1587               | %index != %ESP && %base == %EBP
1588               = d32 following SIB byte + (%index << scale)
1589               | %index != %ESP && %base != %ESP
1590               = %base + (%index << scale)
1591
1592            What happens to the souls of CPU architects who dream up such
1593            horrendous schemes, do you suppose?
1594         */
1595         UChar sib     = getIByte(delta);
1596         UChar scale   = toUChar((sib >> 6) & 3);
1597         UChar index_r = toUChar((sib >> 3) & 7);
1598         UChar base_r  = toUChar(sib & 7);
1599         delta++;
1600
1601         if (index_r != R_ESP && base_r != R_EBP) {
1602            DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
1603                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1604            *len = 2;
1605            return
1606               disAMode_copy2tmp(
1607               handleSegOverride(sorb,
1608                  binop(Iop_Add32,
1609                        getIReg(4,base_r),
1610                        binop(Iop_Shl32, getIReg(4,index_r),
1611                              mkU8(scale)))));
1612         }
1613
1614         if (index_r != R_ESP && base_r == R_EBP) {
1615            UInt d = getUDisp32(delta);
1616            DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
1617                      nameIReg(4,index_r), 1<<scale);
1618            *len = 6;
1619            return
1620               disAMode_copy2tmp(
1621               handleSegOverride(sorb,
1622                  binop(Iop_Add32,
1623                        binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
1624                        mkU32(d))));
1625         }
1626
1627         if (index_r == R_ESP && base_r != R_EBP) {
1628            DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
1629            *len = 2;
1630            return disAMode_copy2tmp(
1631                   handleSegOverride(sorb, getIReg(4,base_r)));
1632         }
1633
1634         if (index_r == R_ESP && base_r == R_EBP) {
1635            UInt d = getUDisp32(delta);
1636            DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
1637            *len = 6;
1638            return disAMode_copy2tmp(
1639                   handleSegOverride(sorb, mkU32(d)));
1640         }
1641         /*NOTREACHED*/
1642         vassert(0);
1643      }
1644
1645      /* SIB, with 8-bit displacement.  Special cases:
1646         -- %esp cannot act as an index value.
1647            If index_r indicates %esp, zero is used for the index.
1648         Denoted value is:
1649            | %index == %ESP
1650            = d8 + %base
1651            | %index != %ESP
1652            = d8 + %base + (%index << scale)
1653      */
1654      case 0x0C: {
1655         UChar sib     = getIByte(delta);
1656         UChar scale   = toUChar((sib >> 6) & 3);
1657         UChar index_r = toUChar((sib >> 3) & 7);
1658         UChar base_r  = toUChar(sib & 7);
1659         UInt  d       = getSDisp8(delta+1);
1660
1661         if (index_r == R_ESP) {
1662            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1663                                   (Int)d, nameIReg(4,base_r));
1664            *len = 3;
1665            return disAMode_copy2tmp(
1666                   handleSegOverride(sorb,
1667                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1668         } else {
1669            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1670                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1671            *len = 3;
1672            return
1673                disAMode_copy2tmp(
1674                handleSegOverride(sorb,
1675                  binop(Iop_Add32,
1676                        binop(Iop_Add32,
1677                              getIReg(4,base_r),
1678                              binop(Iop_Shl32,
1679                                    getIReg(4,index_r), mkU8(scale))),
1680                        mkU32(d))));
1681         }
1682	 /*NOTREACHED*/
1683         vassert(0);
1684      }
1685
1686      /* SIB, with 32-bit displacement.  Special cases:
1687         -- %esp cannot act as an index value.
1688            If index_r indicates %esp, zero is used for the index.
1689         Denoted value is:
1690            | %index == %ESP
1691            = d32 + %base
1692            | %index != %ESP
1693            = d32 + %base + (%index << scale)
1694      */
1695      case 0x14: {
1696         UChar sib     = getIByte(delta);
1697         UChar scale   = toUChar((sib >> 6) & 3);
1698         UChar index_r = toUChar((sib >> 3) & 7);
1699         UChar base_r  = toUChar(sib & 7);
1700         UInt d        = getUDisp32(delta+1);
1701
1702         if (index_r == R_ESP) {
1703            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1704                                   (Int)d, nameIReg(4,base_r));
1705            *len = 6;
1706            return disAMode_copy2tmp(
1707                   handleSegOverride(sorb,
1708                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1709         } else {
1710            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1711                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1712            *len = 6;
1713            return
1714                disAMode_copy2tmp(
1715                handleSegOverride(sorb,
1716                  binop(Iop_Add32,
1717                        binop(Iop_Add32,
1718                              getIReg(4,base_r),
1719                              binop(Iop_Shl32,
1720                                    getIReg(4,index_r), mkU8(scale))),
1721                        mkU32(d))));
1722         }
1723	 /*NOTREACHED*/
1724         vassert(0);
1725      }
1726
1727      default:
1728         vpanic("disAMode(x86)");
1729         return 0; /*notreached*/
1730   }
1731}
1732
1733
1734/* Figure out the number of (insn-stream) bytes constituting the amode
1735   beginning at delta.  Is useful for getting hold of literals beyond
1736   the end of the amode before it has been disassembled.  */
1737
1738static UInt lengthAMode ( Int delta )
1739{
1740   UChar mod_reg_rm = getIByte(delta); delta++;
1741
1742   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1743      jump table seems a bit excessive.
1744   */
1745   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1746   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1747                                     /* is now XX0XXYYY */
1748   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1749   switch (mod_reg_rm) {
1750
1751      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
1752      case 0x00: case 0x01: case 0x02: case 0x03:
1753      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1754         return 1;
1755
1756      /* d8(%eax) ... d8(%edi), not including d8(%esp). */
1757      case 0x08: case 0x09: case 0x0A: case 0x0B:
1758      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1759         return 2;
1760
1761      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
1762      case 0x10: case 0x11: case 0x12: case 0x13:
1763      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1764         return 5;
1765
1766      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
1767      case 0x18: case 0x19: case 0x1A: case 0x1B:
1768      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1769         return 1;
1770
1771      /* a 32-bit literal address. */
1772      case 0x05: return 5;
1773
1774      /* SIB, no displacement.  */
1775      case 0x04: {
1776         UChar sib    = getIByte(delta);
1777         UChar base_r = toUChar(sib & 7);
1778         if (base_r == R_EBP) return 6; else return 2;
1779      }
1780      /* SIB, with 8-bit displacement.  */
1781      case 0x0C: return 3;
1782
1783      /* SIB, with 32-bit displacement.  */
1784      case 0x14: return 6;
1785
1786      default:
1787         vpanic("lengthAMode");
1788         return 0; /*notreached*/
1789   }
1790}
1791
1792/*------------------------------------------------------------*/
1793/*--- Disassembling common idioms                          ---*/
1794/*------------------------------------------------------------*/
1795
1796/* Handle binary integer instructions of the form
1797      op E, G  meaning
1798      op reg-or-mem, reg
1799   Is passed the a ptr to the modRM byte, the actual operation, and the
1800   data size.  Returns the address advanced completely over this
1801   instruction.
1802
1803   E(src) is reg-or-mem
1804   G(dst) is reg.
1805
1806   If E is reg, -->    GET %G,  tmp
1807                       OP %E,   tmp
1808                       PUT tmp, %G
1809
1810   If E is mem and OP is not reversible,
1811                -->    (getAddr E) -> tmpa
1812                       LD (tmpa), tmpa
1813                       GET %G, tmp2
1814                       OP tmpa, tmp2
1815                       PUT tmp2, %G
1816
1817   If E is mem and OP is reversible
1818                -->    (getAddr E) -> tmpa
1819                       LD (tmpa), tmpa
1820                       OP %G, tmpa
1821                       PUT tmpa, %G
1822*/
1823static
1824UInt dis_op2_E_G ( UChar       sorb,
1825                   Bool        addSubCarry,
1826                   IROp        op8,
1827                   Bool        keep,
1828                   Int         size,
1829                   Int         delta0,
1830                   const HChar* t_x86opc )
1831{
1832   HChar   dis_buf[50];
1833   Int     len;
1834   IRType  ty   = szToITy(size);
1835   IRTemp  dst1 = newTemp(ty);
1836   IRTemp  src  = newTemp(ty);
1837   IRTemp  dst0 = newTemp(ty);
1838   UChar   rm   = getUChar(delta0);
1839   IRTemp  addr = IRTemp_INVALID;
1840
1841   /* addSubCarry == True indicates the intended operation is
1842      add-with-carry or subtract-with-borrow. */
1843   if (addSubCarry) {
1844      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1845      vassert(keep);
1846   }
1847
1848   if (epartIsReg(rm)) {
1849      /* Specially handle XOR reg,reg, because that doesn't really
1850         depend on reg, and doing the obvious thing potentially
1851         generates a spurious value check failure due to the bogus
1852         dependency.  Ditto SBB reg,reg. */
1853      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1854          && gregOfRM(rm) == eregOfRM(rm)) {
1855         putIReg(size, gregOfRM(rm), mkU(ty,0));
1856      }
1857      assign( dst0, getIReg(size,gregOfRM(rm)) );
1858      assign( src,  getIReg(size,eregOfRM(rm)) );
1859
1860      if (addSubCarry && op8 == Iop_Add8) {
1861         helper_ADC( size, dst1, dst0, src,
1862                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1863         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1864      } else
1865      if (addSubCarry && op8 == Iop_Sub8) {
1866         helper_SBB( size, dst1, dst0, src,
1867                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1868         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1869      } else {
1870         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1871         if (isAddSub(op8))
1872            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1873         else
1874            setFlags_DEP1(op8, dst1, ty);
1875         if (keep)
1876            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1877      }
1878
1879      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1880                          nameIReg(size,eregOfRM(rm)),
1881                          nameIReg(size,gregOfRM(rm)));
1882      return 1+delta0;
1883   } else {
1884      /* E refers to memory */
1885      addr = disAMode ( &len, sorb, delta0, dis_buf);
1886      assign( dst0, getIReg(size,gregOfRM(rm)) );
1887      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
1888
1889      if (addSubCarry && op8 == Iop_Add8) {
1890         helper_ADC( size, dst1, dst0, src,
1891                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1892         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1893      } else
1894      if (addSubCarry && op8 == Iop_Sub8) {
1895         helper_SBB( size, dst1, dst0, src,
1896                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1897         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1898      } else {
1899         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1900         if (isAddSub(op8))
1901            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1902         else
1903            setFlags_DEP1(op8, dst1, ty);
1904         if (keep)
1905            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1906      }
1907
1908      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1909                          dis_buf,nameIReg(size,gregOfRM(rm)));
1910      return len+delta0;
1911   }
1912}
1913
1914
1915
1916/* Handle binary integer instructions of the form
1917      op G, E  meaning
1918      op reg, reg-or-mem
1919   Is passed the a ptr to the modRM byte, the actual operation, and the
1920   data size.  Returns the address advanced completely over this
1921   instruction.
1922
1923   G(src) is reg.
1924   E(dst) is reg-or-mem
1925
1926   If E is reg, -->    GET %E,  tmp
1927                       OP %G,   tmp
1928                       PUT tmp, %E
1929
1930   If E is mem, -->    (getAddr E) -> tmpa
1931                       LD (tmpa), tmpv
1932                       OP %G, tmpv
1933                       ST tmpv, (tmpa)
1934*/
1935static
1936UInt dis_op2_G_E ( UChar       sorb,
1937                   Bool        locked,
1938                   Bool        addSubCarry,
1939                   IROp        op8,
1940                   Bool        keep,
1941                   Int         size,
1942                   Int         delta0,
1943                   const HChar* t_x86opc )
1944{
1945   HChar   dis_buf[50];
1946   Int     len;
1947   IRType  ty   = szToITy(size);
1948   IRTemp  dst1 = newTemp(ty);
1949   IRTemp  src  = newTemp(ty);
1950   IRTemp  dst0 = newTemp(ty);
1951   UChar   rm   = getIByte(delta0);
1952   IRTemp  addr = IRTemp_INVALID;
1953
1954   /* addSubCarry == True indicates the intended operation is
1955      add-with-carry or subtract-with-borrow. */
1956   if (addSubCarry) {
1957      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1958      vassert(keep);
1959   }
1960
1961   if (epartIsReg(rm)) {
1962      /* Specially handle XOR reg,reg, because that doesn't really
1963         depend on reg, and doing the obvious thing potentially
1964         generates a spurious value check failure due to the bogus
1965         dependency.  Ditto SBB reg,reg.*/
1966      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1967          && gregOfRM(rm) == eregOfRM(rm)) {
1968         putIReg(size, eregOfRM(rm), mkU(ty,0));
1969      }
1970      assign(dst0, getIReg(size,eregOfRM(rm)));
1971      assign(src,  getIReg(size,gregOfRM(rm)));
1972
1973      if (addSubCarry && op8 == Iop_Add8) {
1974         helper_ADC( size, dst1, dst0, src,
1975                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1976         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1977      } else
1978      if (addSubCarry && op8 == Iop_Sub8) {
1979         helper_SBB( size, dst1, dst0, src,
1980                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1981         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1982      } else {
1983         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
1984         if (isAddSub(op8))
1985            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1986         else
1987            setFlags_DEP1(op8, dst1, ty);
1988         if (keep)
1989            putIReg(size, eregOfRM(rm), mkexpr(dst1));
1990      }
1991
1992      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1993                          nameIReg(size,gregOfRM(rm)),
1994                          nameIReg(size,eregOfRM(rm)));
1995      return 1+delta0;
1996   }
1997
1998   /* E refers to memory */
1999   {
2000      addr = disAMode ( &len, sorb, delta0, dis_buf);
2001      assign(dst0, loadLE(ty,mkexpr(addr)));
2002      assign(src,  getIReg(size,gregOfRM(rm)));
2003
2004      if (addSubCarry && op8 == Iop_Add8) {
2005         if (locked) {
2006            /* cas-style store */
2007            helper_ADC( size, dst1, dst0, src,
2008                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2009         } else {
2010            /* normal store */
2011            helper_ADC( size, dst1, dst0, src,
2012                        /*store*/addr, IRTemp_INVALID, 0 );
2013         }
2014      } else
2015      if (addSubCarry && op8 == Iop_Sub8) {
2016         if (locked) {
2017            /* cas-style store */
2018            helper_SBB( size, dst1, dst0, src,
2019                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2020         } else {
2021            /* normal store */
2022            helper_SBB( size, dst1, dst0, src,
2023                        /*store*/addr, IRTemp_INVALID, 0 );
2024         }
2025      } else {
2026         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2027         if (keep) {
2028            if (locked) {
2029               if (0) vex_printf("locked case\n" );
2030               casLE( mkexpr(addr),
2031                      mkexpr(dst0)/*expval*/,
2032                      mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
2033            } else {
2034               if (0) vex_printf("nonlocked case\n");
2035               storeLE(mkexpr(addr), mkexpr(dst1));
2036            }
2037         }
2038         if (isAddSub(op8))
2039            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2040         else
2041            setFlags_DEP1(op8, dst1, ty);
2042      }
2043
2044      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2045                          nameIReg(size,gregOfRM(rm)), dis_buf);
2046      return len+delta0;
2047   }
2048}
2049
2050
2051/* Handle move instructions of the form
2052      mov E, G  meaning
2053      mov reg-or-mem, reg
2054   Is passed the a ptr to the modRM byte, and the data size.  Returns
2055   the address advanced completely over this instruction.
2056
2057   E(src) is reg-or-mem
2058   G(dst) is reg.
2059
2060   If E is reg, -->    GET %E,  tmpv
2061                       PUT tmpv, %G
2062
2063   If E is mem  -->    (getAddr E) -> tmpa
2064                       LD (tmpa), tmpb
2065                       PUT tmpb, %G
2066*/
2067static
2068UInt dis_mov_E_G ( UChar       sorb,
2069                   Int         size,
2070                   Int         delta0 )
2071{
2072   Int len;
2073   UChar rm = getIByte(delta0);
2074   HChar dis_buf[50];
2075
2076   if (epartIsReg(rm)) {
2077      putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
2078      DIP("mov%c %s,%s\n", nameISize(size),
2079                           nameIReg(size,eregOfRM(rm)),
2080                           nameIReg(size,gregOfRM(rm)));
2081      return 1+delta0;
2082   }
2083
2084   /* E refers to memory */
2085   {
2086      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
2087      putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
2088      DIP("mov%c %s,%s\n", nameISize(size),
2089                           dis_buf,nameIReg(size,gregOfRM(rm)));
2090      return delta0+len;
2091   }
2092}
2093
2094
2095/* Handle move instructions of the form
2096      mov G, E  meaning
2097      mov reg, reg-or-mem
2098   Is passed the a ptr to the modRM byte, and the data size.  Returns
2099   the address advanced completely over this instruction.
2100
2101   G(src) is reg.
2102   E(dst) is reg-or-mem
2103
2104   If E is reg, -->    GET %G,  tmp
2105                       PUT tmp, %E
2106
2107   If E is mem, -->    (getAddr E) -> tmpa
2108                       GET %G, tmpv
2109                       ST tmpv, (tmpa)
2110*/
2111static
2112UInt dis_mov_G_E ( UChar       sorb,
2113                   Int         size,
2114                   Int         delta0 )
2115{
2116   Int len;
2117   UChar rm = getIByte(delta0);
2118   HChar dis_buf[50];
2119
2120   if (epartIsReg(rm)) {
2121      putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
2122      DIP("mov%c %s,%s\n", nameISize(size),
2123                           nameIReg(size,gregOfRM(rm)),
2124                           nameIReg(size,eregOfRM(rm)));
2125      return 1+delta0;
2126   }
2127
2128   /* E refers to memory */
2129   {
2130      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
2131      storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
2132      DIP("mov%c %s,%s\n", nameISize(size),
2133                           nameIReg(size,gregOfRM(rm)), dis_buf);
2134      return len+delta0;
2135   }
2136}
2137
2138
2139/* op $immediate, AL/AX/EAX. */
2140static
2141UInt dis_op_imm_A ( Int    size,
2142                    Bool   carrying,
2143                    IROp   op8,
2144                    Bool   keep,
2145                    Int    delta,
2146                    const HChar* t_x86opc )
2147{
2148   IRType ty   = szToITy(size);
2149   IRTemp dst0 = newTemp(ty);
2150   IRTemp src  = newTemp(ty);
2151   IRTemp dst1 = newTemp(ty);
2152   UInt lit    = getUDisp(size,delta);
2153   assign(dst0, getIReg(size,R_EAX));
2154   assign(src,  mkU(ty,lit));
2155
2156   if (isAddSub(op8) && !carrying) {
2157      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2158      setFlags_DEP1_DEP2(op8, dst0, src, ty);
2159   }
2160   else
2161   if (isLogic(op8)) {
2162      vassert(!carrying);
2163      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2164      setFlags_DEP1(op8, dst1, ty);
2165   }
2166   else
2167   if (op8 == Iop_Add8 && carrying) {
2168      helper_ADC( size, dst1, dst0, src,
2169                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2170   }
2171   else
2172   if (op8 == Iop_Sub8 && carrying) {
2173      helper_SBB( size, dst1, dst0, src,
2174                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2175   }
2176   else
2177      vpanic("dis_op_imm_A(x86,guest)");
2178
2179   if (keep)
2180      putIReg(size, R_EAX, mkexpr(dst1));
2181
2182   DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
2183                           lit, nameIReg(size,R_EAX));
2184   return delta+size;
2185}
2186
2187
2188/* Sign- and Zero-extending moves. */
2189static
2190UInt dis_movx_E_G ( UChar      sorb,
2191                    Int delta, Int szs, Int szd, Bool sign_extend )
2192{
2193   UChar rm = getIByte(delta);
2194   if (epartIsReg(rm)) {
2195      if (szd == szs) {
2196         // mutant case.  See #250799
2197         putIReg(szd, gregOfRM(rm),
2198                           getIReg(szs,eregOfRM(rm)));
2199      } else {
2200         // normal case
2201         putIReg(szd, gregOfRM(rm),
2202                      unop(mkWidenOp(szs,szd,sign_extend),
2203                           getIReg(szs,eregOfRM(rm))));
2204      }
2205      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2206                               nameISize(szs), nameISize(szd),
2207                               nameIReg(szs,eregOfRM(rm)),
2208                               nameIReg(szd,gregOfRM(rm)));
2209      return 1+delta;
2210   }
2211
2212   /* E refers to memory */
2213   {
2214      Int    len;
2215      HChar  dis_buf[50];
2216      IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
2217      if (szd == szs) {
2218         // mutant case.  See #250799
2219         putIReg(szd, gregOfRM(rm),
2220                           loadLE(szToITy(szs),mkexpr(addr)));
2221      } else {
2222         // normal case
2223         putIReg(szd, gregOfRM(rm),
2224                      unop(mkWidenOp(szs,szd,sign_extend),
2225                           loadLE(szToITy(szs),mkexpr(addr))));
2226      }
2227      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2228                               nameISize(szs), nameISize(szd),
2229                               dis_buf, nameIReg(szd,gregOfRM(rm)));
2230      return len+delta;
2231   }
2232}
2233
2234
2235/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
2236   16 / 8 bit quantity in the given IRTemp.  */
2237static
2238void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
2239{
2240   IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
2241   IRTemp src64 = newTemp(Ity_I64);
2242   IRTemp dst64 = newTemp(Ity_I64);
2243   switch (sz) {
2244      case 4:
2245         assign( src64, binop(Iop_32HLto64,
2246                              getIReg(4,R_EDX), getIReg(4,R_EAX)) );
2247         assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
2248         putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
2249         putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
2250         break;
2251      case 2: {
2252         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2253         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2254         assign( src64, unop(widen3264,
2255                             binop(Iop_16HLto32,
2256                                   getIReg(2,R_EDX), getIReg(2,R_EAX))) );
2257         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
2258         putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
2259         putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
2260         break;
2261      }
2262      case 1: {
2263         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2264         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2265         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
2266         assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
2267         assign( dst64,
2268                 binop(op, mkexpr(src64),
2269                           unop(widen1632, unop(widen816, mkexpr(t)))) );
2270         putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
2271                           unop(Iop_64to32,mkexpr(dst64)))) );
2272         putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
2273                           unop(Iop_64HIto32,mkexpr(dst64)))) );
2274         break;
2275      }
2276      default: vpanic("codegen_div(x86)");
2277   }
2278}
2279
2280
2281static
2282UInt dis_Grp1 ( UChar sorb, Bool locked,
2283                Int delta, UChar modrm,
2284                Int am_sz, Int d_sz, Int sz, UInt d32 )
2285{
2286   Int     len;
2287   HChar   dis_buf[50];
2288   IRType  ty   = szToITy(sz);
2289   IRTemp  dst1 = newTemp(ty);
2290   IRTemp  src  = newTemp(ty);
2291   IRTemp  dst0 = newTemp(ty);
2292   IRTemp  addr = IRTemp_INVALID;
2293   IROp    op8  = Iop_INVALID;
2294   UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
2295
2296   switch (gregOfRM(modrm)) {
2297      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
2298      case 2: break;  // ADC
2299      case 3: break;  // SBB
2300      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
2301      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
2302      /*NOTREACHED*/
2303      default: vpanic("dis_Grp1: unhandled case");
2304   }
2305
2306   if (epartIsReg(modrm)) {
2307      vassert(am_sz == 1);
2308
2309      assign(dst0, getIReg(sz,eregOfRM(modrm)));
2310      assign(src,  mkU(ty,d32 & mask));
2311
2312      if (gregOfRM(modrm) == 2 /* ADC */) {
2313         helper_ADC( sz, dst1, dst0, src,
2314                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2315      } else
2316      if (gregOfRM(modrm) == 3 /* SBB */) {
2317         helper_SBB( sz, dst1, dst0, src,
2318                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2319      } else {
2320         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2321         if (isAddSub(op8))
2322            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2323         else
2324            setFlags_DEP1(op8, dst1, ty);
2325      }
2326
2327      if (gregOfRM(modrm) < 7)
2328         putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2329
2330      delta += (am_sz + d_sz);
2331      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
2332                              nameIReg(sz,eregOfRM(modrm)));
2333   } else {
2334      addr = disAMode ( &len, sorb, delta, dis_buf);
2335
2336      assign(dst0, loadLE(ty,mkexpr(addr)));
2337      assign(src, mkU(ty,d32 & mask));
2338
2339      if (gregOfRM(modrm) == 2 /* ADC */) {
2340         if (locked) {
2341            /* cas-style store */
2342            helper_ADC( sz, dst1, dst0, src,
2343                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2344         } else {
2345            /* normal store */
2346            helper_ADC( sz, dst1, dst0, src,
2347                        /*store*/addr, IRTemp_INVALID, 0 );
2348         }
2349      } else
2350      if (gregOfRM(modrm) == 3 /* SBB */) {
2351         if (locked) {
2352            /* cas-style store */
2353            helper_SBB( sz, dst1, dst0, src,
2354                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2355         } else {
2356            /* normal store */
2357            helper_SBB( sz, dst1, dst0, src,
2358                        /*store*/addr, IRTemp_INVALID, 0 );
2359         }
2360      } else {
2361         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2362         if (gregOfRM(modrm) < 7) {
2363            if (locked) {
2364               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
2365                                    mkexpr(dst1)/*newVal*/,
2366                                    guest_EIP_curr_instr );
2367            } else {
2368               storeLE(mkexpr(addr), mkexpr(dst1));
2369            }
2370         }
2371         if (isAddSub(op8))
2372            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2373         else
2374            setFlags_DEP1(op8, dst1, ty);
2375      }
2376
2377      delta += (len+d_sz);
2378      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
2379                              d32, dis_buf);
2380   }
2381   return delta;
2382}
2383
2384
2385/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
2386   expression. */
2387
2388static
2389UInt dis_Grp2 ( UChar sorb,
2390                Int delta, UChar modrm,
2391                Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
2392                const HChar* shift_expr_txt, Bool* decode_OK )
2393{
2394   /* delta on entry points at the modrm byte. */
2395   HChar  dis_buf[50];
2396   Int    len;
2397   Bool   isShift, isRotate, isRotateC;
2398   IRType ty    = szToITy(sz);
2399   IRTemp dst0  = newTemp(ty);
2400   IRTemp dst1  = newTemp(ty);
2401   IRTemp addr  = IRTemp_INVALID;
2402
2403   *decode_OK = True;
2404
2405   vassert(sz == 1 || sz == 2 || sz == 4);
2406
2407   /* Put value to shift/rotate in dst0. */
2408   if (epartIsReg(modrm)) {
2409      assign(dst0, getIReg(sz, eregOfRM(modrm)));
2410      delta += (am_sz + d_sz);
2411   } else {
2412      addr = disAMode ( &len, sorb, delta, dis_buf);
2413      assign(dst0, loadLE(ty,mkexpr(addr)));
2414      delta += len + d_sz;
2415   }
2416
2417   isShift = False;
2418   switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
2419
2420   isRotate = False;
2421   switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
2422
2423   isRotateC = False;
2424   switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
2425
2426   if (!isShift && !isRotate && !isRotateC) {
2427      /*NOTREACHED*/
2428      vpanic("dis_Grp2(Reg): unhandled case(x86)");
2429   }
2430
2431   if (isRotateC) {
2432      /* call a helper; these insns are so ridiculous they do not
2433         deserve better */
2434      Bool     left = toBool(gregOfRM(modrm) == 2);
2435      IRTemp   r64  = newTemp(Ity_I64);
2436      IRExpr** args
2437         = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
2438                          widenUto32(shift_expr),   /* rotate amount */
2439                          widenUto32(mk_x86g_calculate_eflags_all()),
2440                          mkU32(sz) );
2441      assign( r64, mkIRExprCCall(
2442                      Ity_I64,
2443                      0/*regparm*/,
2444                      left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
2445                      left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
2446                      args
2447                   )
2448            );
2449      /* new eflags in hi half r64; new value in lo half r64 */
2450      assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
2451      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2452      stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
2453      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2454      /* Set NDEP even though it isn't used.  This makes redundant-PUT
2455         elimination of previous stores to this field work better. */
2456      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2457   }
2458
2459   if (isShift) {
2460
2461      IRTemp pre32     = newTemp(Ity_I32);
2462      IRTemp res32     = newTemp(Ity_I32);
2463      IRTemp res32ss   = newTemp(Ity_I32);
2464      IRTemp shift_amt = newTemp(Ity_I8);
2465      IROp   op32;
2466
2467      switch (gregOfRM(modrm)) {
2468         case 4: op32 = Iop_Shl32; break;
2469         case 5: op32 = Iop_Shr32; break;
2470         case 6: op32 = Iop_Shl32; break;
2471         case 7: op32 = Iop_Sar32; break;
2472         /*NOTREACHED*/
2473         default: vpanic("dis_Grp2:shift"); break;
2474      }
2475
2476      /* Widen the value to be shifted to 32 bits, do the shift, and
2477         narrow back down.  This seems surprisingly long-winded, but
2478         unfortunately the Intel semantics requires that 8/16-bit
2479         shifts give defined results for shift values all the way up
2480         to 31, and this seems the simplest way to do it.  It has the
2481         advantage that the only IR level shifts generated are of 32
2482         bit values, and the shift amount is guaranteed to be in the
2483         range 0 .. 31, thereby observing the IR semantics requiring
2484         all shift values to be in the range 0 .. 2^word_size-1. */
2485
2486      /* shift_amt = shift_expr & 31, regardless of operation size */
2487      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
2488
2489      /* suitably widen the value to be shifted to 32 bits. */
2490      assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
2491                                     : widenUto32(mkexpr(dst0)) );
2492
2493      /* res32 = pre32 `shift` shift_amt */
2494      assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
2495
2496      /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
2497      assign( res32ss,
2498              binop(op32,
2499                    mkexpr(pre32),
2500                    binop(Iop_And8,
2501                          binop(Iop_Sub8,
2502                                mkexpr(shift_amt), mkU8(1)),
2503                          mkU8(31))) );
2504
2505      /* Build the flags thunk. */
2506      setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
2507
2508      /* Narrow the result back down. */
2509      assign( dst1, narrowTo(ty, mkexpr(res32)) );
2510
2511   } /* if (isShift) */
2512
2513   else
2514   if (isRotate) {
2515      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
2516      Bool   left      = toBool(gregOfRM(modrm) == 0);
2517      IRTemp rot_amt   = newTemp(Ity_I8);
2518      IRTemp rot_amt32 = newTemp(Ity_I8);
2519      IRTemp oldFlags  = newTemp(Ity_I32);
2520
2521      /* rot_amt = shift_expr & mask */
2522      /* By masking the rotate amount thusly, the IR-level Shl/Shr
2523         expressions never shift beyond the word size and thus remain
2524         well defined. */
2525      assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
2526
2527      if (ty == Ity_I32)
2528         assign(rot_amt, mkexpr(rot_amt32));
2529      else
2530         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
2531
2532      if (left) {
2533
2534         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
2535         assign(dst1,
2536            binop( mkSizedOp(ty,Iop_Or8),
2537                   binop( mkSizedOp(ty,Iop_Shl8),
2538                          mkexpr(dst0),
2539                          mkexpr(rot_amt)
2540                   ),
2541                   binop( mkSizedOp(ty,Iop_Shr8),
2542                          mkexpr(dst0),
2543                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2544                   )
2545            )
2546         );
2547         ccOp += X86G_CC_OP_ROLB;
2548
2549      } else { /* right */
2550
2551         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
2552         assign(dst1,
2553            binop( mkSizedOp(ty,Iop_Or8),
2554                   binop( mkSizedOp(ty,Iop_Shr8),
2555                          mkexpr(dst0),
2556                          mkexpr(rot_amt)
2557                   ),
2558                   binop( mkSizedOp(ty,Iop_Shl8),
2559                          mkexpr(dst0),
2560                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2561                   )
2562            )
2563         );
2564         ccOp += X86G_CC_OP_RORB;
2565
2566      }
2567
2568      /* dst1 now holds the rotated value.  Build flag thunk.  We
2569         need the resulting value for this, and the previous flags.
2570         Except don't set it if the rotate count is zero. */
2571
2572      assign(oldFlags, mk_x86g_calculate_eflags_all());
2573
2574      /* rot_amt32 :: Ity_I8.  We need to convert it to I1. */
2575      IRTemp rot_amt32b = newTemp(Ity_I1);
2576      assign(rot_amt32b, binop(Iop_CmpNE8, mkexpr(rot_amt32), mkU8(0)) );
2577
2578      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
2579      stmt( IRStmt_Put( OFFB_CC_OP,
2580                        IRExpr_ITE( mkexpr(rot_amt32b),
2581                                    mkU32(ccOp),
2582                                    IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
2583      stmt( IRStmt_Put( OFFB_CC_DEP1,
2584                        IRExpr_ITE( mkexpr(rot_amt32b),
2585                                    widenUto32(mkexpr(dst1)),
2586                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
2587      stmt( IRStmt_Put( OFFB_CC_DEP2,
2588                        IRExpr_ITE( mkexpr(rot_amt32b),
2589                                    mkU32(0),
2590                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
2591      stmt( IRStmt_Put( OFFB_CC_NDEP,
2592                        IRExpr_ITE( mkexpr(rot_amt32b),
2593                                    mkexpr(oldFlags),
2594                                    IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
2595   } /* if (isRotate) */
2596
2597   /* Save result, and finish up. */
2598   if (epartIsReg(modrm)) {
2599      putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2600      if (vex_traceflags & VEX_TRACE_FE) {
2601         vex_printf("%s%c ",
2602                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2603         if (shift_expr_txt)
2604            vex_printf("%s", shift_expr_txt);
2605         else
2606            ppIRExpr(shift_expr);
2607         vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
2608      }
2609   } else {
2610      storeLE(mkexpr(addr), mkexpr(dst1));
2611      if (vex_traceflags & VEX_TRACE_FE) {
2612         vex_printf("%s%c ",
2613                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2614         if (shift_expr_txt)
2615            vex_printf("%s", shift_expr_txt);
2616         else
2617            ppIRExpr(shift_expr);
2618         vex_printf(", %s\n", dis_buf);
2619      }
2620   }
2621   return delta;
2622}
2623
2624
2625/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
2626static
2627UInt dis_Grp8_Imm ( UChar sorb,
2628                    Bool locked,
2629                    Int delta, UChar modrm,
2630                    Int am_sz, Int sz, UInt src_val,
2631                    Bool* decode_OK )
2632{
2633   /* src_val denotes a d8.
2634      And delta on entry points at the modrm byte. */
2635
2636   IRType ty     = szToITy(sz);
2637   IRTemp t2     = newTemp(Ity_I32);
2638   IRTemp t2m    = newTemp(Ity_I32);
2639   IRTemp t_addr = IRTemp_INVALID;
2640   HChar  dis_buf[50];
2641   UInt   mask;
2642
2643   /* we're optimists :-) */
2644   *decode_OK = True;
2645
2646   /* Limit src_val -- the bit offset -- to something within a word.
2647      The Intel docs say that literal offsets larger than a word are
2648      masked in this way. */
2649   switch (sz) {
2650      case 2:  src_val &= 15; break;
2651      case 4:  src_val &= 31; break;
2652      default: *decode_OK = False; return delta;
2653   }
2654
2655   /* Invent a mask suitable for the operation. */
2656   switch (gregOfRM(modrm)) {
2657      case 4: /* BT */  mask = 0;               break;
2658      case 5: /* BTS */ mask = 1 << src_val;    break;
2659      case 6: /* BTR */ mask = ~(1 << src_val); break;
2660      case 7: /* BTC */ mask = 1 << src_val;    break;
2661         /* If this needs to be extended, probably simplest to make a
2662            new function to handle the other cases (0 .. 3).  The
2663            Intel docs do however not indicate any use for 0 .. 3, so
2664            we don't expect this to happen. */
2665      default: *decode_OK = False; return delta;
2666   }
2667
2668   /* Fetch the value to be tested and modified into t2, which is
2669      32-bits wide regardless of sz. */
2670   if (epartIsReg(modrm)) {
2671      vassert(am_sz == 1);
2672      assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
2673      delta += (am_sz + 1);
2674      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2675                              src_val, nameIReg(sz,eregOfRM(modrm)));
2676   } else {
2677      Int len;
2678      t_addr = disAMode ( &len, sorb, delta, dis_buf);
2679      delta  += (len+1);
2680      assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
2681      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2682                              src_val, dis_buf);
2683   }
2684
2685   /* Compute the new value into t2m, if non-BT. */
2686   switch (gregOfRM(modrm)) {
2687      case 4: /* BT */
2688         break;
2689      case 5: /* BTS */
2690         assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
2691         break;
2692      case 6: /* BTR */
2693         assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
2694         break;
2695      case 7: /* BTC */
2696         assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
2697         break;
2698      default:
2699         /*NOTREACHED*/ /*the previous switch guards this*/
2700         vassert(0);
2701   }
2702
2703   /* Write the result back, if non-BT.  If the CAS fails then we
2704      side-exit from the trace at this point, and so the flag state is
2705      not affected.  This is of course as required. */
2706   if (gregOfRM(modrm) != 4 /* BT */) {
2707      if (epartIsReg(modrm)) {
2708         putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
2709      } else {
2710         if (locked) {
2711            casLE( mkexpr(t_addr),
2712                   narrowTo(ty, mkexpr(t2))/*expd*/,
2713                   narrowTo(ty, mkexpr(t2m))/*new*/,
2714                   guest_EIP_curr_instr );
2715         } else {
2716            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
2717         }
2718      }
2719   }
2720
2721   /* Copy relevant bit from t2 into the carry flag. */
2722   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
2723   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2724   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2725   stmt( IRStmt_Put(
2726            OFFB_CC_DEP1,
2727            binop(Iop_And32,
2728                  binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
2729                  mkU32(1))
2730       ));
2731   /* Set NDEP even though it isn't used.  This makes redundant-PUT
2732      elimination of previous stores to this field work better. */
2733   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2734
2735   return delta;
2736}
2737
2738
2739/* Signed/unsigned widening multiply.  Generate IR to multiply the
2740   value in EAX/AX/AL by the given IRTemp, and park the result in
2741   EDX:EAX/DX:AX/AX.
2742*/
2743static void codegen_mulL_A_D ( Int sz, Bool syned,
2744                               IRTemp tmp, const HChar* tmp_txt )
2745{
2746   IRType ty = szToITy(sz);
2747   IRTemp t1 = newTemp(ty);
2748
2749   assign( t1, getIReg(sz, R_EAX) );
2750
2751   switch (ty) {
2752      case Ity_I32: {
2753         IRTemp res64   = newTemp(Ity_I64);
2754         IRTemp resHi   = newTemp(Ity_I32);
2755         IRTemp resLo   = newTemp(Ity_I32);
2756         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
2757         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2758         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
2759         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2760         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
2761         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
2762         putIReg(4, R_EDX, mkexpr(resHi));
2763         putIReg(4, R_EAX, mkexpr(resLo));
2764         break;
2765      }
2766      case Ity_I16: {
2767         IRTemp res32   = newTemp(Ity_I32);
2768         IRTemp resHi   = newTemp(Ity_I16);
2769         IRTemp resLo   = newTemp(Ity_I16);
2770         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
2771         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2772         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
2773         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2774         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
2775         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
2776         putIReg(2, R_EDX, mkexpr(resHi));
2777         putIReg(2, R_EAX, mkexpr(resLo));
2778         break;
2779      }
2780      case Ity_I8: {
2781         IRTemp res16   = newTemp(Ity_I16);
2782         IRTemp resHi   = newTemp(Ity_I8);
2783         IRTemp resLo   = newTemp(Ity_I8);
2784         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
2785         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2786         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
2787         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2788         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
2789         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
2790         putIReg(2, R_EAX, mkexpr(res16));
2791         break;
2792      }
2793      default:
2794         vpanic("codegen_mulL_A_D(x86)");
2795   }
2796   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
2797}
2798
2799
2800/* Group 3 extended opcodes. */
2801static
2802UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
2803{
2804   UInt    d32;
2805   UChar   modrm;
2806   HChar   dis_buf[50];
2807   Int     len;
2808   IRTemp  addr;
2809   IRType  ty = szToITy(sz);
2810   IRTemp  t1 = newTemp(ty);
2811   IRTemp dst1, src, dst0;
2812
2813   *decode_OK = True; /* may change this later */
2814
2815   modrm = getIByte(delta);
2816
2817   if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
2818      /* LOCK prefix only allowed with not and neg subopcodes */
2819      *decode_OK = False;
2820      return delta;
2821   }
2822
2823   if (epartIsReg(modrm)) {
2824      switch (gregOfRM(modrm)) {
2825         case 0: { /* TEST */
2826            delta++; d32 = getUDisp(sz, delta); delta += sz;
2827            dst1 = newTemp(ty);
2828            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2829                               getIReg(sz,eregOfRM(modrm)),
2830                               mkU(ty,d32)));
2831            setFlags_DEP1( Iop_And8, dst1, ty );
2832            DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
2833                                      nameIReg(sz, eregOfRM(modrm)));
2834            break;
2835         }
2836         case 1: /* UNDEFINED */
2837           /* The Intel docs imply this insn is undefined and binutils
2838              agrees.  Unfortunately Core 2 will run it (with who
2839              knows what result?)  sandpile.org reckons it's an alias
2840              for case 0.  We play safe. */
2841           *decode_OK = False;
2842           break;
2843         case 2: /* NOT */
2844            delta++;
2845            putIReg(sz, eregOfRM(modrm),
2846                        unop(mkSizedOp(ty,Iop_Not8),
2847                             getIReg(sz, eregOfRM(modrm))));
2848            DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2849            break;
2850         case 3: /* NEG */
2851            delta++;
2852            dst0 = newTemp(ty);
2853            src  = newTemp(ty);
2854            dst1 = newTemp(ty);
2855            assign(dst0, mkU(ty,0));
2856            assign(src,  getIReg(sz,eregOfRM(modrm)));
2857            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
2858            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2859            putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2860            DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2861            break;
2862         case 4: /* MUL (unsigned widening) */
2863            delta++;
2864            src = newTemp(ty);
2865            assign(src, getIReg(sz,eregOfRM(modrm)));
2866            codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
2867            break;
2868         case 5: /* IMUL (signed widening) */
2869            delta++;
2870            src = newTemp(ty);
2871            assign(src, getIReg(sz,eregOfRM(modrm)));
2872            codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
2873            break;
2874         case 6: /* DIV */
2875            delta++;
2876            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2877            codegen_div ( sz, t1, False );
2878            DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2879            break;
2880         case 7: /* IDIV */
2881            delta++;
2882            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2883            codegen_div ( sz, t1, True );
2884            DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2885            break;
2886         default:
2887            /* This can't happen - gregOfRM should return 0 .. 7 only */
2888            vpanic("Grp3(x86)");
2889      }
2890   } else {
2891      addr = disAMode ( &len, sorb, delta, dis_buf );
2892      t1   = newTemp(ty);
2893      delta += len;
2894      assign(t1, loadLE(ty,mkexpr(addr)));
2895      switch (gregOfRM(modrm)) {
2896         case 0: { /* TEST */
2897            d32 = getUDisp(sz, delta); delta += sz;
2898            dst1 = newTemp(ty);
2899            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2900                               mkexpr(t1), mkU(ty,d32)));
2901            setFlags_DEP1( Iop_And8, dst1, ty );
2902            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
2903            break;
2904         }
2905         case 1: /* UNDEFINED */
2906           /* See comment above on R case */
2907           *decode_OK = False;
2908           break;
2909         case 2: /* NOT */
2910            dst1 = newTemp(ty);
2911            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
2912            if (locked) {
2913               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2914                                    guest_EIP_curr_instr );
2915            } else {
2916               storeLE( mkexpr(addr), mkexpr(dst1) );
2917            }
2918            DIP("not%c %s\n", nameISize(sz), dis_buf);
2919            break;
2920         case 3: /* NEG */
2921            dst0 = newTemp(ty);
2922            src  = newTemp(ty);
2923            dst1 = newTemp(ty);
2924            assign(dst0, mkU(ty,0));
2925            assign(src,  mkexpr(t1));
2926            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
2927                               mkexpr(dst0), mkexpr(src)));
2928            if (locked) {
2929               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2930                                    guest_EIP_curr_instr );
2931            } else {
2932               storeLE( mkexpr(addr), mkexpr(dst1) );
2933            }
2934            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2935            DIP("neg%c %s\n", nameISize(sz), dis_buf);
2936            break;
2937         case 4: /* MUL */
2938            codegen_mulL_A_D ( sz, False, t1, dis_buf );
2939            break;
2940         case 5: /* IMUL */
2941            codegen_mulL_A_D ( sz, True, t1, dis_buf );
2942            break;
2943         case 6: /* DIV */
2944            codegen_div ( sz, t1, False );
2945            DIP("div%c %s\n", nameISize(sz), dis_buf);
2946            break;
2947         case 7: /* IDIV */
2948            codegen_div ( sz, t1, True );
2949            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
2950            break;
2951         default:
2952            /* This can't happen - gregOfRM should return 0 .. 7 only */
2953            vpanic("Grp3(x86)");
2954      }
2955   }
2956   return delta;
2957}
2958
2959
2960/* Group 4 extended opcodes. */
2961static
2962UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
2963{
2964   Int   alen;
2965   UChar modrm;
2966   HChar dis_buf[50];
2967   IRType ty = Ity_I8;
2968   IRTemp t1 = newTemp(ty);
2969   IRTemp t2 = newTemp(ty);
2970
2971   *decode_OK = True;
2972
2973   modrm = getIByte(delta);
2974
2975   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
2976      /* LOCK prefix only allowed with inc and dec subopcodes */
2977      *decode_OK = False;
2978      return delta;
2979   }
2980
2981   if (epartIsReg(modrm)) {
2982      assign(t1, getIReg(1, eregOfRM(modrm)));
2983      switch (gregOfRM(modrm)) {
2984         case 0: /* INC */
2985            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2986            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2987            setFlags_INC_DEC( True, t2, ty );
2988            break;
2989         case 1: /* DEC */
2990            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2991            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2992            setFlags_INC_DEC( False, t2, ty );
2993            break;
2994         default:
2995            *decode_OK = False;
2996            return delta;
2997      }
2998      delta++;
2999      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
3000                      nameIReg(1, eregOfRM(modrm)));
3001   } else {
3002      IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
3003      assign( t1, loadLE(ty, mkexpr(addr)) );
3004      switch (gregOfRM(modrm)) {
3005         case 0: /* INC */
3006            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3007            if (locked) {
3008               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3009                      guest_EIP_curr_instr );
3010            } else {
3011               storeLE( mkexpr(addr), mkexpr(t2) );
3012            }
3013            setFlags_INC_DEC( True, t2, ty );
3014            break;
3015         case 1: /* DEC */
3016            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3017            if (locked) {
3018               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3019                      guest_EIP_curr_instr );
3020            } else {
3021               storeLE( mkexpr(addr), mkexpr(t2) );
3022            }
3023            setFlags_INC_DEC( False, t2, ty );
3024            break;
3025         default:
3026            *decode_OK = False;
3027            return delta;
3028      }
3029      delta += alen;
3030      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
3031   }
3032   return delta;
3033}
3034
3035
3036/* Group 5 extended opcodes. */
3037static
3038UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
3039                /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
3040{
3041   Int     len;
3042   UChar   modrm;
3043   HChar   dis_buf[50];
3044   IRTemp  addr = IRTemp_INVALID;
3045   IRType  ty = szToITy(sz);
3046   IRTemp  t1 = newTemp(ty);
3047   IRTemp  t2 = IRTemp_INVALID;
3048
3049   *decode_OK = True;
3050
3051   modrm = getIByte(delta);
3052
3053   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3054      /* LOCK prefix only allowed with inc and dec subopcodes */
3055      *decode_OK = False;
3056      return delta;
3057   }
3058
3059   if (epartIsReg(modrm)) {
3060      assign(t1, getIReg(sz,eregOfRM(modrm)));
3061      switch (gregOfRM(modrm)) {
3062         case 0: /* INC */
3063            vassert(sz == 2 || sz == 4);
3064            t2 = newTemp(ty);
3065            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3066                             mkexpr(t1), mkU(ty,1)));
3067            setFlags_INC_DEC( True, t2, ty );
3068            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3069            break;
3070         case 1: /* DEC */
3071            vassert(sz == 2 || sz == 4);
3072            t2 = newTemp(ty);
3073            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3074                             mkexpr(t1), mkU(ty,1)));
3075            setFlags_INC_DEC( False, t2, ty );
3076            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3077            break;
3078         case 2: /* call Ev */
3079            vassert(sz == 4);
3080            t2 = newTemp(Ity_I32);
3081            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3082            putIReg(4, R_ESP, mkexpr(t2));
3083            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
3084            jmp_treg(dres, Ijk_Call, t1);
3085            vassert(dres->whatNext == Dis_StopHere);
3086            break;
3087         case 4: /* jmp Ev */
3088            vassert(sz == 4);
3089            jmp_treg(dres, Ijk_Boring, t1);
3090            vassert(dres->whatNext == Dis_StopHere);
3091            break;
3092         case 6: /* PUSH Ev */
3093            vassert(sz == 4 || sz == 2);
3094            t2 = newTemp(Ity_I32);
3095            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3096            putIReg(4, R_ESP, mkexpr(t2) );
3097            storeLE( mkexpr(t2), mkexpr(t1) );
3098            break;
3099         default:
3100            *decode_OK = False;
3101            return delta;
3102      }
3103      delta++;
3104      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3105                       nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3106   } else {
3107      addr = disAMode ( &len, sorb, delta, dis_buf );
3108      assign(t1, loadLE(ty,mkexpr(addr)));
3109      switch (gregOfRM(modrm)) {
3110         case 0: /* INC */
3111            t2 = newTemp(ty);
3112            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3113                             mkexpr(t1), mkU(ty,1)));
3114            if (locked) {
3115               casLE( mkexpr(addr),
3116                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3117            } else {
3118               storeLE(mkexpr(addr),mkexpr(t2));
3119            }
3120            setFlags_INC_DEC( True, t2, ty );
3121            break;
3122         case 1: /* DEC */
3123            t2 = newTemp(ty);
3124            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3125                             mkexpr(t1), mkU(ty,1)));
3126            if (locked) {
3127               casLE( mkexpr(addr),
3128                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3129            } else {
3130               storeLE(mkexpr(addr),mkexpr(t2));
3131            }
3132            setFlags_INC_DEC( False, t2, ty );
3133            break;
3134         case 2: /* call Ev */
3135            vassert(sz == 4);
3136            t2 = newTemp(Ity_I32);
3137            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3138            putIReg(4, R_ESP, mkexpr(t2));
3139            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
3140            jmp_treg(dres, Ijk_Call, t1);
3141            vassert(dres->whatNext == Dis_StopHere);
3142            break;
3143         case 4: /* JMP Ev */
3144            vassert(sz == 4);
3145            jmp_treg(dres, Ijk_Boring, t1);
3146            vassert(dres->whatNext == Dis_StopHere);
3147            break;
3148         case 6: /* PUSH Ev */
3149            vassert(sz == 4 || sz == 2);
3150            t2 = newTemp(Ity_I32);
3151            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3152            putIReg(4, R_ESP, mkexpr(t2) );
3153            storeLE( mkexpr(t2), mkexpr(t1) );
3154            break;
3155         default:
3156            *decode_OK = False;
3157            return delta;
3158      }
3159      delta += len;
3160      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3161                       nameISize(sz), dis_buf);
3162   }
3163   return delta;
3164}
3165
3166
3167/*------------------------------------------------------------*/
3168/*--- Disassembling string ops (including REP prefixes)    ---*/
3169/*------------------------------------------------------------*/
3170
3171/* Code shared by all the string ops */
3172static
3173void dis_string_op_increment(Int sz, Int t_inc)
3174{
3175   if (sz == 4 || sz == 2) {
3176      assign( t_inc,
3177              binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
3178                               mkU8(sz/2) ) );
3179   } else {
3180      assign( t_inc,
3181              IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
3182   }
3183}
3184
3185static
3186void dis_string_op( void (*dis_OP)( Int, IRTemp ),
3187                    Int sz, const HChar* name, UChar sorb )
3188{
3189   IRTemp t_inc = newTemp(Ity_I32);
3190   vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
3191   dis_string_op_increment(sz, t_inc);
3192   dis_OP( sz, t_inc );
3193   DIP("%s%c\n", name, nameISize(sz));
3194}
3195
3196static
3197void dis_MOVS ( Int sz, IRTemp t_inc )
3198{
3199   IRType ty = szToITy(sz);
3200   IRTemp td = newTemp(Ity_I32);   /* EDI */
3201   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3202
3203   assign( td, getIReg(4, R_EDI) );
3204   assign( ts, getIReg(4, R_ESI) );
3205
3206   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
3207
3208   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3209   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3210}
3211
3212static
3213void dis_LODS ( Int sz, IRTemp t_inc )
3214{
3215   IRType ty = szToITy(sz);
3216   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3217
3218   assign( ts, getIReg(4, R_ESI) );
3219
3220   putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
3221
3222   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3223}
3224
3225static
3226void dis_STOS ( Int sz, IRTemp t_inc )
3227{
3228   IRType ty = szToITy(sz);
3229   IRTemp ta = newTemp(ty);        /* EAX */
3230   IRTemp td = newTemp(Ity_I32);   /* EDI */
3231
3232   assign( ta, getIReg(sz, R_EAX) );
3233   assign( td, getIReg(4, R_EDI) );
3234
3235   storeLE( mkexpr(td), mkexpr(ta) );
3236
3237   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3238}
3239
3240static
3241void dis_CMPS ( Int sz, IRTemp t_inc )
3242{
3243   IRType ty  = szToITy(sz);
3244   IRTemp tdv = newTemp(ty);      /* (EDI) */
3245   IRTemp tsv = newTemp(ty);      /* (ESI) */
3246   IRTemp td  = newTemp(Ity_I32); /*  EDI  */
3247   IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
3248
3249   assign( td, getIReg(4, R_EDI) );
3250   assign( ts, getIReg(4, R_ESI) );
3251
3252   assign( tdv, loadLE(ty,mkexpr(td)) );
3253   assign( tsv, loadLE(ty,mkexpr(ts)) );
3254
3255   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
3256
3257   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3258   putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3259}
3260
3261static
3262void dis_SCAS ( Int sz, IRTemp t_inc )
3263{
3264   IRType ty  = szToITy(sz);
3265   IRTemp ta  = newTemp(ty);       /*  EAX  */
3266   IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
3267   IRTemp tdv = newTemp(ty);       /* (EDI) */
3268
3269   assign( ta, getIReg(sz, R_EAX) );
3270   assign( td, getIReg(4, R_EDI) );
3271
3272   assign( tdv, loadLE(ty,mkexpr(td)) );
3273   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
3274
3275   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3276}
3277
3278
3279/* Wrap the appropriate string op inside a REP/REPE/REPNE.
3280   We assume the insn is the last one in the basic block, and so emit a jump
3281   to the next insn, rather than just falling through. */
3282static
3283void dis_REP_op ( /*MOD*/DisResult* dres,
3284                  X86Condcode cond,
3285                  void (*dis_OP)(Int, IRTemp),
3286                  Int sz, Addr32 eip, Addr32 eip_next, const HChar* name )
3287{
3288   IRTemp t_inc = newTemp(Ity_I32);
3289   IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
3290
3291   assign( tc, getIReg(4,R_ECX) );
3292
3293   stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
3294                      Ijk_Boring,
3295                      IRConst_U32(eip_next), OFFB_EIP ) );
3296
3297   putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
3298
3299   dis_string_op_increment(sz, t_inc);
3300   dis_OP (sz, t_inc);
3301
3302   if (cond == X86CondAlways) {
3303      jmp_lit(dres, Ijk_Boring, eip);
3304      vassert(dres->whatNext == Dis_StopHere);
3305   } else {
3306      stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
3307                         Ijk_Boring,
3308                         IRConst_U32(eip), OFFB_EIP ) );
3309      jmp_lit(dres, Ijk_Boring, eip_next);
3310      vassert(dres->whatNext == Dis_StopHere);
3311   }
3312   DIP("%s%c\n", name, nameISize(sz));
3313}
3314
3315
3316/*------------------------------------------------------------*/
3317/*--- Arithmetic, etc.                                     ---*/
3318/*------------------------------------------------------------*/
3319
3320/* IMUL E, G.  Supplied eip points to the modR/M byte. */
3321static
3322UInt dis_mul_E_G ( UChar       sorb,
3323                   Int         size,
3324                   Int         delta0 )
3325{
3326   Int    alen;
3327   HChar  dis_buf[50];
3328   UChar  rm = getIByte(delta0);
3329   IRType ty = szToITy(size);
3330   IRTemp te = newTemp(ty);
3331   IRTemp tg = newTemp(ty);
3332   IRTemp resLo = newTemp(ty);
3333
3334   assign( tg, getIReg(size, gregOfRM(rm)) );
3335   if (epartIsReg(rm)) {
3336      assign( te, getIReg(size, eregOfRM(rm)) );
3337   } else {
3338      IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
3339      assign( te, loadLE(ty,mkexpr(addr)) );
3340   }
3341
3342   setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
3343
3344   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
3345
3346   putIReg(size, gregOfRM(rm), mkexpr(resLo) );
3347
3348   if (epartIsReg(rm)) {
3349      DIP("imul%c %s, %s\n", nameISize(size),
3350                             nameIReg(size,eregOfRM(rm)),
3351                             nameIReg(size,gregOfRM(rm)));
3352      return 1+delta0;
3353   } else {
3354      DIP("imul%c %s, %s\n", nameISize(size),
3355                             dis_buf, nameIReg(size,gregOfRM(rm)));
3356      return alen+delta0;
3357   }
3358}
3359
3360
3361/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
3362static
3363UInt dis_imul_I_E_G ( UChar       sorb,
3364                      Int         size,
3365                      Int         delta,
3366                      Int         litsize )
3367{
3368   Int    d32, alen;
3369   HChar  dis_buf[50];
3370   UChar  rm = getIByte(delta);
3371   IRType ty = szToITy(size);
3372   IRTemp te = newTemp(ty);
3373   IRTemp tl = newTemp(ty);
3374   IRTemp resLo = newTemp(ty);
3375
3376   vassert(size == 1 || size == 2 || size == 4);
3377
3378   if (epartIsReg(rm)) {
3379      assign(te, getIReg(size, eregOfRM(rm)));
3380      delta++;
3381   } else {
3382      IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
3383      assign(te, loadLE(ty, mkexpr(addr)));
3384      delta += alen;
3385   }
3386   d32 = getSDisp(litsize,delta);
3387   delta += litsize;
3388
3389   if (size == 1) d32 &= 0xFF;
3390   if (size == 2) d32 &= 0xFFFF;
3391
3392   assign(tl, mkU(ty,d32));
3393
3394   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
3395
3396   setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
3397
3398   putIReg(size, gregOfRM(rm), mkexpr(resLo));
3399
3400   DIP("imul %d, %s, %s\n", d32,
3401       ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
3402       nameIReg(size,gregOfRM(rm)) );
3403   return delta;
3404}
3405
3406
3407/* Generate an IR sequence to do a count-leading-zeroes operation on
3408   the supplied IRTemp, and return a new IRTemp holding the result.
3409   'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
3410   argument is zero, return the number of bits in the word (the
3411   natural semantics). */
3412static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
3413{
3414   vassert(ty == Ity_I32 || ty == Ity_I16);
3415
3416   IRTemp src32 = newTemp(Ity_I32);
3417   assign(src32, widenUto32( mkexpr(src) ));
3418
3419   IRTemp src32x = newTemp(Ity_I32);
3420   assign(src32x,
3421          binop(Iop_Shl32, mkexpr(src32),
3422                           mkU8(32 - 8 * sizeofIRType(ty))));
3423
3424   // Clz32 has undefined semantics when its input is zero, so
3425   // special-case around that.
3426   IRTemp res32 = newTemp(Ity_I32);
3427   assign(res32,
3428          IRExpr_ITE(
3429             binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0)),
3430             mkU32(8 * sizeofIRType(ty)),
3431             unop(Iop_Clz32, mkexpr(src32x))
3432   ));
3433
3434   IRTemp res = newTemp(ty);
3435   assign(res, narrowTo(ty, mkexpr(res32)));
3436   return res;
3437}
3438
3439
3440/*------------------------------------------------------------*/
3441/*---                                                      ---*/
3442/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
3443/*---                                                      ---*/
3444/*------------------------------------------------------------*/
3445
3446/* --- Helper functions for dealing with the register stack. --- */
3447
3448/* --- Set the emulation-warning pseudo-register. --- */
3449
3450static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
3451{
3452   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3453   stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
3454}
3455
3456/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
3457
3458static IRExpr* mkQNaN64 ( void )
3459{
3460  /* QNaN is 0 2047 1 0(51times)
3461     == 0b 11111111111b 1 0(51times)
3462     == 0x7FF8 0000 0000 0000
3463   */
3464   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
3465}
3466
3467/* --------- Get/put the top-of-stack pointer. --------- */
3468
3469static IRExpr* get_ftop ( void )
3470{
3471   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
3472}
3473
3474static void put_ftop ( IRExpr* e )
3475{
3476   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3477   stmt( IRStmt_Put( OFFB_FTOP, e ) );
3478}
3479
3480/* --------- Get/put the C3210 bits. --------- */
3481
3482static IRExpr* get_C3210 ( void )
3483{
3484   return IRExpr_Get( OFFB_FC3210, Ity_I32 );
3485}
3486
3487static void put_C3210 ( IRExpr* e )
3488{
3489   stmt( IRStmt_Put( OFFB_FC3210, e ) );
3490}
3491
3492/* --------- Get/put the FPU rounding mode. --------- */
3493static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
3494{
3495   return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
3496}
3497
3498static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
3499{
3500   stmt( IRStmt_Put( OFFB_FPROUND, e ) );
3501}
3502
3503
3504/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
3505/* Produces a value in 0 .. 3, which is encoded as per the type
3506   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
3507   per IRRoundingMode, we merely need to get it and mask it for
3508   safety.
3509*/
3510static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
3511{
3512   return binop( Iop_And32, get_fpround(), mkU32(3) );
3513}
3514
3515static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
3516{
3517   return mkU32(Irrm_NEAREST);
3518}
3519
3520
3521/* --------- Get/set FP register tag bytes. --------- */
3522
3523/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
3524
3525static void put_ST_TAG ( Int i, IRExpr* value )
3526{
3527   IRRegArray* descr;
3528   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
3529   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3530   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3531}
3532
3533/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
3534   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
3535
3536static IRExpr* get_ST_TAG ( Int i )
3537{
3538   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3539   return IRExpr_GetI( descr, get_ftop(), i );
3540}
3541
3542
3543/* --------- Get/set FP registers. --------- */
3544
3545/* Given i, and some expression e, emit 'ST(i) = e' and set the
3546   register's tag to indicate the register is full.  The previous
3547   state of the register is not checked. */
3548
3549static void put_ST_UNCHECKED ( Int i, IRExpr* value )
3550{
3551   IRRegArray* descr;
3552   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
3553   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3554   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3555   /* Mark the register as in-use. */
3556   put_ST_TAG(i, mkU8(1));
3557}
3558
3559/* Given i, and some expression e, emit
3560      ST(i) = is_full(i) ? NaN : e
3561   and set the tag accordingly.
3562*/
3563
3564static void put_ST ( Int i, IRExpr* value )
3565{
3566   put_ST_UNCHECKED(
3567      i,
3568      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
3569                  /* non-0 means full */
3570                  mkQNaN64(),
3571                  /* 0 means empty */
3572                  value
3573      )
3574   );
3575}
3576
3577
3578/* Given i, generate an expression yielding 'ST(i)'. */
3579
3580static IRExpr* get_ST_UNCHECKED ( Int i )
3581{
3582   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3583   return IRExpr_GetI( descr, get_ftop(), i );
3584}
3585
3586
3587/* Given i, generate an expression yielding
3588  is_full(i) ? ST(i) : NaN
3589*/
3590
3591static IRExpr* get_ST ( Int i )
3592{
3593   return
3594      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
3595                  /* non-0 means full */
3596                  get_ST_UNCHECKED(i),
3597                  /* 0 means empty */
3598                  mkQNaN64());
3599}
3600
3601
3602/* Given i, and some expression e, and a condition cond, generate IR
3603   which has the same effect as put_ST(i,e) when cond is true and has
3604   no effect when cond is false.  Given the lack of proper
3605   if-then-else in the IR, this is pretty tricky.
3606*/
3607
3608static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
3609{
3610   // new_tag = if cond then FULL else old_tag
3611   // new_val = if cond then (if old_tag==FULL then NaN else val)
3612   //                   else old_val
3613
3614   IRTemp old_tag = newTemp(Ity_I8);
3615   assign(old_tag, get_ST_TAG(i));
3616   IRTemp new_tag = newTemp(Ity_I8);
3617   assign(new_tag,
3618          IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
3619
3620   IRTemp old_val = newTemp(Ity_F64);
3621   assign(old_val, get_ST_UNCHECKED(i));
3622   IRTemp new_val = newTemp(Ity_F64);
3623   assign(new_val,
3624          IRExpr_ITE(mkexpr(cond),
3625                     IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
3626                                /* non-0 means full */
3627                                mkQNaN64(),
3628                                /* 0 means empty */
3629                                value),
3630                     mkexpr(old_val)));
3631
3632   put_ST_UNCHECKED(i, mkexpr(new_val));
3633   // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
3634   // now set it to new_tag instead.
3635   put_ST_TAG(i, mkexpr(new_tag));
3636}
3637
3638/* Adjust FTOP downwards by one register. */
3639
3640static void fp_push ( void )
3641{
3642   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
3643}
3644
3645/* Adjust FTOP downwards by one register when COND is 1:I1.  Else
3646   don't change it. */
3647
3648static void maybe_fp_push ( IRTemp cond )
3649{
3650   put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
3651}
3652
3653/* Adjust FTOP upwards by one register, and mark the vacated register
3654   as empty.  */
3655
3656static void fp_pop ( void )
3657{
3658   put_ST_TAG(0, mkU8(0));
3659   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
3660}
3661
3662/* Set the C2 bit of the FPU status register to e[0].  Assumes that
3663   e[31:1] == 0.
3664*/
3665static void set_C2 ( IRExpr* e )
3666{
3667   IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
3668   put_C3210( binop(Iop_Or32,
3669                    cleared,
3670                    binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
3671}
3672
3673/* Generate code to check that abs(d64) < 2^63 and is finite.  This is
3674   used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
3675   test is simple, but the derivation of it is not so simple.
3676
3677   The exponent field for an IEEE754 double is 11 bits.  That means it
3678   can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
3679   the number is either a NaN or an Infinity and so is not finite.
3680   Furthermore, a finite value of exactly 2^63 is the smallest value
3681   that has exponent value 0x43E.  Hence, what we need to do is
3682   extract the exponent, ignoring the sign bit and mantissa, and check
3683   it is < 0x43E, or <= 0x43D.
3684
3685   To make this easily applicable to 32- and 64-bit targets, a
3686   roundabout approach is used.  First the number is converted to I64,
3687   then the top 32 bits are taken.  Shifting them right by 20 bits
3688   places the sign bit and exponent in the bottom 12 bits.  Anding
3689   with 0x7FF gets rid of the sign bit, leaving just the exponent
3690   available for comparison.
3691*/
3692static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
3693{
3694   IRTemp i64 = newTemp(Ity_I64);
3695   assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
3696   IRTemp exponent = newTemp(Ity_I32);
3697   assign(exponent,
3698          binop(Iop_And32,
3699                binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
3700                mkU32(0x7FF)));
3701   IRTemp in_range_and_finite = newTemp(Ity_I1);
3702   assign(in_range_and_finite,
3703          binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
3704   return in_range_and_finite;
3705}
3706
3707/* Invent a plausible-looking FPU status word value:
3708      ((ftop & 7) << 11) | (c3210 & 0x4700)
3709 */
3710static IRExpr* get_FPU_sw ( void )
3711{
3712   return
3713      unop(Iop_32to16,
3714           binop(Iop_Or32,
3715                 binop(Iop_Shl32,
3716                       binop(Iop_And32, get_ftop(), mkU32(7)),
3717                             mkU8(11)),
3718                       binop(Iop_And32, get_C3210(), mkU32(0x4700))
3719      ));
3720}
3721
3722
3723/* ------------------------------------------------------- */
3724/* Given all that stack-mangling junk, we can now go ahead
3725   and describe FP instructions.
3726*/
3727
3728/* ST(0) = ST(0) `op` mem64/32(addr)
3729   Need to check ST(0)'s tag on read, but not on write.
3730*/
3731static
3732void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
3733                         IROp op, Bool dbl )
3734{
3735   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3736   if (dbl) {
3737      put_ST_UNCHECKED(0,
3738         triop( op,
3739                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3740                get_ST(0),
3741                loadLE(Ity_F64,mkexpr(addr))
3742         ));
3743   } else {
3744      put_ST_UNCHECKED(0,
3745         triop( op,
3746                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3747                get_ST(0),
3748                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
3749         ));
3750   }
3751}
3752
3753
3754/* ST(0) = mem64/32(addr) `op` ST(0)
3755   Need to check ST(0)'s tag on read, but not on write.
3756*/
3757static
3758void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
3759                            IROp op, Bool dbl )
3760{
3761   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3762   if (dbl) {
3763      put_ST_UNCHECKED(0,
3764         triop( op,
3765                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3766                loadLE(Ity_F64,mkexpr(addr)),
3767                get_ST(0)
3768         ));
3769   } else {
3770      put_ST_UNCHECKED(0,
3771         triop( op,
3772                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3773                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
3774                get_ST(0)
3775         ));
3776   }
3777}
3778
3779
3780/* ST(dst) = ST(dst) `op` ST(src).
3781   Check dst and src tags when reading but not on write.
3782*/
3783static
3784void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3785                      Bool pop_after )
3786{
3787   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3788                                 (Int)st_src, (Int)st_dst );
3789   put_ST_UNCHECKED(
3790      st_dst,
3791      triop( op,
3792             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3793             get_ST(st_dst),
3794             get_ST(st_src) )
3795   );
3796   if (pop_after)
3797      fp_pop();
3798}
3799
3800/* ST(dst) = ST(src) `op` ST(dst).
3801   Check dst and src tags when reading but not on write.
3802*/
3803static
3804void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src,
3805                         UInt st_dst, Bool pop_after )
3806{
3807   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3808                                 (Int)st_src, (Int)st_dst );
3809   put_ST_UNCHECKED(
3810      st_dst,
3811      triop( op,
3812             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3813             get_ST(st_src),
3814             get_ST(st_dst) )
3815   );
3816   if (pop_after)
3817      fp_pop();
3818}
3819
3820/* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
3821static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
3822{
3823   DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
3824   /* This is a bit of a hack (and isn't really right).  It sets
3825      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
3826      documentation implies A and S are unchanged.
3827   */
3828   /* It's also fishy in that it is used both for COMIP and
3829      UCOMIP, and they aren't the same (although similar). */
3830   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
3831   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
3832   stmt( IRStmt_Put( OFFB_CC_DEP1,
3833                     binop( Iop_And32,
3834                            binop(Iop_CmpF64, get_ST(0), get_ST(i)),
3835                            mkU32(0x45)
3836       )));
3837   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3838      elimination of previous stores to this field work better. */
3839   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
3840   if (pop_after)
3841      fp_pop();
3842}
3843
3844
3845static
3846UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
3847{
3848   Int    len;
3849   UInt   r_src, r_dst;
3850   HChar  dis_buf[50];
3851   IRTemp t1, t2;
3852
3853   /* On entry, delta points at the second byte of the insn (the modrm
3854      byte).*/
3855   UChar first_opcode = getIByte(delta-1);
3856   UChar modrm        = getIByte(delta+0);
3857
3858   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
3859
3860   if (first_opcode == 0xD8) {
3861      if (modrm < 0xC0) {
3862
3863         /* bits 5,4,3 are an opcode extension, and the modRM also
3864           specifies an address. */
3865         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3866         delta += len;
3867
3868         switch (gregOfRM(modrm)) {
3869
3870            case 0: /* FADD single-real */
3871               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
3872               break;
3873
3874            case 1: /* FMUL single-real */
3875               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
3876               break;
3877
3878            case 2: /* FCOM single-real */
3879               DIP("fcoms %s\n", dis_buf);
3880               /* This forces C1 to zero, which isn't right. */
3881               put_C3210(
3882                   binop( Iop_And32,
3883                          binop(Iop_Shl32,
3884                                binop(Iop_CmpF64,
3885                                      get_ST(0),
3886                                      unop(Iop_F32toF64,
3887                                           loadLE(Ity_F32,mkexpr(addr)))),
3888                                mkU8(8)),
3889                          mkU32(0x4500)
3890                   ));
3891               break;
3892
3893            case 3: /* FCOMP single-real */
3894               DIP("fcomps %s\n", dis_buf);
3895               /* This forces C1 to zero, which isn't right. */
3896               put_C3210(
3897                   binop( Iop_And32,
3898                          binop(Iop_Shl32,
3899                                binop(Iop_CmpF64,
3900                                      get_ST(0),
3901                                      unop(Iop_F32toF64,
3902                                           loadLE(Ity_F32,mkexpr(addr)))),
3903                                mkU8(8)),
3904                          mkU32(0x4500)
3905                   ));
3906               fp_pop();
3907               break;
3908
3909            case 4: /* FSUB single-real */
3910               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
3911               break;
3912
3913            case 5: /* FSUBR single-real */
3914               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
3915               break;
3916
3917            case 6: /* FDIV single-real */
3918               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
3919               break;
3920
3921            case 7: /* FDIVR single-real */
3922               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
3923               break;
3924
3925            default:
3926               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
3927               vex_printf("first_opcode == 0xD8\n");
3928               goto decode_fail;
3929         }
3930      } else {
3931         delta++;
3932         switch (modrm) {
3933
3934            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
3935               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
3936               break;
3937
3938            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
3939               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
3940               break;
3941
3942            /* Dunno if this is right */
3943            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
3944               r_dst = (UInt)modrm - 0xD0;
3945               DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
3946               /* This forces C1 to zero, which isn't right. */
3947               put_C3210(
3948                   binop( Iop_And32,
3949                          binop(Iop_Shl32,
3950                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3951                                mkU8(8)),
3952                          mkU32(0x4500)
3953                   ));
3954               break;
3955
3956            /* Dunno if this is right */
3957            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
3958               r_dst = (UInt)modrm - 0xD8;
3959               DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
3960               /* This forces C1 to zero, which isn't right. */
3961               put_C3210(
3962                   binop( Iop_And32,
3963                          binop(Iop_Shl32,
3964                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3965                                mkU8(8)),
3966                          mkU32(0x4500)
3967                   ));
3968               fp_pop();
3969               break;
3970
3971            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
3972               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
3973               break;
3974
3975            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
3976               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
3977               break;
3978
3979            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
3980               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
3981               break;
3982
3983            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
3984               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
3985               break;
3986
3987            default:
3988               goto decode_fail;
3989         }
3990      }
3991   }
3992
3993   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
3994   else
3995   if (first_opcode == 0xD9) {
3996      if (modrm < 0xC0) {
3997
3998         /* bits 5,4,3 are an opcode extension, and the modRM also
3999            specifies an address. */
4000         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4001         delta += len;
4002
4003         switch (gregOfRM(modrm)) {
4004
4005            case 0: /* FLD single-real */
4006               DIP("flds %s\n", dis_buf);
4007               fp_push();
4008               put_ST(0, unop(Iop_F32toF64,
4009                              loadLE(Ity_F32, mkexpr(addr))));
4010               break;
4011
4012            case 2: /* FST single-real */
4013               DIP("fsts %s\n", dis_buf);
4014               storeLE(mkexpr(addr),
4015                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
4016               break;
4017
4018            case 3: /* FSTP single-real */
4019               DIP("fstps %s\n", dis_buf);
4020               storeLE(mkexpr(addr),
4021                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
4022               fp_pop();
4023               break;
4024
4025            case 4: { /* FLDENV m28 */
4026               /* Uses dirty helper:
4027                     VexEmNote x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
4028               IRTemp   ew = newTemp(Ity_I32);
4029               IRDirty* d  = unsafeIRDirty_0_N (
4030                                0/*regparms*/,
4031                                "x86g_dirtyhelper_FLDENV",
4032                                &x86g_dirtyhelper_FLDENV,
4033                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
4034                             );
4035               d->tmp   = ew;
4036               /* declare we're reading memory */
4037               d->mFx   = Ifx_Read;
4038               d->mAddr = mkexpr(addr);
4039               d->mSize = 28;
4040
4041               /* declare we're writing guest state */
4042               d->nFxState = 4;
4043               vex_bzero(&d->fxState, sizeof(d->fxState));
4044
4045               d->fxState[0].fx     = Ifx_Write;
4046               d->fxState[0].offset = OFFB_FTOP;
4047               d->fxState[0].size   = sizeof(UInt);
4048
4049               d->fxState[1].fx     = Ifx_Write;
4050               d->fxState[1].offset = OFFB_FPTAGS;
4051               d->fxState[1].size   = 8 * sizeof(UChar);
4052
4053               d->fxState[2].fx     = Ifx_Write;
4054               d->fxState[2].offset = OFFB_FPROUND;
4055               d->fxState[2].size   = sizeof(UInt);
4056
4057               d->fxState[3].fx     = Ifx_Write;
4058               d->fxState[3].offset = OFFB_FC3210;
4059               d->fxState[3].size   = sizeof(UInt);
4060
4061               stmt( IRStmt_Dirty(d) );
4062
4063               /* ew contains any emulation warning we may need to
4064                  issue.  If needed, side-exit to the next insn,
4065                  reporting the warning, so that Valgrind's dispatcher
4066                  sees the warning. */
4067               put_emwarn( mkexpr(ew) );
4068               stmt(
4069                  IRStmt_Exit(
4070                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4071                     Ijk_EmWarn,
4072                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4073                     OFFB_EIP
4074                  )
4075               );
4076
4077               DIP("fldenv %s\n", dis_buf);
4078               break;
4079            }
4080
4081            case 5: {/* FLDCW */
4082               /* The only thing we observe in the control word is the
4083                  rounding mode.  Therefore, pass the 16-bit value
4084                  (x87 native-format control word) to a clean helper,
4085                  getting back a 64-bit value, the lower half of which
4086                  is the FPROUND value to store, and the upper half of
4087                  which is the emulation-warning token which may be
4088                  generated.
4089               */
4090               /* ULong x86h_check_fldcw ( UInt ); */
4091               IRTemp t64 = newTemp(Ity_I64);
4092               IRTemp ew = newTemp(Ity_I32);
4093               DIP("fldcw %s\n", dis_buf);
4094               assign( t64, mkIRExprCCall(
4095                               Ity_I64, 0/*regparms*/,
4096                               "x86g_check_fldcw",
4097                               &x86g_check_fldcw,
4098                               mkIRExprVec_1(
4099                                  unop( Iop_16Uto32,
4100                                        loadLE(Ity_I16, mkexpr(addr)))
4101                               )
4102                            )
4103                     );
4104
4105               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
4106               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
4107               put_emwarn( mkexpr(ew) );
4108               /* Finally, if an emulation warning was reported,
4109                  side-exit to the next insn, reporting the warning,
4110                  so that Valgrind's dispatcher sees the warning. */
4111               stmt(
4112                  IRStmt_Exit(
4113                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4114                     Ijk_EmWarn,
4115                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4116                     OFFB_EIP
4117                  )
4118               );
4119               break;
4120            }
4121
4122            case 6: { /* FNSTENV m28 */
4123               /* Uses dirty helper:
4124                     void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
4125               IRDirty* d = unsafeIRDirty_0_N (
4126                               0/*regparms*/,
4127                               "x86g_dirtyhelper_FSTENV",
4128                               &x86g_dirtyhelper_FSTENV,
4129                               mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
4130                            );
4131               /* declare we're writing memory */
4132               d->mFx   = Ifx_Write;
4133               d->mAddr = mkexpr(addr);
4134               d->mSize = 28;
4135
4136               /* declare we're reading guest state */
4137               d->nFxState = 4;
4138               vex_bzero(&d->fxState, sizeof(d->fxState));
4139
4140               d->fxState[0].fx     = Ifx_Read;
4141               d->fxState[0].offset = OFFB_FTOP;
4142               d->fxState[0].size   = sizeof(UInt);
4143
4144               d->fxState[1].fx     = Ifx_Read;
4145               d->fxState[1].offset = OFFB_FPTAGS;
4146               d->fxState[1].size   = 8 * sizeof(UChar);
4147
4148               d->fxState[2].fx     = Ifx_Read;
4149               d->fxState[2].offset = OFFB_FPROUND;
4150               d->fxState[2].size   = sizeof(UInt);
4151
4152               d->fxState[3].fx     = Ifx_Read;
4153               d->fxState[3].offset = OFFB_FC3210;
4154               d->fxState[3].size   = sizeof(UInt);
4155
4156               stmt( IRStmt_Dirty(d) );
4157
4158               DIP("fnstenv %s\n", dis_buf);
4159               break;
4160            }
4161
4162            case 7: /* FNSTCW */
4163              /* Fake up a native x87 FPU control word.  The only
4164                 thing it depends on is FPROUND[1:0], so call a clean
4165                 helper to cook it up. */
4166               /* UInt x86h_create_fpucw ( UInt fpround ) */
4167               DIP("fnstcw %s\n", dis_buf);
4168               storeLE(
4169                  mkexpr(addr),
4170                  unop( Iop_32to16,
4171                        mkIRExprCCall(
4172                           Ity_I32, 0/*regp*/,
4173                           "x86g_create_fpucw", &x86g_create_fpucw,
4174                           mkIRExprVec_1( get_fpround() )
4175                        )
4176                  )
4177               );
4178               break;
4179
4180            default:
4181               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4182               vex_printf("first_opcode == 0xD9\n");
4183               goto decode_fail;
4184         }
4185
4186      } else {
4187         delta++;
4188         switch (modrm) {
4189
4190            case 0xC0 ... 0xC7: /* FLD %st(?) */
4191               r_src = (UInt)modrm - 0xC0;
4192               DIP("fld %%st(%d)\n", (Int)r_src);
4193               t1 = newTemp(Ity_F64);
4194               assign(t1, get_ST(r_src));
4195               fp_push();
4196               put_ST(0, mkexpr(t1));
4197               break;
4198
4199            case 0xC8 ... 0xCF: /* FXCH %st(?) */
4200               r_src = (UInt)modrm - 0xC8;
4201               DIP("fxch %%st(%d)\n", (Int)r_src);
4202               t1 = newTemp(Ity_F64);
4203               t2 = newTemp(Ity_F64);
4204               assign(t1, get_ST(0));
4205               assign(t2, get_ST(r_src));
4206               put_ST_UNCHECKED(0, mkexpr(t2));
4207               put_ST_UNCHECKED(r_src, mkexpr(t1));
4208               break;
4209
4210            case 0xE0: /* FCHS */
4211               DIP("fchs\n");
4212               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
4213               break;
4214
4215            case 0xE1: /* FABS */
4216               DIP("fabs\n");
4217               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
4218               break;
4219
4220            case 0xE4: /* FTST */
4221               DIP("ftst\n");
4222               /* This forces C1 to zero, which isn't right. */
4223               /* Well, in fact the Intel docs say (bizarrely): "C1 is
4224                  set to 0 if stack underflow occurred; otherwise, set
4225                  to 0" which is pretty nonsensical.  I guess it's a
4226                   typo. */
4227               put_C3210(
4228                   binop( Iop_And32,
4229                          binop(Iop_Shl32,
4230                                binop(Iop_CmpF64,
4231                                      get_ST(0),
4232                                      IRExpr_Const(IRConst_F64i(0x0ULL))),
4233                                mkU8(8)),
4234                          mkU32(0x4500)
4235                   ));
4236               break;
4237
4238            case 0xE5: { /* FXAM */
4239               /* This is an interesting one.  It examines %st(0),
4240                  regardless of whether the tag says it's empty or not.
4241                  Here, just pass both the tag (in our format) and the
4242                  value (as a double, actually a ULong) to a helper
4243                  function. */
4244               IRExpr** args
4245                  = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
4246                                   unop(Iop_ReinterpF64asI64,
4247                                        get_ST_UNCHECKED(0)) );
4248               put_C3210(mkIRExprCCall(
4249                            Ity_I32,
4250                            0/*regparm*/,
4251                            "x86g_calculate_FXAM", &x86g_calculate_FXAM,
4252                            args
4253                        ));
4254               DIP("fxam\n");
4255               break;
4256            }
4257
4258            case 0xE8: /* FLD1 */
4259               DIP("fld1\n");
4260               fp_push();
4261               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
4262               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
4263               break;
4264
4265            case 0xE9: /* FLDL2T */
4266               DIP("fldl2t\n");
4267               fp_push();
4268               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
4269               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
4270               break;
4271
4272            case 0xEA: /* FLDL2E */
4273               DIP("fldl2e\n");
4274               fp_push();
4275               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
4276               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
4277               break;
4278
4279            case 0xEB: /* FLDPI */
4280               DIP("fldpi\n");
4281               fp_push();
4282               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
4283               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
4284               break;
4285
4286            case 0xEC: /* FLDLG2 */
4287               DIP("fldlg2\n");
4288               fp_push();
4289               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
4290               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
4291               break;
4292
4293            case 0xED: /* FLDLN2 */
4294               DIP("fldln2\n");
4295               fp_push();
4296               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
4297               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
4298               break;
4299
4300            case 0xEE: /* FLDZ */
4301               DIP("fldz\n");
4302               fp_push();
4303               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
4304               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
4305               break;
4306
4307            case 0xF0: /* F2XM1 */
4308               DIP("f2xm1\n");
4309               put_ST_UNCHECKED(0,
4310                  binop(Iop_2xm1F64,
4311                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4312                        get_ST(0)));
4313               break;
4314
4315            case 0xF1: /* FYL2X */
4316               DIP("fyl2x\n");
4317               put_ST_UNCHECKED(1,
4318                  triop(Iop_Yl2xF64,
4319                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4320                        get_ST(1),
4321                        get_ST(0)));
4322               fp_pop();
4323               break;
4324
4325            case 0xF2: { /* FPTAN */
4326               DIP("fptan\n");
4327               IRTemp argD = newTemp(Ity_F64);
4328               assign(argD, get_ST(0));
4329               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4330               IRTemp resD = newTemp(Ity_F64);
4331               assign(resD,
4332                  IRExpr_ITE(
4333                     mkexpr(argOK),
4334                     binop(Iop_TanF64,
4335                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4336                           mkexpr(argD)),
4337                     mkexpr(argD))
4338               );
4339               put_ST_UNCHECKED(0, mkexpr(resD));
4340               /* Conditionally push 1.0 on the stack, if the arg is
4341                  in range */
4342               maybe_fp_push(argOK);
4343               maybe_put_ST(argOK, 0,
4344                            IRExpr_Const(IRConst_F64(1.0)));
4345               set_C2( binop(Iop_Xor32,
4346                             unop(Iop_1Uto32, mkexpr(argOK)),
4347                             mkU32(1)) );
4348               break;
4349            }
4350
4351            case 0xF3: /* FPATAN */
4352               DIP("fpatan\n");
4353               put_ST_UNCHECKED(1,
4354                  triop(Iop_AtanF64,
4355                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4356                        get_ST(1),
4357                        get_ST(0)));
4358               fp_pop();
4359               break;
4360
4361            case 0xF4: { /* FXTRACT */
4362               IRTemp argF = newTemp(Ity_F64);
4363               IRTemp sigF = newTemp(Ity_F64);
4364               IRTemp expF = newTemp(Ity_F64);
4365               IRTemp argI = newTemp(Ity_I64);
4366               IRTemp sigI = newTemp(Ity_I64);
4367               IRTemp expI = newTemp(Ity_I64);
4368               DIP("fxtract\n");
4369               assign( argF, get_ST(0) );
4370               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
4371               assign( sigI,
4372                       mkIRExprCCall(
4373                          Ity_I64, 0/*regparms*/,
4374                          "x86amd64g_calculate_FXTRACT",
4375                          &x86amd64g_calculate_FXTRACT,
4376                          mkIRExprVec_2( mkexpr(argI),
4377                                         mkIRExpr_HWord(0)/*sig*/ ))
4378               );
4379               assign( expI,
4380                       mkIRExprCCall(
4381                          Ity_I64, 0/*regparms*/,
4382                          "x86amd64g_calculate_FXTRACT",
4383                          &x86amd64g_calculate_FXTRACT,
4384                          mkIRExprVec_2( mkexpr(argI),
4385                                         mkIRExpr_HWord(1)/*exp*/ ))
4386               );
4387               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
4388               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
4389               /* exponent */
4390               put_ST_UNCHECKED(0, mkexpr(expF) );
4391               fp_push();
4392               /* significand */
4393               put_ST(0, mkexpr(sigF) );
4394               break;
4395            }
4396
4397            case 0xF5: { /* FPREM1 -- IEEE compliant */
4398               IRTemp a1 = newTemp(Ity_F64);
4399               IRTemp a2 = newTemp(Ity_F64);
4400               DIP("fprem1\n");
4401               /* Do FPREM1 twice, once to get the remainder, and once
4402                  to get the C3210 flag values. */
4403               assign( a1, get_ST(0) );
4404               assign( a2, get_ST(1) );
4405               put_ST_UNCHECKED(0,
4406                  triop(Iop_PRem1F64,
4407                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4408                        mkexpr(a1),
4409                        mkexpr(a2)));
4410               put_C3210(
4411                  triop(Iop_PRem1C3210F64,
4412                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4413                        mkexpr(a1),
4414                        mkexpr(a2)) );
4415               break;
4416            }
4417
4418            case 0xF7: /* FINCSTP */
4419               DIP("fprem\n");
4420               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4421               break;
4422
4423            case 0xF8: { /* FPREM -- not IEEE compliant */
4424               IRTemp a1 = newTemp(Ity_F64);
4425               IRTemp a2 = newTemp(Ity_F64);
4426               DIP("fprem\n");
4427               /* Do FPREM twice, once to get the remainder, and once
4428                  to get the C3210 flag values. */
4429               assign( a1, get_ST(0) );
4430               assign( a2, get_ST(1) );
4431               put_ST_UNCHECKED(0,
4432                  triop(Iop_PRemF64,
4433                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4434                        mkexpr(a1),
4435                        mkexpr(a2)));
4436               put_C3210(
4437                  triop(Iop_PRemC3210F64,
4438                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4439                        mkexpr(a1),
4440                        mkexpr(a2)) );
4441               break;
4442            }
4443
4444            case 0xF9: /* FYL2XP1 */
4445               DIP("fyl2xp1\n");
4446               put_ST_UNCHECKED(1,
4447                  triop(Iop_Yl2xp1F64,
4448                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4449                        get_ST(1),
4450                        get_ST(0)));
4451               fp_pop();
4452               break;
4453
4454            case 0xFA: /* FSQRT */
4455               DIP("fsqrt\n");
4456               put_ST_UNCHECKED(0,
4457                  binop(Iop_SqrtF64,
4458                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4459                        get_ST(0)));
4460               break;
4461
4462            case 0xFB: { /* FSINCOS */
4463               DIP("fsincos\n");
4464               IRTemp argD = newTemp(Ity_F64);
4465               assign(argD, get_ST(0));
4466               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4467               IRTemp resD = newTemp(Ity_F64);
4468               assign(resD,
4469                  IRExpr_ITE(
4470                     mkexpr(argOK),
4471                     binop(Iop_SinF64,
4472                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4473                           mkexpr(argD)),
4474                     mkexpr(argD))
4475               );
4476               put_ST_UNCHECKED(0, mkexpr(resD));
4477               /* Conditionally push the cos value on the stack, if
4478                  the arg is in range */
4479               maybe_fp_push(argOK);
4480               maybe_put_ST(argOK, 0,
4481                  binop(Iop_CosF64,
4482                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4483                        mkexpr(argD)));
4484               set_C2( binop(Iop_Xor32,
4485                             unop(Iop_1Uto32, mkexpr(argOK)),
4486                             mkU32(1)) );
4487               break;
4488            }
4489
4490            case 0xFC: /* FRNDINT */
4491               DIP("frndint\n");
4492               put_ST_UNCHECKED(0,
4493                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
4494               break;
4495
4496            case 0xFD: /* FSCALE */
4497               DIP("fscale\n");
4498               put_ST_UNCHECKED(0,
4499                  triop(Iop_ScaleF64,
4500                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4501                        get_ST(0),
4502                        get_ST(1)));
4503               break;
4504
4505            case 0xFE:   /* FSIN */
4506            case 0xFF: { /* FCOS */
4507               Bool isSIN = modrm == 0xFE;
4508               DIP("%s\n", isSIN ? "fsin" : "fcos");
4509               IRTemp argD = newTemp(Ity_F64);
4510               assign(argD, get_ST(0));
4511               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
4512               IRTemp resD = newTemp(Ity_F64);
4513               assign(resD,
4514                  IRExpr_ITE(
4515                     mkexpr(argOK),
4516                     binop(isSIN ? Iop_SinF64 : Iop_CosF64,
4517                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4518                           mkexpr(argD)),
4519                     mkexpr(argD))
4520               );
4521               put_ST_UNCHECKED(0, mkexpr(resD));
4522               set_C2( binop(Iop_Xor32,
4523                             unop(Iop_1Uto32, mkexpr(argOK)),
4524                             mkU32(1)) );
4525               break;
4526            }
4527
4528            default:
4529               goto decode_fail;
4530         }
4531      }
4532   }
4533
4534   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
4535   else
4536   if (first_opcode == 0xDA) {
4537
4538      if (modrm < 0xC0) {
4539
4540         /* bits 5,4,3 are an opcode extension, and the modRM also
4541            specifies an address. */
4542         IROp   fop;
4543         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4544         delta += len;
4545         switch (gregOfRM(modrm)) {
4546
4547            case 0: /* FIADD m32int */ /* ST(0) += m32int */
4548               DIP("fiaddl %s\n", dis_buf);
4549               fop = Iop_AddF64;
4550               goto do_fop_m32;
4551
4552            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
4553               DIP("fimull %s\n", dis_buf);
4554               fop = Iop_MulF64;
4555               goto do_fop_m32;
4556
4557            case 2: /* FICOM m32int */
4558               DIP("ficoml %s\n", dis_buf);
4559               /* This forces C1 to zero, which isn't right. */
4560               put_C3210(
4561                   binop( Iop_And32,
4562                          binop(Iop_Shl32,
4563                                binop(Iop_CmpF64,
4564                                      get_ST(0),
4565                                      unop(Iop_I32StoF64,
4566                                           loadLE(Ity_I32,mkexpr(addr)))),
4567                                mkU8(8)),
4568                          mkU32(0x4500)
4569                   ));
4570               break;
4571
4572            case 3: /* FICOMP m32int */
4573               DIP("ficompl %s\n", dis_buf);
4574               /* This forces C1 to zero, which isn't right. */
4575               put_C3210(
4576                   binop( Iop_And32,
4577                          binop(Iop_Shl32,
4578                                binop(Iop_CmpF64,
4579                                      get_ST(0),
4580                                      unop(Iop_I32StoF64,
4581                                           loadLE(Ity_I32,mkexpr(addr)))),
4582                                mkU8(8)),
4583                          mkU32(0x4500)
4584                   ));
4585               fp_pop();
4586               break;
4587
4588            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
4589               DIP("fisubl %s\n", dis_buf);
4590               fop = Iop_SubF64;
4591               goto do_fop_m32;
4592
4593            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
4594               DIP("fisubrl %s\n", dis_buf);
4595               fop = Iop_SubF64;
4596               goto do_foprev_m32;
4597
4598            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
4599               DIP("fidivl %s\n", dis_buf);
4600               fop = Iop_DivF64;
4601               goto do_fop_m32;
4602
4603            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
4604               DIP("fidivrl %s\n", dis_buf);
4605               fop = Iop_DivF64;
4606               goto do_foprev_m32;
4607
4608            do_fop_m32:
4609               put_ST_UNCHECKED(0,
4610                  triop(fop,
4611                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4612                        get_ST(0),
4613                        unop(Iop_I32StoF64,
4614                             loadLE(Ity_I32, mkexpr(addr)))));
4615               break;
4616
4617            do_foprev_m32:
4618               put_ST_UNCHECKED(0,
4619                  triop(fop,
4620                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4621                        unop(Iop_I32StoF64,
4622                             loadLE(Ity_I32, mkexpr(addr))),
4623                        get_ST(0)));
4624               break;
4625
4626            default:
4627               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4628               vex_printf("first_opcode == 0xDA\n");
4629               goto decode_fail;
4630         }
4631
4632      } else {
4633
4634         delta++;
4635         switch (modrm) {
4636
4637            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
4638               r_src = (UInt)modrm - 0xC0;
4639               DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
4640               put_ST_UNCHECKED(0,
4641                                IRExpr_ITE(
4642                                    mk_x86g_calculate_condition(X86CondB),
4643                                    get_ST(r_src), get_ST(0)) );
4644               break;
4645
4646            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
4647               r_src = (UInt)modrm - 0xC8;
4648               DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
4649               put_ST_UNCHECKED(0,
4650                                IRExpr_ITE(
4651                                    mk_x86g_calculate_condition(X86CondZ),
4652                                    get_ST(r_src), get_ST(0)) );
4653               break;
4654
4655            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
4656               r_src = (UInt)modrm - 0xD0;
4657               DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
4658               put_ST_UNCHECKED(0,
4659                                IRExpr_ITE(
4660                                    mk_x86g_calculate_condition(X86CondBE),
4661                                    get_ST(r_src), get_ST(0)) );
4662               break;
4663
4664            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
4665               r_src = (UInt)modrm - 0xD8;
4666               DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
4667               put_ST_UNCHECKED(0,
4668                                IRExpr_ITE(
4669                                    mk_x86g_calculate_condition(X86CondP),
4670                                    get_ST(r_src), get_ST(0)) );
4671               break;
4672
4673            case 0xE9: /* FUCOMPP %st(0),%st(1) */
4674               DIP("fucompp %%st(0),%%st(1)\n");
4675               /* This forces C1 to zero, which isn't right. */
4676               put_C3210(
4677                   binop( Iop_And32,
4678                          binop(Iop_Shl32,
4679                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
4680                                mkU8(8)),
4681                          mkU32(0x4500)
4682                   ));
4683               fp_pop();
4684               fp_pop();
4685               break;
4686
4687            default:
4688               goto decode_fail;
4689         }
4690
4691      }
4692   }
4693
4694   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
4695   else
4696   if (first_opcode == 0xDB) {
4697      if (modrm < 0xC0) {
4698
4699         /* bits 5,4,3 are an opcode extension, and the modRM also
4700            specifies an address. */
4701         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4702         delta += len;
4703
4704         switch (gregOfRM(modrm)) {
4705
4706            case 0: /* FILD m32int */
4707               DIP("fildl %s\n", dis_buf);
4708               fp_push();
4709               put_ST(0, unop(Iop_I32StoF64,
4710                              loadLE(Ity_I32, mkexpr(addr))));
4711               break;
4712
4713            case 1: /* FISTTPL m32 (SSE3) */
4714               DIP("fisttpl %s\n", dis_buf);
4715               storeLE( mkexpr(addr),
4716                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
4717               fp_pop();
4718               break;
4719
4720            case 2: /* FIST m32 */
4721               DIP("fistl %s\n", dis_buf);
4722               storeLE( mkexpr(addr),
4723                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4724               break;
4725
4726            case 3: /* FISTP m32 */
4727               DIP("fistpl %s\n", dis_buf);
4728               storeLE( mkexpr(addr),
4729                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4730               fp_pop();
4731               break;
4732
4733            case 5: { /* FLD extended-real */
4734               /* Uses dirty helper:
4735                     ULong x86g_loadF80le ( UInt )
4736                  addr holds the address.  First, do a dirty call to
4737                  get hold of the data. */
4738               IRTemp   val  = newTemp(Ity_I64);
4739               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
4740
4741               IRDirty* d = unsafeIRDirty_1_N (
4742                               val,
4743                               0/*regparms*/,
4744                               "x86g_dirtyhelper_loadF80le",
4745                               &x86g_dirtyhelper_loadF80le,
4746                               args
4747                            );
4748               /* declare that we're reading memory */
4749               d->mFx   = Ifx_Read;
4750               d->mAddr = mkexpr(addr);
4751               d->mSize = 10;
4752
4753               /* execute the dirty call, dumping the result in val. */
4754               stmt( IRStmt_Dirty(d) );
4755               fp_push();
4756               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
4757
4758               DIP("fldt %s\n", dis_buf);
4759               break;
4760            }
4761
4762            case 7: { /* FSTP extended-real */
4763               /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
4764               IRExpr** args
4765                  = mkIRExprVec_2( mkexpr(addr),
4766                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
4767
4768               IRDirty* d = unsafeIRDirty_0_N (
4769                               0/*regparms*/,
4770                               "x86g_dirtyhelper_storeF80le",
4771                               &x86g_dirtyhelper_storeF80le,
4772                               args
4773                            );
4774               /* declare we're writing memory */
4775               d->mFx   = Ifx_Write;
4776               d->mAddr = mkexpr(addr);
4777               d->mSize = 10;
4778
4779               /* execute the dirty call. */
4780               stmt( IRStmt_Dirty(d) );
4781               fp_pop();
4782
4783               DIP("fstpt\n %s", dis_buf);
4784               break;
4785            }
4786
4787            default:
4788               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4789               vex_printf("first_opcode == 0xDB\n");
4790               goto decode_fail;
4791         }
4792
4793      } else {
4794
4795         delta++;
4796         switch (modrm) {
4797
4798            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
4799               r_src = (UInt)modrm - 0xC0;
4800               DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
4801               put_ST_UNCHECKED(0,
4802                                IRExpr_ITE(
4803                                    mk_x86g_calculate_condition(X86CondNB),
4804                                    get_ST(r_src), get_ST(0)) );
4805               break;
4806
4807            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
4808               r_src = (UInt)modrm - 0xC8;
4809               DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
4810               put_ST_UNCHECKED(0,
4811                                IRExpr_ITE(
4812                                    mk_x86g_calculate_condition(X86CondNZ),
4813                                    get_ST(r_src), get_ST(0)) );
4814               break;
4815
4816            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
4817               r_src = (UInt)modrm - 0xD0;
4818               DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
4819               put_ST_UNCHECKED(0,
4820                                IRExpr_ITE(
4821                                    mk_x86g_calculate_condition(X86CondNBE),
4822                                    get_ST(r_src), get_ST(0)) );
4823               break;
4824
4825            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
4826               r_src = (UInt)modrm - 0xD8;
4827               DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
4828               put_ST_UNCHECKED(0,
4829                                IRExpr_ITE(
4830                                    mk_x86g_calculate_condition(X86CondNP),
4831                                    get_ST(r_src), get_ST(0)) );
4832               break;
4833
4834            case 0xE2:
4835               DIP("fnclex\n");
4836               break;
4837
4838            case 0xE3: {
4839               /* Uses dirty helper:
4840                     void x86g_do_FINIT ( VexGuestX86State* ) */
4841               IRDirty* d  = unsafeIRDirty_0_N (
4842                                0/*regparms*/,
4843                                "x86g_dirtyhelper_FINIT",
4844                                &x86g_dirtyhelper_FINIT,
4845                                mkIRExprVec_1(IRExpr_BBPTR())
4846                             );
4847
4848               /* declare we're writing guest state */
4849               d->nFxState = 5;
4850               vex_bzero(&d->fxState, sizeof(d->fxState));
4851
4852               d->fxState[0].fx     = Ifx_Write;
4853               d->fxState[0].offset = OFFB_FTOP;
4854               d->fxState[0].size   = sizeof(UInt);
4855
4856               d->fxState[1].fx     = Ifx_Write;
4857               d->fxState[1].offset = OFFB_FPREGS;
4858               d->fxState[1].size   = 8 * sizeof(ULong);
4859
4860               d->fxState[2].fx     = Ifx_Write;
4861               d->fxState[2].offset = OFFB_FPTAGS;
4862               d->fxState[2].size   = 8 * sizeof(UChar);
4863
4864               d->fxState[3].fx     = Ifx_Write;
4865               d->fxState[3].offset = OFFB_FPROUND;
4866               d->fxState[3].size   = sizeof(UInt);
4867
4868               d->fxState[4].fx     = Ifx_Write;
4869               d->fxState[4].offset = OFFB_FC3210;
4870               d->fxState[4].size   = sizeof(UInt);
4871
4872               stmt( IRStmt_Dirty(d) );
4873
4874               DIP("fninit\n");
4875               break;
4876            }
4877
4878            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
4879               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
4880               break;
4881
4882            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
4883               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
4884               break;
4885
4886            default:
4887               goto decode_fail;
4888         }
4889      }
4890   }
4891
4892   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
4893   else
4894   if (first_opcode == 0xDC) {
4895      if (modrm < 0xC0) {
4896
4897         /* bits 5,4,3 are an opcode extension, and the modRM also
4898            specifies an address. */
4899         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4900         delta += len;
4901
4902         switch (gregOfRM(modrm)) {
4903
4904            case 0: /* FADD double-real */
4905               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
4906               break;
4907
4908            case 1: /* FMUL double-real */
4909               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
4910               break;
4911
4912            case 2: /* FCOM double-real */
4913               DIP("fcoml %s\n", dis_buf);
4914               /* This forces C1 to zero, which isn't right. */
4915               put_C3210(
4916                   binop( Iop_And32,
4917                          binop(Iop_Shl32,
4918                                binop(Iop_CmpF64,
4919                                      get_ST(0),
4920                                      loadLE(Ity_F64,mkexpr(addr))),
4921                                mkU8(8)),
4922                          mkU32(0x4500)
4923                   ));
4924               break;
4925
4926            case 3: /* FCOMP double-real */
4927               DIP("fcompl %s\n", dis_buf);
4928               /* This forces C1 to zero, which isn't right. */
4929               put_C3210(
4930                   binop( Iop_And32,
4931                          binop(Iop_Shl32,
4932                                binop(Iop_CmpF64,
4933                                      get_ST(0),
4934                                      loadLE(Ity_F64,mkexpr(addr))),
4935                                mkU8(8)),
4936                          mkU32(0x4500)
4937                   ));
4938               fp_pop();
4939               break;
4940
4941            case 4: /* FSUB double-real */
4942               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
4943               break;
4944
4945            case 5: /* FSUBR double-real */
4946               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
4947               break;
4948
4949            case 6: /* FDIV double-real */
4950               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
4951               break;
4952
4953            case 7: /* FDIVR double-real */
4954               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
4955               break;
4956
4957            default:
4958               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4959               vex_printf("first_opcode == 0xDC\n");
4960               goto decode_fail;
4961         }
4962
4963      } else {
4964
4965         delta++;
4966         switch (modrm) {
4967
4968            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
4969               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
4970               break;
4971
4972            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
4973               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
4974               break;
4975
4976            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
4977               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
4978               break;
4979
4980            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
4981               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
4982               break;
4983
4984            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
4985               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
4986               break;
4987
4988            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
4989               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
4990               break;
4991
4992            default:
4993               goto decode_fail;
4994         }
4995
4996      }
4997   }
4998
4999   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
5000   else
5001   if (first_opcode == 0xDD) {
5002
5003      if (modrm < 0xC0) {
5004
5005         /* bits 5,4,3 are an opcode extension, and the modRM also
5006            specifies an address. */
5007         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5008         delta += len;
5009
5010         switch (gregOfRM(modrm)) {
5011
5012            case 0: /* FLD double-real */
5013               DIP("fldl %s\n", dis_buf);
5014               fp_push();
5015               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
5016               break;
5017
5018            case 1: /* FISTTPQ m64 (SSE3) */
5019               DIP("fistppll %s\n", dis_buf);
5020               storeLE( mkexpr(addr),
5021                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
5022               fp_pop();
5023               break;
5024
5025            case 2: /* FST double-real */
5026               DIP("fstl %s\n", dis_buf);
5027               storeLE(mkexpr(addr), get_ST(0));
5028               break;
5029
5030            case 3: /* FSTP double-real */
5031               DIP("fstpl %s\n", dis_buf);
5032               storeLE(mkexpr(addr), get_ST(0));
5033               fp_pop();
5034               break;
5035
5036            case 4: { /* FRSTOR m108 */
5037               /* Uses dirty helper:
5038                     VexEmNote x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
5039               IRTemp   ew = newTemp(Ity_I32);
5040               IRDirty* d  = unsafeIRDirty_0_N (
5041                                0/*regparms*/,
5042                                "x86g_dirtyhelper_FRSTOR",
5043                                &x86g_dirtyhelper_FRSTOR,
5044                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
5045                             );
5046               d->tmp   = ew;
5047               /* declare we're reading memory */
5048               d->mFx   = Ifx_Read;
5049               d->mAddr = mkexpr(addr);
5050               d->mSize = 108;
5051
5052               /* declare we're writing guest state */
5053               d->nFxState = 5;
5054               vex_bzero(&d->fxState, sizeof(d->fxState));
5055
5056               d->fxState[0].fx     = Ifx_Write;
5057               d->fxState[0].offset = OFFB_FTOP;
5058               d->fxState[0].size   = sizeof(UInt);
5059
5060               d->fxState[1].fx     = Ifx_Write;
5061               d->fxState[1].offset = OFFB_FPREGS;
5062               d->fxState[1].size   = 8 * sizeof(ULong);
5063
5064               d->fxState[2].fx     = Ifx_Write;
5065               d->fxState[2].offset = OFFB_FPTAGS;
5066               d->fxState[2].size   = 8 * sizeof(UChar);
5067
5068               d->fxState[3].fx     = Ifx_Write;
5069               d->fxState[3].offset = OFFB_FPROUND;
5070               d->fxState[3].size   = sizeof(UInt);
5071
5072               d->fxState[4].fx     = Ifx_Write;
5073               d->fxState[4].offset = OFFB_FC3210;
5074               d->fxState[4].size   = sizeof(UInt);
5075
5076               stmt( IRStmt_Dirty(d) );
5077
5078               /* ew contains any emulation warning we may need to
5079                  issue.  If needed, side-exit to the next insn,
5080                  reporting the warning, so that Valgrind's dispatcher
5081                  sees the warning. */
5082               put_emwarn( mkexpr(ew) );
5083               stmt(
5084                  IRStmt_Exit(
5085                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5086                     Ijk_EmWarn,
5087                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
5088                     OFFB_EIP
5089                  )
5090               );
5091
5092               DIP("frstor %s\n", dis_buf);
5093               break;
5094            }
5095
5096            case 6: { /* FNSAVE m108 */
5097               /* Uses dirty helper:
5098                     void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
5099               IRDirty* d = unsafeIRDirty_0_N (
5100                               0/*regparms*/,
5101                               "x86g_dirtyhelper_FSAVE",
5102                               &x86g_dirtyhelper_FSAVE,
5103                               mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
5104                            );
5105               /* declare we're writing memory */
5106               d->mFx   = Ifx_Write;
5107               d->mAddr = mkexpr(addr);
5108               d->mSize = 108;
5109
5110               /* declare we're reading guest state */
5111               d->nFxState = 5;
5112               vex_bzero(&d->fxState, sizeof(d->fxState));
5113
5114               d->fxState[0].fx     = Ifx_Read;
5115               d->fxState[0].offset = OFFB_FTOP;
5116               d->fxState[0].size   = sizeof(UInt);
5117
5118               d->fxState[1].fx     = Ifx_Read;
5119               d->fxState[1].offset = OFFB_FPREGS;
5120               d->fxState[1].size   = 8 * sizeof(ULong);
5121
5122               d->fxState[2].fx     = Ifx_Read;
5123               d->fxState[2].offset = OFFB_FPTAGS;
5124               d->fxState[2].size   = 8 * sizeof(UChar);
5125
5126               d->fxState[3].fx     = Ifx_Read;
5127               d->fxState[3].offset = OFFB_FPROUND;
5128               d->fxState[3].size   = sizeof(UInt);
5129
5130               d->fxState[4].fx     = Ifx_Read;
5131               d->fxState[4].offset = OFFB_FC3210;
5132               d->fxState[4].size   = sizeof(UInt);
5133
5134               stmt( IRStmt_Dirty(d) );
5135
5136               DIP("fnsave %s\n", dis_buf);
5137               break;
5138            }
5139
5140            case 7: { /* FNSTSW m16 */
5141               IRExpr* sw = get_FPU_sw();
5142               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
5143               storeLE( mkexpr(addr), sw );
5144               DIP("fnstsw %s\n", dis_buf);
5145               break;
5146            }
5147
5148            default:
5149               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5150               vex_printf("first_opcode == 0xDD\n");
5151               goto decode_fail;
5152         }
5153      } else {
5154         delta++;
5155         switch (modrm) {
5156
5157            case 0xC0 ... 0xC7: /* FFREE %st(?) */
5158               r_dst = (UInt)modrm - 0xC0;
5159               DIP("ffree %%st(%d)\n", (Int)r_dst);
5160               put_ST_TAG ( r_dst, mkU8(0) );
5161               break;
5162
5163            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
5164               r_dst = (UInt)modrm - 0xD0;
5165               DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
5166               /* P4 manual says: "If the destination operand is a
5167                  non-empty register, the invalid-operation exception
5168                  is not generated.  Hence put_ST_UNCHECKED. */
5169               put_ST_UNCHECKED(r_dst, get_ST(0));
5170               break;
5171
5172            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
5173               r_dst = (UInt)modrm - 0xD8;
5174               DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
5175               /* P4 manual says: "If the destination operand is a
5176                  non-empty register, the invalid-operation exception
5177                  is not generated.  Hence put_ST_UNCHECKED. */
5178               put_ST_UNCHECKED(r_dst, get_ST(0));
5179               fp_pop();
5180               break;
5181
5182            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
5183               r_dst = (UInt)modrm - 0xE0;
5184               DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
5185               /* This forces C1 to zero, which isn't right. */
5186               put_C3210(
5187                   binop( Iop_And32,
5188                          binop(Iop_Shl32,
5189                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5190                                mkU8(8)),
5191                          mkU32(0x4500)
5192                   ));
5193               break;
5194
5195            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
5196               r_dst = (UInt)modrm - 0xE8;
5197               DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
5198               /* This forces C1 to zero, which isn't right. */
5199               put_C3210(
5200                   binop( Iop_And32,
5201                          binop(Iop_Shl32,
5202                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5203                                mkU8(8)),
5204                          mkU32(0x4500)
5205                   ));
5206               fp_pop();
5207               break;
5208
5209            default:
5210               goto decode_fail;
5211         }
5212      }
5213   }
5214
5215   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
5216   else
5217   if (first_opcode == 0xDE) {
5218
5219      if (modrm < 0xC0) {
5220
5221         /* bits 5,4,3 are an opcode extension, and the modRM also
5222            specifies an address. */
5223         IROp   fop;
5224         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5225         delta += len;
5226
5227         switch (gregOfRM(modrm)) {
5228
5229            case 0: /* FIADD m16int */ /* ST(0) += m16int */
5230               DIP("fiaddw %s\n", dis_buf);
5231               fop = Iop_AddF64;
5232               goto do_fop_m16;
5233
5234            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
5235               DIP("fimulw %s\n", dis_buf);
5236               fop = Iop_MulF64;
5237               goto do_fop_m16;
5238
5239            case 2: /* FICOM m16int */
5240               DIP("ficomw %s\n", dis_buf);
5241               /* This forces C1 to zero, which isn't right. */
5242               put_C3210(
5243                   binop( Iop_And32,
5244                          binop(Iop_Shl32,
5245                                binop(Iop_CmpF64,
5246                                      get_ST(0),
5247                                      unop(Iop_I32StoF64,
5248                                         unop(Iop_16Sto32,
5249                                           loadLE(Ity_I16,mkexpr(addr))))),
5250                                mkU8(8)),
5251                          mkU32(0x4500)
5252                   ));
5253               break;
5254
5255            case 3: /* FICOMP m16int */
5256               DIP("ficompw %s\n", dis_buf);
5257               /* This forces C1 to zero, which isn't right. */
5258               put_C3210(
5259                   binop( Iop_And32,
5260                          binop(Iop_Shl32,
5261                                binop(Iop_CmpF64,
5262                                      get_ST(0),
5263                                      unop(Iop_I32StoF64,
5264                                         unop(Iop_16Sto32,
5265                                              loadLE(Ity_I16,mkexpr(addr))))),
5266                                mkU8(8)),
5267                          mkU32(0x4500)
5268                   ));
5269               fp_pop();
5270               break;
5271
5272            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
5273               DIP("fisubw %s\n", dis_buf);
5274               fop = Iop_SubF64;
5275               goto do_fop_m16;
5276
5277            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
5278               DIP("fisubrw %s\n", dis_buf);
5279               fop = Iop_SubF64;
5280               goto do_foprev_m16;
5281
5282            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
5283               DIP("fisubw %s\n", dis_buf);
5284               fop = Iop_DivF64;
5285               goto do_fop_m16;
5286
5287            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
5288               DIP("fidivrw %s\n", dis_buf);
5289               fop = Iop_DivF64;
5290               goto do_foprev_m16;
5291
5292            do_fop_m16:
5293               put_ST_UNCHECKED(0,
5294                  triop(fop,
5295                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5296                        get_ST(0),
5297                        unop(Iop_I32StoF64,
5298                             unop(Iop_16Sto32,
5299                                  loadLE(Ity_I16, mkexpr(addr))))));
5300               break;
5301
5302            do_foprev_m16:
5303               put_ST_UNCHECKED(0,
5304                  triop(fop,
5305                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5306                        unop(Iop_I32StoF64,
5307                             unop(Iop_16Sto32,
5308                                  loadLE(Ity_I16, mkexpr(addr)))),
5309                        get_ST(0)));
5310               break;
5311
5312            default:
5313               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5314               vex_printf("first_opcode == 0xDE\n");
5315               goto decode_fail;
5316         }
5317
5318      } else {
5319
5320         delta++;
5321         switch (modrm) {
5322
5323            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
5324               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
5325               break;
5326
5327            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
5328               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
5329               break;
5330
5331            case 0xD9: /* FCOMPP %st(0),%st(1) */
5332               DIP("fuompp %%st(0),%%st(1)\n");
5333               /* This forces C1 to zero, which isn't right. */
5334               put_C3210(
5335                   binop( Iop_And32,
5336                          binop(Iop_Shl32,
5337                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5338                                mkU8(8)),
5339                          mkU32(0x4500)
5340                   ));
5341               fp_pop();
5342               fp_pop();
5343               break;
5344
5345            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
5346               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
5347               break;
5348
5349            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
5350               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
5351               break;
5352
5353            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
5354               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
5355               break;
5356
5357            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
5358               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
5359               break;
5360
5361            default:
5362               goto decode_fail;
5363         }
5364
5365      }
5366   }
5367
5368   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
5369   else
5370   if (first_opcode == 0xDF) {
5371
5372      if (modrm < 0xC0) {
5373
5374         /* bits 5,4,3 are an opcode extension, and the modRM also
5375            specifies an address. */
5376         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5377         delta += len;
5378
5379         switch (gregOfRM(modrm)) {
5380
5381            case 0: /* FILD m16int */
5382               DIP("fildw %s\n", dis_buf);
5383               fp_push();
5384               put_ST(0, unop(Iop_I32StoF64,
5385                              unop(Iop_16Sto32,
5386                                   loadLE(Ity_I16, mkexpr(addr)))));
5387               break;
5388
5389            case 1: /* FISTTPS m16 (SSE3) */
5390               DIP("fisttps %s\n", dis_buf);
5391               storeLE( mkexpr(addr),
5392                        binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
5393               fp_pop();
5394               break;
5395
5396            case 2: /* FIST m16 */
5397               DIP("fistp %s\n", dis_buf);
5398               storeLE( mkexpr(addr),
5399                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5400               break;
5401
5402            case 3: /* FISTP m16 */
5403               DIP("fistps %s\n", dis_buf);
5404               storeLE( mkexpr(addr),
5405                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5406               fp_pop();
5407               break;
5408
5409            case 5: /* FILD m64 */
5410               DIP("fildll %s\n", dis_buf);
5411               fp_push();
5412               put_ST(0, binop(Iop_I64StoF64,
5413                               get_roundingmode(),
5414                               loadLE(Ity_I64, mkexpr(addr))));
5415               break;
5416
5417            case 7: /* FISTP m64 */
5418               DIP("fistpll %s\n", dis_buf);
5419               storeLE( mkexpr(addr),
5420                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
5421               fp_pop();
5422               break;
5423
5424            default:
5425               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5426               vex_printf("first_opcode == 0xDF\n");
5427               goto decode_fail;
5428         }
5429
5430      } else {
5431
5432         delta++;
5433         switch (modrm) {
5434
5435            case 0xC0: /* FFREEP %st(0) */
5436               DIP("ffreep %%st(%d)\n", 0);
5437               put_ST_TAG ( 0, mkU8(0) );
5438               fp_pop();
5439               break;
5440
5441            case 0xE0: /* FNSTSW %ax */
5442               DIP("fnstsw %%ax\n");
5443               /* Get the FPU status word value and dump it in %AX. */
5444               if (0) {
5445                  /* The obvious thing to do is simply dump the 16-bit
5446                     status word value in %AX.  However, due to a
5447                     limitation in Memcheck's origin tracking
5448                     machinery, this causes Memcheck not to track the
5449                     origin of any undefinedness into %AH (only into
5450                     %AL/%AX/%EAX), which means origins are lost in
5451                     the sequence "fnstsw %ax; test $M,%ah; jcond .." */
5452                  putIReg(2, R_EAX, get_FPU_sw());
5453               } else {
5454                  /* So a somewhat lame kludge is to make it very
5455                     clear to Memcheck that the value is written to
5456                     both %AH and %AL.  This generates marginally
5457                     worse code, but I don't think it matters much. */
5458                  IRTemp t16 = newTemp(Ity_I16);
5459                  assign(t16, get_FPU_sw());
5460                  putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
5461                  putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
5462               }
5463               break;
5464
5465            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
5466               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
5467               break;
5468
5469            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
5470               /* not really right since COMIP != UCOMIP */
5471               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
5472               break;
5473
5474            default:
5475               goto decode_fail;
5476         }
5477      }
5478
5479   }
5480
5481   else
5482   vpanic("dis_FPU(x86): invalid primary opcode");
5483
5484   *decode_ok = True;
5485   return delta;
5486
5487  decode_fail:
5488   *decode_ok = False;
5489   return delta;
5490}
5491
5492
5493/*------------------------------------------------------------*/
5494/*---                                                      ---*/
5495/*--- MMX INSTRUCTIONS                                     ---*/
5496/*---                                                      ---*/
5497/*------------------------------------------------------------*/
5498
5499/* Effect of MMX insns on x87 FPU state (table 11-2 of
5500   IA32 arch manual, volume 3):
5501
5502   Read from, or write to MMX register (viz, any insn except EMMS):
5503   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
5504   * FP stack pointer set to zero
5505
5506   EMMS:
5507   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
5508   * FP stack pointer set to zero
5509*/
5510
5511static void do_MMX_preamble ( void )
5512{
5513   Int         i;
5514   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5515   IRExpr*     zero  = mkU32(0);
5516   IRExpr*     tag1  = mkU8(1);
5517   put_ftop(zero);
5518   for (i = 0; i < 8; i++)
5519      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
5520}
5521
5522static void do_EMMS_preamble ( void )
5523{
5524   Int         i;
5525   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5526   IRExpr*     zero  = mkU32(0);
5527   IRExpr*     tag0  = mkU8(0);
5528   put_ftop(zero);
5529   for (i = 0; i < 8; i++)
5530      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
5531}
5532
5533
5534static IRExpr* getMMXReg ( UInt archreg )
5535{
5536   vassert(archreg < 8);
5537   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
5538}
5539
5540
5541static void putMMXReg ( UInt archreg, IRExpr* e )
5542{
5543   vassert(archreg < 8);
5544   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
5545   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
5546}
5547
5548
5549/* Helper for non-shift MMX insns.  Note this is incomplete in the
5550   sense that it does not first call do_MMX_preamble() -- that is the
5551   responsibility of its caller. */
5552
5553static
5554UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
5555                               Int    delta,
5556                               UChar  opc,
5557                               const HChar* name,
5558                               Bool   show_granularity )
5559{
5560   HChar   dis_buf[50];
5561   UChar   modrm = getIByte(delta);
5562   Bool    isReg = epartIsReg(modrm);
5563   IRExpr* argL  = NULL;
5564   IRExpr* argR  = NULL;
5565   IRExpr* argG  = NULL;
5566   IRExpr* argE  = NULL;
5567   IRTemp  res   = newTemp(Ity_I64);
5568
5569   Bool    invG  = False;
5570   IROp    op    = Iop_INVALID;
5571   void*   hAddr = NULL;
5572   Bool    eLeft = False;
5573   const HChar*  hName = NULL;
5574
5575#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
5576
5577   switch (opc) {
5578      /* Original MMX ones */
5579      case 0xFC: op = Iop_Add8x8; break;
5580      case 0xFD: op = Iop_Add16x4; break;
5581      case 0xFE: op = Iop_Add32x2; break;
5582
5583      case 0xEC: op = Iop_QAdd8Sx8; break;
5584      case 0xED: op = Iop_QAdd16Sx4; break;
5585
5586      case 0xDC: op = Iop_QAdd8Ux8; break;
5587      case 0xDD: op = Iop_QAdd16Ux4; break;
5588
5589      case 0xF8: op = Iop_Sub8x8;  break;
5590      case 0xF9: op = Iop_Sub16x4; break;
5591      case 0xFA: op = Iop_Sub32x2; break;
5592
5593      case 0xE8: op = Iop_QSub8Sx8; break;
5594      case 0xE9: op = Iop_QSub16Sx4; break;
5595
5596      case 0xD8: op = Iop_QSub8Ux8; break;
5597      case 0xD9: op = Iop_QSub16Ux4; break;
5598
5599      case 0xE5: op = Iop_MulHi16Sx4; break;
5600      case 0xD5: op = Iop_Mul16x4; break;
5601      case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
5602
5603      case 0x74: op = Iop_CmpEQ8x8; break;
5604      case 0x75: op = Iop_CmpEQ16x4; break;
5605      case 0x76: op = Iop_CmpEQ32x2; break;
5606
5607      case 0x64: op = Iop_CmpGT8Sx8; break;
5608      case 0x65: op = Iop_CmpGT16Sx4; break;
5609      case 0x66: op = Iop_CmpGT32Sx2; break;
5610
5611      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
5612      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
5613      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
5614
5615      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
5616      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
5617      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
5618
5619      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
5620      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
5621      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
5622
5623      case 0xDB: op = Iop_And64; break;
5624      case 0xDF: op = Iop_And64; invG = True; break;
5625      case 0xEB: op = Iop_Or64; break;
5626      case 0xEF: /* Possibly do better here if argL and argR are the
5627                    same reg */
5628                 op = Iop_Xor64; break;
5629
5630      /* Introduced in SSE1 */
5631      case 0xE0: op = Iop_Avg8Ux8;    break;
5632      case 0xE3: op = Iop_Avg16Ux4;   break;
5633      case 0xEE: op = Iop_Max16Sx4;   break;
5634      case 0xDE: op = Iop_Max8Ux8;    break;
5635      case 0xEA: op = Iop_Min16Sx4;   break;
5636      case 0xDA: op = Iop_Min8Ux8;    break;
5637      case 0xE4: op = Iop_MulHi16Ux4; break;
5638      case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
5639
5640      /* Introduced in SSE2 */
5641      case 0xD4: op = Iop_Add64; break;
5642      case 0xFB: op = Iop_Sub64; break;
5643
5644      default:
5645         vex_printf("\n0x%x\n", (Int)opc);
5646         vpanic("dis_MMXop_regmem_to_reg");
5647   }
5648
5649#  undef XXX
5650
5651   argG = getMMXReg(gregOfRM(modrm));
5652   if (invG)
5653      argG = unop(Iop_Not64, argG);
5654
5655   if (isReg) {
5656      delta++;
5657      argE = getMMXReg(eregOfRM(modrm));
5658   } else {
5659      Int    len;
5660      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5661      delta += len;
5662      argE = loadLE(Ity_I64, mkexpr(addr));
5663   }
5664
5665   if (eLeft) {
5666      argL = argE;
5667      argR = argG;
5668   } else {
5669      argL = argG;
5670      argR = argE;
5671   }
5672
5673   if (op != Iop_INVALID) {
5674      vassert(hName == NULL);
5675      vassert(hAddr == NULL);
5676      assign(res, binop(op, argL, argR));
5677   } else {
5678      vassert(hName != NULL);
5679      vassert(hAddr != NULL);
5680      assign( res,
5681              mkIRExprCCall(
5682                 Ity_I64,
5683                 0/*regparms*/, hName, hAddr,
5684                 mkIRExprVec_2( argL, argR )
5685              )
5686            );
5687   }
5688
5689   putMMXReg( gregOfRM(modrm), mkexpr(res) );
5690
5691   DIP("%s%s %s, %s\n",
5692       name, show_granularity ? nameMMXGran(opc & 3) : "",
5693       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
5694       nameMMXReg(gregOfRM(modrm)) );
5695
5696   return delta;
5697}
5698
5699
5700/* Vector by scalar shift of G by the amount specified at the bottom
5701   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
5702
5703static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
5704                                 const HChar* opname, IROp op )
5705{
5706   HChar   dis_buf[50];
5707   Int     alen, size;
5708   IRTemp  addr;
5709   Bool    shl, shr, sar;
5710   UChar   rm   = getIByte(delta);
5711   IRTemp  g0   = newTemp(Ity_I64);
5712   IRTemp  g1   = newTemp(Ity_I64);
5713   IRTemp  amt  = newTemp(Ity_I32);
5714   IRTemp  amt8 = newTemp(Ity_I8);
5715
5716   if (epartIsReg(rm)) {
5717      assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
5718      DIP("%s %s,%s\n", opname,
5719                        nameMMXReg(eregOfRM(rm)),
5720                        nameMMXReg(gregOfRM(rm)) );
5721      delta++;
5722   } else {
5723      addr = disAMode ( &alen, sorb, delta, dis_buf );
5724      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
5725      DIP("%s %s,%s\n", opname,
5726                        dis_buf,
5727                        nameMMXReg(gregOfRM(rm)) );
5728      delta += alen;
5729   }
5730   assign( g0,   getMMXReg(gregOfRM(rm)) );
5731   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
5732
5733   shl = shr = sar = False;
5734   size = 0;
5735   switch (op) {
5736      case Iop_ShlN16x4: shl = True; size = 32; break;
5737      case Iop_ShlN32x2: shl = True; size = 32; break;
5738      case Iop_Shl64:    shl = True; size = 64; break;
5739      case Iop_ShrN16x4: shr = True; size = 16; break;
5740      case Iop_ShrN32x2: shr = True; size = 32; break;
5741      case Iop_Shr64:    shr = True; size = 64; break;
5742      case Iop_SarN16x4: sar = True; size = 16; break;
5743      case Iop_SarN32x2: sar = True; size = 32; break;
5744      default: vassert(0);
5745   }
5746
5747   if (shl || shr) {
5748     assign(
5749        g1,
5750        IRExpr_ITE(
5751           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
5752           binop(op, mkexpr(g0), mkexpr(amt8)),
5753           mkU64(0)
5754        )
5755     );
5756   } else
5757   if (sar) {
5758     assign(
5759        g1,
5760        IRExpr_ITE(
5761           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
5762           binop(op, mkexpr(g0), mkexpr(amt8)),
5763           binop(op, mkexpr(g0), mkU8(size-1))
5764        )
5765     );
5766   } else {
5767      /*NOTREACHED*/
5768      vassert(0);
5769   }
5770
5771   putMMXReg( gregOfRM(rm), mkexpr(g1) );
5772   return delta;
5773}
5774
5775
5776/* Vector by scalar shift of E by an immediate byte.  This is a
5777   straight copy of dis_SSE_shiftE_imm. */
5778
5779static
5780UInt dis_MMX_shiftE_imm ( Int delta, const HChar* opname, IROp op )
5781{
5782   Bool    shl, shr, sar;
5783   UChar   rm   = getIByte(delta);
5784   IRTemp  e0   = newTemp(Ity_I64);
5785   IRTemp  e1   = newTemp(Ity_I64);
5786   UChar   amt, size;
5787   vassert(epartIsReg(rm));
5788   vassert(gregOfRM(rm) == 2
5789           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
5790   amt = getIByte(delta+1);
5791   delta += 2;
5792   DIP("%s $%d,%s\n", opname,
5793                      (Int)amt,
5794                      nameMMXReg(eregOfRM(rm)) );
5795
5796   assign( e0, getMMXReg(eregOfRM(rm)) );
5797
5798   shl = shr = sar = False;
5799   size = 0;
5800   switch (op) {
5801      case Iop_ShlN16x4: shl = True; size = 16; break;
5802      case Iop_ShlN32x2: shl = True; size = 32; break;
5803      case Iop_Shl64:    shl = True; size = 64; break;
5804      case Iop_SarN16x4: sar = True; size = 16; break;
5805      case Iop_SarN32x2: sar = True; size = 32; break;
5806      case Iop_ShrN16x4: shr = True; size = 16; break;
5807      case Iop_ShrN32x2: shr = True; size = 32; break;
5808      case Iop_Shr64:    shr = True; size = 64; break;
5809      default: vassert(0);
5810   }
5811
5812   if (shl || shr) {
5813      assign( e1, amt >= size
5814                     ? mkU64(0)
5815                     : binop(op, mkexpr(e0), mkU8(amt))
5816      );
5817   } else
5818   if (sar) {
5819      assign( e1, amt >= size
5820                     ? binop(op, mkexpr(e0), mkU8(size-1))
5821                     : binop(op, mkexpr(e0), mkU8(amt))
5822      );
5823   } else {
5824      /*NOTREACHED*/
5825      vassert(0);
5826   }
5827
5828   putMMXReg( eregOfRM(rm), mkexpr(e1) );
5829   return delta;
5830}
5831
5832
5833/* Completely handle all MMX instructions except emms. */
5834
5835static
5836UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
5837{
5838   Int   len;
5839   UChar modrm;
5840   HChar dis_buf[50];
5841   UChar opc = getIByte(delta);
5842   delta++;
5843
5844   /* dis_MMX handles all insns except emms. */
5845   do_MMX_preamble();
5846
5847   switch (opc) {
5848
5849      case 0x6E:
5850         /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
5851         if (sz != 4)
5852            goto mmx_decode_failure;
5853         modrm = getIByte(delta);
5854         if (epartIsReg(modrm)) {
5855            delta++;
5856            putMMXReg(
5857               gregOfRM(modrm),
5858               binop( Iop_32HLto64,
5859                      mkU32(0),
5860                      getIReg(4, eregOfRM(modrm)) ) );
5861            DIP("movd %s, %s\n",
5862                nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5863         } else {
5864            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5865            delta += len;
5866            putMMXReg(
5867               gregOfRM(modrm),
5868               binop( Iop_32HLto64,
5869                      mkU32(0),
5870                      loadLE(Ity_I32, mkexpr(addr)) ) );
5871            DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
5872         }
5873         break;
5874
5875      case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
5876         if (sz != 4)
5877            goto mmx_decode_failure;
5878         modrm = getIByte(delta);
5879         if (epartIsReg(modrm)) {
5880            delta++;
5881            putIReg( 4, eregOfRM(modrm),
5882                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5883            DIP("movd %s, %s\n",
5884                nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
5885         } else {
5886            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5887            delta += len;
5888            storeLE( mkexpr(addr),
5889                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5890            DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
5891         }
5892         break;
5893
5894      case 0x6F:
5895         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
5896         if (sz != 4)
5897            goto mmx_decode_failure;
5898         modrm = getIByte(delta);
5899         if (epartIsReg(modrm)) {
5900            delta++;
5901            putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
5902            DIP("movq %s, %s\n",
5903                nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5904         } else {
5905            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5906            delta += len;
5907            putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
5908            DIP("movq %s, %s\n",
5909                dis_buf, nameMMXReg(gregOfRM(modrm)));
5910         }
5911         break;
5912
5913      case 0x7F:
5914         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
5915         if (sz != 4)
5916            goto mmx_decode_failure;
5917         modrm = getIByte(delta);
5918         if (epartIsReg(modrm)) {
5919            delta++;
5920            putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
5921            DIP("movq %s, %s\n",
5922                nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
5923         } else {
5924            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5925            delta += len;
5926            storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
5927            DIP("mov(nt)q %s, %s\n",
5928                nameMMXReg(gregOfRM(modrm)), dis_buf);
5929         }
5930         break;
5931
5932      case 0xFC:
5933      case 0xFD:
5934      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
5935         if (sz != 4)
5936            goto mmx_decode_failure;
5937         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
5938         break;
5939
5940      case 0xEC:
5941      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
5942         if (sz != 4)
5943            goto mmx_decode_failure;
5944         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
5945         break;
5946
5947      case 0xDC:
5948      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5949         if (sz != 4)
5950            goto mmx_decode_failure;
5951         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
5952         break;
5953
5954      case 0xF8:
5955      case 0xF9:
5956      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
5957         if (sz != 4)
5958            goto mmx_decode_failure;
5959         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
5960         break;
5961
5962      case 0xE8:
5963      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
5964         if (sz != 4)
5965            goto mmx_decode_failure;
5966         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
5967         break;
5968
5969      case 0xD8:
5970      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5971         if (sz != 4)
5972            goto mmx_decode_failure;
5973         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
5974         break;
5975
5976      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
5977         if (sz != 4)
5978            goto mmx_decode_failure;
5979         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
5980         break;
5981
5982      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
5983         if (sz != 4)
5984            goto mmx_decode_failure;
5985         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
5986         break;
5987
5988      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
5989         vassert(sz == 4);
5990         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
5991         break;
5992
5993      case 0x74:
5994      case 0x75:
5995      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
5996         if (sz != 4)
5997            goto mmx_decode_failure;
5998         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
5999         break;
6000
6001      case 0x64:
6002      case 0x65:
6003      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
6004         if (sz != 4)
6005            goto mmx_decode_failure;
6006         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
6007         break;
6008
6009      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
6010         if (sz != 4)
6011            goto mmx_decode_failure;
6012         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
6013         break;
6014
6015      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
6016         if (sz != 4)
6017            goto mmx_decode_failure;
6018         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
6019         break;
6020
6021      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
6022         if (sz != 4)
6023            goto mmx_decode_failure;
6024         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
6025         break;
6026
6027      case 0x68:
6028      case 0x69:
6029      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
6030         if (sz != 4)
6031            goto mmx_decode_failure;
6032         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
6033         break;
6034
6035      case 0x60:
6036      case 0x61:
6037      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
6038         if (sz != 4)
6039            goto mmx_decode_failure;
6040         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
6041         break;
6042
6043      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
6044         if (sz != 4)
6045            goto mmx_decode_failure;
6046         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
6047         break;
6048
6049      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
6050         if (sz != 4)
6051            goto mmx_decode_failure;
6052         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
6053         break;
6054
6055      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
6056         if (sz != 4)
6057            goto mmx_decode_failure;
6058         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
6059         break;
6060
6061      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
6062         if (sz != 4)
6063            goto mmx_decode_failure;
6064         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
6065         break;
6066
6067#     define SHIFT_BY_REG(_name,_op)                                 \
6068                delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
6069                break;
6070
6071      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
6072      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
6073      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
6074      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
6075
6076      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
6077      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
6078      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
6079      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
6080
6081      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
6082      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
6083      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
6084
6085#     undef SHIFT_BY_REG
6086
6087      case 0x71:
6088      case 0x72:
6089      case 0x73: {
6090         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
6091         UChar byte2, subopc;
6092         if (sz != 4)
6093            goto mmx_decode_failure;
6094         byte2  = getIByte(delta);           /* amode / sub-opcode */
6095         subopc = toUChar( (byte2 >> 3) & 7 );
6096
6097#        define SHIFT_BY_IMM(_name,_op)                         \
6098             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
6099             } while (0)
6100
6101              if (subopc == 2 /*SRL*/ && opc == 0x71)
6102                 SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
6103         else if (subopc == 2 /*SRL*/ && opc == 0x72)
6104                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
6105         else if (subopc == 2 /*SRL*/ && opc == 0x73)
6106                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
6107
6108         else if (subopc == 4 /*SAR*/ && opc == 0x71)
6109                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
6110         else if (subopc == 4 /*SAR*/ && opc == 0x72)
6111                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
6112
6113         else if (subopc == 6 /*SHL*/ && opc == 0x71)
6114                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
6115         else if (subopc == 6 /*SHL*/ && opc == 0x72)
6116                 SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
6117         else if (subopc == 6 /*SHL*/ && opc == 0x73)
6118                 SHIFT_BY_IMM("psllq", Iop_Shl64);
6119
6120         else goto mmx_decode_failure;
6121
6122#        undef SHIFT_BY_IMM
6123         break;
6124      }
6125
6126      case 0xF7: {
6127         IRTemp addr    = newTemp(Ity_I32);
6128         IRTemp regD    = newTemp(Ity_I64);
6129         IRTemp regM    = newTemp(Ity_I64);
6130         IRTemp mask    = newTemp(Ity_I64);
6131         IRTemp olddata = newTemp(Ity_I64);
6132         IRTemp newdata = newTemp(Ity_I64);
6133
6134         modrm = getIByte(delta);
6135         if (sz != 4 || (!epartIsReg(modrm)))
6136            goto mmx_decode_failure;
6137         delta++;
6138
6139         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
6140         assign( regM, getMMXReg( eregOfRM(modrm) ));
6141         assign( regD, getMMXReg( gregOfRM(modrm) ));
6142         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
6143         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
6144         assign( newdata,
6145                 binop(Iop_Or64,
6146                       binop(Iop_And64,
6147                             mkexpr(regD),
6148                             mkexpr(mask) ),
6149                       binop(Iop_And64,
6150                             mkexpr(olddata),
6151                             unop(Iop_Not64, mkexpr(mask)))) );
6152         storeLE( mkexpr(addr), mkexpr(newdata) );
6153         DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
6154                                 nameMMXReg( gregOfRM(modrm) ) );
6155         break;
6156      }
6157
6158      /* --- MMX decode failure --- */
6159      default:
6160      mmx_decode_failure:
6161         *decode_ok = False;
6162         return delta; /* ignored */
6163
6164   }
6165
6166   *decode_ok = True;
6167   return delta;
6168}
6169
6170
6171/*------------------------------------------------------------*/
6172/*--- More misc arithmetic and other obscure insns.        ---*/
6173/*------------------------------------------------------------*/
6174
6175/* Double length left and right shifts.  Apparently only required in
6176   v-size (no b- variant). */
6177static
6178UInt dis_SHLRD_Gv_Ev ( UChar sorb,
6179                       Int delta, UChar modrm,
6180                       Int sz,
6181                       IRExpr* shift_amt,
6182                       Bool amt_is_literal,
6183                       const HChar* shift_amt_txt,
6184                       Bool left_shift )
6185{
6186   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
6187      for printing it.   And eip on entry points at the modrm byte. */
6188   Int len;
6189   HChar dis_buf[50];
6190
6191   IRType ty       = szToITy(sz);
6192   IRTemp gsrc     = newTemp(ty);
6193   IRTemp esrc     = newTemp(ty);
6194   IRTemp addr     = IRTemp_INVALID;
6195   IRTemp tmpSH    = newTemp(Ity_I8);
6196   IRTemp tmpL     = IRTemp_INVALID;
6197   IRTemp tmpRes   = IRTemp_INVALID;
6198   IRTemp tmpSubSh = IRTemp_INVALID;
6199   IROp   mkpair;
6200   IROp   getres;
6201   IROp   shift;
6202   IRExpr* mask = NULL;
6203
6204   vassert(sz == 2 || sz == 4);
6205
6206   /* The E-part is the destination; this is shifted.  The G-part
6207      supplies bits to be shifted into the E-part, but is not
6208      changed.
6209
6210      If shifting left, form a double-length word with E at the top
6211      and G at the bottom, and shift this left.  The result is then in
6212      the high part.
6213
6214      If shifting right, form a double-length word with G at the top
6215      and E at the bottom, and shift this right.  The result is then
6216      at the bottom.  */
6217
6218   /* Fetch the operands. */
6219
6220   assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
6221
6222   if (epartIsReg(modrm)) {
6223      delta++;
6224      assign( esrc, getIReg(sz, eregOfRM(modrm)) );
6225      DIP("sh%cd%c %s, %s, %s\n",
6226          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6227          shift_amt_txt,
6228          nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
6229   } else {
6230      addr = disAMode ( &len, sorb, delta, dis_buf );
6231      delta += len;
6232      assign( esrc, loadLE(ty, mkexpr(addr)) );
6233      DIP("sh%cd%c %s, %s, %s\n",
6234          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6235          shift_amt_txt,
6236          nameIReg(sz, gregOfRM(modrm)), dis_buf);
6237   }
6238
6239   /* Round up the relevant primops. */
6240
6241   if (sz == 4) {
6242      tmpL     = newTemp(Ity_I64);
6243      tmpRes   = newTemp(Ity_I32);
6244      tmpSubSh = newTemp(Ity_I32);
6245      mkpair   = Iop_32HLto64;
6246      getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
6247      shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
6248      mask     = mkU8(31);
6249   } else {
6250      /* sz == 2 */
6251      tmpL     = newTemp(Ity_I32);
6252      tmpRes   = newTemp(Ity_I16);
6253      tmpSubSh = newTemp(Ity_I16);
6254      mkpair   = Iop_16HLto32;
6255      getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
6256      shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
6257      mask     = mkU8(15);
6258   }
6259
6260   /* Do the shift, calculate the subshift value, and set
6261      the flag thunk. */
6262
6263   assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
6264
6265   if (left_shift)
6266      assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
6267   else
6268      assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
6269
6270   assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
6271   assign( tmpSubSh,
6272           unop(getres,
6273                binop(shift,
6274                      mkexpr(tmpL),
6275                      binop(Iop_And8,
6276                            binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
6277                            mask))) );
6278
6279   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
6280                              tmpRes, tmpSubSh, ty, tmpSH );
6281
6282   /* Put result back. */
6283
6284   if (epartIsReg(modrm)) {
6285      putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
6286   } else {
6287      storeLE( mkexpr(addr), mkexpr(tmpRes) );
6288   }
6289
6290   if (amt_is_literal) delta++;
6291   return delta;
6292}
6293
6294
6295/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
6296   required. */
6297
6298typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
6299
6300static const HChar* nameBtOp ( BtOp op )
6301{
6302   switch (op) {
6303      case BtOpNone:  return "";
6304      case BtOpSet:   return "s";
6305      case BtOpReset: return "r";
6306      case BtOpComp:  return "c";
6307      default: vpanic("nameBtOp(x86)");
6308   }
6309}
6310
6311
6312static
6313UInt dis_bt_G_E ( VexAbiInfo* vbi,
6314                  UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
6315{
6316   HChar  dis_buf[50];
6317   UChar  modrm;
6318   Int    len;
6319   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
6320          t_addr1, t_esp, t_mask, t_new;
6321
6322   vassert(sz == 2 || sz == 4);
6323
6324   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
6325             = t_addr0 = t_addr1 = t_esp
6326             = t_mask = t_new = IRTemp_INVALID;
6327
6328   t_fetched = newTemp(Ity_I8);
6329   t_new     = newTemp(Ity_I8);
6330   t_bitno0  = newTemp(Ity_I32);
6331   t_bitno1  = newTemp(Ity_I32);
6332   t_bitno2  = newTemp(Ity_I8);
6333   t_addr1   = newTemp(Ity_I32);
6334   modrm     = getIByte(delta);
6335
6336   assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
6337
6338   if (epartIsReg(modrm)) {
6339      delta++;
6340      /* Get it onto the client's stack. */
6341      t_esp = newTemp(Ity_I32);
6342      t_addr0 = newTemp(Ity_I32);
6343
6344      /* For the choice of the value 128, see comment in dis_bt_G_E in
6345         guest_amd64_toIR.c.  We point out here only that 128 is
6346         fast-cased in Memcheck and is > 0, so seems like a good
6347         choice. */
6348      vassert(vbi->guest_stack_redzone_size == 0);
6349      assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
6350      putIReg(4, R_ESP, mkexpr(t_esp));
6351
6352      storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
6353
6354      /* Make t_addr0 point at it. */
6355      assign( t_addr0, mkexpr(t_esp) );
6356
6357      /* Mask out upper bits of the shift amount, since we're doing a
6358         reg. */
6359      assign( t_bitno1, binop(Iop_And32,
6360                              mkexpr(t_bitno0),
6361                              mkU32(sz == 4 ? 31 : 15)) );
6362
6363   } else {
6364      t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
6365      delta += len;
6366      assign( t_bitno1, mkexpr(t_bitno0) );
6367   }
6368
6369   /* At this point: t_addr0 is the address being operated on.  If it
6370      was a reg, we will have pushed it onto the client's stack.
6371      t_bitno1 is the bit number, suitably masked in the case of a
6372      reg.  */
6373
6374   /* Now the main sequence. */
6375   assign( t_addr1,
6376           binop(Iop_Add32,
6377                 mkexpr(t_addr0),
6378                 binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
6379
6380   /* t_addr1 now holds effective address */
6381
6382   assign( t_bitno2,
6383           unop(Iop_32to8,
6384                binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
6385
6386   /* t_bitno2 contains offset of bit within byte */
6387
6388   if (op != BtOpNone) {
6389      t_mask = newTemp(Ity_I8);
6390      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
6391   }
6392
6393   /* t_mask is now a suitable byte mask */
6394
6395   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
6396
6397   if (op != BtOpNone) {
6398      switch (op) {
6399         case BtOpSet:
6400            assign( t_new,
6401                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
6402            break;
6403         case BtOpComp:
6404            assign( t_new,
6405                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
6406            break;
6407         case BtOpReset:
6408            assign( t_new,
6409                    binop(Iop_And8, mkexpr(t_fetched),
6410                                    unop(Iop_Not8, mkexpr(t_mask))) );
6411            break;
6412         default:
6413            vpanic("dis_bt_G_E(x86)");
6414      }
6415      if (locked && !epartIsReg(modrm)) {
6416         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
6417                                 mkexpr(t_new)/*new*/,
6418                                 guest_EIP_curr_instr );
6419      } else {
6420         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
6421      }
6422   }
6423
6424   /* Side effect done; now get selected bit into Carry flag */
6425   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
6426   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6427   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6428   stmt( IRStmt_Put(
6429            OFFB_CC_DEP1,
6430            binop(Iop_And32,
6431                  binop(Iop_Shr32,
6432                        unop(Iop_8Uto32, mkexpr(t_fetched)),
6433                        mkexpr(t_bitno2)),
6434                  mkU32(1)))
6435       );
6436   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6437      elimination of previous stores to this field work better. */
6438   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6439
6440   /* Move reg operand from stack back to reg */
6441   if (epartIsReg(modrm)) {
6442      /* t_esp still points at it. */
6443      putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
6444      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
6445   }
6446
6447   DIP("bt%s%c %s, %s\n",
6448       nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
6449       ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
6450
6451   return delta;
6452}
6453
6454
6455
6456/* Handle BSF/BSR.  Only v-size seems necessary. */
6457static
6458UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
6459{
6460   Bool   isReg;
6461   UChar  modrm;
6462   HChar  dis_buf[50];
6463
6464   IRType ty  = szToITy(sz);
6465   IRTemp src = newTemp(ty);
6466   IRTemp dst = newTemp(ty);
6467
6468   IRTemp src32 = newTemp(Ity_I32);
6469   IRTemp dst32 = newTemp(Ity_I32);
6470   IRTemp srcB  = newTemp(Ity_I1);
6471
6472   vassert(sz == 4 || sz == 2);
6473
6474   modrm = getIByte(delta);
6475
6476   isReg = epartIsReg(modrm);
6477   if (isReg) {
6478      delta++;
6479      assign( src, getIReg(sz, eregOfRM(modrm)) );
6480   } else {
6481      Int    len;
6482      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6483      delta += len;
6484      assign( src, loadLE(ty, mkexpr(addr)) );
6485   }
6486
6487   DIP("bs%c%c %s, %s\n",
6488       fwds ? 'f' : 'r', nameISize(sz),
6489       ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
6490       nameIReg(sz, gregOfRM(modrm)));
6491
6492   /* Generate a bool expression which is zero iff the original is
6493      zero, and nonzero otherwise.  Ask for a CmpNE version which, if
6494      instrumented by Memcheck, is instrumented expensively, since
6495      this may be used on the output of a preceding movmskb insn,
6496      which has been known to be partially defined, and in need of
6497      careful handling. */
6498   assign( srcB, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
6499                       mkexpr(src), mkU(ty,0)) );
6500
6501   /* Flags: Z is 1 iff source value is zero.  All others
6502      are undefined -- we force them to zero. */
6503   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6504   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6505   stmt( IRStmt_Put(
6506            OFFB_CC_DEP1,
6507            IRExpr_ITE( mkexpr(srcB),
6508                        /* src!=0 */
6509                        mkU32(0),
6510                        /* src==0 */
6511                        mkU32(X86G_CC_MASK_Z)
6512                        )
6513       ));
6514   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6515      elimination of previous stores to this field work better. */
6516   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6517
6518   /* Result: iff source value is zero, we can't use
6519      Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
6520      But anyway, Intel x86 semantics say the result is undefined in
6521      such situations.  Hence handle the zero case specially. */
6522
6523   /* Bleh.  What we compute:
6524
6525          bsf32:  if src == 0 then 0 else  Ctz32(src)
6526          bsr32:  if src == 0 then 0 else  31 - Clz32(src)
6527
6528          bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
6529          bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
6530
6531      First, widen src to 32 bits if it is not already.
6532
6533      Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
6534      dst register unchanged when src == 0.  Hence change accordingly.
6535   */
6536   if (sz == 2)
6537      assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
6538   else
6539      assign( src32, mkexpr(src) );
6540
6541   /* The main computation, guarding against zero. */
6542   assign( dst32,
6543           IRExpr_ITE(
6544              mkexpr(srcB),
6545              /* src != 0 */
6546              fwds ? unop(Iop_Ctz32, mkexpr(src32))
6547                   : binop(Iop_Sub32,
6548                           mkU32(31),
6549                           unop(Iop_Clz32, mkexpr(src32))),
6550              /* src == 0 -- leave dst unchanged */
6551              widenUto32( getIReg( sz, gregOfRM(modrm) ) )
6552           )
6553         );
6554
6555   if (sz == 2)
6556      assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
6557   else
6558      assign( dst, mkexpr(dst32) );
6559
6560   /* dump result back */
6561   putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
6562
6563   return delta;
6564}
6565
6566
6567static
6568void codegen_xchg_eAX_Reg ( Int sz, Int reg )
6569{
6570   IRType ty = szToITy(sz);
6571   IRTemp t1 = newTemp(ty);
6572   IRTemp t2 = newTemp(ty);
6573   vassert(sz == 2 || sz == 4);
6574   assign( t1, getIReg(sz, R_EAX) );
6575   assign( t2, getIReg(sz, reg) );
6576   putIReg( sz, R_EAX, mkexpr(t2) );
6577   putIReg( sz, reg, mkexpr(t1) );
6578   DIP("xchg%c %s, %s\n",
6579       nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
6580}
6581
6582
6583static
6584void codegen_SAHF ( void )
6585{
6586   /* Set the flags to:
6587      (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
6588      | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6589                |X86G_CC_MASK_P|X86G_CC_MASK_C)
6590   */
6591   UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6592                       |X86G_CC_MASK_C|X86G_CC_MASK_P;
6593   IRTemp oldflags   = newTemp(Ity_I32);
6594   assign( oldflags, mk_x86g_calculate_eflags_all() );
6595   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6596   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6597   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6598   stmt( IRStmt_Put( OFFB_CC_DEP1,
6599         binop(Iop_Or32,
6600               binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
6601               binop(Iop_And32,
6602                     binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
6603                     mkU32(mask_SZACP))
6604              )
6605   ));
6606   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6607      elimination of previous stores to this field work better. */
6608   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6609}
6610
6611
6612static
6613void codegen_LAHF ( void  )
6614{
6615   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
6616   IRExpr* eax_with_hole;
6617   IRExpr* new_byte;
6618   IRExpr* new_eax;
6619   UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6620                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
6621
6622   IRTemp  flags = newTemp(Ity_I32);
6623   assign( flags, mk_x86g_calculate_eflags_all() );
6624
6625   eax_with_hole
6626      = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
6627   new_byte
6628      = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
6629                        mkU32(1<<1));
6630   new_eax
6631      = binop(Iop_Or32, eax_with_hole,
6632                        binop(Iop_Shl32, new_byte, mkU8(8)));
6633   putIReg(4, R_EAX, new_eax);
6634}
6635
6636
6637static
6638UInt dis_cmpxchg_G_E ( UChar       sorb,
6639                       Bool        locked,
6640                       Int         size,
6641                       Int         delta0 )
6642{
6643   HChar dis_buf[50];
6644   Int   len;
6645
6646   IRType ty    = szToITy(size);
6647   IRTemp acc   = newTemp(ty);
6648   IRTemp src   = newTemp(ty);
6649   IRTemp dest  = newTemp(ty);
6650   IRTemp dest2 = newTemp(ty);
6651   IRTemp acc2  = newTemp(ty);
6652   IRTemp cond  = newTemp(Ity_I1);
6653   IRTemp addr  = IRTemp_INVALID;
6654   UChar  rm    = getUChar(delta0);
6655
6656   /* There are 3 cases to consider:
6657
6658      reg-reg: ignore any lock prefix, generate sequence based
6659               on ITE
6660
6661      reg-mem, not locked: ignore any lock prefix, generate sequence
6662                           based on ITE
6663
6664      reg-mem, locked: use IRCAS
6665   */
6666   if (epartIsReg(rm)) {
6667      /* case 1 */
6668      assign( dest, getIReg(size, eregOfRM(rm)) );
6669      delta0++;
6670      assign( src, getIReg(size, gregOfRM(rm)) );
6671      assign( acc, getIReg(size, R_EAX) );
6672      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6673      assign( cond, mk_x86g_calculate_condition(X86CondZ) );
6674      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
6675      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
6676      putIReg(size, R_EAX, mkexpr(acc2));
6677      putIReg(size, eregOfRM(rm), mkexpr(dest2));
6678      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6679                               nameIReg(size,gregOfRM(rm)),
6680                               nameIReg(size,eregOfRM(rm)) );
6681   }
6682   else if (!epartIsReg(rm) && !locked) {
6683      /* case 2 */
6684      addr = disAMode ( &len, sorb, delta0, dis_buf );
6685      assign( dest, loadLE(ty, mkexpr(addr)) );
6686      delta0 += len;
6687      assign( src, getIReg(size, gregOfRM(rm)) );
6688      assign( acc, getIReg(size, R_EAX) );
6689      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6690      assign( cond, mk_x86g_calculate_condition(X86CondZ) );
6691      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
6692      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
6693      putIReg(size, R_EAX, mkexpr(acc2));
6694      storeLE( mkexpr(addr), mkexpr(dest2) );
6695      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6696                               nameIReg(size,gregOfRM(rm)), dis_buf);
6697   }
6698   else if (!epartIsReg(rm) && locked) {
6699      /* case 3 */
6700      /* src is new value.  acc is expected value.  dest is old value.
6701         Compute success from the output of the IRCAS, and steer the
6702         new value for EAX accordingly: in case of success, EAX is
6703         unchanged. */
6704      addr = disAMode ( &len, sorb, delta0, dis_buf );
6705      delta0 += len;
6706      assign( src, getIReg(size, gregOfRM(rm)) );
6707      assign( acc, getIReg(size, R_EAX) );
6708      stmt( IRStmt_CAS(
6709         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
6710                  NULL, mkexpr(acc), NULL, mkexpr(src) )
6711      ));
6712      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6713      assign( cond, mk_x86g_calculate_condition(X86CondZ) );
6714      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
6715      putIReg(size, R_EAX, mkexpr(acc2));
6716      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6717                               nameIReg(size,gregOfRM(rm)), dis_buf);
6718   }
6719   else vassert(0);
6720
6721   return delta0;
6722}
6723
6724
6725/* Handle conditional move instructions of the form
6726      cmovcc E(reg-or-mem), G(reg)
6727
6728   E(src) is reg-or-mem
6729   G(dst) is reg.
6730
6731   If E is reg, -->    GET %E, tmps
6732                       GET %G, tmpd
6733                       CMOVcc tmps, tmpd
6734                       PUT tmpd, %G
6735
6736   If E is mem  -->    (getAddr E) -> tmpa
6737                       LD (tmpa), tmps
6738                       GET %G, tmpd
6739                       CMOVcc tmps, tmpd
6740                       PUT tmpd, %G
6741*/
6742static
6743UInt dis_cmov_E_G ( UChar       sorb,
6744                    Int         sz,
6745                    X86Condcode cond,
6746                    Int         delta0 )
6747{
6748   UChar rm  = getIByte(delta0);
6749   HChar dis_buf[50];
6750   Int   len;
6751
6752   IRType ty   = szToITy(sz);
6753   IRTemp tmps = newTemp(ty);
6754   IRTemp tmpd = newTemp(ty);
6755
6756   if (epartIsReg(rm)) {
6757      assign( tmps, getIReg(sz, eregOfRM(rm)) );
6758      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6759
6760      putIReg(sz, gregOfRM(rm),
6761                  IRExpr_ITE( mk_x86g_calculate_condition(cond),
6762                              mkexpr(tmps),
6763                              mkexpr(tmpd) )
6764             );
6765      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6766                              name_X86Condcode(cond),
6767                              nameIReg(sz,eregOfRM(rm)),
6768                              nameIReg(sz,gregOfRM(rm)));
6769      return 1+delta0;
6770   }
6771
6772   /* E refers to memory */
6773   {
6774      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6775      assign( tmps, loadLE(ty, mkexpr(addr)) );
6776      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6777
6778      putIReg(sz, gregOfRM(rm),
6779                  IRExpr_ITE( mk_x86g_calculate_condition(cond),
6780                              mkexpr(tmps),
6781                              mkexpr(tmpd) )
6782             );
6783
6784      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6785                              name_X86Condcode(cond),
6786                              dis_buf,
6787                              nameIReg(sz,gregOfRM(rm)));
6788      return len+delta0;
6789   }
6790}
6791
6792
6793static
6794UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
6795                    Bool* decodeOK )
6796{
6797   Int   len;
6798   UChar rm = getIByte(delta0);
6799   HChar dis_buf[50];
6800
6801   IRType ty    = szToITy(sz);
6802   IRTemp tmpd  = newTemp(ty);
6803   IRTemp tmpt0 = newTemp(ty);
6804   IRTemp tmpt1 = newTemp(ty);
6805
6806   /* There are 3 cases to consider:
6807
6808      reg-reg: ignore any lock prefix,
6809               generate 'naive' (non-atomic) sequence
6810
6811      reg-mem, not locked: ignore any lock prefix, generate 'naive'
6812                           (non-atomic) sequence
6813
6814      reg-mem, locked: use IRCAS
6815   */
6816
6817   if (epartIsReg(rm)) {
6818      /* case 1 */
6819      assign( tmpd,  getIReg(sz, eregOfRM(rm)));
6820      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6821      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6822                           mkexpr(tmpd), mkexpr(tmpt0)) );
6823      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6824      putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
6825      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6826      DIP("xadd%c %s, %s\n",
6827          nameISize(sz), nameIReg(sz,gregOfRM(rm)),
6828          				 nameIReg(sz,eregOfRM(rm)));
6829      *decodeOK = True;
6830      return 1+delta0;
6831   }
6832   else if (!epartIsReg(rm) && !locked) {
6833      /* case 2 */
6834      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6835      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6836      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6837      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6838                           mkexpr(tmpd), mkexpr(tmpt0)) );
6839      storeLE( mkexpr(addr), mkexpr(tmpt1) );
6840      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6841      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6842      DIP("xadd%c %s, %s\n",
6843          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6844      *decodeOK = True;
6845      return len+delta0;
6846   }
6847   else if (!epartIsReg(rm) && locked) {
6848      /* case 3 */
6849      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6850      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6851      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6852      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6853                           mkexpr(tmpd), mkexpr(tmpt0)) );
6854      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
6855                           mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
6856      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6857      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6858      DIP("xadd%c %s, %s\n",
6859          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6860      *decodeOK = True;
6861      return len+delta0;
6862   }
6863   /*UNREACHED*/
6864   vassert(0);
6865}
6866
6867/* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
6868
6869static
6870UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
6871{
6872   Int    len;
6873   IRTemp addr;
6874   UChar  rm  = getIByte(delta0);
6875   HChar  dis_buf[50];
6876
6877   if (epartIsReg(rm)) {
6878      putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
6879      DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
6880      return 1+delta0;
6881   } else {
6882      addr = disAMode ( &len, sorb, delta0, dis_buf );
6883      putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
6884      DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
6885      return len+delta0;
6886   }
6887}
6888
6889/* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
6890   dst is ireg and sz==4, zero out top half of it.  */
6891
6892static
6893UInt dis_mov_Sw_Ew ( UChar sorb,
6894                     Int   sz,
6895                     Int   delta0 )
6896{
6897   Int    len;
6898   IRTemp addr;
6899   UChar  rm  = getIByte(delta0);
6900   HChar  dis_buf[50];
6901
6902   vassert(sz == 2 || sz == 4);
6903
6904   if (epartIsReg(rm)) {
6905      if (sz == 4)
6906         putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
6907      else
6908         putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
6909
6910      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
6911      return 1+delta0;
6912   } else {
6913      addr = disAMode ( &len, sorb, delta0, dis_buf );
6914      storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
6915      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
6916      return len+delta0;
6917   }
6918}
6919
6920
6921static
6922void dis_push_segreg ( UInt sreg, Int sz )
6923{
6924    IRTemp t1 = newTemp(Ity_I16);
6925    IRTemp ta = newTemp(Ity_I32);
6926    vassert(sz == 2 || sz == 4);
6927
6928    assign( t1, getSReg(sreg) );
6929    assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
6930    putIReg(4, R_ESP, mkexpr(ta));
6931    storeLE( mkexpr(ta), mkexpr(t1) );
6932
6933    DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6934}
6935
6936static
6937void dis_pop_segreg ( UInt sreg, Int sz )
6938{
6939    IRTemp t1 = newTemp(Ity_I16);
6940    IRTemp ta = newTemp(Ity_I32);
6941    vassert(sz == 2 || sz == 4);
6942
6943    assign( ta, getIReg(4, R_ESP) );
6944    assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
6945
6946    putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
6947    putSReg( sreg, mkexpr(t1) );
6948    DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6949}
6950
6951static
6952void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
6953{
6954   IRTemp t1 = newTemp(Ity_I32);
6955   IRTemp t2 = newTemp(Ity_I32);
6956   assign(t1, getIReg(4,R_ESP));
6957   assign(t2, loadLE(Ity_I32,mkexpr(t1)));
6958   putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
6959   jmp_treg(dres, Ijk_Ret, t2);
6960   vassert(dres->whatNext == Dis_StopHere);
6961}
6962
6963/*------------------------------------------------------------*/
6964/*--- SSE/SSE2/SSE3 helpers                                ---*/
6965/*------------------------------------------------------------*/
6966
6967/* Indicates whether the op requires a rounding-mode argument.  Note
6968   that this covers only vector floating point arithmetic ops, and
6969   omits the scalar ones that need rounding modes.  Note also that
6970   inconsistencies here will get picked up later by the IR sanity
6971   checker, so this isn't correctness-critical. */
6972static Bool requiresRMode ( IROp op )
6973{
6974   switch (op) {
6975      /* 128 bit ops */
6976      case Iop_Add32Fx4: case Iop_Sub32Fx4:
6977      case Iop_Mul32Fx4: case Iop_Div32Fx4:
6978      case Iop_Add64Fx2: case Iop_Sub64Fx2:
6979      case Iop_Mul64Fx2: case Iop_Div64Fx2:
6980         return True;
6981      default:
6982         break;
6983   }
6984   return False;
6985}
6986
6987
6988/* Worker function; do not call directly.
6989   Handles full width G = G `op` E   and   G = (not G) `op` E.
6990*/
6991
6992static UInt dis_SSE_E_to_G_all_wrk (
6993               UChar sorb, Int delta,
6994               const HChar* opname, IROp op,
6995               Bool   invertG
6996            )
6997{
6998   HChar   dis_buf[50];
6999   Int     alen;
7000   IRTemp  addr;
7001   UChar   rm = getIByte(delta);
7002   IRExpr* gpart
7003      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
7004                : getXMMReg(gregOfRM(rm));
7005   if (epartIsReg(rm)) {
7006      putXMMReg(
7007         gregOfRM(rm),
7008         requiresRMode(op)
7009            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7010                        gpart,
7011                        getXMMReg(eregOfRM(rm)))
7012            : binop(op, gpart,
7013                        getXMMReg(eregOfRM(rm)))
7014      );
7015      DIP("%s %s,%s\n", opname,
7016                        nameXMMReg(eregOfRM(rm)),
7017                        nameXMMReg(gregOfRM(rm)) );
7018      return delta+1;
7019   } else {
7020      addr = disAMode ( &alen, sorb, delta, dis_buf );
7021      putXMMReg(
7022         gregOfRM(rm),
7023         requiresRMode(op)
7024            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7025                        gpart,
7026                        loadLE(Ity_V128, mkexpr(addr)))
7027            : binop(op, gpart,
7028                        loadLE(Ity_V128, mkexpr(addr)))
7029      );
7030      DIP("%s %s,%s\n", opname,
7031                        dis_buf,
7032                        nameXMMReg(gregOfRM(rm)) );
7033      return delta+alen;
7034   }
7035}
7036
7037
7038/* All lanes SSE binary operation, G = G `op` E. */
7039
7040static
7041UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, const HChar* opname, IROp op )
7042{
7043   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
7044}
7045
7046/* All lanes SSE binary operation, G = (not G) `op` E. */
7047
7048static
7049UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
7050                               const HChar* opname, IROp op )
7051{
7052   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
7053}
7054
7055
7056/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
7057
7058static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
7059                                  const HChar* opname, IROp op )
7060{
7061   HChar   dis_buf[50];
7062   Int     alen;
7063   IRTemp  addr;
7064   UChar   rm = getIByte(delta);
7065   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7066   if (epartIsReg(rm)) {
7067      putXMMReg( gregOfRM(rm),
7068                 binop(op, gpart,
7069                           getXMMReg(eregOfRM(rm))) );
7070      DIP("%s %s,%s\n", opname,
7071                        nameXMMReg(eregOfRM(rm)),
7072                        nameXMMReg(gregOfRM(rm)) );
7073      return delta+1;
7074   } else {
7075      /* We can only do a 32-bit memory read, so the upper 3/4 of the
7076         E operand needs to be made simply of zeroes. */
7077      IRTemp epart = newTemp(Ity_V128);
7078      addr = disAMode ( &alen, sorb, delta, dis_buf );
7079      assign( epart, unop( Iop_32UtoV128,
7080                           loadLE(Ity_I32, mkexpr(addr))) );
7081      putXMMReg( gregOfRM(rm),
7082                 binop(op, gpart, mkexpr(epart)) );
7083      DIP("%s %s,%s\n", opname,
7084                        dis_buf,
7085                        nameXMMReg(gregOfRM(rm)) );
7086      return delta+alen;
7087   }
7088}
7089
7090
7091/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
7092
7093static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
7094                                  const HChar* opname, IROp op )
7095{
7096   HChar   dis_buf[50];
7097   Int     alen;
7098   IRTemp  addr;
7099   UChar   rm = getIByte(delta);
7100   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7101   if (epartIsReg(rm)) {
7102      putXMMReg( gregOfRM(rm),
7103                 binop(op, gpart,
7104                           getXMMReg(eregOfRM(rm))) );
7105      DIP("%s %s,%s\n", opname,
7106                        nameXMMReg(eregOfRM(rm)),
7107                        nameXMMReg(gregOfRM(rm)) );
7108      return delta+1;
7109   } else {
7110      /* We can only do a 64-bit memory read, so the upper half of the
7111         E operand needs to be made simply of zeroes. */
7112      IRTemp epart = newTemp(Ity_V128);
7113      addr = disAMode ( &alen, sorb, delta, dis_buf );
7114      assign( epart, unop( Iop_64UtoV128,
7115                           loadLE(Ity_I64, mkexpr(addr))) );
7116      putXMMReg( gregOfRM(rm),
7117                 binop(op, gpart, mkexpr(epart)) );
7118      DIP("%s %s,%s\n", opname,
7119                        dis_buf,
7120                        nameXMMReg(gregOfRM(rm)) );
7121      return delta+alen;
7122   }
7123}
7124
7125
7126/* All lanes unary SSE operation, G = op(E). */
7127
7128static UInt dis_SSE_E_to_G_unary_all (
7129               UChar sorb, Int delta,
7130               const HChar* opname, IROp op
7131            )
7132{
7133   HChar   dis_buf[50];
7134   Int     alen;
7135   IRTemp  addr;
7136   UChar   rm = getIByte(delta);
7137   if (epartIsReg(rm)) {
7138      putXMMReg( gregOfRM(rm),
7139                 unop(op, getXMMReg(eregOfRM(rm))) );
7140      DIP("%s %s,%s\n", opname,
7141                        nameXMMReg(eregOfRM(rm)),
7142                        nameXMMReg(gregOfRM(rm)) );
7143      return delta+1;
7144   } else {
7145      addr = disAMode ( &alen, sorb, delta, dis_buf );
7146      putXMMReg( gregOfRM(rm),
7147                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
7148      DIP("%s %s,%s\n", opname,
7149                        dis_buf,
7150                        nameXMMReg(gregOfRM(rm)) );
7151      return delta+alen;
7152   }
7153}
7154
7155
7156/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
7157
7158static UInt dis_SSE_E_to_G_unary_lo32 (
7159               UChar sorb, Int delta,
7160               const HChar* opname, IROp op
7161            )
7162{
7163   /* First we need to get the old G value and patch the low 32 bits
7164      of the E operand into it.  Then apply op and write back to G. */
7165   HChar   dis_buf[50];
7166   Int     alen;
7167   IRTemp  addr;
7168   UChar   rm = getIByte(delta);
7169   IRTemp  oldG0 = newTemp(Ity_V128);
7170   IRTemp  oldG1 = newTemp(Ity_V128);
7171
7172   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7173
7174   if (epartIsReg(rm)) {
7175      assign( oldG1,
7176              binop( Iop_SetV128lo32,
7177                     mkexpr(oldG0),
7178                     getXMMRegLane32(eregOfRM(rm), 0)) );
7179      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7180      DIP("%s %s,%s\n", opname,
7181                        nameXMMReg(eregOfRM(rm)),
7182                        nameXMMReg(gregOfRM(rm)) );
7183      return delta+1;
7184   } else {
7185      addr = disAMode ( &alen, sorb, delta, dis_buf );
7186      assign( oldG1,
7187              binop( Iop_SetV128lo32,
7188                     mkexpr(oldG0),
7189                     loadLE(Ity_I32, mkexpr(addr)) ));
7190      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7191      DIP("%s %s,%s\n", opname,
7192                        dis_buf,
7193                        nameXMMReg(gregOfRM(rm)) );
7194      return delta+alen;
7195   }
7196}
7197
7198
7199/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
7200
7201static UInt dis_SSE_E_to_G_unary_lo64 (
7202               UChar sorb, Int delta,
7203               const HChar* opname, IROp op
7204            )
7205{
7206   /* First we need to get the old G value and patch the low 64 bits
7207      of the E operand into it.  Then apply op and write back to G. */
7208   HChar   dis_buf[50];
7209   Int     alen;
7210   IRTemp  addr;
7211   UChar   rm = getIByte(delta);
7212   IRTemp  oldG0 = newTemp(Ity_V128);
7213   IRTemp  oldG1 = newTemp(Ity_V128);
7214
7215   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7216
7217   if (epartIsReg(rm)) {
7218      assign( oldG1,
7219              binop( Iop_SetV128lo64,
7220                     mkexpr(oldG0),
7221                     getXMMRegLane64(eregOfRM(rm), 0)) );
7222      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7223      DIP("%s %s,%s\n", opname,
7224                        nameXMMReg(eregOfRM(rm)),
7225                        nameXMMReg(gregOfRM(rm)) );
7226      return delta+1;
7227   } else {
7228      addr = disAMode ( &alen, sorb, delta, dis_buf );
7229      assign( oldG1,
7230              binop( Iop_SetV128lo64,
7231                     mkexpr(oldG0),
7232                     loadLE(Ity_I64, mkexpr(addr)) ));
7233      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7234      DIP("%s %s,%s\n", opname,
7235                        dis_buf,
7236                        nameXMMReg(gregOfRM(rm)) );
7237      return delta+alen;
7238   }
7239}
7240
7241
7242/* SSE integer binary operation:
7243      G = G `op` E   (eLeft == False)
7244      G = E `op` G   (eLeft == True)
7245*/
7246static UInt dis_SSEint_E_to_G(
7247               UChar sorb, Int delta,
7248               const HChar* opname, IROp op,
7249               Bool   eLeft
7250            )
7251{
7252   HChar   dis_buf[50];
7253   Int     alen;
7254   IRTemp  addr;
7255   UChar   rm = getIByte(delta);
7256   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7257   IRExpr* epart = NULL;
7258   if (epartIsReg(rm)) {
7259      epart = getXMMReg(eregOfRM(rm));
7260      DIP("%s %s,%s\n", opname,
7261                        nameXMMReg(eregOfRM(rm)),
7262                        nameXMMReg(gregOfRM(rm)) );
7263      delta += 1;
7264   } else {
7265      addr  = disAMode ( &alen, sorb, delta, dis_buf );
7266      epart = loadLE(Ity_V128, mkexpr(addr));
7267      DIP("%s %s,%s\n", opname,
7268                        dis_buf,
7269                        nameXMMReg(gregOfRM(rm)) );
7270      delta += alen;
7271   }
7272   putXMMReg( gregOfRM(rm),
7273              eLeft ? binop(op, epart, gpart)
7274	            : binop(op, gpart, epart) );
7275   return delta;
7276}
7277
7278
7279/* Helper for doing SSE FP comparisons. */
7280
7281static void findSSECmpOp ( Bool* needNot, IROp* op,
7282                           Int imm8, Bool all_lanes, Int sz )
7283{
7284   imm8 &= 7;
7285   *needNot = False;
7286   *op      = Iop_INVALID;
7287   if (imm8 >= 4) {
7288      *needNot = True;
7289      imm8 -= 4;
7290   }
7291
7292   if (sz == 4 && all_lanes) {
7293      switch (imm8) {
7294         case 0: *op = Iop_CmpEQ32Fx4; return;
7295         case 1: *op = Iop_CmpLT32Fx4; return;
7296         case 2: *op = Iop_CmpLE32Fx4; return;
7297         case 3: *op = Iop_CmpUN32Fx4; return;
7298         default: break;
7299      }
7300   }
7301   if (sz == 4 && !all_lanes) {
7302      switch (imm8) {
7303         case 0: *op = Iop_CmpEQ32F0x4; return;
7304         case 1: *op = Iop_CmpLT32F0x4; return;
7305         case 2: *op = Iop_CmpLE32F0x4; return;
7306         case 3: *op = Iop_CmpUN32F0x4; return;
7307         default: break;
7308      }
7309   }
7310   if (sz == 8 && all_lanes) {
7311      switch (imm8) {
7312         case 0: *op = Iop_CmpEQ64Fx2; return;
7313         case 1: *op = Iop_CmpLT64Fx2; return;
7314         case 2: *op = Iop_CmpLE64Fx2; return;
7315         case 3: *op = Iop_CmpUN64Fx2; return;
7316         default: break;
7317      }
7318   }
7319   if (sz == 8 && !all_lanes) {
7320      switch (imm8) {
7321         case 0: *op = Iop_CmpEQ64F0x2; return;
7322         case 1: *op = Iop_CmpLT64F0x2; return;
7323         case 2: *op = Iop_CmpLE64F0x2; return;
7324         case 3: *op = Iop_CmpUN64F0x2; return;
7325         default: break;
7326      }
7327   }
7328   vpanic("findSSECmpOp(x86,guest)");
7329}
7330
7331/* Handles SSE 32F/64F comparisons. */
7332
7333static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
7334				const HChar* opname, Bool all_lanes, Int sz )
7335{
7336   HChar   dis_buf[50];
7337   Int     alen, imm8;
7338   IRTemp  addr;
7339   Bool    needNot = False;
7340   IROp    op      = Iop_INVALID;
7341   IRTemp  plain   = newTemp(Ity_V128);
7342   UChar   rm      = getIByte(delta);
7343   UShort  mask    = 0;
7344   vassert(sz == 4 || sz == 8);
7345   if (epartIsReg(rm)) {
7346      imm8 = getIByte(delta+1);
7347      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7348      assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
7349                               getXMMReg(eregOfRM(rm))) );
7350      delta += 2;
7351      DIP("%s $%d,%s,%s\n", opname,
7352                            (Int)imm8,
7353                            nameXMMReg(eregOfRM(rm)),
7354                            nameXMMReg(gregOfRM(rm)) );
7355   } else {
7356      addr = disAMode ( &alen, sorb, delta, dis_buf );
7357      imm8 = getIByte(delta+alen);
7358      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7359      assign( plain,
7360              binop(
7361                 op,
7362                 getXMMReg(gregOfRM(rm)),
7363                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
7364                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
7365                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
7366             )
7367      );
7368      delta += alen+1;
7369      DIP("%s $%d,%s,%s\n", opname,
7370                            (Int)imm8,
7371                            dis_buf,
7372                            nameXMMReg(gregOfRM(rm)) );
7373   }
7374
7375   if (needNot && all_lanes) {
7376      putXMMReg( gregOfRM(rm),
7377                 unop(Iop_NotV128, mkexpr(plain)) );
7378   }
7379   else
7380   if (needNot && !all_lanes) {
7381      mask = toUShort( sz==4 ? 0x000F : 0x00FF );
7382      putXMMReg( gregOfRM(rm),
7383                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
7384   }
7385   else {
7386      putXMMReg( gregOfRM(rm), mkexpr(plain) );
7387   }
7388
7389   return delta;
7390}
7391
7392
7393/* Vector by scalar shift of G by the amount specified at the bottom
7394   of E. */
7395
7396static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
7397                                 const HChar* opname, IROp op )
7398{
7399   HChar   dis_buf[50];
7400   Int     alen, size;
7401   IRTemp  addr;
7402   Bool    shl, shr, sar;
7403   UChar   rm   = getIByte(delta);
7404   IRTemp  g0   = newTemp(Ity_V128);
7405   IRTemp  g1   = newTemp(Ity_V128);
7406   IRTemp  amt  = newTemp(Ity_I32);
7407   IRTemp  amt8 = newTemp(Ity_I8);
7408   if (epartIsReg(rm)) {
7409      assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
7410      DIP("%s %s,%s\n", opname,
7411                        nameXMMReg(eregOfRM(rm)),
7412                        nameXMMReg(gregOfRM(rm)) );
7413      delta++;
7414   } else {
7415      addr = disAMode ( &alen, sorb, delta, dis_buf );
7416      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
7417      DIP("%s %s,%s\n", opname,
7418                        dis_buf,
7419                        nameXMMReg(gregOfRM(rm)) );
7420      delta += alen;
7421   }
7422   assign( g0,   getXMMReg(gregOfRM(rm)) );
7423   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
7424
7425   shl = shr = sar = False;
7426   size = 0;
7427   switch (op) {
7428      case Iop_ShlN16x8: shl = True; size = 32; break;
7429      case Iop_ShlN32x4: shl = True; size = 32; break;
7430      case Iop_ShlN64x2: shl = True; size = 64; break;
7431      case Iop_SarN16x8: sar = True; size = 16; break;
7432      case Iop_SarN32x4: sar = True; size = 32; break;
7433      case Iop_ShrN16x8: shr = True; size = 16; break;
7434      case Iop_ShrN32x4: shr = True; size = 32; break;
7435      case Iop_ShrN64x2: shr = True; size = 64; break;
7436      default: vassert(0);
7437   }
7438
7439   if (shl || shr) {
7440     assign(
7441        g1,
7442        IRExpr_ITE(
7443           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
7444           binop(op, mkexpr(g0), mkexpr(amt8)),
7445           mkV128(0x0000)
7446        )
7447     );
7448   } else
7449   if (sar) {
7450     assign(
7451        g1,
7452        IRExpr_ITE(
7453           binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
7454           binop(op, mkexpr(g0), mkexpr(amt8)),
7455           binop(op, mkexpr(g0), mkU8(size-1))
7456        )
7457     );
7458   } else {
7459      /*NOTREACHED*/
7460      vassert(0);
7461   }
7462
7463   putXMMReg( gregOfRM(rm), mkexpr(g1) );
7464   return delta;
7465}
7466
7467
7468/* Vector by scalar shift of E by an immediate byte. */
7469
7470static
7471UInt dis_SSE_shiftE_imm ( Int delta, const HChar* opname, IROp op )
7472{
7473   Bool    shl, shr, sar;
7474   UChar   rm   = getIByte(delta);
7475   IRTemp  e0   = newTemp(Ity_V128);
7476   IRTemp  e1   = newTemp(Ity_V128);
7477   UChar   amt, size;
7478   vassert(epartIsReg(rm));
7479   vassert(gregOfRM(rm) == 2
7480           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
7481   amt = getIByte(delta+1);
7482   delta += 2;
7483   DIP("%s $%d,%s\n", opname,
7484                      (Int)amt,
7485                      nameXMMReg(eregOfRM(rm)) );
7486   assign( e0, getXMMReg(eregOfRM(rm)) );
7487
7488   shl = shr = sar = False;
7489   size = 0;
7490   switch (op) {
7491      case Iop_ShlN16x8: shl = True; size = 16; break;
7492      case Iop_ShlN32x4: shl = True; size = 32; break;
7493      case Iop_ShlN64x2: shl = True; size = 64; break;
7494      case Iop_SarN16x8: sar = True; size = 16; break;
7495      case Iop_SarN32x4: sar = True; size = 32; break;
7496      case Iop_ShrN16x8: shr = True; size = 16; break;
7497      case Iop_ShrN32x4: shr = True; size = 32; break;
7498      case Iop_ShrN64x2: shr = True; size = 64; break;
7499      default: vassert(0);
7500   }
7501
7502   if (shl || shr) {
7503      assign( e1, amt >= size
7504                     ? mkV128(0x0000)
7505                     : binop(op, mkexpr(e0), mkU8(amt))
7506      );
7507   } else
7508   if (sar) {
7509      assign( e1, amt >= size
7510                     ? binop(op, mkexpr(e0), mkU8(size-1))
7511                     : binop(op, mkexpr(e0), mkU8(amt))
7512      );
7513   } else {
7514      /*NOTREACHED*/
7515      vassert(0);
7516   }
7517
7518   putXMMReg( eregOfRM(rm), mkexpr(e1) );
7519   return delta;
7520}
7521
7522
7523/* Get the current SSE rounding mode. */
7524
7525static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
7526{
7527   return binop( Iop_And32,
7528                 IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
7529                 mkU32(3) );
7530}
7531
7532static void put_sse_roundingmode ( IRExpr* sseround )
7533{
7534   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
7535   stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
7536}
7537
7538/* Break a 128-bit value up into four 32-bit ints. */
7539
7540static void breakup128to32s ( IRTemp t128,
7541			      /*OUTs*/
7542                              IRTemp* t3, IRTemp* t2,
7543                              IRTemp* t1, IRTemp* t0 )
7544{
7545   IRTemp hi64 = newTemp(Ity_I64);
7546   IRTemp lo64 = newTemp(Ity_I64);
7547   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
7548   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
7549
7550   vassert(t0 && *t0 == IRTemp_INVALID);
7551   vassert(t1 && *t1 == IRTemp_INVALID);
7552   vassert(t2 && *t2 == IRTemp_INVALID);
7553   vassert(t3 && *t3 == IRTemp_INVALID);
7554
7555   *t0 = newTemp(Ity_I32);
7556   *t1 = newTemp(Ity_I32);
7557   *t2 = newTemp(Ity_I32);
7558   *t3 = newTemp(Ity_I32);
7559   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
7560   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
7561   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
7562   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
7563}
7564
7565/* Construct a 128-bit value from four 32-bit ints. */
7566
7567static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
7568                              IRTemp t1, IRTemp t0 )
7569{
7570   return
7571      binop( Iop_64HLtoV128,
7572             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
7573             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
7574   );
7575}
7576
7577/* Break a 64-bit value up into four 16-bit ints. */
7578
7579static void breakup64to16s ( IRTemp t64,
7580                             /*OUTs*/
7581                             IRTemp* t3, IRTemp* t2,
7582                             IRTemp* t1, IRTemp* t0 )
7583{
7584   IRTemp hi32 = newTemp(Ity_I32);
7585   IRTemp lo32 = newTemp(Ity_I32);
7586   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
7587   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
7588
7589   vassert(t0 && *t0 == IRTemp_INVALID);
7590   vassert(t1 && *t1 == IRTemp_INVALID);
7591   vassert(t2 && *t2 == IRTemp_INVALID);
7592   vassert(t3 && *t3 == IRTemp_INVALID);
7593
7594   *t0 = newTemp(Ity_I16);
7595   *t1 = newTemp(Ity_I16);
7596   *t2 = newTemp(Ity_I16);
7597   *t3 = newTemp(Ity_I16);
7598   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
7599   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
7600   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
7601   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
7602}
7603
7604/* Construct a 64-bit value from four 16-bit ints. */
7605
7606static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
7607                             IRTemp t1, IRTemp t0 )
7608{
7609   return
7610      binop( Iop_32HLto64,
7611             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
7612             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
7613   );
7614}
7615
7616/* Generate IR to set the guest %EFLAGS from the pushfl-format image
7617   in the given 32-bit temporary.  The flags that are set are: O S Z A
7618   C P D ID AC.
7619
7620   In all cases, code to set AC is generated.  However, VEX actually
7621   ignores the AC value and so can optionally emit an emulation
7622   warning when it is enabled.  In this routine, an emulation warning
7623   is only emitted if emit_AC_emwarn is True, in which case
7624   next_insn_EIP must be correct (this allows for correct code
7625   generation for popfl/popfw).  If emit_AC_emwarn is False,
7626   next_insn_EIP is unimportant (this allows for easy if kludgey code
7627   generation for IRET.) */
7628
7629static
7630void set_EFLAGS_from_value ( IRTemp t1,
7631                             Bool   emit_AC_emwarn,
7632                             Addr32 next_insn_EIP )
7633{
7634   vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
7635
7636   /* t1 is the flag word.  Mask out everything except OSZACP and set
7637      the flags thunk to X86G_CC_OP_COPY. */
7638   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
7639   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
7640   stmt( IRStmt_Put( OFFB_CC_DEP1,
7641                     binop(Iop_And32,
7642                           mkexpr(t1),
7643                           mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
7644                                  | X86G_CC_MASK_A | X86G_CC_MASK_Z
7645                                  | X86G_CC_MASK_S| X86G_CC_MASK_O )
7646                          )
7647                    )
7648       );
7649   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7650      elimination of previous stores to this field work better. */
7651   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7652
7653   /* Also need to set the D flag, which is held in bit 10 of t1.
7654      If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
7655   stmt( IRStmt_Put(
7656            OFFB_DFLAG,
7657            IRExpr_ITE(
7658               unop(Iop_32to1,
7659                    binop(Iop_And32,
7660                          binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
7661                          mkU32(1))),
7662               mkU32(0xFFFFFFFF),
7663               mkU32(1)))
7664       );
7665
7666   /* Set the ID flag */
7667   stmt( IRStmt_Put(
7668            OFFB_IDFLAG,
7669            IRExpr_ITE(
7670               unop(Iop_32to1,
7671                    binop(Iop_And32,
7672                          binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
7673                          mkU32(1))),
7674               mkU32(1),
7675               mkU32(0)))
7676       );
7677
7678   /* And set the AC flag.  If setting it 1 to, possibly emit an
7679      emulation warning. */
7680   stmt( IRStmt_Put(
7681            OFFB_ACFLAG,
7682            IRExpr_ITE(
7683               unop(Iop_32to1,
7684                    binop(Iop_And32,
7685                          binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
7686                          mkU32(1))),
7687               mkU32(1),
7688               mkU32(0)))
7689       );
7690
7691   if (emit_AC_emwarn) {
7692      put_emwarn( mkU32(EmWarn_X86_acFlag) );
7693      stmt(
7694         IRStmt_Exit(
7695            binop( Iop_CmpNE32,
7696                   binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
7697                   mkU32(0) ),
7698            Ijk_EmWarn,
7699            IRConst_U32( next_insn_EIP ),
7700            OFFB_EIP
7701         )
7702      );
7703   }
7704}
7705
7706
7707/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
7708   values (aa,bb), computes, for each of the 4 16-bit lanes:
7709
7710   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
7711*/
7712static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
7713{
7714   IRTemp aa      = newTemp(Ity_I64);
7715   IRTemp bb      = newTemp(Ity_I64);
7716   IRTemp aahi32s = newTemp(Ity_I64);
7717   IRTemp aalo32s = newTemp(Ity_I64);
7718   IRTemp bbhi32s = newTemp(Ity_I64);
7719   IRTemp bblo32s = newTemp(Ity_I64);
7720   IRTemp rHi     = newTemp(Ity_I64);
7721   IRTemp rLo     = newTemp(Ity_I64);
7722   IRTemp one32x2 = newTemp(Ity_I64);
7723   assign(aa, aax);
7724   assign(bb, bbx);
7725   assign( aahi32s,
7726           binop(Iop_SarN32x2,
7727                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
7728                 mkU8(16) ));
7729   assign( aalo32s,
7730           binop(Iop_SarN32x2,
7731                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
7732                 mkU8(16) ));
7733   assign( bbhi32s,
7734           binop(Iop_SarN32x2,
7735                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
7736                 mkU8(16) ));
7737   assign( bblo32s,
7738           binop(Iop_SarN32x2,
7739                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
7740                 mkU8(16) ));
7741   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
7742   assign(
7743      rHi,
7744      binop(
7745         Iop_ShrN32x2,
7746         binop(
7747            Iop_Add32x2,
7748            binop(
7749               Iop_ShrN32x2,
7750               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
7751               mkU8(14)
7752            ),
7753            mkexpr(one32x2)
7754         ),
7755         mkU8(1)
7756      )
7757   );
7758   assign(
7759      rLo,
7760      binop(
7761         Iop_ShrN32x2,
7762         binop(
7763            Iop_Add32x2,
7764            binop(
7765               Iop_ShrN32x2,
7766               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
7767               mkU8(14)
7768            ),
7769            mkexpr(one32x2)
7770         ),
7771         mkU8(1)
7772      )
7773   );
7774   return
7775      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
7776}
7777
7778/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
7779   values (aa,bb), computes, for each lane:
7780
7781          if aa_lane < 0 then - bb_lane
7782     else if aa_lane > 0 then bb_lane
7783     else 0
7784*/
7785static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
7786{
7787   IRTemp aa       = newTemp(Ity_I64);
7788   IRTemp bb       = newTemp(Ity_I64);
7789   IRTemp zero     = newTemp(Ity_I64);
7790   IRTemp bbNeg    = newTemp(Ity_I64);
7791   IRTemp negMask  = newTemp(Ity_I64);
7792   IRTemp posMask  = newTemp(Ity_I64);
7793   IROp   opSub    = Iop_INVALID;
7794   IROp   opCmpGTS = Iop_INVALID;
7795
7796   switch (laneszB) {
7797      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
7798      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
7799      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
7800      default: vassert(0);
7801   }
7802
7803   assign( aa,      aax );
7804   assign( bb,      bbx );
7805   assign( zero,    mkU64(0) );
7806   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
7807   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
7808   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
7809
7810   return
7811      binop(Iop_Or64,
7812            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
7813            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
7814
7815}
7816
7817/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
7818   value aa, computes, for each lane
7819
7820   if aa < 0 then -aa else aa
7821
7822   Note that the result is interpreted as unsigned, so that the
7823   absolute value of the most negative signed input can be
7824   represented.
7825*/
7826static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
7827{
7828   IRTemp aa      = newTemp(Ity_I64);
7829   IRTemp zero    = newTemp(Ity_I64);
7830   IRTemp aaNeg   = newTemp(Ity_I64);
7831   IRTemp negMask = newTemp(Ity_I64);
7832   IRTemp posMask = newTemp(Ity_I64);
7833   IROp   opSub   = Iop_INVALID;
7834   IROp   opSarN  = Iop_INVALID;
7835
7836   switch (laneszB) {
7837      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
7838      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
7839      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
7840      default: vassert(0);
7841   }
7842
7843   assign( aa,      aax );
7844   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
7845   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
7846   assign( zero,    mkU64(0) );
7847   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
7848   return
7849      binop(Iop_Or64,
7850            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
7851            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
7852}
7853
7854static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
7855                                        IRTemp lo64, Int byteShift )
7856{
7857   vassert(byteShift >= 1 && byteShift <= 7);
7858   return
7859      binop(Iop_Or64,
7860            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
7861            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
7862      );
7863}
7864
7865/* Generate a SIGSEGV followed by a restart of the current instruction
7866   if effective_addr is not 16-aligned.  This is required behaviour
7867   for some SSE3 instructions and all 128-bit SSSE3 instructions.
7868   This assumes that guest_RIP_curr_instr is set correctly! */
7869static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
7870{
7871   stmt(
7872      IRStmt_Exit(
7873         binop(Iop_CmpNE32,
7874               binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
7875               mkU32(0)),
7876         Ijk_SigSEGV,
7877         IRConst_U32(guest_EIP_curr_instr),
7878         OFFB_EIP
7879      )
7880   );
7881}
7882
7883
7884/* Helper for deciding whether a given insn (starting at the opcode
7885   byte) may validly be used with a LOCK prefix.  The following insns
7886   may be used with LOCK when their destination operand is in memory.
7887   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
7888
7889   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
7890   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
7891   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
7892   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
7893   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
7894   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
7895   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
7896
7897   DEC        FE /1,  FF /1
7898   INC        FE /0,  FF /0
7899
7900   NEG        F6 /3,  F7 /3
7901   NOT        F6 /2,  F7 /2
7902
7903   XCHG       86, 87
7904
7905   BTC        0F BB,  0F BA /7
7906   BTR        0F B3,  0F BA /6
7907   BTS        0F AB,  0F BA /5
7908
7909   CMPXCHG    0F B0,  0F B1
7910   CMPXCHG8B  0F C7 /1
7911
7912   XADD       0F C0,  0F C1
7913
7914   ------------------------------
7915
7916   80 /0  =  addb $imm8,  rm8
7917   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
7918   82 /0  =  addb $imm8,  rm8
7919   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
7920
7921   00     =  addb r8,  rm8
7922   01     =  addl r32, rm32  and  addw r16, rm16
7923
7924   Same for ADD OR ADC SBB AND SUB XOR
7925
7926   FE /1  = dec rm8
7927   FF /1  = dec rm32  and  dec rm16
7928
7929   FE /0  = inc rm8
7930   FF /0  = inc rm32  and  inc rm16
7931
7932   F6 /3  = neg rm8
7933   F7 /3  = neg rm32  and  neg rm16
7934
7935   F6 /2  = not rm8
7936   F7 /2  = not rm32  and  not rm16
7937
7938   0F BB     = btcw r16, rm16    and  btcl r32, rm32
7939   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
7940
7941   Same for BTS, BTR
7942*/
7943static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
7944{
7945   switch (opc[0]) {
7946      case 0x00: case 0x01: case 0x08: case 0x09:
7947      case 0x10: case 0x11: case 0x18: case 0x19:
7948      case 0x20: case 0x21: case 0x28: case 0x29:
7949      case 0x30: case 0x31:
7950         if (!epartIsReg(opc[1]))
7951            return True;
7952         break;
7953
7954      case 0x80: case 0x81: case 0x82: case 0x83:
7955         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
7956             && !epartIsReg(opc[1]))
7957            return True;
7958         break;
7959
7960      case 0xFE: case 0xFF:
7961         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
7962             && !epartIsReg(opc[1]))
7963            return True;
7964         break;
7965
7966      case 0xF6: case 0xF7:
7967         if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
7968             && !epartIsReg(opc[1]))
7969            return True;
7970         break;
7971
7972      case 0x86: case 0x87:
7973         if (!epartIsReg(opc[1]))
7974            return True;
7975         break;
7976
7977      case 0x0F: {
7978         switch (opc[1]) {
7979            case 0xBB: case 0xB3: case 0xAB:
7980               if (!epartIsReg(opc[2]))
7981                  return True;
7982               break;
7983            case 0xBA:
7984               if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
7985                   && !epartIsReg(opc[2]))
7986                  return True;
7987               break;
7988            case 0xB0: case 0xB1:
7989               if (!epartIsReg(opc[2]))
7990                  return True;
7991               break;
7992            case 0xC7:
7993               if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
7994                  return True;
7995               break;
7996            case 0xC0: case 0xC1:
7997               if (!epartIsReg(opc[2]))
7998                  return True;
7999               break;
8000            default:
8001               break;
8002         } /* switch (opc[1]) */
8003         break;
8004      }
8005
8006      default:
8007         break;
8008   } /* switch (opc[0]) */
8009
8010   return False;
8011}
8012
8013static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
8014{
8015   IRTemp t2 = newTemp(ty);
8016   if (ty == Ity_I32) {
8017      assign( t2,
8018         binop(
8019            Iop_Or32,
8020            binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
8021            binop(
8022               Iop_Or32,
8023               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
8024                                mkU32(0x00FF0000)),
8025               binop(Iop_Or32,
8026                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
8027                                      mkU32(0x0000FF00)),
8028                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
8029                                      mkU32(0x000000FF) )
8030            )))
8031      );
8032      return t2;
8033   }
8034   if (ty == Ity_I16) {
8035      assign(t2,
8036             binop(Iop_Or16,
8037                   binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
8038                   binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
8039      return t2;
8040   }
8041   vassert(0);
8042   /*NOTREACHED*/
8043   return IRTemp_INVALID;
8044}
8045
8046/*------------------------------------------------------------*/
8047/*--- Disassemble a single instruction                     ---*/
8048/*------------------------------------------------------------*/
8049
8050/* Disassemble a single instruction into IR.  The instruction is
8051   located in host memory at &guest_code[delta].  *expect_CAS is set
8052   to True if the resulting IR is expected to contain an IRCAS
8053   statement, and False if it's not expected to.  This makes it
8054   possible for the caller of disInstr_X86_WRK to check that
8055   LOCK-prefixed instructions are at least plausibly translated, in
8056   that it becomes possible to check that a (validly) LOCK-prefixed
8057   instruction generates a translation containing an IRCAS, and
8058   instructions without LOCK prefixes don't generate translations
8059   containing an IRCAS.
8060*/
8061static
8062DisResult disInstr_X86_WRK (
8063             /*OUT*/Bool* expect_CAS,
8064             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
8065             Bool         resteerCisOk,
8066             void*        callback_opaque,
8067             Long         delta64,
8068             VexArchInfo* archinfo,
8069             VexAbiInfo*  vbi,
8070             Bool         sigill_diag
8071          )
8072{
8073   IRType    ty;
8074   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
8075   Int       alen;
8076   UChar     opc, modrm, abyte, pre;
8077   UInt      d32;
8078   HChar     dis_buf[50];
8079   Int       am_sz, d_sz, n_prefixes;
8080   DisResult dres;
8081   UChar*    insn; /* used in SSE decoders */
8082
8083   /* The running delta */
8084   Int delta = (Int)delta64;
8085
8086   /* Holds eip at the start of the insn, so that we can print
8087      consistent error messages for unimplemented insns. */
8088   Int delta_start = delta;
8089
8090   /* sz denotes the nominal data-op size of the insn; we change it to
8091      2 if an 0x66 prefix is seen */
8092   Int sz = 4;
8093
8094   /* sorb holds the segment-override-prefix byte, if any.  Zero if no
8095      prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
8096      indicating the prefix.  */
8097   UChar sorb = 0;
8098
8099   /* Gets set to True if a LOCK prefix is seen. */
8100   Bool pfx_lock = False;
8101
8102   /* Set result defaults. */
8103   dres.whatNext    = Dis_Continue;
8104   dres.len         = 0;
8105   dres.continueAt  = 0;
8106   dres.jk_StopHere = Ijk_INVALID;
8107
8108   *expect_CAS = False;
8109
8110   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
8111
8112   vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
8113   DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
8114
8115   /* Spot "Special" instructions (see comment at top of file). */
8116   {
8117      UChar* code = (UChar*)(guest_code + delta);
8118      /* Spot the 12-byte preamble:
8119         C1C703   roll $3,  %edi
8120         C1C70D   roll $13, %edi
8121         C1C71D   roll $29, %edi
8122         C1C713   roll $19, %edi
8123      */
8124      if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
8125          code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
8126          code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
8127          code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
8128         /* Got a "Special" instruction preamble.  Which one is it? */
8129         if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
8130            /* %EDX = client_request ( %EAX ) */
8131            DIP("%%edx = client_request ( %%eax )\n");
8132            delta += 14;
8133            jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
8134            vassert(dres.whatNext == Dis_StopHere);
8135            goto decode_success;
8136         }
8137         else
8138         if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
8139            /* %EAX = guest_NRADDR */
8140            DIP("%%eax = guest_NRADDR\n");
8141            delta += 14;
8142            putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
8143            goto decode_success;
8144         }
8145         else
8146         if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
8147            /* call-noredir *%EAX */
8148            DIP("call-noredir *%%eax\n");
8149            delta += 14;
8150            t1 = newTemp(Ity_I32);
8151            assign(t1, getIReg(4,R_EAX));
8152            t2 = newTemp(Ity_I32);
8153            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
8154            putIReg(4, R_ESP, mkexpr(t2));
8155            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
8156            jmp_treg(&dres, Ijk_NoRedir, t1);
8157            vassert(dres.whatNext == Dis_StopHere);
8158            goto decode_success;
8159         }
8160         else
8161         if (code[12] == 0x87 && code[13] == 0xFF /* xchgl %edi,%edi */) {
8162            /* IR injection */
8163            DIP("IR injection\n");
8164            vex_inject_ir(irsb, Iend_LE);
8165
8166            // Invalidate the current insn. The reason is that the IRop we're
8167            // injecting here can change. In which case the translation has to
8168            // be redone. For ease of handling, we simply invalidate all the
8169            // time.
8170            stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
8171            stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
8172
8173            delta += 14;
8174
8175            stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
8176            dres.whatNext    = Dis_StopHere;
8177            dres.jk_StopHere = Ijk_InvalICache;
8178            goto decode_success;
8179         }
8180         /* We don't know what it is. */
8181         goto decode_failure;
8182         /*NOTREACHED*/
8183      }
8184   }
8185
8186   /* Handle a couple of weird-ass NOPs that have been observed in the
8187      wild. */
8188   {
8189      UChar* code = (UChar*)(guest_code + delta);
8190      /* Sun's JVM 1.5.0 uses the following as a NOP:
8191         26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
8192      if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
8193          && code[3] == 0x65 && code[4] == 0x90) {
8194         DIP("%%es:%%cs:%%fs:%%gs:nop\n");
8195         delta += 5;
8196         goto decode_success;
8197      }
8198      /* Don't barf on recent binutils padding,
8199         all variants of which are: nopw %cs:0x0(%eax,%eax,1)
8200         66 2e 0f 1f 84 00 00 00 00 00
8201         66 66 2e 0f 1f 84 00 00 00 00 00
8202         66 66 66 2e 0f 1f 84 00 00 00 00 00
8203         66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8204         66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8205         66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8206      */
8207      if (code[0] == 0x66) {
8208         Int data16_cnt;
8209         for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
8210            if (code[data16_cnt] != 0x66)
8211               break;
8212         if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
8213             && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
8214             && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
8215             && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
8216             && code[data16_cnt + 8] == 0x00 ) {
8217            DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
8218            delta += 9 + data16_cnt;
8219            goto decode_success;
8220         }
8221      }
8222   }
8223
8224   /* Normal instruction handling starts here. */
8225
8226   /* Deal with some but not all prefixes:
8227         66(oso)
8228         F0(lock)
8229         2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
8230      Not dealt with (left in place):
8231         F2 F3
8232   */
8233   n_prefixes = 0;
8234   while (True) {
8235      if (n_prefixes > 7) goto decode_failure;
8236      pre = getUChar(delta);
8237      switch (pre) {
8238         case 0x66:
8239            sz = 2;
8240            break;
8241         case 0xF0:
8242            pfx_lock = True;
8243            *expect_CAS = True;
8244            break;
8245         case 0x3E: /* %DS: */
8246         case 0x26: /* %ES: */
8247         case 0x64: /* %FS: */
8248         case 0x65: /* %GS: */
8249            if (sorb != 0)
8250               goto decode_failure; /* only one seg override allowed */
8251            sorb = pre;
8252            break;
8253         case 0x2E: { /* %CS: */
8254            /* 2E prefix on a conditional branch instruction is a
8255               branch-prediction hint, which can safely be ignored.  */
8256            UChar op1 = getIByte(delta+1);
8257            UChar op2 = getIByte(delta+2);
8258            if ((op1 >= 0x70 && op1 <= 0x7F)
8259                || (op1 == 0xE3)
8260                || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
8261               if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
8262            } else {
8263               /* All other CS override cases are not handled */
8264               goto decode_failure;
8265            }
8266            break;
8267         }
8268         case 0x36: /* %SS: */
8269            /* SS override cases are not handled */
8270            goto decode_failure;
8271         default:
8272            goto not_a_prefix;
8273      }
8274      n_prefixes++;
8275      delta++;
8276   }
8277
8278   not_a_prefix:
8279
8280   /* Now we should be looking at the primary opcode byte or the
8281      leading F2 or F3.  Check that any LOCK prefix is actually
8282      allowed. */
8283
8284   if (pfx_lock) {
8285      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
8286         DIP("lock ");
8287      } else {
8288         *expect_CAS = False;
8289         goto decode_failure;
8290      }
8291   }
8292
8293
8294   /* ---------------------------------------------------- */
8295   /* --- The SSE decoder.                             --- */
8296   /* ---------------------------------------------------- */
8297
8298   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
8299      previous life? */
8300
8301   /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
8302      later section, further on. */
8303
8304   insn = (UChar*)&guest_code[delta];
8305
8306   /* Treat fxsave specially.  It should be doable even on an SSE0
8307      (Pentium-II class) CPU.  Hence be prepared to handle it on
8308      any subarchitecture variant.
8309   */
8310
8311   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
8312   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8313       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
8314      IRDirty* d;
8315      modrm = getIByte(delta+2);
8316      vassert(sz == 4);
8317      vassert(!epartIsReg(modrm));
8318
8319      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8320      delta += 2+alen;
8321      gen_SEGV_if_not_16_aligned(addr);
8322
8323      DIP("fxsave %s\n", dis_buf);
8324
8325      /* Uses dirty helper:
8326            void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
8327      d = unsafeIRDirty_0_N (
8328             0/*regparms*/,
8329             "x86g_dirtyhelper_FXSAVE",
8330             &x86g_dirtyhelper_FXSAVE,
8331             mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
8332          );
8333
8334      /* declare we're writing memory */
8335      d->mFx   = Ifx_Write;
8336      d->mAddr = mkexpr(addr);
8337      d->mSize = 464; /* according to recent Intel docs */
8338
8339      /* declare we're reading guest state */
8340      d->nFxState = 7;
8341      vex_bzero(&d->fxState, sizeof(d->fxState));
8342
8343      d->fxState[0].fx     = Ifx_Read;
8344      d->fxState[0].offset = OFFB_FTOP;
8345      d->fxState[0].size   = sizeof(UInt);
8346
8347      d->fxState[1].fx     = Ifx_Read;
8348      d->fxState[1].offset = OFFB_FPREGS;
8349      d->fxState[1].size   = 8 * sizeof(ULong);
8350
8351      d->fxState[2].fx     = Ifx_Read;
8352      d->fxState[2].offset = OFFB_FPTAGS;
8353      d->fxState[2].size   = 8 * sizeof(UChar);
8354
8355      d->fxState[3].fx     = Ifx_Read;
8356      d->fxState[3].offset = OFFB_FPROUND;
8357      d->fxState[3].size   = sizeof(UInt);
8358
8359      d->fxState[4].fx     = Ifx_Read;
8360      d->fxState[4].offset = OFFB_FC3210;
8361      d->fxState[4].size   = sizeof(UInt);
8362
8363      d->fxState[5].fx     = Ifx_Read;
8364      d->fxState[5].offset = OFFB_XMM0;
8365      d->fxState[5].size   = 8 * sizeof(U128);
8366
8367      d->fxState[6].fx     = Ifx_Read;
8368      d->fxState[6].offset = OFFB_SSEROUND;
8369      d->fxState[6].size   = sizeof(UInt);
8370
8371      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8372	 images are packed back-to-back.  If not, the value of
8373	 d->fxState[5].size is wrong. */
8374      vassert(16 == sizeof(U128));
8375      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8376
8377      stmt( IRStmt_Dirty(d) );
8378
8379      goto decode_success;
8380   }
8381
8382   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
8383   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8384       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
8385      IRDirty* d;
8386      modrm = getIByte(delta+2);
8387      vassert(sz == 4);
8388      vassert(!epartIsReg(modrm));
8389
8390      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8391      delta += 2+alen;
8392      gen_SEGV_if_not_16_aligned(addr);
8393
8394      DIP("fxrstor %s\n", dis_buf);
8395
8396      /* Uses dirty helper:
8397            VexEmNote x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
8398         NOTE:
8399            the VexEmNote value is simply ignored (unlike for FRSTOR)
8400      */
8401      d = unsafeIRDirty_0_N (
8402             0/*regparms*/,
8403             "x86g_dirtyhelper_FXRSTOR",
8404             &x86g_dirtyhelper_FXRSTOR,
8405             mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
8406          );
8407
8408      /* declare we're reading memory */
8409      d->mFx   = Ifx_Read;
8410      d->mAddr = mkexpr(addr);
8411      d->mSize = 464; /* according to recent Intel docs */
8412
8413      /* declare we're writing guest state */
8414      d->nFxState = 7;
8415      vex_bzero(&d->fxState, sizeof(d->fxState));
8416
8417      d->fxState[0].fx     = Ifx_Write;
8418      d->fxState[0].offset = OFFB_FTOP;
8419      d->fxState[0].size   = sizeof(UInt);
8420
8421      d->fxState[1].fx     = Ifx_Write;
8422      d->fxState[1].offset = OFFB_FPREGS;
8423      d->fxState[1].size   = 8 * sizeof(ULong);
8424
8425      d->fxState[2].fx     = Ifx_Write;
8426      d->fxState[2].offset = OFFB_FPTAGS;
8427      d->fxState[2].size   = 8 * sizeof(UChar);
8428
8429      d->fxState[3].fx     = Ifx_Write;
8430      d->fxState[3].offset = OFFB_FPROUND;
8431      d->fxState[3].size   = sizeof(UInt);
8432
8433      d->fxState[4].fx     = Ifx_Write;
8434      d->fxState[4].offset = OFFB_FC3210;
8435      d->fxState[4].size   = sizeof(UInt);
8436
8437      d->fxState[5].fx     = Ifx_Write;
8438      d->fxState[5].offset = OFFB_XMM0;
8439      d->fxState[5].size   = 8 * sizeof(U128);
8440
8441      d->fxState[6].fx     = Ifx_Write;
8442      d->fxState[6].offset = OFFB_SSEROUND;
8443      d->fxState[6].size   = sizeof(UInt);
8444
8445      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8446	 images are packed back-to-back.  If not, the value of
8447	 d->fxState[5].size is wrong. */
8448      vassert(16 == sizeof(U128));
8449      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8450
8451      stmt( IRStmt_Dirty(d) );
8452
8453      goto decode_success;
8454   }
8455
8456   /* ------ SSE decoder main ------ */
8457
8458   /* Skip parts of the decoder which don't apply given the stated
8459      guest subarchitecture. */
8460   if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
8461      goto after_sse_decoders;
8462
8463   /* With mmxext only some extended MMX instructions are recognized.
8464      The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
8465      PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
8466      PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
8467
8468      http://support.amd.com/us/Embedded_TechDocs/22466.pdf
8469      https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
8470
8471   if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
8472      goto mmxext;
8473
8474   /* Otherwise we must be doing sse1 or sse2, so we can at least try
8475      for SSE1 here. */
8476
8477   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
8478   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
8479      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
8480      goto decode_success;
8481   }
8482
8483   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
8484   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
8485      vassert(sz == 4);
8486      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
8487      goto decode_success;
8488   }
8489
8490   /* 0F 55 = ANDNPS -- G = (not G) and E */
8491   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
8492      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
8493      goto decode_success;
8494   }
8495
8496   /* 0F 54 = ANDPS -- G = G and E */
8497   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
8498      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
8499      goto decode_success;
8500   }
8501
8502   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
8503   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
8504      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
8505      goto decode_success;
8506   }
8507
8508   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
8509   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
8510      vassert(sz == 4);
8511      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
8512      goto decode_success;
8513   }
8514
8515   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
8516   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
8517   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
8518      IRTemp argL = newTemp(Ity_F32);
8519      IRTemp argR = newTemp(Ity_F32);
8520      modrm = getIByte(delta+2);
8521      if (epartIsReg(modrm)) {
8522         assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
8523         delta += 2+1;
8524         DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8525                                  nameXMMReg(gregOfRM(modrm)) );
8526      } else {
8527         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8528	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
8529         delta += 2+alen;
8530         DIP("[u]comiss %s,%s\n", dis_buf,
8531                                  nameXMMReg(gregOfRM(modrm)) );
8532      }
8533      assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
8534
8535      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
8536      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
8537      stmt( IRStmt_Put(
8538               OFFB_CC_DEP1,
8539               binop( Iop_And32,
8540                      binop(Iop_CmpF64,
8541                            unop(Iop_F32toF64,mkexpr(argL)),
8542                            unop(Iop_F32toF64,mkexpr(argR))),
8543                      mkU32(0x45)
8544          )));
8545      /* Set NDEP even though it isn't used.  This makes redundant-PUT
8546         elimination of previous stores to this field work better. */
8547      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
8548      goto decode_success;
8549   }
8550
8551   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
8552      half xmm */
8553   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
8554      IRTemp arg64 = newTemp(Ity_I64);
8555      IRTemp rmode = newTemp(Ity_I32);
8556      vassert(sz == 4);
8557
8558      modrm = getIByte(delta+2);
8559      do_MMX_preamble();
8560      if (epartIsReg(modrm)) {
8561         assign( arg64, getMMXReg(eregOfRM(modrm)) );
8562         delta += 2+1;
8563         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8564                                 nameXMMReg(gregOfRM(modrm)));
8565      } else {
8566         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8567	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
8568         delta += 2+alen;
8569         DIP("cvtpi2ps %s,%s\n", dis_buf,
8570                                 nameXMMReg(gregOfRM(modrm)) );
8571      }
8572
8573      assign( rmode, get_sse_roundingmode() );
8574
8575      putXMMRegLane32F(
8576         gregOfRM(modrm), 0,
8577         binop(Iop_F64toF32,
8578               mkexpr(rmode),
8579               unop(Iop_I32StoF64,
8580                    unop(Iop_64to32, mkexpr(arg64)) )) );
8581
8582      putXMMRegLane32F(
8583         gregOfRM(modrm), 1,
8584         binop(Iop_F64toF32,
8585               mkexpr(rmode),
8586               unop(Iop_I32StoF64,
8587                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
8588
8589      goto decode_success;
8590   }
8591
8592   /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
8593      quarter xmm */
8594   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
8595      IRTemp arg32 = newTemp(Ity_I32);
8596      IRTemp rmode = newTemp(Ity_I32);
8597      vassert(sz == 4);
8598
8599      modrm = getIByte(delta+3);
8600      if (epartIsReg(modrm)) {
8601         assign( arg32, getIReg(4, eregOfRM(modrm)) );
8602         delta += 3+1;
8603         DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
8604                                 nameXMMReg(gregOfRM(modrm)));
8605      } else {
8606         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8607	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
8608         delta += 3+alen;
8609         DIP("cvtsi2ss %s,%s\n", dis_buf,
8610                                 nameXMMReg(gregOfRM(modrm)) );
8611      }
8612
8613      assign( rmode, get_sse_roundingmode() );
8614
8615      putXMMRegLane32F(
8616         gregOfRM(modrm), 0,
8617         binop(Iop_F64toF32,
8618               mkexpr(rmode),
8619               unop(Iop_I32StoF64, mkexpr(arg32)) ) );
8620
8621      goto decode_success;
8622   }
8623
8624   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8625      I32 in mmx, according to prevailing SSE rounding mode */
8626   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8627      I32 in mmx, rounding towards zero */
8628   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
8629      IRTemp dst64  = newTemp(Ity_I64);
8630      IRTemp rmode  = newTemp(Ity_I32);
8631      IRTemp f32lo  = newTemp(Ity_F32);
8632      IRTemp f32hi  = newTemp(Ity_F32);
8633      Bool   r2zero = toBool(insn[1] == 0x2C);
8634
8635      do_MMX_preamble();
8636      modrm = getIByte(delta+2);
8637
8638      if (epartIsReg(modrm)) {
8639         delta += 2+1;
8640	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8641	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
8642         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8643                                   nameXMMReg(eregOfRM(modrm)),
8644                                   nameMMXReg(gregOfRM(modrm)));
8645      } else {
8646         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8647	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8648	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
8649                                              mkexpr(addr),
8650                                              mkU32(4) )));
8651         delta += 2+alen;
8652         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8653                                   dis_buf,
8654                                   nameMMXReg(gregOfRM(modrm)));
8655      }
8656
8657      if (r2zero) {
8658         assign(rmode, mkU32((UInt)Irrm_ZERO) );
8659      } else {
8660         assign( rmode, get_sse_roundingmode() );
8661      }
8662
8663      assign(
8664         dst64,
8665         binop( Iop_32HLto64,
8666                binop( Iop_F64toI32S,
8667                       mkexpr(rmode),
8668                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
8669                binop( Iop_F64toI32S,
8670                       mkexpr(rmode),
8671                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
8672              )
8673      );
8674
8675      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
8676      goto decode_success;
8677   }
8678
8679   /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
8680      I32 in ireg, according to prevailing SSE rounding mode */
8681   /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
8682      I32 in ireg, rounding towards zero */
8683   if (insn[0] == 0xF3 && insn[1] == 0x0F
8684       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
8685      IRTemp rmode = newTemp(Ity_I32);
8686      IRTemp f32lo = newTemp(Ity_F32);
8687      Bool   r2zero = toBool(insn[2] == 0x2C);
8688      vassert(sz == 4);
8689
8690      modrm = getIByte(delta+3);
8691      if (epartIsReg(modrm)) {
8692         delta += 3+1;
8693	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8694         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8695                                   nameXMMReg(eregOfRM(modrm)),
8696                                   nameIReg(4, gregOfRM(modrm)));
8697      } else {
8698         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8699	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8700         delta += 3+alen;
8701         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8702                                   dis_buf,
8703                                   nameIReg(4, gregOfRM(modrm)));
8704      }
8705
8706      if (r2zero) {
8707         assign( rmode, mkU32((UInt)Irrm_ZERO) );
8708      } else {
8709         assign( rmode, get_sse_roundingmode() );
8710      }
8711
8712      putIReg(4, gregOfRM(modrm),
8713                 binop( Iop_F64toI32S,
8714                        mkexpr(rmode),
8715                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
8716      );
8717
8718      goto decode_success;
8719   }
8720
8721   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
8722   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
8723      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
8724      goto decode_success;
8725   }
8726
8727   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
8728   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
8729      vassert(sz == 4);
8730      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
8731      goto decode_success;
8732   }
8733
8734   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
8735   if (insn[0] == 0x0F && insn[1] == 0xAE
8736       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
8737
8738      IRTemp t64 = newTemp(Ity_I64);
8739      IRTemp ew = newTemp(Ity_I32);
8740
8741      modrm = getIByte(delta+2);
8742      vassert(!epartIsReg(modrm));
8743      vassert(sz == 4);
8744
8745      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8746      delta += 2+alen;
8747      DIP("ldmxcsr %s\n", dis_buf);
8748
8749      /* The only thing we observe in %mxcsr is the rounding mode.
8750         Therefore, pass the 32-bit value (SSE native-format control
8751         word) to a clean helper, getting back a 64-bit value, the
8752         lower half of which is the SSEROUND value to store, and the
8753         upper half of which is the emulation-warning token which may
8754         be generated.
8755      */
8756      /* ULong x86h_check_ldmxcsr ( UInt ); */
8757      assign( t64, mkIRExprCCall(
8758                      Ity_I64, 0/*regparms*/,
8759                      "x86g_check_ldmxcsr",
8760                      &x86g_check_ldmxcsr,
8761                      mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
8762                   )
8763            );
8764
8765      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
8766      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
8767      put_emwarn( mkexpr(ew) );
8768      /* Finally, if an emulation warning was reported, side-exit to
8769         the next insn, reporting the warning, so that Valgrind's
8770         dispatcher sees the warning. */
8771      stmt(
8772         IRStmt_Exit(
8773            binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
8774            Ijk_EmWarn,
8775            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
8776            OFFB_EIP
8777         )
8778      );
8779      goto decode_success;
8780   }
8781
8782
8783   /* mmxext sse1 subset starts here. mmxext only arches will parse
8784      only this subset of the sse1 instructions. */
8785  mmxext:
8786
8787   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8788   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
8789   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
8790      Bool ok = False;
8791      delta = dis_MMX( &ok, sorb, sz, delta+1 );
8792      if (!ok)
8793         goto decode_failure;
8794      goto decode_success;
8795   }
8796
8797   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8798   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
8799      Intel manual does not say anything about the usual business of
8800      the FP reg tags getting trashed whenever an MMX insn happens.
8801      So we just leave them alone.
8802   */
8803   if (insn[0] == 0x0F && insn[1] == 0xE7) {
8804      modrm = getIByte(delta+2);
8805      if (sz == 4 && !epartIsReg(modrm)) {
8806         /* do_MMX_preamble(); Intel docs don't specify this */
8807         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8808         storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
8809         DIP("movntq %s,%s\n", dis_buf,
8810                               nameMMXReg(gregOfRM(modrm)));
8811         delta += 2+alen;
8812         goto decode_success;
8813      }
8814      /* else fall through */
8815   }
8816
8817   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8818   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
8819   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
8820      do_MMX_preamble();
8821      delta = dis_MMXop_regmem_to_reg (
8822                sorb, delta+2, insn[1], "pavgb", False );
8823      goto decode_success;
8824   }
8825
8826   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8827   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
8828   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
8829      do_MMX_preamble();
8830      delta = dis_MMXop_regmem_to_reg (
8831                sorb, delta+2, insn[1], "pavgw", False );
8832      goto decode_success;
8833   }
8834
8835   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8836   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
8837      zero-extend of it in ireg(G). */
8838   if (insn[0] == 0x0F && insn[1] == 0xC5) {
8839      modrm = insn[2];
8840      if (sz == 4 && epartIsReg(modrm)) {
8841         IRTemp sV = newTemp(Ity_I64);
8842         t5 = newTemp(Ity_I16);
8843         do_MMX_preamble();
8844         assign(sV, getMMXReg(eregOfRM(modrm)));
8845         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
8846         switch (insn[3] & 3) {
8847            case 0:  assign(t5, mkexpr(t0)); break;
8848            case 1:  assign(t5, mkexpr(t1)); break;
8849            case 2:  assign(t5, mkexpr(t2)); break;
8850            case 3:  assign(t5, mkexpr(t3)); break;
8851            default: vassert(0); /*NOTREACHED*/
8852         }
8853         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
8854         DIP("pextrw $%d,%s,%s\n",
8855             (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
8856                           nameIReg(4,gregOfRM(modrm)));
8857         delta += 4;
8858         goto decode_success;
8859      }
8860      /* else fall through */
8861   }
8862
8863   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8864   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
8865      put it into the specified lane of mmx(G). */
8866   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
8867      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
8868         mmx reg.  t4 is the new lane value.  t5 is the original
8869         mmx value. t6 is the new mmx value. */
8870      Int lane;
8871      t4 = newTemp(Ity_I16);
8872      t5 = newTemp(Ity_I64);
8873      t6 = newTemp(Ity_I64);
8874      modrm = insn[2];
8875      do_MMX_preamble();
8876
8877      assign(t5, getMMXReg(gregOfRM(modrm)));
8878      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
8879
8880      if (epartIsReg(modrm)) {
8881         assign(t4, getIReg(2, eregOfRM(modrm)));
8882         delta += 3+1;
8883         lane = insn[3+1-1];
8884         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8885                                   nameIReg(2,eregOfRM(modrm)),
8886                                   nameMMXReg(gregOfRM(modrm)));
8887      } else {
8888         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8889         delta += 3+alen;
8890         lane = insn[3+alen-1];
8891         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
8892         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8893                                   dis_buf,
8894                                   nameMMXReg(gregOfRM(modrm)));
8895      }
8896
8897      switch (lane & 3) {
8898         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
8899         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
8900         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
8901         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
8902         default: vassert(0); /*NOTREACHED*/
8903      }
8904      putMMXReg(gregOfRM(modrm), mkexpr(t6));
8905      goto decode_success;
8906   }
8907
8908   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8909   /* 0F EE = PMAXSW -- 16x4 signed max */
8910   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
8911      do_MMX_preamble();
8912      delta = dis_MMXop_regmem_to_reg (
8913                sorb, delta+2, insn[1], "pmaxsw", False );
8914      goto decode_success;
8915   }
8916
8917   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8918   /* 0F DE = PMAXUB -- 8x8 unsigned max */
8919   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
8920      do_MMX_preamble();
8921      delta = dis_MMXop_regmem_to_reg (
8922                sorb, delta+2, insn[1], "pmaxub", False );
8923      goto decode_success;
8924   }
8925
8926   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8927   /* 0F EA = PMINSW -- 16x4 signed min */
8928   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
8929      do_MMX_preamble();
8930      delta = dis_MMXop_regmem_to_reg (
8931                sorb, delta+2, insn[1], "pminsw", False );
8932      goto decode_success;
8933   }
8934
8935   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8936   /* 0F DA = PMINUB -- 8x8 unsigned min */
8937   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
8938      do_MMX_preamble();
8939      delta = dis_MMXop_regmem_to_reg (
8940                sorb, delta+2, insn[1], "pminub", False );
8941      goto decode_success;
8942   }
8943
8944   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8945   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
8946      mmx(E), turn them into a byte, and put zero-extend of it in
8947      ireg(G). */
8948   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
8949      modrm = insn[2];
8950      if (epartIsReg(modrm)) {
8951         do_MMX_preamble();
8952         t0 = newTemp(Ity_I64);
8953         t1 = newTemp(Ity_I32);
8954         assign(t0, getMMXReg(eregOfRM(modrm)));
8955         assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
8956         putIReg(4, gregOfRM(modrm), mkexpr(t1));
8957         DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8958                                 nameIReg(4,gregOfRM(modrm)));
8959         delta += 3;
8960         goto decode_success;
8961      }
8962      /* else fall through */
8963   }
8964
8965   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8966   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
8967   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
8968      do_MMX_preamble();
8969      delta = dis_MMXop_regmem_to_reg (
8970                sorb, delta+2, insn[1], "pmuluh", False );
8971      goto decode_success;
8972   }
8973
8974   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
8975   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
8976   /* 0F 18 /2 = PREFETCH1 */
8977   /* 0F 18 /3 = PREFETCH2 */
8978   if (insn[0] == 0x0F && insn[1] == 0x18
8979       && !epartIsReg(insn[2])
8980       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
8981      const HChar* hintstr = "??";
8982
8983      modrm = getIByte(delta+2);
8984      vassert(!epartIsReg(modrm));
8985
8986      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8987      delta += 2+alen;
8988
8989      switch (gregOfRM(modrm)) {
8990         case 0: hintstr = "nta"; break;
8991         case 1: hintstr = "t0"; break;
8992         case 2: hintstr = "t1"; break;
8993         case 3: hintstr = "t2"; break;
8994         default: vassert(0); /*NOTREACHED*/
8995      }
8996
8997      DIP("prefetch%s %s\n", hintstr, dis_buf);
8998      goto decode_success;
8999   }
9000
9001   /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
9002   /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
9003   if (insn[0] == 0x0F && insn[1] == 0x0D
9004       && !epartIsReg(insn[2])
9005       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
9006      const HChar* hintstr = "??";
9007
9008      modrm = getIByte(delta+2);
9009      vassert(!epartIsReg(modrm));
9010
9011      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9012      delta += 2+alen;
9013
9014      switch (gregOfRM(modrm)) {
9015         case 0: hintstr = ""; break;
9016         case 1: hintstr = "w"; break;
9017         default: vassert(0); /*NOTREACHED*/
9018      }
9019
9020      DIP("prefetch%s %s\n", hintstr, dis_buf);
9021      goto decode_success;
9022   }
9023
9024   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9025   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
9026   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
9027      do_MMX_preamble();
9028      delta = dis_MMXop_regmem_to_reg (
9029                 sorb, delta+2, insn[1], "psadbw", False );
9030      goto decode_success;
9031   }
9032
9033   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9034   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
9035   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
9036      Int order;
9037      IRTemp sV, dV, s3, s2, s1, s0;
9038      s3 = s2 = s1 = s0 = IRTemp_INVALID;
9039      sV = newTemp(Ity_I64);
9040      dV = newTemp(Ity_I64);
9041      do_MMX_preamble();
9042      modrm = insn[2];
9043      if (epartIsReg(modrm)) {
9044         assign( sV, getMMXReg(eregOfRM(modrm)) );
9045         order = (Int)insn[3];
9046         delta += 2+2;
9047         DIP("pshufw $%d,%s,%s\n", order,
9048                                   nameMMXReg(eregOfRM(modrm)),
9049                                   nameMMXReg(gregOfRM(modrm)));
9050      } else {
9051         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9052         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
9053	 order = (Int)insn[2+alen];
9054         delta += 3+alen;
9055         DIP("pshufw $%d,%s,%s\n", order,
9056                                   dis_buf,
9057                                   nameMMXReg(gregOfRM(modrm)));
9058      }
9059      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
9060
9061#     define SEL(n) \
9062                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9063      assign(dV,
9064	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
9065                          SEL((order>>2)&3), SEL((order>>0)&3) )
9066      );
9067      putMMXReg(gregOfRM(modrm), mkexpr(dV));
9068#     undef SEL
9069      goto decode_success;
9070   }
9071
9072   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
9073   if (insn[0] == 0x0F && insn[1] == 0xAE
9074       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
9075      vassert(sz == 4);
9076      delta += 3;
9077      /* Insert a memory fence.  It's sometimes important that these
9078         are carried through to the generated code. */
9079      stmt( IRStmt_MBE(Imbe_Fence) );
9080      DIP("sfence\n");
9081      goto decode_success;
9082   }
9083
9084   /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
9085   if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
9086      goto after_sse_decoders;
9087
9088
9089   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
9090   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
9091      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
9092      goto decode_success;
9093   }
9094
9095   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
9096   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
9097      vassert(sz == 4);
9098      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
9099      goto decode_success;
9100   }
9101
9102   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
9103   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
9104      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
9105      goto decode_success;
9106   }
9107
9108   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
9109   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
9110      vassert(sz == 4);
9111      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
9112      goto decode_success;
9113   }
9114
9115   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
9116   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
9117   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
9118      modrm = getIByte(delta+2);
9119      if (epartIsReg(modrm)) {
9120         putXMMReg( gregOfRM(modrm),
9121                    getXMMReg( eregOfRM(modrm) ));
9122         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9123                                  nameXMMReg(gregOfRM(modrm)));
9124         delta += 2+1;
9125      } else {
9126         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9127         if (insn[1] == 0x28/*movaps*/)
9128            gen_SEGV_if_not_16_aligned( addr );
9129         putXMMReg( gregOfRM(modrm),
9130                    loadLE(Ity_V128, mkexpr(addr)) );
9131         DIP("mov[ua]ps %s,%s\n", dis_buf,
9132                                  nameXMMReg(gregOfRM(modrm)));
9133         delta += 2+alen;
9134      }
9135      goto decode_success;
9136   }
9137
9138   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
9139   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
9140   if (sz == 4 && insn[0] == 0x0F
9141       && (insn[1] == 0x29 || insn[1] == 0x11)) {
9142      modrm = getIByte(delta+2);
9143      if (epartIsReg(modrm)) {
9144         /* fall through; awaiting test case */
9145      } else {
9146         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9147         if (insn[1] == 0x29/*movaps*/)
9148            gen_SEGV_if_not_16_aligned( addr );
9149         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9150         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
9151                                  dis_buf );
9152         delta += 2+alen;
9153         goto decode_success;
9154      }
9155   }
9156
9157   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
9158   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
9159   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
9160      modrm = getIByte(delta+2);
9161      if (epartIsReg(modrm)) {
9162         delta += 2+1;
9163         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
9164                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
9165         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9166                               nameXMMReg(gregOfRM(modrm)));
9167      } else {
9168         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9169         delta += 2+alen;
9170         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
9171                          loadLE(Ity_I64, mkexpr(addr)) );
9172         DIP("movhps %s,%s\n", dis_buf,
9173                               nameXMMReg( gregOfRM(modrm) ));
9174      }
9175      goto decode_success;
9176   }
9177
9178   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
9179   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
9180      if (!epartIsReg(insn[2])) {
9181         delta += 2;
9182         addr = disAMode ( &alen, sorb, delta, dis_buf );
9183         delta += alen;
9184         storeLE( mkexpr(addr),
9185                  getXMMRegLane64( gregOfRM(insn[2]),
9186                                   1/*upper lane*/ ) );
9187         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
9188                               dis_buf);
9189         goto decode_success;
9190      }
9191      /* else fall through */
9192   }
9193
9194   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
9195   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
9196   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
9197      modrm = getIByte(delta+2);
9198      if (epartIsReg(modrm)) {
9199         delta += 2+1;
9200         putXMMRegLane64( gregOfRM(modrm),
9201                          0/*lower lane*/,
9202                          getXMMRegLane64( eregOfRM(modrm), 1 ));
9203         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
9204                                 nameXMMReg(gregOfRM(modrm)));
9205      } else {
9206         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9207         delta += 2+alen;
9208         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
9209                          loadLE(Ity_I64, mkexpr(addr)) );
9210         DIP("movlps %s, %s\n",
9211             dis_buf, nameXMMReg( gregOfRM(modrm) ));
9212      }
9213      goto decode_success;
9214   }
9215
9216   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
9217   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
9218      if (!epartIsReg(insn[2])) {
9219         delta += 2;
9220         addr = disAMode ( &alen, sorb, delta, dis_buf );
9221         delta += alen;
9222         storeLE( mkexpr(addr),
9223                  getXMMRegLane64( gregOfRM(insn[2]),
9224                                   0/*lower lane*/ ) );
9225         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
9226                                dis_buf);
9227         goto decode_success;
9228      }
9229      /* else fall through */
9230   }
9231
9232   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
9233      to 4 lowest bits of ireg(G) */
9234   if (insn[0] == 0x0F && insn[1] == 0x50) {
9235      modrm = getIByte(delta+2);
9236      if (sz == 4 && epartIsReg(modrm)) {
9237         Int src;
9238         t0 = newTemp(Ity_I32);
9239         t1 = newTemp(Ity_I32);
9240         t2 = newTemp(Ity_I32);
9241         t3 = newTemp(Ity_I32);
9242         delta += 2+1;
9243         src = eregOfRM(modrm);
9244         assign( t0, binop( Iop_And32,
9245                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
9246                            mkU32(1) ));
9247         assign( t1, binop( Iop_And32,
9248                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
9249                            mkU32(2) ));
9250         assign( t2, binop( Iop_And32,
9251                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
9252                            mkU32(4) ));
9253         assign( t3, binop( Iop_And32,
9254                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
9255                            mkU32(8) ));
9256         putIReg(4, gregOfRM(modrm),
9257                    binop(Iop_Or32,
9258                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
9259                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
9260                         )
9261                 );
9262         DIP("movmskps %s,%s\n", nameXMMReg(src),
9263                                 nameIReg(4, gregOfRM(modrm)));
9264         goto decode_success;
9265      }
9266      /* else fall through */
9267   }
9268
9269   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
9270   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
9271   if (insn[0] == 0x0F && insn[1] == 0x2B) {
9272      modrm = getIByte(delta+2);
9273      if (!epartIsReg(modrm)) {
9274         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9275         gen_SEGV_if_not_16_aligned( addr );
9276         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9277         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
9278                                 dis_buf,
9279                                 nameXMMReg(gregOfRM(modrm)));
9280         delta += 2+alen;
9281         goto decode_success;
9282      }
9283      /* else fall through */
9284   }
9285
9286   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
9287      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
9288   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
9289      vassert(sz == 4);
9290      modrm = getIByte(delta+3);
9291      if (epartIsReg(modrm)) {
9292         putXMMRegLane32( gregOfRM(modrm), 0,
9293                          getXMMRegLane32( eregOfRM(modrm), 0 ));
9294         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9295                              nameXMMReg(gregOfRM(modrm)));
9296         delta += 3+1;
9297      } else {
9298         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9299         /* zero bits 127:64 */
9300         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
9301         /* zero bits 63:32 */
9302         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
9303         /* write bits 31:0 */
9304         putXMMRegLane32( gregOfRM(modrm), 0,
9305                          loadLE(Ity_I32, mkexpr(addr)) );
9306         DIP("movss %s,%s\n", dis_buf,
9307                              nameXMMReg(gregOfRM(modrm)));
9308         delta += 3+alen;
9309      }
9310      goto decode_success;
9311   }
9312
9313   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
9314      or lo 1/4 xmm). */
9315   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
9316      vassert(sz == 4);
9317      modrm = getIByte(delta+3);
9318      if (epartIsReg(modrm)) {
9319         /* fall through, we don't yet have a test case */
9320      } else {
9321         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9322         storeLE( mkexpr(addr),
9323                  getXMMRegLane32(gregOfRM(modrm), 0) );
9324         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
9325                              dis_buf);
9326         delta += 3+alen;
9327         goto decode_success;
9328      }
9329   }
9330
9331   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
9332   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
9333      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
9334      goto decode_success;
9335   }
9336
9337   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
9338   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
9339      vassert(sz == 4);
9340      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
9341      goto decode_success;
9342   }
9343
9344   /* 0F 56 = ORPS -- G = G and E */
9345   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
9346      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
9347      goto decode_success;
9348   }
9349
9350   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
9351   if (insn[0] == 0x0F && insn[1] == 0x53) {
9352      vassert(sz == 4);
9353      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9354                                        "rcpps", Iop_Recip32Fx4 );
9355      goto decode_success;
9356   }
9357
9358   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
9359   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
9360      vassert(sz == 4);
9361      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9362                                         "rcpss", Iop_Recip32F0x4 );
9363      goto decode_success;
9364   }
9365
9366   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
9367   if (insn[0] == 0x0F && insn[1] == 0x52) {
9368      vassert(sz == 4);
9369      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9370                                        "rsqrtps", Iop_RSqrt32Fx4 );
9371      goto decode_success;
9372   }
9373
9374   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
9375   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
9376      vassert(sz == 4);
9377      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9378                                         "rsqrtss", Iop_RSqrt32F0x4 );
9379      goto decode_success;
9380   }
9381
9382   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
9383   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
9384      Int    select;
9385      IRTemp sV, dV;
9386      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9387      sV = newTemp(Ity_V128);
9388      dV = newTemp(Ity_V128);
9389      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9390      modrm = insn[2];
9391      assign( dV, getXMMReg(gregOfRM(modrm)) );
9392
9393      if (epartIsReg(modrm)) {
9394         assign( sV, getXMMReg(eregOfRM(modrm)) );
9395         select = (Int)insn[3];
9396         delta += 2+2;
9397         DIP("shufps $%d,%s,%s\n", select,
9398                                   nameXMMReg(eregOfRM(modrm)),
9399                                   nameXMMReg(gregOfRM(modrm)));
9400      } else {
9401         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9402         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9403         select = (Int)insn[2+alen];
9404         delta += 3+alen;
9405         DIP("shufps $%d,%s,%s\n", select,
9406                                   dis_buf,
9407                                   nameXMMReg(gregOfRM(modrm)));
9408      }
9409
9410      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9411      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9412
9413#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
9414#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9415
9416      putXMMReg(
9417         gregOfRM(modrm),
9418         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
9419                       SELD((select>>2)&3), SELD((select>>0)&3) )
9420      );
9421
9422#     undef SELD
9423#     undef SELS
9424
9425      goto decode_success;
9426   }
9427
9428   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
9429   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
9430      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9431                                        "sqrtps", Iop_Sqrt32Fx4 );
9432      goto decode_success;
9433   }
9434
9435   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
9436   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
9437      vassert(sz == 4);
9438      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9439                                         "sqrtss", Iop_Sqrt32F0x4 );
9440      goto decode_success;
9441   }
9442
9443   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
9444   if (insn[0] == 0x0F && insn[1] == 0xAE
9445       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
9446      modrm = getIByte(delta+2);
9447      vassert(sz == 4);
9448      vassert(!epartIsReg(modrm));
9449
9450      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9451      delta += 2+alen;
9452
9453      /* Fake up a native SSE mxcsr word.  The only thing it depends
9454         on is SSEROUND[1:0], so call a clean helper to cook it up.
9455      */
9456      /* UInt x86h_create_mxcsr ( UInt sseround ) */
9457      DIP("stmxcsr %s\n", dis_buf);
9458      storeLE( mkexpr(addr),
9459               mkIRExprCCall(
9460                  Ity_I32, 0/*regp*/,
9461                  "x86g_create_mxcsr", &x86g_create_mxcsr,
9462                  mkIRExprVec_1( get_sse_roundingmode() )
9463               )
9464             );
9465      goto decode_success;
9466   }
9467
9468   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
9469   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
9470      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
9471      goto decode_success;
9472   }
9473
9474   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
9475   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
9476      vassert(sz == 4);
9477      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
9478      goto decode_success;
9479   }
9480
9481   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
9482   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
9483   /* These just appear to be special cases of SHUFPS */
9484   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
9485      IRTemp sV, dV;
9486      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9487      Bool hi = toBool(insn[1] == 0x15);
9488      sV = newTemp(Ity_V128);
9489      dV = newTemp(Ity_V128);
9490      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9491      modrm = insn[2];
9492      assign( dV, getXMMReg(gregOfRM(modrm)) );
9493
9494      if (epartIsReg(modrm)) {
9495         assign( sV, getXMMReg(eregOfRM(modrm)) );
9496         delta += 2+1;
9497         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9498                                  nameXMMReg(eregOfRM(modrm)),
9499                                  nameXMMReg(gregOfRM(modrm)));
9500      } else {
9501         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9502         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9503         delta += 2+alen;
9504         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9505                                  dis_buf,
9506                                  nameXMMReg(gregOfRM(modrm)));
9507      }
9508
9509      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9510      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9511
9512      if (hi) {
9513         putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
9514      } else {
9515         putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
9516      }
9517
9518      goto decode_success;
9519   }
9520
9521   /* 0F 57 = XORPS -- G = G and E */
9522   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
9523      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
9524      goto decode_success;
9525   }
9526
9527   /* ---------------------------------------------------- */
9528   /* --- end of the SSE decoder.                      --- */
9529   /* ---------------------------------------------------- */
9530
9531   /* ---------------------------------------------------- */
9532   /* --- start of the SSE2 decoder.                   --- */
9533   /* ---------------------------------------------------- */
9534
9535   /* Skip parts of the decoder which don't apply given the stated
9536      guest subarchitecture. */
9537   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
9538      goto after_sse_decoders; /* no SSE2 capabilities */
9539
9540   insn = (UChar*)&guest_code[delta];
9541
9542   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
9543   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
9544      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
9545      goto decode_success;
9546   }
9547
9548   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
9549   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
9550      vassert(sz == 4);
9551      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
9552      goto decode_success;
9553   }
9554
9555   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
9556   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
9557      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
9558      goto decode_success;
9559   }
9560
9561   /* 66 0F 54 = ANDPD -- G = G and E */
9562   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
9563      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
9564      goto decode_success;
9565   }
9566
9567   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
9568   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
9569      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
9570      goto decode_success;
9571   }
9572
9573   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
9574   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
9575      vassert(sz == 4);
9576      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
9577      goto decode_success;
9578   }
9579
9580   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
9581   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
9582   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
9583      IRTemp argL = newTemp(Ity_F64);
9584      IRTemp argR = newTemp(Ity_F64);
9585      modrm = getIByte(delta+2);
9586      if (epartIsReg(modrm)) {
9587         assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
9588         delta += 2+1;
9589         DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9590                                  nameXMMReg(gregOfRM(modrm)) );
9591      } else {
9592         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9593	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9594         delta += 2+alen;
9595         DIP("[u]comisd %s,%s\n", dis_buf,
9596                                  nameXMMReg(gregOfRM(modrm)) );
9597      }
9598      assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
9599
9600      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
9601      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
9602      stmt( IRStmt_Put(
9603               OFFB_CC_DEP1,
9604               binop( Iop_And32,
9605                      binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
9606                      mkU32(0x45)
9607          )));
9608      /* Set NDEP even though it isn't used.  This makes redundant-PUT
9609         elimination of previous stores to this field work better. */
9610      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
9611      goto decode_success;
9612   }
9613
9614   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
9615      F64 in xmm(G) */
9616   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
9617      IRTemp arg64 = newTemp(Ity_I64);
9618      vassert(sz == 4);
9619
9620      modrm = getIByte(delta+3);
9621      if (epartIsReg(modrm)) {
9622         assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
9623         delta += 3+1;
9624         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9625                                 nameXMMReg(gregOfRM(modrm)));
9626      } else {
9627         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9628	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9629         delta += 3+alen;
9630         DIP("cvtdq2pd %s,%s\n", dis_buf,
9631                                 nameXMMReg(gregOfRM(modrm)) );
9632      }
9633
9634      putXMMRegLane64F(
9635         gregOfRM(modrm), 0,
9636         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
9637      );
9638
9639      putXMMRegLane64F(
9640         gregOfRM(modrm), 1,
9641         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
9642      );
9643
9644      goto decode_success;
9645   }
9646
9647   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
9648      xmm(G) */
9649   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
9650      IRTemp argV  = newTemp(Ity_V128);
9651      IRTemp rmode = newTemp(Ity_I32);
9652
9653      modrm = getIByte(delta+2);
9654      if (epartIsReg(modrm)) {
9655         assign( argV, getXMMReg(eregOfRM(modrm)) );
9656         delta += 2+1;
9657         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9658                                 nameXMMReg(gregOfRM(modrm)));
9659      } else {
9660         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9661	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9662         delta += 2+alen;
9663         DIP("cvtdq2ps %s,%s\n", dis_buf,
9664                                 nameXMMReg(gregOfRM(modrm)) );
9665      }
9666
9667      assign( rmode, get_sse_roundingmode() );
9668      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9669
9670#     define CVT(_t)  binop( Iop_F64toF32,                    \
9671                             mkexpr(rmode),                   \
9672                             unop(Iop_I32StoF64,mkexpr(_t)))
9673
9674      putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
9675      putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
9676      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9677      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9678
9679#     undef CVT
9680
9681      goto decode_success;
9682   }
9683
9684   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9685      lo half xmm(G), and zero upper half */
9686   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
9687      IRTemp argV  = newTemp(Ity_V128);
9688      IRTemp rmode = newTemp(Ity_I32);
9689      vassert(sz == 4);
9690
9691      modrm = getIByte(delta+3);
9692      if (epartIsReg(modrm)) {
9693         assign( argV, getXMMReg(eregOfRM(modrm)) );
9694         delta += 3+1;
9695         DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9696                                 nameXMMReg(gregOfRM(modrm)));
9697      } else {
9698         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9699	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9700         delta += 3+alen;
9701         DIP("cvtpd2dq %s,%s\n", dis_buf,
9702                                 nameXMMReg(gregOfRM(modrm)) );
9703      }
9704
9705      assign( rmode, get_sse_roundingmode() );
9706      t0 = newTemp(Ity_F64);
9707      t1 = newTemp(Ity_F64);
9708      assign( t0, unop(Iop_ReinterpI64asF64,
9709                       unop(Iop_V128to64, mkexpr(argV))) );
9710      assign( t1, unop(Iop_ReinterpI64asF64,
9711                       unop(Iop_V128HIto64, mkexpr(argV))) );
9712
9713#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9714                             mkexpr(rmode),                   \
9715                             mkexpr(_t) )
9716
9717      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9718      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9719      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9720      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9721
9722#     undef CVT
9723
9724      goto decode_success;
9725   }
9726
9727   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9728      I32 in mmx, according to prevailing SSE rounding mode */
9729   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9730      I32 in mmx, rounding towards zero */
9731   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9732      IRTemp dst64  = newTemp(Ity_I64);
9733      IRTemp rmode  = newTemp(Ity_I32);
9734      IRTemp f64lo  = newTemp(Ity_F64);
9735      IRTemp f64hi  = newTemp(Ity_F64);
9736      Bool   r2zero = toBool(insn[1] == 0x2C);
9737
9738      do_MMX_preamble();
9739      modrm = getIByte(delta+2);
9740
9741      if (epartIsReg(modrm)) {
9742         delta += 2+1;
9743	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9744	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
9745         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
9746                                   nameXMMReg(eregOfRM(modrm)),
9747                                   nameMMXReg(gregOfRM(modrm)));
9748      } else {
9749         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9750	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9751	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
9752                                              mkexpr(addr),
9753                                              mkU32(8) )));
9754         delta += 2+alen;
9755         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
9756                                   dis_buf,
9757                                   nameMMXReg(gregOfRM(modrm)));
9758      }
9759
9760      if (r2zero) {
9761         assign(rmode, mkU32((UInt)Irrm_ZERO) );
9762      } else {
9763         assign( rmode, get_sse_roundingmode() );
9764      }
9765
9766      assign(
9767         dst64,
9768         binop( Iop_32HLto64,
9769                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
9770                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
9771              )
9772      );
9773
9774      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
9775      goto decode_success;
9776   }
9777
9778   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
9779      lo half xmm(G), and zero upper half */
9780   /* Note, this is practically identical to CVTPD2DQ.  It would have
9781      been nicer to merge them together, but the insn[] offsets differ
9782      by one. */
9783   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
9784      IRTemp argV  = newTemp(Ity_V128);
9785      IRTemp rmode = newTemp(Ity_I32);
9786
9787      modrm = getIByte(delta+2);
9788      if (epartIsReg(modrm)) {
9789         assign( argV, getXMMReg(eregOfRM(modrm)) );
9790         delta += 2+1;
9791         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9792                                 nameXMMReg(gregOfRM(modrm)));
9793      } else {
9794         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9795	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9796         delta += 2+alen;
9797         DIP("cvtpd2ps %s,%s\n", dis_buf,
9798                                 nameXMMReg(gregOfRM(modrm)) );
9799      }
9800
9801      assign( rmode, get_sse_roundingmode() );
9802      t0 = newTemp(Ity_F64);
9803      t1 = newTemp(Ity_F64);
9804      assign( t0, unop(Iop_ReinterpI64asF64,
9805                       unop(Iop_V128to64, mkexpr(argV))) );
9806      assign( t1, unop(Iop_ReinterpI64asF64,
9807                       unop(Iop_V128HIto64, mkexpr(argV))) );
9808
9809#     define CVT(_t)  binop( Iop_F64toF32,                    \
9810                             mkexpr(rmode),                   \
9811                             mkexpr(_t) )
9812
9813      putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
9814      putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
9815      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9816      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9817
9818#     undef CVT
9819
9820      goto decode_success;
9821   }
9822
9823   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
9824      xmm(G) */
9825   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
9826      IRTemp arg64 = newTemp(Ity_I64);
9827
9828      modrm = getIByte(delta+2);
9829      if (epartIsReg(modrm)) {
9830         /* Only switch to MMX mode if the source is a MMX register.
9831            This is inconsistent with all other instructions which
9832            convert between XMM and (M64 or MMX), which always switch
9833            to MMX mode even if 64-bit operand is M64 and not MMX.  At
9834            least, that's what the Intel docs seem to me to say.
9835            Fixes #210264. */
9836         do_MMX_preamble();
9837         assign( arg64, getMMXReg(eregOfRM(modrm)) );
9838         delta += 2+1;
9839         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9840                                 nameXMMReg(gregOfRM(modrm)));
9841      } else {
9842         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9843	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9844         delta += 2+alen;
9845         DIP("cvtpi2pd %s,%s\n", dis_buf,
9846                                 nameXMMReg(gregOfRM(modrm)) );
9847      }
9848
9849      putXMMRegLane64F(
9850         gregOfRM(modrm), 0,
9851         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
9852      );
9853
9854      putXMMRegLane64F(
9855         gregOfRM(modrm), 1,
9856         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
9857      );
9858
9859      goto decode_success;
9860   }
9861
9862   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9863      xmm(G) */
9864   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
9865      IRTemp argV  = newTemp(Ity_V128);
9866      IRTemp rmode = newTemp(Ity_I32);
9867
9868      modrm = getIByte(delta+2);
9869      if (epartIsReg(modrm)) {
9870         assign( argV, getXMMReg(eregOfRM(modrm)) );
9871         delta += 2+1;
9872         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9873                                 nameXMMReg(gregOfRM(modrm)));
9874      } else {
9875         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9876	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9877         delta += 2+alen;
9878         DIP("cvtps2dq %s,%s\n", dis_buf,
9879                                 nameXMMReg(gregOfRM(modrm)) );
9880      }
9881
9882      assign( rmode, get_sse_roundingmode() );
9883      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9884
9885      /* This is less than ideal.  If it turns out to be a performance
9886	 bottleneck it can be improved. */
9887#     define CVT(_t)                            \
9888        binop( Iop_F64toI32S,                   \
9889               mkexpr(rmode),                   \
9890               unop( Iop_F32toF64,              \
9891                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9892
9893      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9894      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9895      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9896      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9897
9898#     undef CVT
9899
9900      goto decode_success;
9901   }
9902
9903   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
9904      F64 in xmm(G). */
9905   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
9906      IRTemp f32lo = newTemp(Ity_F32);
9907      IRTemp f32hi = newTemp(Ity_F32);
9908
9909      modrm = getIByte(delta+2);
9910      if (epartIsReg(modrm)) {
9911         assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
9912         assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
9913         delta += 2+1;
9914         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9915                                 nameXMMReg(gregOfRM(modrm)));
9916      } else {
9917         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9918	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
9919	 assign( f32hi, loadLE(Ity_F32,
9920                               binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
9921         delta += 2+alen;
9922         DIP("cvtps2pd %s,%s\n", dis_buf,
9923                                 nameXMMReg(gregOfRM(modrm)) );
9924      }
9925
9926      putXMMRegLane64F( gregOfRM(modrm), 1,
9927                        unop(Iop_F32toF64, mkexpr(f32hi)) );
9928      putXMMRegLane64F( gregOfRM(modrm), 0,
9929                        unop(Iop_F32toF64, mkexpr(f32lo)) );
9930
9931      goto decode_success;
9932   }
9933
9934   /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
9935      I32 in ireg, according to prevailing SSE rounding mode */
9936   /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
9937      I32 in ireg, rounding towards zero */
9938   if (insn[0] == 0xF2 && insn[1] == 0x0F
9939       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
9940      IRTemp rmode = newTemp(Ity_I32);
9941      IRTemp f64lo = newTemp(Ity_F64);
9942      Bool   r2zero = toBool(insn[2] == 0x2C);
9943      vassert(sz == 4);
9944
9945      modrm = getIByte(delta+3);
9946      if (epartIsReg(modrm)) {
9947         delta += 3+1;
9948	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9949         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9950                                   nameXMMReg(eregOfRM(modrm)),
9951                                   nameIReg(4, gregOfRM(modrm)));
9952      } else {
9953         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9954	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9955         delta += 3+alen;
9956         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9957                                   dis_buf,
9958                                   nameIReg(4, gregOfRM(modrm)));
9959      }
9960
9961      if (r2zero) {
9962         assign( rmode, mkU32((UInt)Irrm_ZERO) );
9963      } else {
9964         assign( rmode, get_sse_roundingmode() );
9965      }
9966
9967      putIReg(4, gregOfRM(modrm),
9968                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
9969
9970      goto decode_success;
9971   }
9972
9973   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
9974      low 1/4 xmm(G), according to prevailing SSE rounding mode */
9975   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
9976      IRTemp rmode = newTemp(Ity_I32);
9977      IRTemp f64lo = newTemp(Ity_F64);
9978      vassert(sz == 4);
9979
9980      modrm = getIByte(delta+3);
9981      if (epartIsReg(modrm)) {
9982         delta += 3+1;
9983	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9984         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9985                                 nameXMMReg(gregOfRM(modrm)));
9986      } else {
9987         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9988	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9989         delta += 3+alen;
9990         DIP("cvtsd2ss %s,%s\n", dis_buf,
9991                                 nameXMMReg(gregOfRM(modrm)));
9992      }
9993
9994      assign( rmode, get_sse_roundingmode() );
9995      putXMMRegLane32F(
9996         gregOfRM(modrm), 0,
9997         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
9998      );
9999
10000      goto decode_success;
10001   }
10002
10003   /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
10004      half xmm */
10005   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
10006      IRTemp arg32 = newTemp(Ity_I32);
10007      vassert(sz == 4);
10008
10009      modrm = getIByte(delta+3);
10010      if (epartIsReg(modrm)) {
10011         assign( arg32, getIReg(4, eregOfRM(modrm)) );
10012         delta += 3+1;
10013         DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
10014                                 nameXMMReg(gregOfRM(modrm)));
10015      } else {
10016         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10017	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
10018         delta += 3+alen;
10019         DIP("cvtsi2sd %s,%s\n", dis_buf,
10020                                 nameXMMReg(gregOfRM(modrm)) );
10021      }
10022
10023      putXMMRegLane64F(
10024         gregOfRM(modrm), 0,
10025         unop(Iop_I32StoF64, mkexpr(arg32)) );
10026
10027      goto decode_success;
10028   }
10029
10030   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
10031      low half xmm(G) */
10032   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
10033      IRTemp f32lo = newTemp(Ity_F32);
10034      vassert(sz == 4);
10035
10036      modrm = getIByte(delta+3);
10037      if (epartIsReg(modrm)) {
10038         delta += 3+1;
10039	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
10040         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10041                                 nameXMMReg(gregOfRM(modrm)));
10042      } else {
10043         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10044	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
10045         delta += 3+alen;
10046         DIP("cvtss2sd %s,%s\n", dis_buf,
10047                                 nameXMMReg(gregOfRM(modrm)));
10048      }
10049
10050      putXMMRegLane64F( gregOfRM(modrm), 0,
10051                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
10052
10053      goto decode_success;
10054   }
10055
10056   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
10057      lo half xmm(G), and zero upper half, rounding towards zero */
10058   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
10059      IRTemp argV  = newTemp(Ity_V128);
10060      IRTemp rmode = newTemp(Ity_I32);
10061
10062      modrm = getIByte(delta+2);
10063      if (epartIsReg(modrm)) {
10064         assign( argV, getXMMReg(eregOfRM(modrm)) );
10065         delta += 2+1;
10066         DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10067                                  nameXMMReg(gregOfRM(modrm)));
10068      } else {
10069         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10070	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10071         delta += 2+alen;
10072         DIP("cvttpd2dq %s,%s\n", dis_buf,
10073                                  nameXMMReg(gregOfRM(modrm)) );
10074      }
10075
10076      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10077
10078      t0 = newTemp(Ity_F64);
10079      t1 = newTemp(Ity_F64);
10080      assign( t0, unop(Iop_ReinterpI64asF64,
10081                       unop(Iop_V128to64, mkexpr(argV))) );
10082      assign( t1, unop(Iop_ReinterpI64asF64,
10083                       unop(Iop_V128HIto64, mkexpr(argV))) );
10084
10085#     define CVT(_t)  binop( Iop_F64toI32S,                   \
10086                             mkexpr(rmode),                   \
10087                             mkexpr(_t) )
10088
10089      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
10090      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
10091      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10092      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10093
10094#     undef CVT
10095
10096      goto decode_success;
10097   }
10098
10099   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
10100      xmm(G), rounding towards zero */
10101   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
10102      IRTemp argV  = newTemp(Ity_V128);
10103      IRTemp rmode = newTemp(Ity_I32);
10104      vassert(sz == 4);
10105
10106      modrm = getIByte(delta+3);
10107      if (epartIsReg(modrm)) {
10108         assign( argV, getXMMReg(eregOfRM(modrm)) );
10109         delta += 3+1;
10110         DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10111                                  nameXMMReg(gregOfRM(modrm)));
10112      } else {
10113         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10114	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10115         delta += 3+alen;
10116         DIP("cvttps2dq %s,%s\n", dis_buf,
10117                                  nameXMMReg(gregOfRM(modrm)) );
10118      }
10119
10120      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10121      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10122
10123      /* This is less than ideal.  If it turns out to be a performance
10124	 bottleneck it can be improved. */
10125#     define CVT(_t)                            \
10126        binop( Iop_F64toI32S,                   \
10127               mkexpr(rmode),                   \
10128               unop( Iop_F32toF64,              \
10129                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10130
10131      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
10132      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
10133      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
10134      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
10135
10136#     undef CVT
10137
10138      goto decode_success;
10139   }
10140
10141   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
10142   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
10143      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
10144      goto decode_success;
10145   }
10146
10147   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
10148   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
10149      vassert(sz == 4);
10150      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
10151      goto decode_success;
10152   }
10153
10154   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
10155   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
10156   if (insn[0] == 0x0F && insn[1] == 0xAE
10157       && epartIsReg(insn[2])
10158       && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
10159      vassert(sz == 4);
10160      delta += 3;
10161      /* Insert a memory fence.  It's sometimes important that these
10162         are carried through to the generated code. */
10163      stmt( IRStmt_MBE(Imbe_Fence) );
10164      DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
10165      goto decode_success;
10166   }
10167
10168   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
10169   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
10170      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
10171      goto decode_success;
10172   }
10173
10174   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
10175   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
10176      vassert(sz == 4);
10177      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
10178      goto decode_success;
10179   }
10180
10181   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
10182   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
10183      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
10184      goto decode_success;
10185   }
10186
10187   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
10188   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
10189      vassert(sz == 4);
10190      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
10191      goto decode_success;
10192   }
10193
10194   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
10195   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
10196   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
10197   if (sz == 2 && insn[0] == 0x0F
10198       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
10199      const HChar* wot = insn[1]==0x28 ? "apd" :
10200                         insn[1]==0x10 ? "upd" : "dqa";
10201      modrm = getIByte(delta+2);
10202      if (epartIsReg(modrm)) {
10203         putXMMReg( gregOfRM(modrm),
10204                    getXMMReg( eregOfRM(modrm) ));
10205         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
10206                                   nameXMMReg(gregOfRM(modrm)));
10207         delta += 2+1;
10208      } else {
10209         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10210         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
10211            gen_SEGV_if_not_16_aligned( addr );
10212         putXMMReg( gregOfRM(modrm),
10213                    loadLE(Ity_V128, mkexpr(addr)) );
10214         DIP("mov%s %s,%s\n", wot, dis_buf,
10215                                   nameXMMReg(gregOfRM(modrm)));
10216         delta += 2+alen;
10217      }
10218      goto decode_success;
10219   }
10220
10221   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
10222   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
10223   if (sz == 2 && insn[0] == 0x0F
10224       && (insn[1] == 0x29 || insn[1] == 0x11)) {
10225      const HChar* wot = insn[1]==0x29 ? "apd" : "upd";
10226      modrm = getIByte(delta+2);
10227      if (epartIsReg(modrm)) {
10228         /* fall through; awaiting test case */
10229      } else {
10230         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10231         if (insn[1] == 0x29/*movapd*/)
10232            gen_SEGV_if_not_16_aligned( addr );
10233         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10234         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
10235                                   dis_buf );
10236         delta += 2+alen;
10237         goto decode_success;
10238      }
10239   }
10240
10241   /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
10242   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
10243      modrm = getIByte(delta+2);
10244      if (epartIsReg(modrm)) {
10245         delta += 2+1;
10246         putXMMReg(
10247            gregOfRM(modrm),
10248            unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
10249         );
10250         DIP("movd %s, %s\n",
10251             nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
10252      } else {
10253         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10254         delta += 2+alen;
10255         putXMMReg(
10256            gregOfRM(modrm),
10257            unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
10258         );
10259         DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
10260      }
10261      goto decode_success;
10262   }
10263
10264   /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
10265   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
10266      modrm = getIByte(delta+2);
10267      if (epartIsReg(modrm)) {
10268         delta += 2+1;
10269         putIReg( 4, eregOfRM(modrm),
10270                  getXMMRegLane32(gregOfRM(modrm), 0) );
10271         DIP("movd %s, %s\n",
10272             nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
10273      } else {
10274         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10275         delta += 2+alen;
10276         storeLE( mkexpr(addr),
10277                  getXMMRegLane32(gregOfRM(modrm), 0) );
10278         DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10279      }
10280      goto decode_success;
10281   }
10282
10283   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
10284   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
10285      modrm = getIByte(delta+2);
10286      if (epartIsReg(modrm)) {
10287         delta += 2+1;
10288         putXMMReg( eregOfRM(modrm),
10289                    getXMMReg(gregOfRM(modrm)) );
10290         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10291                                nameXMMReg(eregOfRM(modrm)));
10292      } else {
10293         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10294         delta += 2+alen;
10295         gen_SEGV_if_not_16_aligned( addr );
10296         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10297         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10298      }
10299      goto decode_success;
10300   }
10301
10302   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
10303   /* Unfortunately can't simply use the MOVDQA case since the
10304      prefix lengths are different (66 vs F3) */
10305   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
10306      vassert(sz == 4);
10307      modrm = getIByte(delta+3);
10308      if (epartIsReg(modrm)) {
10309         putXMMReg( gregOfRM(modrm),
10310                    getXMMReg( eregOfRM(modrm) ));
10311         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10312                               nameXMMReg(gregOfRM(modrm)));
10313         delta += 3+1;
10314      } else {
10315         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10316         putXMMReg( gregOfRM(modrm),
10317                    loadLE(Ity_V128, mkexpr(addr)) );
10318         DIP("movdqu %s,%s\n", dis_buf,
10319                               nameXMMReg(gregOfRM(modrm)));
10320         delta += 3+alen;
10321      }
10322      goto decode_success;
10323   }
10324
10325   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
10326   /* Unfortunately can't simply use the MOVDQA case since the
10327      prefix lengths are different (66 vs F3) */
10328   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
10329      vassert(sz == 4);
10330      modrm = getIByte(delta+3);
10331      if (epartIsReg(modrm)) {
10332         delta += 3+1;
10333         putXMMReg( eregOfRM(modrm),
10334                    getXMMReg(gregOfRM(modrm)) );
10335         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10336                                nameXMMReg(eregOfRM(modrm)));
10337      } else {
10338         addr = disAMode( &alen, sorb, delta+3, dis_buf );
10339         delta += 3+alen;
10340         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10341         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10342      }
10343      goto decode_success;
10344   }
10345
10346   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
10347   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
10348      vassert(sz == 4);
10349      modrm = getIByte(delta+3);
10350      if (epartIsReg(modrm)) {
10351         do_MMX_preamble();
10352         putMMXReg( gregOfRM(modrm),
10353                    getXMMRegLane64( eregOfRM(modrm), 0 ));
10354         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10355                                nameMMXReg(gregOfRM(modrm)));
10356         delta += 3+1;
10357         goto decode_success;
10358      } else {
10359         /* fall through, apparently no mem case for this insn */
10360      }
10361   }
10362
10363   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
10364   /* These seems identical to MOVHPS.  This instruction encoding is
10365      completely crazy. */
10366   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
10367      modrm = getIByte(delta+2);
10368      if (epartIsReg(modrm)) {
10369         /* fall through; apparently reg-reg is not possible */
10370      } else {
10371         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10372         delta += 2+alen;
10373         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
10374                          loadLE(Ity_I64, mkexpr(addr)) );
10375         DIP("movhpd %s,%s\n", dis_buf,
10376                               nameXMMReg( gregOfRM(modrm) ));
10377         goto decode_success;
10378      }
10379   }
10380
10381   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
10382   /* Again, this seems identical to MOVHPS. */
10383   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
10384      if (!epartIsReg(insn[2])) {
10385         delta += 2;
10386         addr = disAMode ( &alen, sorb, delta, dis_buf );
10387         delta += alen;
10388         storeLE( mkexpr(addr),
10389                  getXMMRegLane64( gregOfRM(insn[2]),
10390                                   1/*upper lane*/ ) );
10391         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
10392                               dis_buf);
10393         goto decode_success;
10394      }
10395      /* else fall through */
10396   }
10397
10398   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
10399   /* Identical to MOVLPS ? */
10400   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
10401      modrm = getIByte(delta+2);
10402      if (epartIsReg(modrm)) {
10403         /* fall through; apparently reg-reg is not possible */
10404      } else {
10405         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10406         delta += 2+alen;
10407         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
10408                          loadLE(Ity_I64, mkexpr(addr)) );
10409         DIP("movlpd %s, %s\n",
10410             dis_buf, nameXMMReg( gregOfRM(modrm) ));
10411         goto decode_success;
10412      }
10413   }
10414
10415   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
10416   /* Identical to MOVLPS ? */
10417   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
10418      if (!epartIsReg(insn[2])) {
10419         delta += 2;
10420         addr = disAMode ( &alen, sorb, delta, dis_buf );
10421         delta += alen;
10422         storeLE( mkexpr(addr),
10423                  getXMMRegLane64( gregOfRM(insn[2]),
10424                                   0/*lower lane*/ ) );
10425         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
10426                                dis_buf);
10427         goto decode_success;
10428      }
10429      /* else fall through */
10430   }
10431
10432   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
10433      2 lowest bits of ireg(G) */
10434   if (insn[0] == 0x0F && insn[1] == 0x50) {
10435      modrm = getIByte(delta+2);
10436      if (sz == 2 && epartIsReg(modrm)) {
10437         Int src;
10438         t0 = newTemp(Ity_I32);
10439         t1 = newTemp(Ity_I32);
10440         delta += 2+1;
10441         src = eregOfRM(modrm);
10442         assign( t0, binop( Iop_And32,
10443                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
10444                            mkU32(1) ));
10445         assign( t1, binop( Iop_And32,
10446                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
10447                            mkU32(2) ));
10448         putIReg(4, gregOfRM(modrm),
10449                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
10450                 );
10451         DIP("movmskpd %s,%s\n", nameXMMReg(src),
10452                                 nameIReg(4, gregOfRM(modrm)));
10453         goto decode_success;
10454      }
10455      /* else fall through */
10456   }
10457
10458   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
10459   if (insn[0] == 0x0F && insn[1] == 0xF7) {
10460      modrm = getIByte(delta+2);
10461      if (sz == 2 && epartIsReg(modrm)) {
10462         IRTemp regD    = newTemp(Ity_V128);
10463         IRTemp mask    = newTemp(Ity_V128);
10464         IRTemp olddata = newTemp(Ity_V128);
10465         IRTemp newdata = newTemp(Ity_V128);
10466                addr    = newTemp(Ity_I32);
10467
10468         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
10469         assign( regD, getXMMReg( gregOfRM(modrm) ));
10470
10471         /* Unfortunately can't do the obvious thing with SarN8x16
10472            here since that can't be re-emitted as SSE2 code - no such
10473            insn. */
10474	 assign(
10475            mask,
10476            binop(Iop_64HLtoV128,
10477                  binop(Iop_SarN8x8,
10478                        getXMMRegLane64( eregOfRM(modrm), 1 ),
10479                        mkU8(7) ),
10480                  binop(Iop_SarN8x8,
10481                        getXMMRegLane64( eregOfRM(modrm), 0 ),
10482                        mkU8(7) ) ));
10483         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10484         assign( newdata,
10485                 binop(Iop_OrV128,
10486                       binop(Iop_AndV128,
10487                             mkexpr(regD),
10488                             mkexpr(mask) ),
10489                       binop(Iop_AndV128,
10490                             mkexpr(olddata),
10491                             unop(Iop_NotV128, mkexpr(mask)))) );
10492         storeLE( mkexpr(addr), mkexpr(newdata) );
10493
10494         delta += 2+1;
10495         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
10496                                   nameXMMReg( gregOfRM(modrm) ) );
10497         goto decode_success;
10498      }
10499      /* else fall through */
10500   }
10501
10502   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
10503   if (insn[0] == 0x0F && insn[1] == 0xE7) {
10504      modrm = getIByte(delta+2);
10505      if (sz == 2 && !epartIsReg(modrm)) {
10506         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10507         gen_SEGV_if_not_16_aligned( addr );
10508         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10509         DIP("movntdq %s,%s\n", dis_buf,
10510                                nameXMMReg(gregOfRM(modrm)));
10511         delta += 2+alen;
10512         goto decode_success;
10513      }
10514      /* else fall through */
10515   }
10516
10517   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
10518   if (insn[0] == 0x0F && insn[1] == 0xC3) {
10519      vassert(sz == 4);
10520      modrm = getIByte(delta+2);
10521      if (!epartIsReg(modrm)) {
10522         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10523         storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
10524         DIP("movnti %s,%s\n", dis_buf,
10525                               nameIReg(4, gregOfRM(modrm)));
10526         delta += 2+alen;
10527         goto decode_success;
10528      }
10529      /* else fall through */
10530   }
10531
10532   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
10533      or lo half xmm).  */
10534   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
10535      modrm = getIByte(delta+2);
10536      if (epartIsReg(modrm)) {
10537         /* fall through, awaiting test case */
10538         /* dst: lo half copied, hi half zeroed */
10539      } else {
10540         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10541         storeLE( mkexpr(addr),
10542                  getXMMRegLane64( gregOfRM(modrm), 0 ));
10543         DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
10544         delta += 2+alen;
10545         goto decode_success;
10546      }
10547   }
10548
10549   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
10550      hi half). */
10551   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
10552      vassert(sz == 4);
10553      modrm = getIByte(delta+3);
10554      if (epartIsReg(modrm)) {
10555         do_MMX_preamble();
10556         putXMMReg( gregOfRM(modrm),
10557                    unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
10558         DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10559                                nameXMMReg(gregOfRM(modrm)));
10560         delta += 3+1;
10561         goto decode_success;
10562      } else {
10563         /* fall through, apparently no mem case for this insn */
10564      }
10565   }
10566
10567   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
10568      G (lo half xmm).  Upper half of G is zeroed out. */
10569   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
10570      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
10571      If E is reg, upper half of G is unchanged. */
10572   if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
10573       || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
10574      vassert(sz == 4);
10575      modrm = getIByte(delta+3);
10576      if (epartIsReg(modrm)) {
10577         putXMMRegLane64( gregOfRM(modrm), 0,
10578                          getXMMRegLane64( eregOfRM(modrm), 0 ));
10579         if (insn[0] == 0xF3/*MOVQ*/) {
10580            /* zero bits 127:64 */
10581            putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10582         }
10583         DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10584                              nameXMMReg(gregOfRM(modrm)));
10585         delta += 3+1;
10586      } else {
10587         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10588         /* zero bits 127:64 */
10589         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10590         /* write bits 63:0 */
10591         putXMMRegLane64( gregOfRM(modrm), 0,
10592                          loadLE(Ity_I64, mkexpr(addr)) );
10593         DIP("movsd %s,%s\n", dis_buf,
10594                              nameXMMReg(gregOfRM(modrm)));
10595         delta += 3+alen;
10596      }
10597      goto decode_success;
10598   }
10599
10600   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
10601      or lo half xmm). */
10602   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
10603      vassert(sz == 4);
10604      modrm = getIByte(delta+3);
10605      if (epartIsReg(modrm)) {
10606         putXMMRegLane64( eregOfRM(modrm), 0,
10607                          getXMMRegLane64( gregOfRM(modrm), 0 ));
10608         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10609                              nameXMMReg(eregOfRM(modrm)));
10610         delta += 3+1;
10611      } else {
10612         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10613         storeLE( mkexpr(addr),
10614                  getXMMRegLane64(gregOfRM(modrm), 0) );
10615         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10616                              dis_buf);
10617         delta += 3+alen;
10618      }
10619      goto decode_success;
10620   }
10621
10622   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
10623   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
10624      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
10625      goto decode_success;
10626   }
10627
10628   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
10629   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
10630      vassert(sz == 4);
10631      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
10632      goto decode_success;
10633   }
10634
10635   /* 66 0F 56 = ORPD -- G = G and E */
10636   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
10637      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
10638      goto decode_success;
10639   }
10640
10641   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
10642   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
10643      Int    select;
10644      IRTemp sV = newTemp(Ity_V128);
10645      IRTemp dV = newTemp(Ity_V128);
10646      IRTemp s1 = newTemp(Ity_I64);
10647      IRTemp s0 = newTemp(Ity_I64);
10648      IRTemp d1 = newTemp(Ity_I64);
10649      IRTemp d0 = newTemp(Ity_I64);
10650
10651      modrm = insn[2];
10652      assign( dV, getXMMReg(gregOfRM(modrm)) );
10653
10654      if (epartIsReg(modrm)) {
10655         assign( sV, getXMMReg(eregOfRM(modrm)) );
10656         select = (Int)insn[3];
10657         delta += 2+2;
10658         DIP("shufpd $%d,%s,%s\n", select,
10659                                   nameXMMReg(eregOfRM(modrm)),
10660                                   nameXMMReg(gregOfRM(modrm)));
10661      } else {
10662         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10663         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10664         select = (Int)insn[2+alen];
10665         delta += 3+alen;
10666         DIP("shufpd $%d,%s,%s\n", select,
10667                                   dis_buf,
10668                                   nameXMMReg(gregOfRM(modrm)));
10669      }
10670
10671      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10672      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10673      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10674      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10675
10676#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
10677#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
10678
10679      putXMMReg(
10680         gregOfRM(modrm),
10681         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
10682      );
10683
10684#     undef SELD
10685#     undef SELS
10686
10687      goto decode_success;
10688   }
10689
10690   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
10691   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
10692      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
10693                                        "sqrtpd", Iop_Sqrt64Fx2 );
10694      goto decode_success;
10695   }
10696
10697   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
10698   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
10699      vassert(sz == 4);
10700      delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
10701                                         "sqrtsd", Iop_Sqrt64F0x2 );
10702      goto decode_success;
10703   }
10704
10705   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
10706   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
10707      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
10708      goto decode_success;
10709   }
10710
10711   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
10712   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
10713      vassert(sz == 4);
10714      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
10715      goto decode_success;
10716   }
10717
10718   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
10719   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
10720   /* These just appear to be special cases of SHUFPS */
10721   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
10722      IRTemp s1 = newTemp(Ity_I64);
10723      IRTemp s0 = newTemp(Ity_I64);
10724      IRTemp d1 = newTemp(Ity_I64);
10725      IRTemp d0 = newTemp(Ity_I64);
10726      IRTemp sV = newTemp(Ity_V128);
10727      IRTemp dV = newTemp(Ity_V128);
10728      Bool   hi = toBool(insn[1] == 0x15);
10729
10730      modrm = insn[2];
10731      assign( dV, getXMMReg(gregOfRM(modrm)) );
10732
10733      if (epartIsReg(modrm)) {
10734         assign( sV, getXMMReg(eregOfRM(modrm)) );
10735         delta += 2+1;
10736         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10737                                  nameXMMReg(eregOfRM(modrm)),
10738                                  nameXMMReg(gregOfRM(modrm)));
10739      } else {
10740         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10741         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10742         delta += 2+alen;
10743         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10744                                  dis_buf,
10745                                  nameXMMReg(gregOfRM(modrm)));
10746      }
10747
10748      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10749      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10750      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10751      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10752
10753      if (hi) {
10754         putXMMReg( gregOfRM(modrm),
10755                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
10756      } else {
10757         putXMMReg( gregOfRM(modrm),
10758                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
10759      }
10760
10761      goto decode_success;
10762   }
10763
10764   /* 66 0F 57 = XORPD -- G = G and E */
10765   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
10766      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
10767      goto decode_success;
10768   }
10769
10770   /* 66 0F 6B = PACKSSDW */
10771   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
10772      delta = dis_SSEint_E_to_G( sorb, delta+2,
10773                                 "packssdw",
10774                                 Iop_QNarrowBin32Sto16Sx8, True );
10775      goto decode_success;
10776   }
10777
10778   /* 66 0F 63 = PACKSSWB */
10779   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
10780      delta = dis_SSEint_E_to_G( sorb, delta+2,
10781                                 "packsswb",
10782                                 Iop_QNarrowBin16Sto8Sx16, True );
10783      goto decode_success;
10784   }
10785
10786   /* 66 0F 67 = PACKUSWB */
10787   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
10788      delta = dis_SSEint_E_to_G( sorb, delta+2,
10789                                 "packuswb",
10790                                 Iop_QNarrowBin16Sto8Ux16, True );
10791      goto decode_success;
10792   }
10793
10794   /* 66 0F FC = PADDB */
10795   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
10796      delta = dis_SSEint_E_to_G( sorb, delta+2,
10797                                 "paddb", Iop_Add8x16, False );
10798      goto decode_success;
10799   }
10800
10801   /* 66 0F FE = PADDD */
10802   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
10803      delta = dis_SSEint_E_to_G( sorb, delta+2,
10804                                 "paddd", Iop_Add32x4, False );
10805      goto decode_success;
10806   }
10807
10808   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10809   /* 0F D4 = PADDQ -- add 64x1 */
10810   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
10811      do_MMX_preamble();
10812      delta = dis_MMXop_regmem_to_reg (
10813                sorb, delta+2, insn[1], "paddq", False );
10814      goto decode_success;
10815   }
10816
10817   /* 66 0F D4 = PADDQ */
10818   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
10819      delta = dis_SSEint_E_to_G( sorb, delta+2,
10820                                 "paddq", Iop_Add64x2, False );
10821      goto decode_success;
10822   }
10823
10824   /* 66 0F FD = PADDW */
10825   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
10826      delta = dis_SSEint_E_to_G( sorb, delta+2,
10827                                 "paddw", Iop_Add16x8, False );
10828      goto decode_success;
10829   }
10830
10831   /* 66 0F EC = PADDSB */
10832   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
10833      delta = dis_SSEint_E_to_G( sorb, delta+2,
10834                                 "paddsb", Iop_QAdd8Sx16, False );
10835      goto decode_success;
10836   }
10837
10838   /* 66 0F ED = PADDSW */
10839   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
10840      delta = dis_SSEint_E_to_G( sorb, delta+2,
10841                                 "paddsw", Iop_QAdd16Sx8, False );
10842      goto decode_success;
10843   }
10844
10845   /* 66 0F DC = PADDUSB */
10846   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
10847      delta = dis_SSEint_E_to_G( sorb, delta+2,
10848                                 "paddusb", Iop_QAdd8Ux16, False );
10849      goto decode_success;
10850   }
10851
10852   /* 66 0F DD = PADDUSW */
10853   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
10854      delta = dis_SSEint_E_to_G( sorb, delta+2,
10855                                 "paddusw", Iop_QAdd16Ux8, False );
10856      goto decode_success;
10857   }
10858
10859   /* 66 0F DB = PAND */
10860   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
10861      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
10862      goto decode_success;
10863   }
10864
10865   /* 66 0F DF = PANDN */
10866   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
10867      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
10868      goto decode_success;
10869   }
10870
10871   /* 66 0F E0 = PAVGB */
10872   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
10873      delta = dis_SSEint_E_to_G( sorb, delta+2,
10874                                 "pavgb", Iop_Avg8Ux16, False );
10875      goto decode_success;
10876   }
10877
10878   /* 66 0F E3 = PAVGW */
10879   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
10880      delta = dis_SSEint_E_to_G( sorb, delta+2,
10881                                 "pavgw", Iop_Avg16Ux8, False );
10882      goto decode_success;
10883   }
10884
10885   /* 66 0F 74 = PCMPEQB */
10886   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
10887      delta = dis_SSEint_E_to_G( sorb, delta+2,
10888                                 "pcmpeqb", Iop_CmpEQ8x16, False );
10889      goto decode_success;
10890   }
10891
10892   /* 66 0F 76 = PCMPEQD */
10893   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
10894      delta = dis_SSEint_E_to_G( sorb, delta+2,
10895                                 "pcmpeqd", Iop_CmpEQ32x4, False );
10896      goto decode_success;
10897   }
10898
10899   /* 66 0F 75 = PCMPEQW */
10900   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
10901      delta = dis_SSEint_E_to_G( sorb, delta+2,
10902                                 "pcmpeqw", Iop_CmpEQ16x8, False );
10903      goto decode_success;
10904   }
10905
10906   /* 66 0F 64 = PCMPGTB */
10907   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
10908      delta = dis_SSEint_E_to_G( sorb, delta+2,
10909                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
10910      goto decode_success;
10911   }
10912
10913   /* 66 0F 66 = PCMPGTD */
10914   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
10915      delta = dis_SSEint_E_to_G( sorb, delta+2,
10916                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
10917      goto decode_success;
10918   }
10919
10920   /* 66 0F 65 = PCMPGTW */
10921   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
10922      delta = dis_SSEint_E_to_G( sorb, delta+2,
10923                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
10924      goto decode_success;
10925   }
10926
10927   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
10928      zero-extend of it in ireg(G). */
10929   if (insn[0] == 0x0F && insn[1] == 0xC5) {
10930      modrm = insn[2];
10931      if (sz == 2 && epartIsReg(modrm)) {
10932         t5 = newTemp(Ity_V128);
10933         t4 = newTemp(Ity_I16);
10934         assign(t5, getXMMReg(eregOfRM(modrm)));
10935         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
10936         switch (insn[3] & 7) {
10937            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
10938            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
10939            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
10940            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
10941            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
10942            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
10943            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
10944            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
10945            default: vassert(0); /*NOTREACHED*/
10946         }
10947         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
10948         DIP("pextrw $%d,%s,%s\n",
10949             (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
10950                           nameIReg(4,gregOfRM(modrm)));
10951         delta += 4;
10952         goto decode_success;
10953      }
10954      /* else fall through */
10955   }
10956
10957   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
10958      put it into the specified lane of xmm(G). */
10959   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
10960      Int lane;
10961      t4 = newTemp(Ity_I16);
10962      modrm = insn[2];
10963
10964      if (epartIsReg(modrm)) {
10965         assign(t4, getIReg(2, eregOfRM(modrm)));
10966         delta += 3+1;
10967         lane = insn[3+1-1];
10968         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10969                                   nameIReg(2,eregOfRM(modrm)),
10970                                   nameXMMReg(gregOfRM(modrm)));
10971      } else {
10972         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10973         delta += 3+alen;
10974         lane = insn[3+alen-1];
10975         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
10976         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10977                                   dis_buf,
10978                                   nameXMMReg(gregOfRM(modrm)));
10979      }
10980
10981      putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
10982      goto decode_success;
10983   }
10984
10985   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
10986      E(xmm or mem) to G(xmm) */
10987   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
10988      IRTemp s1V  = newTemp(Ity_V128);
10989      IRTemp s2V  = newTemp(Ity_V128);
10990      IRTemp dV   = newTemp(Ity_V128);
10991      IRTemp s1Hi = newTemp(Ity_I64);
10992      IRTemp s1Lo = newTemp(Ity_I64);
10993      IRTemp s2Hi = newTemp(Ity_I64);
10994      IRTemp s2Lo = newTemp(Ity_I64);
10995      IRTemp dHi  = newTemp(Ity_I64);
10996      IRTemp dLo  = newTemp(Ity_I64);
10997      modrm = insn[2];
10998      if (epartIsReg(modrm)) {
10999         assign( s1V, getXMMReg(eregOfRM(modrm)) );
11000         delta += 2+1;
11001         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11002                                nameXMMReg(gregOfRM(modrm)));
11003      } else {
11004         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11005         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11006         delta += 2+alen;
11007         DIP("pmaddwd %s,%s\n", dis_buf,
11008                                nameXMMReg(gregOfRM(modrm)));
11009      }
11010      assign( s2V, getXMMReg(gregOfRM(modrm)) );
11011      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11012      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11013      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11014      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11015      assign( dHi, mkIRExprCCall(
11016                      Ity_I64, 0/*regparms*/,
11017                      "x86g_calculate_mmx_pmaddwd",
11018                      &x86g_calculate_mmx_pmaddwd,
11019                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11020                   ));
11021      assign( dLo, mkIRExprCCall(
11022                      Ity_I64, 0/*regparms*/,
11023                      "x86g_calculate_mmx_pmaddwd",
11024                      &x86g_calculate_mmx_pmaddwd,
11025                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11026                   ));
11027      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11028      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11029      goto decode_success;
11030   }
11031
11032   /* 66 0F EE = PMAXSW -- 16x8 signed max */
11033   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
11034      delta = dis_SSEint_E_to_G( sorb, delta+2,
11035                                 "pmaxsw", Iop_Max16Sx8, False );
11036      goto decode_success;
11037   }
11038
11039   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
11040   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
11041      delta = dis_SSEint_E_to_G( sorb, delta+2,
11042                                 "pmaxub", Iop_Max8Ux16, False );
11043      goto decode_success;
11044   }
11045
11046   /* 66 0F EA = PMINSW -- 16x8 signed min */
11047   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
11048      delta = dis_SSEint_E_to_G( sorb, delta+2,
11049                                 "pminsw", Iop_Min16Sx8, False );
11050      goto decode_success;
11051   }
11052
11053   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
11054   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
11055      delta = dis_SSEint_E_to_G( sorb, delta+2,
11056                                 "pminub", Iop_Min8Ux16, False );
11057      goto decode_success;
11058   }
11059
11060   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
11061      in xmm(E), turn them into a byte, and put zero-extend of it in
11062      ireg(G). */
11063   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
11064      modrm = insn[2];
11065      if (epartIsReg(modrm)) {
11066         t0 = newTemp(Ity_I64);
11067         t1 = newTemp(Ity_I64);
11068         assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
11069         assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
11070         t5 = newTemp(Ity_I32);
11071         assign(t5,
11072                unop(Iop_16Uto32,
11073                     binop(Iop_8HLto16,
11074                           unop(Iop_GetMSBs8x8, mkexpr(t1)),
11075                           unop(Iop_GetMSBs8x8, mkexpr(t0)))));
11076         putIReg(4, gregOfRM(modrm), mkexpr(t5));
11077         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11078                                 nameIReg(4,gregOfRM(modrm)));
11079         delta += 3;
11080         goto decode_success;
11081      }
11082      /* else fall through */
11083   }
11084
11085   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
11086   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
11087      delta = dis_SSEint_E_to_G( sorb, delta+2,
11088                                 "pmulhuw", Iop_MulHi16Ux8, False );
11089      goto decode_success;
11090   }
11091
11092   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
11093   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
11094      delta = dis_SSEint_E_to_G( sorb, delta+2,
11095                                 "pmulhw", Iop_MulHi16Sx8, False );
11096      goto decode_success;
11097   }
11098
11099   /* 66 0F D5 = PMULHL -- 16x8 multiply */
11100   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
11101      delta = dis_SSEint_E_to_G( sorb, delta+2,
11102                                 "pmullw", Iop_Mul16x8, False );
11103      goto decode_success;
11104   }
11105
11106   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11107   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
11108      0 to form 64-bit result */
11109   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
11110      IRTemp sV = newTemp(Ity_I64);
11111      IRTemp dV = newTemp(Ity_I64);
11112      t1 = newTemp(Ity_I32);
11113      t0 = newTemp(Ity_I32);
11114      modrm = insn[2];
11115
11116      do_MMX_preamble();
11117      assign( dV, getMMXReg(gregOfRM(modrm)) );
11118
11119      if (epartIsReg(modrm)) {
11120         assign( sV, getMMXReg(eregOfRM(modrm)) );
11121         delta += 2+1;
11122         DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11123                                nameMMXReg(gregOfRM(modrm)));
11124      } else {
11125         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11126         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11127         delta += 2+alen;
11128         DIP("pmuludq %s,%s\n", dis_buf,
11129                                nameMMXReg(gregOfRM(modrm)));
11130      }
11131
11132      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
11133      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
11134      putMMXReg( gregOfRM(modrm),
11135                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
11136      goto decode_success;
11137   }
11138
11139   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
11140      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
11141      half */
11142   /* This is a really poor translation -- could be improved if
11143      performance critical */
11144   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
11145      IRTemp sV, dV;
11146      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11147      sV = newTemp(Ity_V128);
11148      dV = newTemp(Ity_V128);
11149      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11150      t1 = newTemp(Ity_I64);
11151      t0 = newTemp(Ity_I64);
11152      modrm = insn[2];
11153      assign( dV, getXMMReg(gregOfRM(modrm)) );
11154
11155      if (epartIsReg(modrm)) {
11156         assign( sV, getXMMReg(eregOfRM(modrm)) );
11157         delta += 2+1;
11158         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11159                                nameXMMReg(gregOfRM(modrm)));
11160      } else {
11161         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11162         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11163         delta += 2+alen;
11164         DIP("pmuludq %s,%s\n", dis_buf,
11165                                nameXMMReg(gregOfRM(modrm)));
11166      }
11167
11168      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
11169      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11170
11171      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
11172      putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
11173      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
11174      putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
11175      goto decode_success;
11176   }
11177
11178   /* 66 0F EB = POR */
11179   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
11180      delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
11181      goto decode_success;
11182   }
11183
11184   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
11185      from E(xmm or mem) to G(xmm) */
11186   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
11187      IRTemp s1V  = newTemp(Ity_V128);
11188      IRTemp s2V  = newTemp(Ity_V128);
11189      IRTemp dV   = newTemp(Ity_V128);
11190      IRTemp s1Hi = newTemp(Ity_I64);
11191      IRTemp s1Lo = newTemp(Ity_I64);
11192      IRTemp s2Hi = newTemp(Ity_I64);
11193      IRTemp s2Lo = newTemp(Ity_I64);
11194      IRTemp dHi  = newTemp(Ity_I64);
11195      IRTemp dLo  = newTemp(Ity_I64);
11196      modrm = insn[2];
11197      if (epartIsReg(modrm)) {
11198         assign( s1V, getXMMReg(eregOfRM(modrm)) );
11199         delta += 2+1;
11200         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11201                               nameXMMReg(gregOfRM(modrm)));
11202      } else {
11203         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11204         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11205         delta += 2+alen;
11206         DIP("psadbw %s,%s\n", dis_buf,
11207                               nameXMMReg(gregOfRM(modrm)));
11208      }
11209      assign( s2V, getXMMReg(gregOfRM(modrm)) );
11210      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11211      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11212      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11213      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11214      assign( dHi, mkIRExprCCall(
11215                      Ity_I64, 0/*regparms*/,
11216                      "x86g_calculate_mmx_psadbw",
11217                      &x86g_calculate_mmx_psadbw,
11218                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11219                   ));
11220      assign( dLo, mkIRExprCCall(
11221                      Ity_I64, 0/*regparms*/,
11222                      "x86g_calculate_mmx_psadbw",
11223                      &x86g_calculate_mmx_psadbw,
11224                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11225                   ));
11226      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11227      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11228      goto decode_success;
11229   }
11230
11231   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
11232   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
11233      Int order;
11234      IRTemp sV, dV, s3, s2, s1, s0;
11235      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11236      sV = newTemp(Ity_V128);
11237      dV = newTemp(Ity_V128);
11238      modrm = insn[2];
11239      if (epartIsReg(modrm)) {
11240         assign( sV, getXMMReg(eregOfRM(modrm)) );
11241         order = (Int)insn[3];
11242         delta += 2+2;
11243         DIP("pshufd $%d,%s,%s\n", order,
11244                                   nameXMMReg(eregOfRM(modrm)),
11245                                   nameXMMReg(gregOfRM(modrm)));
11246      } else {
11247         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11248         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11249	 order = (Int)insn[2+alen];
11250         delta += 3+alen;
11251         DIP("pshufd $%d,%s,%s\n", order,
11252                                   dis_buf,
11253                                   nameXMMReg(gregOfRM(modrm)));
11254      }
11255      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11256
11257#     define SEL(n) \
11258                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11259      assign(dV,
11260	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
11261                           SEL((order>>2)&3), SEL((order>>0)&3) )
11262      );
11263      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11264#     undef SEL
11265      goto decode_success;
11266   }
11267
11268   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
11269      mem) to G(xmm), and copy lower half */
11270   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
11271      Int order;
11272      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
11273      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11274      sV   = newTemp(Ity_V128);
11275      dV   = newTemp(Ity_V128);
11276      sVhi = newTemp(Ity_I64);
11277      dVhi = newTemp(Ity_I64);
11278      modrm = insn[3];
11279      if (epartIsReg(modrm)) {
11280         assign( sV, getXMMReg(eregOfRM(modrm)) );
11281         order = (Int)insn[4];
11282         delta += 4+1;
11283         DIP("pshufhw $%d,%s,%s\n", order,
11284                                    nameXMMReg(eregOfRM(modrm)),
11285                                    nameXMMReg(gregOfRM(modrm)));
11286      } else {
11287         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11288         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11289	 order = (Int)insn[3+alen];
11290         delta += 4+alen;
11291         DIP("pshufhw $%d,%s,%s\n", order,
11292                                    dis_buf,
11293                                    nameXMMReg(gregOfRM(modrm)));
11294      }
11295      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
11296      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
11297
11298#     define SEL(n) \
11299                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11300      assign(dVhi,
11301	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11302                          SEL((order>>2)&3), SEL((order>>0)&3) )
11303      );
11304      assign(dV, binop( Iop_64HLtoV128,
11305                        mkexpr(dVhi),
11306                        unop(Iop_V128to64, mkexpr(sV))) );
11307      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11308#     undef SEL
11309      goto decode_success;
11310   }
11311
11312   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
11313      mem) to G(xmm), and copy upper half */
11314   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
11315      Int order;
11316      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
11317      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11318      sV   = newTemp(Ity_V128);
11319      dV   = newTemp(Ity_V128);
11320      sVlo = newTemp(Ity_I64);
11321      dVlo = newTemp(Ity_I64);
11322      modrm = insn[3];
11323      if (epartIsReg(modrm)) {
11324         assign( sV, getXMMReg(eregOfRM(modrm)) );
11325         order = (Int)insn[4];
11326         delta += 4+1;
11327         DIP("pshuflw $%d,%s,%s\n", order,
11328                                    nameXMMReg(eregOfRM(modrm)),
11329                                    nameXMMReg(gregOfRM(modrm)));
11330      } else {
11331         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11332         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11333	 order = (Int)insn[3+alen];
11334         delta += 4+alen;
11335         DIP("pshuflw $%d,%s,%s\n", order,
11336                                    dis_buf,
11337                                    nameXMMReg(gregOfRM(modrm)));
11338      }
11339      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
11340      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
11341
11342#     define SEL(n) \
11343                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11344      assign(dVlo,
11345	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11346                          SEL((order>>2)&3), SEL((order>>0)&3) )
11347      );
11348      assign(dV, binop( Iop_64HLtoV128,
11349                        unop(Iop_V128HIto64, mkexpr(sV)),
11350                        mkexpr(dVlo) ) );
11351      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11352#     undef SEL
11353      goto decode_success;
11354   }
11355
11356   /* 66 0F 72 /6 ib = PSLLD by immediate */
11357   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11358       && epartIsReg(insn[2])
11359       && gregOfRM(insn[2]) == 6) {
11360      delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
11361      goto decode_success;
11362   }
11363
11364   /* 66 0F F2 = PSLLD by E */
11365   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
11366      delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
11367      goto decode_success;
11368   }
11369
11370   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
11371   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11372       && epartIsReg(insn[2])
11373       && gregOfRM(insn[2]) == 7) {
11374      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11375      Int    imm = (Int)insn[3];
11376      Int    reg = eregOfRM(insn[2]);
11377      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
11378      vassert(imm >= 0 && imm <= 255);
11379      delta += 4;
11380
11381      sV    = newTemp(Ity_V128);
11382      dV    = newTemp(Ity_V128);
11383      hi64  = newTemp(Ity_I64);
11384      lo64  = newTemp(Ity_I64);
11385      hi64r = newTemp(Ity_I64);
11386      lo64r = newTemp(Ity_I64);
11387
11388      if (imm >= 16) {
11389         putXMMReg(reg, mkV128(0x0000));
11390         goto decode_success;
11391      }
11392
11393      assign( sV, getXMMReg(reg) );
11394      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11395      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11396
11397      if (imm == 0) {
11398         assign( lo64r, mkexpr(lo64) );
11399         assign( hi64r, mkexpr(hi64) );
11400      }
11401      else
11402      if (imm == 8) {
11403         assign( lo64r, mkU64(0) );
11404         assign( hi64r, mkexpr(lo64) );
11405      }
11406      else
11407      if (imm > 8) {
11408         assign( lo64r, mkU64(0) );
11409         assign( hi64r, binop( Iop_Shl64,
11410                               mkexpr(lo64),
11411                               mkU8( 8*(imm-8) ) ));
11412      } else {
11413         assign( lo64r, binop( Iop_Shl64,
11414                               mkexpr(lo64),
11415                               mkU8(8 * imm) ));
11416         assign( hi64r,
11417                 binop( Iop_Or64,
11418                        binop(Iop_Shl64, mkexpr(hi64),
11419                                         mkU8(8 * imm)),
11420                        binop(Iop_Shr64, mkexpr(lo64),
11421                                         mkU8(8 * (8 - imm)) )
11422                      )
11423               );
11424      }
11425      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11426      putXMMReg(reg, mkexpr(dV));
11427      goto decode_success;
11428   }
11429
11430   /* 66 0F 73 /6 ib = PSLLQ by immediate */
11431   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11432       && epartIsReg(insn[2])
11433       && gregOfRM(insn[2]) == 6) {
11434      delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
11435      goto decode_success;
11436   }
11437
11438   /* 66 0F F3 = PSLLQ by E */
11439   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
11440      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
11441      goto decode_success;
11442   }
11443
11444   /* 66 0F 71 /6 ib = PSLLW by immediate */
11445   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11446       && epartIsReg(insn[2])
11447       && gregOfRM(insn[2]) == 6) {
11448      delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
11449      goto decode_success;
11450   }
11451
11452   /* 66 0F F1 = PSLLW by E */
11453   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
11454      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
11455      goto decode_success;
11456   }
11457
11458   /* 66 0F 72 /4 ib = PSRAD by immediate */
11459   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11460       && epartIsReg(insn[2])
11461       && gregOfRM(insn[2]) == 4) {
11462      delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
11463      goto decode_success;
11464   }
11465
11466   /* 66 0F E2 = PSRAD by E */
11467   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
11468      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
11469      goto decode_success;
11470   }
11471
11472   /* 66 0F 71 /4 ib = PSRAW by immediate */
11473   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11474       && epartIsReg(insn[2])
11475       && gregOfRM(insn[2]) == 4) {
11476      delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
11477      goto decode_success;
11478   }
11479
11480   /* 66 0F E1 = PSRAW by E */
11481   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
11482      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
11483      goto decode_success;
11484   }
11485
11486   /* 66 0F 72 /2 ib = PSRLD by immediate */
11487   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11488       && epartIsReg(insn[2])
11489       && gregOfRM(insn[2]) == 2) {
11490      delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
11491      goto decode_success;
11492   }
11493
11494   /* 66 0F D2 = PSRLD by E */
11495   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
11496      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
11497      goto decode_success;
11498   }
11499
11500   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
11501   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11502       && epartIsReg(insn[2])
11503       && gregOfRM(insn[2]) == 3) {
11504      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11505      Int    imm = (Int)insn[3];
11506      Int    reg = eregOfRM(insn[2]);
11507      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
11508      vassert(imm >= 0 && imm <= 255);
11509      delta += 4;
11510
11511      sV    = newTemp(Ity_V128);
11512      dV    = newTemp(Ity_V128);
11513      hi64  = newTemp(Ity_I64);
11514      lo64  = newTemp(Ity_I64);
11515      hi64r = newTemp(Ity_I64);
11516      lo64r = newTemp(Ity_I64);
11517
11518      if (imm >= 16) {
11519         putXMMReg(reg, mkV128(0x0000));
11520         goto decode_success;
11521      }
11522
11523      assign( sV, getXMMReg(reg) );
11524      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11525      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11526
11527      if (imm == 0) {
11528         assign( lo64r, mkexpr(lo64) );
11529         assign( hi64r, mkexpr(hi64) );
11530      }
11531      else
11532      if (imm == 8) {
11533         assign( hi64r, mkU64(0) );
11534         assign( lo64r, mkexpr(hi64) );
11535      }
11536      else
11537      if (imm > 8) {
11538         assign( hi64r, mkU64(0) );
11539         assign( lo64r, binop( Iop_Shr64,
11540                               mkexpr(hi64),
11541                               mkU8( 8*(imm-8) ) ));
11542      } else {
11543         assign( hi64r, binop( Iop_Shr64,
11544                               mkexpr(hi64),
11545                               mkU8(8 * imm) ));
11546         assign( lo64r,
11547                 binop( Iop_Or64,
11548                        binop(Iop_Shr64, mkexpr(lo64),
11549                                         mkU8(8 * imm)),
11550                        binop(Iop_Shl64, mkexpr(hi64),
11551                                         mkU8(8 * (8 - imm)) )
11552                      )
11553               );
11554      }
11555
11556      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11557      putXMMReg(reg, mkexpr(dV));
11558      goto decode_success;
11559   }
11560
11561   /* 66 0F 73 /2 ib = PSRLQ by immediate */
11562   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11563       && epartIsReg(insn[2])
11564       && gregOfRM(insn[2]) == 2) {
11565      delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
11566      goto decode_success;
11567   }
11568
11569   /* 66 0F D3 = PSRLQ by E */
11570   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
11571      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
11572      goto decode_success;
11573   }
11574
11575   /* 66 0F 71 /2 ib = PSRLW by immediate */
11576   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11577       && epartIsReg(insn[2])
11578       && gregOfRM(insn[2]) == 2) {
11579      delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
11580      goto decode_success;
11581   }
11582
11583   /* 66 0F D1 = PSRLW by E */
11584   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
11585      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
11586      goto decode_success;
11587   }
11588
11589   /* 66 0F F8 = PSUBB */
11590   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
11591      delta = dis_SSEint_E_to_G( sorb, delta+2,
11592                                 "psubb", Iop_Sub8x16, False );
11593      goto decode_success;
11594   }
11595
11596   /* 66 0F FA = PSUBD */
11597   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
11598      delta = dis_SSEint_E_to_G( sorb, delta+2,
11599                                 "psubd", Iop_Sub32x4, False );
11600      goto decode_success;
11601   }
11602
11603   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11604   /* 0F FB = PSUBQ -- sub 64x1 */
11605   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
11606      do_MMX_preamble();
11607      delta = dis_MMXop_regmem_to_reg (
11608                sorb, delta+2, insn[1], "psubq", False );
11609      goto decode_success;
11610   }
11611
11612   /* 66 0F FB = PSUBQ */
11613   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
11614      delta = dis_SSEint_E_to_G( sorb, delta+2,
11615                                 "psubq", Iop_Sub64x2, False );
11616      goto decode_success;
11617   }
11618
11619   /* 66 0F F9 = PSUBW */
11620   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
11621      delta = dis_SSEint_E_to_G( sorb, delta+2,
11622                                 "psubw", Iop_Sub16x8, False );
11623      goto decode_success;
11624   }
11625
11626   /* 66 0F E8 = PSUBSB */
11627   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
11628      delta = dis_SSEint_E_to_G( sorb, delta+2,
11629                                 "psubsb", Iop_QSub8Sx16, False );
11630      goto decode_success;
11631   }
11632
11633   /* 66 0F E9 = PSUBSW */
11634   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
11635      delta = dis_SSEint_E_to_G( sorb, delta+2,
11636                                 "psubsw", Iop_QSub16Sx8, False );
11637      goto decode_success;
11638   }
11639
11640   /* 66 0F D8 = PSUBSB */
11641   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
11642      delta = dis_SSEint_E_to_G( sorb, delta+2,
11643                                 "psubusb", Iop_QSub8Ux16, False );
11644      goto decode_success;
11645   }
11646
11647   /* 66 0F D9 = PSUBSW */
11648   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
11649      delta = dis_SSEint_E_to_G( sorb, delta+2,
11650                                 "psubusw", Iop_QSub16Ux8, False );
11651      goto decode_success;
11652   }
11653
11654   /* 66 0F 68 = PUNPCKHBW */
11655   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
11656      delta = dis_SSEint_E_to_G( sorb, delta+2,
11657                                 "punpckhbw",
11658                                 Iop_InterleaveHI8x16, True );
11659      goto decode_success;
11660   }
11661
11662   /* 66 0F 6A = PUNPCKHDQ */
11663   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
11664      delta = dis_SSEint_E_to_G( sorb, delta+2,
11665                                 "punpckhdq",
11666                                 Iop_InterleaveHI32x4, True );
11667      goto decode_success;
11668   }
11669
11670   /* 66 0F 6D = PUNPCKHQDQ */
11671   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
11672      delta = dis_SSEint_E_to_G( sorb, delta+2,
11673                                 "punpckhqdq",
11674                                 Iop_InterleaveHI64x2, True );
11675      goto decode_success;
11676   }
11677
11678   /* 66 0F 69 = PUNPCKHWD */
11679   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
11680      delta = dis_SSEint_E_to_G( sorb, delta+2,
11681                                 "punpckhwd",
11682                                 Iop_InterleaveHI16x8, True );
11683      goto decode_success;
11684   }
11685
11686   /* 66 0F 60 = PUNPCKLBW */
11687   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
11688      delta = dis_SSEint_E_to_G( sorb, delta+2,
11689                                 "punpcklbw",
11690                                 Iop_InterleaveLO8x16, True );
11691      goto decode_success;
11692   }
11693
11694   /* 66 0F 62 = PUNPCKLDQ */
11695   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
11696      delta = dis_SSEint_E_to_G( sorb, delta+2,
11697                                 "punpckldq",
11698                                 Iop_InterleaveLO32x4, True );
11699      goto decode_success;
11700   }
11701
11702   /* 66 0F 6C = PUNPCKLQDQ */
11703   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
11704      delta = dis_SSEint_E_to_G( sorb, delta+2,
11705                                 "punpcklqdq",
11706                                 Iop_InterleaveLO64x2, True );
11707      goto decode_success;
11708   }
11709
11710   /* 66 0F 61 = PUNPCKLWD */
11711   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
11712      delta = dis_SSEint_E_to_G( sorb, delta+2,
11713                                 "punpcklwd",
11714                                 Iop_InterleaveLO16x8, True );
11715      goto decode_success;
11716   }
11717
11718   /* 66 0F EF = PXOR */
11719   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
11720      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
11721      goto decode_success;
11722   }
11723
11724//--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
11725//--    if (insn[0] == 0x0F && insn[1] == 0xAE
11726//--        && (!epartIsReg(insn[2]))
11727//--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
11728//--       Bool store = gregOfRM(insn[2]) == 0;
11729//--       vg_assert(sz == 4);
11730//--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
11731//--       t1   = LOW24(pair);
11732//--       eip += 2+HI8(pair);
11733//--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
11734//--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
11735//--                   Lit16, (UShort)insn[2],
11736//--                   TempReg, t1 );
11737//--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
11738//--       goto decode_success;
11739//--    }
11740
11741   /* 0F AE /7 = CLFLUSH -- flush cache line */
11742   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
11743       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
11744
11745      /* This is something of a hack.  We need to know the size of the
11746         cache line containing addr.  Since we don't (easily), assume
11747         256 on the basis that no real cache would have a line that
11748         big.  It's safe to invalidate more stuff than we need, just
11749         inefficient. */
11750      UInt lineszB = 256;
11751
11752      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11753      delta += 2+alen;
11754
11755      /* Round addr down to the start of the containing block. */
11756      stmt( IRStmt_Put(
11757               OFFB_CMSTART,
11758               binop( Iop_And32,
11759                      mkexpr(addr),
11760                      mkU32( ~(lineszB-1) ))) );
11761
11762      stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
11763
11764      jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
11765
11766      DIP("clflush %s\n", dis_buf);
11767      goto decode_success;
11768   }
11769
11770   /* ---------------------------------------------------- */
11771   /* --- end of the SSE2 decoder.                     --- */
11772   /* ---------------------------------------------------- */
11773
11774   /* ---------------------------------------------------- */
11775   /* --- start of the SSE3 decoder.                   --- */
11776   /* ---------------------------------------------------- */
11777
11778   /* Skip parts of the decoder which don't apply given the stated
11779      guest subarchitecture. */
11780   /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
11781   /* In fact this is highly bogus; we accept SSE3 insns even on a
11782      SSE2-only guest since they turn into IR which can be re-emitted
11783      successfully on an SSE2 host. */
11784   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
11785      goto after_sse_decoders; /* no SSE3 capabilities */
11786
11787   insn = (UChar*)&guest_code[delta];
11788
11789   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
11790      duplicating some lanes (2:2:0:0). */
11791   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
11792      duplicating some lanes (3:3:1:1). */
11793   if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
11794       && (insn[2] == 0x12 || insn[2] == 0x16)) {
11795      IRTemp s3, s2, s1, s0;
11796      IRTemp sV  = newTemp(Ity_V128);
11797      Bool   isH = insn[2] == 0x16;
11798      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11799
11800      modrm = insn[3];
11801      if (epartIsReg(modrm)) {
11802         assign( sV, getXMMReg( eregOfRM(modrm)) );
11803         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11804                                  nameXMMReg(eregOfRM(modrm)),
11805                                  nameXMMReg(gregOfRM(modrm)));
11806         delta += 3+1;
11807      } else {
11808         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11809         gen_SEGV_if_not_16_aligned( addr );
11810         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11811         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11812	     dis_buf,
11813             nameXMMReg(gregOfRM(modrm)));
11814         delta += 3+alen;
11815      }
11816
11817      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11818      putXMMReg( gregOfRM(modrm),
11819                 isH ? mk128from32s( s3, s3, s1, s1 )
11820                     : mk128from32s( s2, s2, s0, s0 ) );
11821      goto decode_success;
11822   }
11823
11824   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
11825      duplicating some lanes (0:1:0:1). */
11826   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
11827      IRTemp sV = newTemp(Ity_V128);
11828      IRTemp d0 = newTemp(Ity_I64);
11829
11830      modrm = insn[3];
11831      if (epartIsReg(modrm)) {
11832         assign( sV, getXMMReg( eregOfRM(modrm)) );
11833         DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11834                                nameXMMReg(gregOfRM(modrm)));
11835         delta += 3+1;
11836         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
11837      } else {
11838         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11839         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
11840         DIP("movddup %s,%s\n", dis_buf,
11841                                nameXMMReg(gregOfRM(modrm)));
11842         delta += 3+alen;
11843      }
11844
11845      putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
11846      goto decode_success;
11847   }
11848
11849   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
11850   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
11851      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11852      IRTemp eV   = newTemp(Ity_V128);
11853      IRTemp gV   = newTemp(Ity_V128);
11854      IRTemp addV = newTemp(Ity_V128);
11855      IRTemp subV = newTemp(Ity_V128);
11856      IRTemp rm     = newTemp(Ity_I32);
11857      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11858
11859      modrm = insn[3];
11860      if (epartIsReg(modrm)) {
11861         assign( eV, getXMMReg( eregOfRM(modrm)) );
11862         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11863                                 nameXMMReg(gregOfRM(modrm)));
11864         delta += 3+1;
11865      } else {
11866         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11867         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11868         DIP("addsubps %s,%s\n", dis_buf,
11869                                 nameXMMReg(gregOfRM(modrm)));
11870         delta += 3+alen;
11871      }
11872
11873      assign( gV, getXMMReg(gregOfRM(modrm)) );
11874
11875      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11876      assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11877      assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11878
11879      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
11880      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
11881
11882      putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
11883      goto decode_success;
11884   }
11885
11886   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
11887   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
11888      IRTemp eV   = newTemp(Ity_V128);
11889      IRTemp gV   = newTemp(Ity_V128);
11890      IRTemp addV = newTemp(Ity_V128);
11891      IRTemp subV = newTemp(Ity_V128);
11892      IRTemp a1     = newTemp(Ity_I64);
11893      IRTemp s0     = newTemp(Ity_I64);
11894      IRTemp rm     = newTemp(Ity_I32);
11895
11896      modrm = insn[2];
11897      if (epartIsReg(modrm)) {
11898         assign( eV, getXMMReg( eregOfRM(modrm)) );
11899         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11900                                 nameXMMReg(gregOfRM(modrm)));
11901         delta += 2+1;
11902      } else {
11903         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11904         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11905         DIP("addsubpd %s,%s\n", dis_buf,
11906                                 nameXMMReg(gregOfRM(modrm)));
11907         delta += 2+alen;
11908      }
11909
11910      assign( gV, getXMMReg(gregOfRM(modrm)) );
11911
11912      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11913      assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11914      assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
11915
11916      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11917      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11918
11919      putXMMReg( gregOfRM(modrm),
11920                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11921      goto decode_success;
11922   }
11923
11924   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
11925   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
11926   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
11927       && (insn[2] == 0x7C || insn[2] == 0x7D)) {
11928      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
11929      IRTemp eV     = newTemp(Ity_V128);
11930      IRTemp gV     = newTemp(Ity_V128);
11931      IRTemp leftV  = newTemp(Ity_V128);
11932      IRTemp rightV = newTemp(Ity_V128);
11933      IRTemp rm     = newTemp(Ity_I32);
11934      Bool   isAdd  = insn[2] == 0x7C;
11935      const HChar* str = isAdd ? "add" : "sub";
11936      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
11937
11938      modrm = insn[3];
11939      if (epartIsReg(modrm)) {
11940         assign( eV, getXMMReg( eregOfRM(modrm)) );
11941         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11942                                   nameXMMReg(gregOfRM(modrm)));
11943         delta += 3+1;
11944      } else {
11945         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11946         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11947         DIP("h%sps %s,%s\n", str, dis_buf,
11948                                   nameXMMReg(gregOfRM(modrm)));
11949         delta += 3+alen;
11950      }
11951
11952      assign( gV, getXMMReg(gregOfRM(modrm)) );
11953
11954      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
11955      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
11956
11957      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
11958      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
11959
11960      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11961      putXMMReg( gregOfRM(modrm),
11962                 triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
11963                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
11964      goto decode_success;
11965   }
11966
11967   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
11968   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
11969   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
11970      IRTemp e1     = newTemp(Ity_I64);
11971      IRTemp e0     = newTemp(Ity_I64);
11972      IRTemp g1     = newTemp(Ity_I64);
11973      IRTemp g0     = newTemp(Ity_I64);
11974      IRTemp eV     = newTemp(Ity_V128);
11975      IRTemp gV     = newTemp(Ity_V128);
11976      IRTemp leftV  = newTemp(Ity_V128);
11977      IRTemp rightV = newTemp(Ity_V128);
11978      IRTemp rm     = newTemp(Ity_I32);
11979      Bool   isAdd  = insn[1] == 0x7C;
11980      const HChar* str = isAdd ? "add" : "sub";
11981
11982      modrm = insn[2];
11983      if (epartIsReg(modrm)) {
11984         assign( eV, getXMMReg( eregOfRM(modrm)) );
11985         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11986                                   nameXMMReg(gregOfRM(modrm)));
11987         delta += 2+1;
11988      } else {
11989         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11990         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11991         DIP("h%spd %s,%s\n", str, dis_buf,
11992                              nameXMMReg(gregOfRM(modrm)));
11993         delta += 2+alen;
11994      }
11995
11996      assign( gV, getXMMReg(gregOfRM(modrm)) );
11997
11998      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
11999      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
12000      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
12001      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
12002
12003      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
12004      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
12005
12006      assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
12007      putXMMReg( gregOfRM(modrm),
12008                 triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
12009                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
12010      goto decode_success;
12011   }
12012
12013   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
12014   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
12015      modrm = getIByte(delta+3);
12016      if (epartIsReg(modrm)) {
12017         goto decode_failure;
12018      } else {
12019         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12020         putXMMReg( gregOfRM(modrm),
12021                    loadLE(Ity_V128, mkexpr(addr)) );
12022         DIP("lddqu %s,%s\n", dis_buf,
12023                              nameXMMReg(gregOfRM(modrm)));
12024         delta += 3+alen;
12025      }
12026      goto decode_success;
12027   }
12028
12029   /* ---------------------------------------------------- */
12030   /* --- end of the SSE3 decoder.                     --- */
12031   /* ---------------------------------------------------- */
12032
12033   /* ---------------------------------------------------- */
12034   /* --- start of the SSSE3 decoder.                  --- */
12035   /* ---------------------------------------------------- */
12036
12037   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
12038      Unsigned Bytes (MMX) */
12039   if (sz == 4
12040       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
12041      IRTemp sV        = newTemp(Ity_I64);
12042      IRTemp dV        = newTemp(Ity_I64);
12043      IRTemp sVoddsSX  = newTemp(Ity_I64);
12044      IRTemp sVevensSX = newTemp(Ity_I64);
12045      IRTemp dVoddsZX  = newTemp(Ity_I64);
12046      IRTemp dVevensZX = newTemp(Ity_I64);
12047
12048      modrm = insn[3];
12049      do_MMX_preamble();
12050      assign( dV, getMMXReg(gregOfRM(modrm)) );
12051
12052      if (epartIsReg(modrm)) {
12053         assign( sV, getMMXReg(eregOfRM(modrm)) );
12054         delta += 3+1;
12055         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12056                                  nameMMXReg(gregOfRM(modrm)));
12057      } else {
12058         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12059         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12060         delta += 3+alen;
12061         DIP("pmaddubsw %s,%s\n", dis_buf,
12062                                  nameMMXReg(gregOfRM(modrm)));
12063      }
12064
12065      /* compute dV unsigned x sV signed */
12066      assign( sVoddsSX,
12067              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
12068      assign( sVevensSX,
12069              binop(Iop_SarN16x4,
12070                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
12071                    mkU8(8)) );
12072      assign( dVoddsZX,
12073              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
12074      assign( dVevensZX,
12075              binop(Iop_ShrN16x4,
12076                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
12077                    mkU8(8)) );
12078
12079      putMMXReg(
12080         gregOfRM(modrm),
12081         binop(Iop_QAdd16Sx4,
12082               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
12083               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
12084         )
12085      );
12086      goto decode_success;
12087   }
12088
12089   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
12090      Unsigned Bytes (XMM) */
12091   if (sz == 2
12092       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
12093      IRTemp sV        = newTemp(Ity_V128);
12094      IRTemp dV        = newTemp(Ity_V128);
12095      IRTemp sVoddsSX  = newTemp(Ity_V128);
12096      IRTemp sVevensSX = newTemp(Ity_V128);
12097      IRTemp dVoddsZX  = newTemp(Ity_V128);
12098      IRTemp dVevensZX = newTemp(Ity_V128);
12099
12100      modrm = insn[3];
12101      assign( dV, getXMMReg(gregOfRM(modrm)) );
12102
12103      if (epartIsReg(modrm)) {
12104         assign( sV, getXMMReg(eregOfRM(modrm)) );
12105         delta += 3+1;
12106         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12107                                  nameXMMReg(gregOfRM(modrm)));
12108      } else {
12109         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12110         gen_SEGV_if_not_16_aligned( addr );
12111         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12112         delta += 3+alen;
12113         DIP("pmaddubsw %s,%s\n", dis_buf,
12114                                  nameXMMReg(gregOfRM(modrm)));
12115      }
12116
12117      /* compute dV unsigned x sV signed */
12118      assign( sVoddsSX,
12119              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
12120      assign( sVevensSX,
12121              binop(Iop_SarN16x8,
12122                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
12123                    mkU8(8)) );
12124      assign( dVoddsZX,
12125              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
12126      assign( dVevensZX,
12127              binop(Iop_ShrN16x8,
12128                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
12129                    mkU8(8)) );
12130
12131      putXMMReg(
12132         gregOfRM(modrm),
12133         binop(Iop_QAdd16Sx8,
12134               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
12135               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
12136         )
12137      );
12138      goto decode_success;
12139   }
12140
12141   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
12142   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
12143      mmx) and G to G (mmx). */
12144   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
12145      mmx) and G to G (mmx). */
12146   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
12147      to G (mmx). */
12148   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
12149      to G (mmx). */
12150   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
12151      to G (mmx). */
12152   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
12153      to G (mmx). */
12154
12155   if (sz == 4
12156       && insn[0] == 0x0F && insn[1] == 0x38
12157       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12158           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12159      const HChar* str = "???";
12160      IROp   opV64  = Iop_INVALID;
12161      IROp   opCatO = Iop_CatOddLanes16x4;
12162      IROp   opCatE = Iop_CatEvenLanes16x4;
12163      IRTemp sV     = newTemp(Ity_I64);
12164      IRTemp dV     = newTemp(Ity_I64);
12165
12166      modrm = insn[3];
12167
12168      switch (insn[2]) {
12169         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12170         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12171         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12172         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12173         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12174         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12175         default: vassert(0);
12176      }
12177      if (insn[2] == 0x02 || insn[2] == 0x06) {
12178         opCatO = Iop_InterleaveHI32x2;
12179         opCatE = Iop_InterleaveLO32x2;
12180      }
12181
12182      do_MMX_preamble();
12183      assign( dV, getMMXReg(gregOfRM(modrm)) );
12184
12185      if (epartIsReg(modrm)) {
12186         assign( sV, getMMXReg(eregOfRM(modrm)) );
12187         delta += 3+1;
12188         DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12189                                  nameMMXReg(gregOfRM(modrm)));
12190      } else {
12191         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12192         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12193         delta += 3+alen;
12194         DIP("ph%s %s,%s\n", str, dis_buf,
12195                                  nameMMXReg(gregOfRM(modrm)));
12196      }
12197
12198      putMMXReg(
12199         gregOfRM(modrm),
12200         binop(opV64,
12201               binop(opCatE,mkexpr(sV),mkexpr(dV)),
12202               binop(opCatO,mkexpr(sV),mkexpr(dV))
12203         )
12204      );
12205      goto decode_success;
12206   }
12207
12208   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
12209      xmm) and G to G (xmm). */
12210   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
12211      xmm) and G to G (xmm). */
12212   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
12213      G to G (xmm). */
12214   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
12215      G to G (xmm). */
12216   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
12217      G to G (xmm). */
12218   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
12219      G to G (xmm). */
12220
12221   if (sz == 2
12222       && insn[0] == 0x0F && insn[1] == 0x38
12223       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12224           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12225      const HChar* str = "???";
12226      IROp   opV64  = Iop_INVALID;
12227      IROp   opCatO = Iop_CatOddLanes16x4;
12228      IROp   opCatE = Iop_CatEvenLanes16x4;
12229      IRTemp sV     = newTemp(Ity_V128);
12230      IRTemp dV     = newTemp(Ity_V128);
12231      IRTemp sHi    = newTemp(Ity_I64);
12232      IRTemp sLo    = newTemp(Ity_I64);
12233      IRTemp dHi    = newTemp(Ity_I64);
12234      IRTemp dLo    = newTemp(Ity_I64);
12235
12236      modrm = insn[3];
12237
12238      switch (insn[2]) {
12239         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12240         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12241         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12242         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12243         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12244         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12245         default: vassert(0);
12246      }
12247      if (insn[2] == 0x02 || insn[2] == 0x06) {
12248         opCatO = Iop_InterleaveHI32x2;
12249         opCatE = Iop_InterleaveLO32x2;
12250      }
12251
12252      assign( dV, getXMMReg(gregOfRM(modrm)) );
12253
12254      if (epartIsReg(modrm)) {
12255         assign( sV, getXMMReg( eregOfRM(modrm)) );
12256         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12257                                  nameXMMReg(gregOfRM(modrm)));
12258         delta += 3+1;
12259      } else {
12260         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12261         gen_SEGV_if_not_16_aligned( addr );
12262         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12263         DIP("ph%s %s,%s\n", str, dis_buf,
12264                             nameXMMReg(gregOfRM(modrm)));
12265         delta += 3+alen;
12266      }
12267
12268      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12269      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12270      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12271      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12272
12273      /* This isn't a particularly efficient way to compute the
12274         result, but at least it avoids a proliferation of IROps,
12275         hence avoids complication all the backends. */
12276      putXMMReg(
12277         gregOfRM(modrm),
12278         binop(Iop_64HLtoV128,
12279               binop(opV64,
12280                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
12281                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
12282               ),
12283               binop(opV64,
12284                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
12285                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
12286               )
12287         )
12288      );
12289      goto decode_success;
12290   }
12291
12292   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
12293      (MMX) */
12294   if (sz == 4
12295       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12296      IRTemp sV = newTemp(Ity_I64);
12297      IRTemp dV = newTemp(Ity_I64);
12298
12299      modrm = insn[3];
12300      do_MMX_preamble();
12301      assign( dV, getMMXReg(gregOfRM(modrm)) );
12302
12303      if (epartIsReg(modrm)) {
12304         assign( sV, getMMXReg(eregOfRM(modrm)) );
12305         delta += 3+1;
12306         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12307                                 nameMMXReg(gregOfRM(modrm)));
12308      } else {
12309         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12310         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12311         delta += 3+alen;
12312         DIP("pmulhrsw %s,%s\n", dis_buf,
12313                                 nameMMXReg(gregOfRM(modrm)));
12314      }
12315
12316      putMMXReg(
12317         gregOfRM(modrm),
12318         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
12319      );
12320      goto decode_success;
12321   }
12322
12323   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
12324      Scale (XMM) */
12325   if (sz == 2
12326       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12327      IRTemp sV  = newTemp(Ity_V128);
12328      IRTemp dV  = newTemp(Ity_V128);
12329      IRTemp sHi = newTemp(Ity_I64);
12330      IRTemp sLo = newTemp(Ity_I64);
12331      IRTemp dHi = newTemp(Ity_I64);
12332      IRTemp dLo = newTemp(Ity_I64);
12333
12334      modrm = insn[3];
12335      assign( dV, getXMMReg(gregOfRM(modrm)) );
12336
12337      if (epartIsReg(modrm)) {
12338         assign( sV, getXMMReg(eregOfRM(modrm)) );
12339         delta += 3+1;
12340         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12341                                 nameXMMReg(gregOfRM(modrm)));
12342      } else {
12343         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12344         gen_SEGV_if_not_16_aligned( addr );
12345         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12346         delta += 3+alen;
12347         DIP("pmulhrsw %s,%s\n", dis_buf,
12348                                 nameXMMReg(gregOfRM(modrm)));
12349      }
12350
12351      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12352      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12353      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12354      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12355
12356      putXMMReg(
12357         gregOfRM(modrm),
12358         binop(Iop_64HLtoV128,
12359               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
12360               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
12361         )
12362      );
12363      goto decode_success;
12364   }
12365
12366   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
12367   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
12368   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
12369   if (sz == 4
12370       && insn[0] == 0x0F && insn[1] == 0x38
12371       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12372      IRTemp sV      = newTemp(Ity_I64);
12373      IRTemp dV      = newTemp(Ity_I64);
12374      const HChar* str = "???";
12375      Int    laneszB = 0;
12376
12377      switch (insn[2]) {
12378         case 0x08: laneszB = 1; str = "b"; break;
12379         case 0x09: laneszB = 2; str = "w"; break;
12380         case 0x0A: laneszB = 4; str = "d"; break;
12381         default: vassert(0);
12382      }
12383
12384      modrm = insn[3];
12385      do_MMX_preamble();
12386      assign( dV, getMMXReg(gregOfRM(modrm)) );
12387
12388      if (epartIsReg(modrm)) {
12389         assign( sV, getMMXReg(eregOfRM(modrm)) );
12390         delta += 3+1;
12391         DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12392                                     nameMMXReg(gregOfRM(modrm)));
12393      } else {
12394         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12395         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12396         delta += 3+alen;
12397         DIP("psign%s %s,%s\n", str, dis_buf,
12398                                     nameMMXReg(gregOfRM(modrm)));
12399      }
12400
12401      putMMXReg(
12402         gregOfRM(modrm),
12403         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
12404      );
12405      goto decode_success;
12406   }
12407
12408   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
12409   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
12410   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
12411   if (sz == 2
12412       && insn[0] == 0x0F && insn[1] == 0x38
12413       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12414      IRTemp sV      = newTemp(Ity_V128);
12415      IRTemp dV      = newTemp(Ity_V128);
12416      IRTemp sHi     = newTemp(Ity_I64);
12417      IRTemp sLo     = newTemp(Ity_I64);
12418      IRTemp dHi     = newTemp(Ity_I64);
12419      IRTemp dLo     = newTemp(Ity_I64);
12420      const HChar* str = "???";
12421      Int    laneszB = 0;
12422
12423      switch (insn[2]) {
12424         case 0x08: laneszB = 1; str = "b"; break;
12425         case 0x09: laneszB = 2; str = "w"; break;
12426         case 0x0A: laneszB = 4; str = "d"; break;
12427         default: vassert(0);
12428      }
12429
12430      modrm = insn[3];
12431      assign( dV, getXMMReg(gregOfRM(modrm)) );
12432
12433      if (epartIsReg(modrm)) {
12434         assign( sV, getXMMReg(eregOfRM(modrm)) );
12435         delta += 3+1;
12436         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12437                                     nameXMMReg(gregOfRM(modrm)));
12438      } else {
12439         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12440         gen_SEGV_if_not_16_aligned( addr );
12441         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12442         delta += 3+alen;
12443         DIP("psign%s %s,%s\n", str, dis_buf,
12444                                     nameXMMReg(gregOfRM(modrm)));
12445      }
12446
12447      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12448      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12449      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12450      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12451
12452      putXMMReg(
12453         gregOfRM(modrm),
12454         binop(Iop_64HLtoV128,
12455               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
12456               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
12457         )
12458      );
12459      goto decode_success;
12460   }
12461
12462   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
12463   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
12464   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
12465   if (sz == 4
12466       && insn[0] == 0x0F && insn[1] == 0x38
12467       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12468      IRTemp sV      = newTemp(Ity_I64);
12469      const HChar* str = "???";
12470      Int    laneszB = 0;
12471
12472      switch (insn[2]) {
12473         case 0x1C: laneszB = 1; str = "b"; break;
12474         case 0x1D: laneszB = 2; str = "w"; break;
12475         case 0x1E: laneszB = 4; str = "d"; break;
12476         default: vassert(0);
12477      }
12478
12479      modrm = insn[3];
12480      do_MMX_preamble();
12481
12482      if (epartIsReg(modrm)) {
12483         assign( sV, getMMXReg(eregOfRM(modrm)) );
12484         delta += 3+1;
12485         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12486                                    nameMMXReg(gregOfRM(modrm)));
12487      } else {
12488         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12489         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12490         delta += 3+alen;
12491         DIP("pabs%s %s,%s\n", str, dis_buf,
12492                                    nameMMXReg(gregOfRM(modrm)));
12493      }
12494
12495      putMMXReg(
12496         gregOfRM(modrm),
12497         dis_PABS_helper( mkexpr(sV), laneszB )
12498      );
12499      goto decode_success;
12500   }
12501
12502   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
12503   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
12504   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
12505   if (sz == 2
12506       && insn[0] == 0x0F && insn[1] == 0x38
12507       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12508      IRTemp sV      = newTemp(Ity_V128);
12509      IRTemp sHi     = newTemp(Ity_I64);
12510      IRTemp sLo     = newTemp(Ity_I64);
12511      const HChar* str = "???";
12512      Int    laneszB = 0;
12513
12514      switch (insn[2]) {
12515         case 0x1C: laneszB = 1; str = "b"; break;
12516         case 0x1D: laneszB = 2; str = "w"; break;
12517         case 0x1E: laneszB = 4; str = "d"; break;
12518         default: vassert(0);
12519      }
12520
12521      modrm = insn[3];
12522
12523      if (epartIsReg(modrm)) {
12524         assign( sV, getXMMReg(eregOfRM(modrm)) );
12525         delta += 3+1;
12526         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12527                                    nameXMMReg(gregOfRM(modrm)));
12528      } else {
12529         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12530         gen_SEGV_if_not_16_aligned( addr );
12531         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12532         delta += 3+alen;
12533         DIP("pabs%s %s,%s\n", str, dis_buf,
12534                                    nameXMMReg(gregOfRM(modrm)));
12535      }
12536
12537      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12538      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12539
12540      putXMMReg(
12541         gregOfRM(modrm),
12542         binop(Iop_64HLtoV128,
12543               dis_PABS_helper( mkexpr(sHi), laneszB ),
12544               dis_PABS_helper( mkexpr(sLo), laneszB )
12545         )
12546      );
12547      goto decode_success;
12548   }
12549
12550   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
12551   if (sz == 4
12552       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12553      IRTemp sV  = newTemp(Ity_I64);
12554      IRTemp dV  = newTemp(Ity_I64);
12555      IRTemp res = newTemp(Ity_I64);
12556
12557      modrm = insn[3];
12558      do_MMX_preamble();
12559      assign( dV, getMMXReg(gregOfRM(modrm)) );
12560
12561      if (epartIsReg(modrm)) {
12562         assign( sV, getMMXReg(eregOfRM(modrm)) );
12563         d32 = (UInt)insn[3+1];
12564         delta += 3+1+1;
12565         DIP("palignr $%d,%s,%s\n",  (Int)d32,
12566                                     nameMMXReg(eregOfRM(modrm)),
12567                                     nameMMXReg(gregOfRM(modrm)));
12568      } else {
12569         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12570         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12571         d32 = (UInt)insn[3+alen];
12572         delta += 3+alen+1;
12573         DIP("palignr $%d%s,%s\n", (Int)d32,
12574                                   dis_buf,
12575                                   nameMMXReg(gregOfRM(modrm)));
12576      }
12577
12578      if (d32 == 0) {
12579         assign( res, mkexpr(sV) );
12580      }
12581      else if (d32 >= 1 && d32 <= 7) {
12582         assign(res,
12583                binop(Iop_Or64,
12584                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
12585                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
12586                     )));
12587      }
12588      else if (d32 == 8) {
12589        assign( res, mkexpr(dV) );
12590      }
12591      else if (d32 >= 9 && d32 <= 15) {
12592         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
12593      }
12594      else if (d32 >= 16 && d32 <= 255) {
12595         assign( res, mkU64(0) );
12596      }
12597      else
12598         vassert(0);
12599
12600      putMMXReg( gregOfRM(modrm), mkexpr(res) );
12601      goto decode_success;
12602   }
12603
12604   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
12605   if (sz == 2
12606       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12607      IRTemp sV  = newTemp(Ity_V128);
12608      IRTemp dV  = newTemp(Ity_V128);
12609      IRTemp sHi = newTemp(Ity_I64);
12610      IRTemp sLo = newTemp(Ity_I64);
12611      IRTemp dHi = newTemp(Ity_I64);
12612      IRTemp dLo = newTemp(Ity_I64);
12613      IRTemp rHi = newTemp(Ity_I64);
12614      IRTemp rLo = newTemp(Ity_I64);
12615
12616      modrm = insn[3];
12617      assign( dV, getXMMReg(gregOfRM(modrm)) );
12618
12619      if (epartIsReg(modrm)) {
12620         assign( sV, getXMMReg(eregOfRM(modrm)) );
12621         d32 = (UInt)insn[3+1];
12622         delta += 3+1+1;
12623         DIP("palignr $%d,%s,%s\n", (Int)d32,
12624                                    nameXMMReg(eregOfRM(modrm)),
12625                                    nameXMMReg(gregOfRM(modrm)));
12626      } else {
12627         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12628         gen_SEGV_if_not_16_aligned( addr );
12629         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12630         d32 = (UInt)insn[3+alen];
12631         delta += 3+alen+1;
12632         DIP("palignr $%d,%s,%s\n", (Int)d32,
12633                                    dis_buf,
12634                                    nameXMMReg(gregOfRM(modrm)));
12635      }
12636
12637      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12638      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12639      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12640      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12641
12642      if (d32 == 0) {
12643         assign( rHi, mkexpr(sHi) );
12644         assign( rLo, mkexpr(sLo) );
12645      }
12646      else if (d32 >= 1 && d32 <= 7) {
12647         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
12648         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
12649      }
12650      else if (d32 == 8) {
12651         assign( rHi, mkexpr(dLo) );
12652         assign( rLo, mkexpr(sHi) );
12653      }
12654      else if (d32 >= 9 && d32 <= 15) {
12655         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
12656         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
12657      }
12658      else if (d32 == 16) {
12659         assign( rHi, mkexpr(dHi) );
12660         assign( rLo, mkexpr(dLo) );
12661      }
12662      else if (d32 >= 17 && d32 <= 23) {
12663         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
12664         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
12665      }
12666      else if (d32 == 24) {
12667         assign( rHi, mkU64(0) );
12668         assign( rLo, mkexpr(dHi) );
12669      }
12670      else if (d32 >= 25 && d32 <= 31) {
12671         assign( rHi, mkU64(0) );
12672         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
12673      }
12674      else if (d32 >= 32 && d32 <= 255) {
12675         assign( rHi, mkU64(0) );
12676         assign( rLo, mkU64(0) );
12677      }
12678      else
12679         vassert(0);
12680
12681      putXMMReg(
12682         gregOfRM(modrm),
12683         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12684      );
12685      goto decode_success;
12686   }
12687
12688   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
12689   if (sz == 4
12690       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12691      IRTemp sV      = newTemp(Ity_I64);
12692      IRTemp dV      = newTemp(Ity_I64);
12693
12694      modrm = insn[3];
12695      do_MMX_preamble();
12696      assign( dV, getMMXReg(gregOfRM(modrm)) );
12697
12698      if (epartIsReg(modrm)) {
12699         assign( sV, getMMXReg(eregOfRM(modrm)) );
12700         delta += 3+1;
12701         DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12702                               nameMMXReg(gregOfRM(modrm)));
12703      } else {
12704         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12705         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12706         delta += 3+alen;
12707         DIP("pshufb %s,%s\n", dis_buf,
12708                               nameMMXReg(gregOfRM(modrm)));
12709      }
12710
12711      putMMXReg(
12712         gregOfRM(modrm),
12713         binop(
12714            Iop_And64,
12715            /* permute the lanes */
12716            binop(
12717               Iop_Perm8x8,
12718               mkexpr(dV),
12719               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
12720            ),
12721            /* mask off lanes which have (index & 0x80) == 0x80 */
12722            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
12723         )
12724      );
12725      goto decode_success;
12726   }
12727
12728   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
12729   if (sz == 2
12730       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12731      IRTemp sV         = newTemp(Ity_V128);
12732      IRTemp dV         = newTemp(Ity_V128);
12733      IRTemp sHi        = newTemp(Ity_I64);
12734      IRTemp sLo        = newTemp(Ity_I64);
12735      IRTemp dHi        = newTemp(Ity_I64);
12736      IRTemp dLo        = newTemp(Ity_I64);
12737      IRTemp rHi        = newTemp(Ity_I64);
12738      IRTemp rLo        = newTemp(Ity_I64);
12739      IRTemp sevens     = newTemp(Ity_I64);
12740      IRTemp mask0x80hi = newTemp(Ity_I64);
12741      IRTemp mask0x80lo = newTemp(Ity_I64);
12742      IRTemp maskBit3hi = newTemp(Ity_I64);
12743      IRTemp maskBit3lo = newTemp(Ity_I64);
12744      IRTemp sAnd7hi    = newTemp(Ity_I64);
12745      IRTemp sAnd7lo    = newTemp(Ity_I64);
12746      IRTemp permdHi    = newTemp(Ity_I64);
12747      IRTemp permdLo    = newTemp(Ity_I64);
12748
12749      modrm = insn[3];
12750      assign( dV, getXMMReg(gregOfRM(modrm)) );
12751
12752      if (epartIsReg(modrm)) {
12753         assign( sV, getXMMReg(eregOfRM(modrm)) );
12754         delta += 3+1;
12755         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12756                               nameXMMReg(gregOfRM(modrm)));
12757      } else {
12758         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12759         gen_SEGV_if_not_16_aligned( addr );
12760         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12761         delta += 3+alen;
12762         DIP("pshufb %s,%s\n", dis_buf,
12763                               nameXMMReg(gregOfRM(modrm)));
12764      }
12765
12766      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12767      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12768      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12769      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12770
12771      assign( sevens, mkU64(0x0707070707070707ULL) );
12772
12773      /*
12774      mask0x80hi = Not(SarN8x8(sHi,7))
12775      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
12776      sAnd7hi    = And(sHi,sevens)
12777      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
12778                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
12779      rHi        = And(permdHi,mask0x80hi)
12780      */
12781      assign(
12782         mask0x80hi,
12783         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
12784
12785      assign(
12786         maskBit3hi,
12787         binop(Iop_SarN8x8,
12788               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
12789               mkU8(7)));
12790
12791      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
12792
12793      assign(
12794         permdHi,
12795         binop(
12796            Iop_Or64,
12797            binop(Iop_And64,
12798                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
12799                  mkexpr(maskBit3hi)),
12800            binop(Iop_And64,
12801                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
12802                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
12803
12804      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
12805
12806      /* And the same for the lower half of the result.  What fun. */
12807
12808      assign(
12809         mask0x80lo,
12810         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
12811
12812      assign(
12813         maskBit3lo,
12814         binop(Iop_SarN8x8,
12815               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
12816               mkU8(7)));
12817
12818      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
12819
12820      assign(
12821         permdLo,
12822         binop(
12823            Iop_Or64,
12824            binop(Iop_And64,
12825                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
12826                  mkexpr(maskBit3lo)),
12827            binop(Iop_And64,
12828                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
12829                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
12830
12831      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
12832
12833      putXMMReg(
12834         gregOfRM(modrm),
12835         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12836      );
12837      goto decode_success;
12838   }
12839
12840   /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
12841   /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
12842   if ((sz == 2 || sz == 4)
12843       && insn[0] == 0x0F && insn[1] == 0x38
12844       && (insn[2] == 0xF0 || insn[2] == 0xF1)
12845       && !epartIsReg(insn[3])) {
12846
12847      modrm = insn[3];
12848      addr = disAMode(&alen, sorb, delta + 3, dis_buf);
12849      delta += 3 + alen;
12850      ty = szToITy(sz);
12851      IRTemp src = newTemp(ty);
12852
12853      if (insn[2] == 0xF0) { /* LOAD */
12854         assign(src, loadLE(ty, mkexpr(addr)));
12855         IRTemp dst = math_BSWAP(src, ty);
12856         putIReg(sz, gregOfRM(modrm), mkexpr(dst));
12857         DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
12858      } else { /* STORE */
12859         assign(src, getIReg(sz, gregOfRM(modrm)));
12860         IRTemp dst = math_BSWAP(src, ty);
12861         storeLE(mkexpr(addr), mkexpr(dst));
12862         DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
12863      }
12864      goto decode_success;
12865   }
12866
12867   /* ---------------------------------------------------- */
12868   /* --- end of the SSSE3 decoder.                    --- */
12869   /* ---------------------------------------------------- */
12870
12871   /* ---------------------------------------------------- */
12872   /* --- start of the SSE4 decoder                    --- */
12873   /* ---------------------------------------------------- */
12874
12875   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
12876      (Partial implementation only -- only deal with cases where
12877      the rounding mode is specified directly by the immediate byte.)
12878      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
12879      (Limitations ditto)
12880   */
12881   if (sz == 2
12882       && insn[0] == 0x0F && insn[1] == 0x3A
12883       && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
12884
12885      Bool   isD = insn[2] == 0x0B;
12886      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
12887      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
12888      Int    imm = 0;
12889
12890      modrm = insn[3];
12891
12892      if (epartIsReg(modrm)) {
12893         assign( src,
12894                 isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
12895                     : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
12896         imm = insn[3+1];
12897         if (imm & ~3) goto decode_failure;
12898         delta += 3+1+1;
12899         DIP( "rounds%c $%d,%s,%s\n",
12900              isD ? 'd' : 's',
12901              imm, nameXMMReg( eregOfRM(modrm) ),
12902                   nameXMMReg( gregOfRM(modrm) ) );
12903      } else {
12904         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12905         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
12906         imm = insn[3+alen];
12907         if (imm & ~3) goto decode_failure;
12908         delta += 3+alen+1;
12909         DIP( "roundsd $%d,%s,%s\n",
12910              imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
12911      }
12912
12913      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
12914         that encoding is the same as the encoding for IRRoundingMode,
12915         we can use that value directly in the IR as a rounding
12916         mode. */
12917      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
12918                  mkU32(imm & 3), mkexpr(src)) );
12919
12920      if (isD)
12921         putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
12922      else
12923         putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
12924
12925      goto decode_success;
12926   }
12927
12928   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
12929      which we can only decode if we're sure this is an AMD cpu that
12930      supports LZCNT, since otherwise it's BSR, which behaves
12931      differently. */
12932   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
12933       && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
12934      vassert(sz == 2 || sz == 4);
12935      /*IRType*/ ty  = szToITy(sz);
12936      IRTemp     src = newTemp(ty);
12937      modrm = insn[3];
12938      if (epartIsReg(modrm)) {
12939         assign(src, getIReg(sz, eregOfRM(modrm)));
12940         delta += 3+1;
12941         DIP("lzcnt%c %s, %s\n", nameISize(sz),
12942             nameIReg(sz, eregOfRM(modrm)),
12943             nameIReg(sz, gregOfRM(modrm)));
12944      } else {
12945         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12946         assign(src, loadLE(ty, mkexpr(addr)));
12947         delta += 3+alen;
12948         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
12949             nameIReg(sz, gregOfRM(modrm)));
12950      }
12951
12952      IRTemp res = gen_LZCNT(ty, src);
12953      putIReg(sz, gregOfRM(modrm), mkexpr(res));
12954
12955      // Update flags.  This is pretty lame .. perhaps can do better
12956      // if this turns out to be performance critical.
12957      // O S A P are cleared.  Z is set if RESULT == 0.
12958      // C is set if SRC is zero.
12959      IRTemp src32 = newTemp(Ity_I32);
12960      IRTemp res32 = newTemp(Ity_I32);
12961      assign(src32, widenUto32(mkexpr(src)));
12962      assign(res32, widenUto32(mkexpr(res)));
12963
12964      IRTemp oszacp = newTemp(Ity_I32);
12965      assign(
12966         oszacp,
12967         binop(Iop_Or32,
12968               binop(Iop_Shl32,
12969                     unop(Iop_1Uto32,
12970                          binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
12971                     mkU8(X86G_CC_SHIFT_Z)),
12972               binop(Iop_Shl32,
12973                     unop(Iop_1Uto32,
12974                          binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
12975                     mkU8(X86G_CC_SHIFT_C))
12976         )
12977      );
12978
12979      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12980      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12981      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12982      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
12983
12984      goto decode_success;
12985   }
12986
12987   /* ---------------------------------------------------- */
12988   /* --- end of the SSE4 decoder                      --- */
12989   /* ---------------------------------------------------- */
12990
12991   after_sse_decoders:
12992
12993   /* ---------------------------------------------------- */
12994   /* --- deal with misc 0x67 pfxs (addr size override) -- */
12995   /* ---------------------------------------------------- */
12996
12997   /* 67 E3 = JCXZ (for JECXZ see below) */
12998   if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
12999      delta += 2;
13000      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13001      delta ++;
13002      stmt( IRStmt_Exit(
13003               binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
13004               Ijk_Boring,
13005               IRConst_U32(d32),
13006               OFFB_EIP
13007            ));
13008       DIP("jcxz 0x%x\n", d32);
13009       goto decode_success;
13010   }
13011
13012   /* ---------------------------------------------------- */
13013   /* --- start of the baseline insn decoder            -- */
13014   /* ---------------------------------------------------- */
13015
13016   /* Get the primary opcode. */
13017   opc = getIByte(delta); delta++;
13018
13019   /* We get here if the current insn isn't SSE, or this CPU doesn't
13020      support SSE. */
13021
13022   switch (opc) {
13023
13024   /* ------------------------ Control flow --------------- */
13025
13026   case 0xC2: /* RET imm16 */
13027      d32 = getUDisp16(delta);
13028      delta += 2;
13029      dis_ret(&dres, d32);
13030      DIP("ret %d\n", (Int)d32);
13031      break;
13032   case 0xC3: /* RET */
13033      dis_ret(&dres, 0);
13034      DIP("ret\n");
13035      break;
13036
13037   case 0xCF: /* IRET */
13038      /* Note, this is an extremely kludgey and limited implementation
13039         of iret.  All it really does is:
13040            popl %EIP; popl %CS; popl %EFLAGS.
13041         %CS is set but ignored (as it is in (eg) popw %cs)". */
13042      t1 = newTemp(Ity_I32); /* ESP */
13043      t2 = newTemp(Ity_I32); /* new EIP */
13044      t3 = newTemp(Ity_I32); /* new CS */
13045      t4 = newTemp(Ity_I32); /* new EFLAGS */
13046      assign(t1, getIReg(4,R_ESP));
13047      assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
13048      assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
13049      assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
13050      /* Get stuff off stack */
13051      putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
13052      /* set %CS (which is ignored anyway) */
13053      putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
13054      /* set %EFLAGS */
13055      set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
13056      /* goto new EIP value */
13057      jmp_treg(&dres, Ijk_Ret, t2);
13058      vassert(dres.whatNext == Dis_StopHere);
13059      DIP("iret (very kludgey)\n");
13060      break;
13061
13062   case 0xE8: /* CALL J4 */
13063      d32 = getUDisp32(delta); delta += 4;
13064      d32 += (guest_EIP_bbstart+delta);
13065      /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
13066      if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
13067                                         && getIByte(delta) <= 0x5F) {
13068         /* Specially treat the position-independent-code idiom
13069                 call X
13070              X: popl %reg
13071            as
13072                 movl %eip, %reg.
13073            since this generates better code, but for no other reason. */
13074         Int archReg = getIByte(delta) - 0x58;
13075         /* vex_printf("-- fPIC thingy\n"); */
13076         putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
13077         delta++; /* Step over the POP */
13078         DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
13079      } else {
13080         /* The normal sequence for a call. */
13081         t1 = newTemp(Ity_I32);
13082         assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
13083         putIReg(4, R_ESP, mkexpr(t1));
13084         storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
13085         if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
13086            /* follow into the call target. */
13087            dres.whatNext   = Dis_ResteerU;
13088            dres.continueAt = (Addr64)(Addr32)d32;
13089         } else {
13090            jmp_lit(&dres, Ijk_Call, d32);
13091            vassert(dres.whatNext == Dis_StopHere);
13092         }
13093         DIP("call 0x%x\n",d32);
13094      }
13095      break;
13096
13097//--    case 0xC8: /* ENTER */
13098//--       d32 = getUDisp16(eip); eip += 2;
13099//--       abyte = getIByte(delta); delta++;
13100//--
13101//--       vg_assert(sz == 4);
13102//--       vg_assert(abyte == 0);
13103//--
13104//--       t1 = newTemp(cb); t2 = newTemp(cb);
13105//--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
13106//--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
13107//--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
13108//--       uLiteral(cb, sz);
13109//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
13110//--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
13111//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
13112//--       if (d32) {
13113//--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
13114//--          uLiteral(cb, d32);
13115//--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
13116//--       }
13117//--       DIP("enter 0x%x, 0x%x", d32, abyte);
13118//--       break;
13119
13120   case 0xC9: /* LEAVE */
13121      vassert(sz == 4);
13122      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13123      assign(t1, getIReg(4,R_EBP));
13124      /* First PUT ESP looks redundant, but need it because ESP must
13125         always be up-to-date for Memcheck to work... */
13126      putIReg(4, R_ESP, mkexpr(t1));
13127      assign(t2, loadLE(Ity_I32,mkexpr(t1)));
13128      putIReg(4, R_EBP, mkexpr(t2));
13129      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
13130      DIP("leave\n");
13131      break;
13132
13133   /* ---------------- Misc weird-ass insns --------------- */
13134
13135   case 0x27: /* DAA */
13136   case 0x2F: /* DAS */
13137   case 0x37: /* AAA */
13138   case 0x3F: /* AAS */
13139      /* An ugly implementation for some ugly instructions.  Oh
13140	 well. */
13141      if (sz != 4) goto decode_failure;
13142      t1 = newTemp(Ity_I32);
13143      t2 = newTemp(Ity_I32);
13144      /* Make up a 32-bit value (t1), with the old value of AX in the
13145         bottom 16 bits, and the old OSZACP bitmask in the upper 16
13146         bits. */
13147      assign(t1,
13148             binop(Iop_16HLto32,
13149                   unop(Iop_32to16,
13150                        mk_x86g_calculate_eflags_all()),
13151                   getIReg(2, R_EAX)
13152            ));
13153      /* Call the helper fn, to get a new AX and OSZACP value, and
13154         poke both back into the guest state.  Also pass the helper
13155         the actual opcode so it knows which of the 4 instructions it
13156         is doing the computation for. */
13157      vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
13158      assign(t2,
13159              mkIRExprCCall(
13160                 Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
13161                 &x86g_calculate_daa_das_aaa_aas,
13162                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13163            ));
13164     putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13165
13166     stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13167     stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13168     stmt( IRStmt_Put( OFFB_CC_DEP1,
13169                       binop(Iop_And32,
13170                             binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13171                             mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13172                                    | X86G_CC_MASK_A | X86G_CC_MASK_Z
13173                                    | X86G_CC_MASK_S| X86G_CC_MASK_O )
13174                            )
13175                      )
13176         );
13177     /* Set NDEP even though it isn't used.  This makes redundant-PUT
13178        elimination of previous stores to this field work better. */
13179     stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13180     switch (opc) {
13181        case 0x27: DIP("daa\n"); break;
13182        case 0x2F: DIP("das\n"); break;
13183        case 0x37: DIP("aaa\n"); break;
13184        case 0x3F: DIP("aas\n"); break;
13185        default: vassert(0);
13186     }
13187     break;
13188
13189   case 0xD4: /* AAM */
13190   case 0xD5: /* AAD */
13191      d32 = getIByte(delta); delta++;
13192      if (sz != 4 || d32 != 10) goto decode_failure;
13193      t1 = newTemp(Ity_I32);
13194      t2 = newTemp(Ity_I32);
13195      /* Make up a 32-bit value (t1), with the old value of AX in the
13196         bottom 16 bits, and the old OSZACP bitmask in the upper 16
13197         bits. */
13198      assign(t1,
13199             binop(Iop_16HLto32,
13200                   unop(Iop_32to16,
13201                        mk_x86g_calculate_eflags_all()),
13202                   getIReg(2, R_EAX)
13203            ));
13204      /* Call the helper fn, to get a new AX and OSZACP value, and
13205         poke both back into the guest state.  Also pass the helper
13206         the actual opcode so it knows which of the 2 instructions it
13207         is doing the computation for. */
13208      assign(t2,
13209              mkIRExprCCall(
13210                 Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
13211                 &x86g_calculate_aad_aam,
13212                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13213            ));
13214      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13215
13216      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13217      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13218      stmt( IRStmt_Put( OFFB_CC_DEP1,
13219                        binop(Iop_And32,
13220                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13221                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13222                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
13223                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
13224                             )
13225                       )
13226          );
13227      /* Set NDEP even though it isn't used.  This makes
13228         redundant-PUT elimination of previous stores to this field
13229         work better. */
13230      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13231
13232      DIP(opc == 0xD4 ? "aam\n" : "aad\n");
13233      break;
13234
13235   /* ------------------------ CWD/CDQ -------------------- */
13236
13237   case 0x98: /* CBW */
13238      if (sz == 4) {
13239         putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
13240         DIP("cwde\n");
13241      } else {
13242         vassert(sz == 2);
13243         putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
13244         DIP("cbw\n");
13245      }
13246      break;
13247
13248   case 0x99: /* CWD/CDQ */
13249      ty = szToITy(sz);
13250      putIReg(sz, R_EDX,
13251                  binop(mkSizedOp(ty,Iop_Sar8),
13252                        getIReg(sz, R_EAX),
13253                        mkU8(sz == 2 ? 15 : 31)) );
13254      DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
13255      break;
13256
13257   /* ------------------------ FPU ops -------------------- */
13258
13259   case 0x9E: /* SAHF */
13260      codegen_SAHF();
13261      DIP("sahf\n");
13262      break;
13263
13264   case 0x9F: /* LAHF */
13265      codegen_LAHF();
13266      DIP("lahf\n");
13267      break;
13268
13269   case 0x9B: /* FWAIT */
13270      /* ignore? */
13271      DIP("fwait\n");
13272      break;
13273
13274   case 0xD8:
13275   case 0xD9:
13276   case 0xDA:
13277   case 0xDB:
13278   case 0xDC:
13279   case 0xDD:
13280   case 0xDE:
13281   case 0xDF: {
13282      Int  delta0    = delta;
13283      Bool decode_OK = False;
13284      delta = dis_FPU ( &decode_OK, sorb, delta );
13285      if (!decode_OK) {
13286         delta = delta0;
13287         goto decode_failure;
13288      }
13289      break;
13290   }
13291
13292   /* ------------------------ INC & DEC ------------------ */
13293
13294   case 0x40: /* INC eAX */
13295   case 0x41: /* INC eCX */
13296   case 0x42: /* INC eDX */
13297   case 0x43: /* INC eBX */
13298   case 0x44: /* INC eSP */
13299   case 0x45: /* INC eBP */
13300   case 0x46: /* INC eSI */
13301   case 0x47: /* INC eDI */
13302      vassert(sz == 2 || sz == 4);
13303      ty = szToITy(sz);
13304      t1 = newTemp(ty);
13305      assign( t1, binop(mkSizedOp(ty,Iop_Add8),
13306                        getIReg(sz, (UInt)(opc - 0x40)),
13307                        mkU(ty,1)) );
13308      setFlags_INC_DEC( True, t1, ty );
13309      putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
13310      DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
13311      break;
13312
13313   case 0x48: /* DEC eAX */
13314   case 0x49: /* DEC eCX */
13315   case 0x4A: /* DEC eDX */
13316   case 0x4B: /* DEC eBX */
13317   case 0x4C: /* DEC eSP */
13318   case 0x4D: /* DEC eBP */
13319   case 0x4E: /* DEC eSI */
13320   case 0x4F: /* DEC eDI */
13321      vassert(sz == 2 || sz == 4);
13322      ty = szToITy(sz);
13323      t1 = newTemp(ty);
13324      assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
13325                        getIReg(sz, (UInt)(opc - 0x48)),
13326                        mkU(ty,1)) );
13327      setFlags_INC_DEC( False, t1, ty );
13328      putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
13329      DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
13330      break;
13331
13332   /* ------------------------ INT ------------------------ */
13333
13334   case 0xCC: /* INT 3 */
13335      jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
13336      vassert(dres.whatNext == Dis_StopHere);
13337      DIP("int $0x3\n");
13338      break;
13339
13340   case 0xCD: /* INT imm8 */
13341      d32 = getIByte(delta); delta++;
13342
13343      /* For any of the cases where we emit a jump (that is, for all
13344         currently handled cases), it's important that all ArchRegs
13345         carry their up-to-date value at this point.  So we declare an
13346         end-of-block here, which forces any TempRegs caching ArchRegs
13347         to be flushed. */
13348
13349      /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
13350         restart of this instruction (hence the "-2" two lines below,
13351         to get the restart EIP to be this instruction.  This is
13352         probably Linux-specific and it would be more correct to only
13353         do this if the VexAbiInfo says that is what we should do.
13354         This used to handle just 0x40-0x43; Jikes RVM uses a larger
13355         range (0x3F-0x49), and this allows some slack as well. */
13356      if (d32 >= 0x3F && d32 <= 0x4F) {
13357         jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
13358         vassert(dres.whatNext == Dis_StopHere);
13359         DIP("int $0x%x\n", (Int)d32);
13360         break;
13361      }
13362
13363      /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
13364         (darwin syscalls).  As part of this, note where we are, so we
13365         can back up the guest to this point if the syscall needs to
13366         be restarted. */
13367      if (d32 == 0x80) {
13368         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13369                           mkU32(guest_EIP_curr_instr) ) );
13370         jmp_lit(&dres, Ijk_Sys_int128, ((Addr32)guest_EIP_bbstart)+delta);
13371         vassert(dres.whatNext == Dis_StopHere);
13372         DIP("int $0x80\n");
13373         break;
13374      }
13375      if (d32 == 0x81) {
13376         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13377                           mkU32(guest_EIP_curr_instr) ) );
13378         jmp_lit(&dres, Ijk_Sys_int129, ((Addr32)guest_EIP_bbstart)+delta);
13379         vassert(dres.whatNext == Dis_StopHere);
13380         DIP("int $0x81\n");
13381         break;
13382      }
13383      if (d32 == 0x82) {
13384         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13385                           mkU32(guest_EIP_curr_instr) ) );
13386         jmp_lit(&dres, Ijk_Sys_int130, ((Addr32)guest_EIP_bbstart)+delta);
13387         vassert(dres.whatNext == Dis_StopHere);
13388         DIP("int $0x82\n");
13389         break;
13390      }
13391
13392      /* none of the above */
13393      goto decode_failure;
13394
13395   /* ------------------------ Jcond, byte offset --------- */
13396
13397   case 0xEB: /* Jb (jump, byte offset) */
13398      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13399      delta++;
13400      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13401         dres.whatNext   = Dis_ResteerU;
13402         dres.continueAt = (Addr64)(Addr32)d32;
13403      } else {
13404         jmp_lit(&dres, Ijk_Boring, d32);
13405         vassert(dres.whatNext == Dis_StopHere);
13406      }
13407      DIP("jmp-8 0x%x\n", d32);
13408      break;
13409
13410   case 0xE9: /* Jv (jump, 16/32 offset) */
13411      vassert(sz == 4); /* JRS added 2004 July 11 */
13412      d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
13413      delta += sz;
13414      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13415         dres.whatNext   = Dis_ResteerU;
13416         dres.continueAt = (Addr64)(Addr32)d32;
13417      } else {
13418         jmp_lit(&dres, Ijk_Boring, d32);
13419         vassert(dres.whatNext == Dis_StopHere);
13420      }
13421      DIP("jmp 0x%x\n", d32);
13422      break;
13423
13424   case 0x70:
13425   case 0x71:
13426   case 0x72: /* JBb/JNAEb (jump below) */
13427   case 0x73: /* JNBb/JAEb (jump not below) */
13428   case 0x74: /* JZb/JEb (jump zero) */
13429   case 0x75: /* JNZb/JNEb (jump not zero) */
13430   case 0x76: /* JBEb/JNAb (jump below or equal) */
13431   case 0x77: /* JNBEb/JAb (jump not below or equal) */
13432   case 0x78: /* JSb (jump negative) */
13433   case 0x79: /* JSb (jump not negative) */
13434   case 0x7A: /* JP (jump parity even) */
13435   case 0x7B: /* JNP/JPO (jump parity odd) */
13436   case 0x7C: /* JLb/JNGEb (jump less) */
13437   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
13438   case 0x7E: /* JLEb/JNGb (jump less or equal) */
13439   case 0x7F: /* JGb/JNLEb (jump greater) */
13440    { Int    jmpDelta;
13441      const HChar* comment  = "";
13442      jmpDelta = (Int)getSDisp8(delta);
13443      vassert(-128 <= jmpDelta && jmpDelta < 128);
13444      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
13445      delta++;
13446      if (resteerCisOk
13447          && vex_control.guest_chase_cond
13448          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13449          && jmpDelta < 0
13450          && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13451         /* Speculation: assume this backward branch is taken.  So we
13452            need to emit a side-exit to the insn following this one,
13453            on the negation of the condition, and continue at the
13454            branch target address (d32).  If we wind up back at the
13455            first instruction of the trace, just stop; it's better to
13456            let the IR loop unroller handle that case. */
13457         stmt( IRStmt_Exit(
13458                  mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
13459                  Ijk_Boring,
13460                  IRConst_U32(guest_EIP_bbstart+delta),
13461                  OFFB_EIP ) );
13462         dres.whatNext   = Dis_ResteerC;
13463         dres.continueAt = (Addr64)(Addr32)d32;
13464         comment = "(assumed taken)";
13465      }
13466      else
13467      if (resteerCisOk
13468          && vex_control.guest_chase_cond
13469          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13470          && jmpDelta >= 0
13471          && resteerOkFn( callback_opaque,
13472                          (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
13473         /* Speculation: assume this forward branch is not taken.  So
13474            we need to emit a side-exit to d32 (the dest) and continue
13475            disassembling at the insn immediately following this
13476            one. */
13477         stmt( IRStmt_Exit(
13478                  mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
13479                  Ijk_Boring,
13480                  IRConst_U32(d32),
13481                  OFFB_EIP ) );
13482         dres.whatNext   = Dis_ResteerC;
13483         dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
13484         comment = "(assumed not taken)";
13485      }
13486      else {
13487         /* Conservative default translation - end the block at this
13488            point. */
13489         jcc_01( &dres, (X86Condcode)(opc - 0x70),
13490                 (Addr32)(guest_EIP_bbstart+delta), d32);
13491         vassert(dres.whatNext == Dis_StopHere);
13492      }
13493      DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
13494      break;
13495    }
13496
13497   case 0xE3: /* JECXZ (for JCXZ see above) */
13498      if (sz != 4) goto decode_failure;
13499      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13500      delta ++;
13501      stmt( IRStmt_Exit(
13502               binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
13503            Ijk_Boring,
13504            IRConst_U32(d32),
13505            OFFB_EIP
13506          ));
13507      DIP("jecxz 0x%x\n", d32);
13508      break;
13509
13510   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
13511   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
13512   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
13513    { /* Again, the docs say this uses ECX/CX as a count depending on
13514         the address size override, not the operand one.  Since we
13515         don't handle address size overrides, I guess that means
13516         ECX. */
13517      IRExpr* zbit  = NULL;
13518      IRExpr* count = NULL;
13519      IRExpr* cond  = NULL;
13520      const HChar* xtra = NULL;
13521
13522      if (sz != 4) goto decode_failure;
13523      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13524      delta++;
13525      putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
13526
13527      count = getIReg(4,R_ECX);
13528      cond = binop(Iop_CmpNE32, count, mkU32(0));
13529      switch (opc) {
13530         case 0xE2:
13531            xtra = "";
13532            break;
13533         case 0xE1:
13534            xtra = "e";
13535            zbit = mk_x86g_calculate_condition( X86CondZ );
13536	    cond = mkAnd1(cond, zbit);
13537            break;
13538         case 0xE0:
13539            xtra = "ne";
13540            zbit = mk_x86g_calculate_condition( X86CondNZ );
13541	    cond = mkAnd1(cond, zbit);
13542            break;
13543         default:
13544	    vassert(0);
13545      }
13546      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
13547
13548      DIP("loop%s 0x%x\n", xtra, d32);
13549      break;
13550    }
13551
13552   /* ------------------------ IMUL ----------------------- */
13553
13554   case 0x69: /* IMUL Iv, Ev, Gv */
13555      delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
13556      break;
13557   case 0x6B: /* IMUL Ib, Ev, Gv */
13558      delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
13559      break;
13560
13561   /* ------------------------ MOV ------------------------ */
13562
13563   case 0x88: /* MOV Gb,Eb */
13564      delta = dis_mov_G_E(sorb, 1, delta);
13565      break;
13566
13567   case 0x89: /* MOV Gv,Ev */
13568      delta = dis_mov_G_E(sorb, sz, delta);
13569      break;
13570
13571   case 0x8A: /* MOV Eb,Gb */
13572      delta = dis_mov_E_G(sorb, 1, delta);
13573      break;
13574
13575   case 0x8B: /* MOV Ev,Gv */
13576      delta = dis_mov_E_G(sorb, sz, delta);
13577      break;
13578
13579   case 0x8D: /* LEA M,Gv */
13580      if (sz != 4)
13581         goto decode_failure;
13582      modrm = getIByte(delta);
13583      if (epartIsReg(modrm))
13584         goto decode_failure;
13585      /* NOTE!  this is the one place where a segment override prefix
13586         has no effect on the address calculation.  Therefore we pass
13587         zero instead of sorb here. */
13588      addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
13589      delta += alen;
13590      putIReg(sz, gregOfRM(modrm), mkexpr(addr));
13591      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
13592                            nameIReg(sz,gregOfRM(modrm)));
13593      break;
13594
13595   case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
13596      delta = dis_mov_Sw_Ew(sorb, sz, delta);
13597      break;
13598
13599   case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
13600      delta = dis_mov_Ew_Sw(sorb, delta);
13601      break;
13602
13603   case 0xA0: /* MOV Ob,AL */
13604      sz = 1;
13605      /* Fall through ... */
13606   case 0xA1: /* MOV Ov,eAX */
13607      d32 = getUDisp32(delta); delta += 4;
13608      ty = szToITy(sz);
13609      addr = newTemp(Ity_I32);
13610      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13611      putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
13612      DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
13613                                d32, nameIReg(sz,R_EAX));
13614      break;
13615
13616   case 0xA2: /* MOV Ob,AL */
13617      sz = 1;
13618      /* Fall through ... */
13619   case 0xA3: /* MOV eAX,Ov */
13620      d32 = getUDisp32(delta); delta += 4;
13621      ty = szToITy(sz);
13622      addr = newTemp(Ity_I32);
13623      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13624      storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
13625      DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
13626                                sorbTxt(sorb), d32);
13627      break;
13628
13629   case 0xB0: /* MOV imm,AL */
13630   case 0xB1: /* MOV imm,CL */
13631   case 0xB2: /* MOV imm,DL */
13632   case 0xB3: /* MOV imm,BL */
13633   case 0xB4: /* MOV imm,AH */
13634   case 0xB5: /* MOV imm,CH */
13635   case 0xB6: /* MOV imm,DH */
13636   case 0xB7: /* MOV imm,BH */
13637      d32 = getIByte(delta); delta += 1;
13638      putIReg(1, opc-0xB0, mkU8(d32));
13639      DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
13640      break;
13641
13642   case 0xB8: /* MOV imm,eAX */
13643   case 0xB9: /* MOV imm,eCX */
13644   case 0xBA: /* MOV imm,eDX */
13645   case 0xBB: /* MOV imm,eBX */
13646   case 0xBC: /* MOV imm,eSP */
13647   case 0xBD: /* MOV imm,eBP */
13648   case 0xBE: /* MOV imm,eSI */
13649   case 0xBF: /* MOV imm,eDI */
13650      d32 = getUDisp(sz,delta); delta += sz;
13651      putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
13652      DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
13653      break;
13654
13655   case 0xC6: /* C6 /0 = MOV Ib,Eb */
13656      sz = 1;
13657      goto maybe_do_Mov_I_E;
13658   case 0xC7: /* C7 /0 = MOV Iv,Ev */
13659      goto maybe_do_Mov_I_E;
13660
13661   maybe_do_Mov_I_E:
13662      modrm = getIByte(delta);
13663      if (gregOfRM(modrm) == 0) {
13664         if (epartIsReg(modrm)) {
13665            delta++; /* mod/rm byte */
13666            d32 = getUDisp(sz,delta); delta += sz;
13667            putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
13668            DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
13669                                     nameIReg(sz,eregOfRM(modrm)));
13670         } else {
13671            addr = disAMode ( &alen, sorb, delta, dis_buf );
13672            delta += alen;
13673            d32 = getUDisp(sz,delta); delta += sz;
13674            storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
13675            DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
13676         }
13677         break;
13678      }
13679      goto decode_failure;
13680
13681   /* ------------------------ opl imm, A ----------------- */
13682
13683   case 0x04: /* ADD Ib, AL */
13684      delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
13685      break;
13686   case 0x05: /* ADD Iv, eAX */
13687      delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
13688      break;
13689
13690   case 0x0C: /* OR Ib, AL */
13691      delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
13692      break;
13693   case 0x0D: /* OR Iv, eAX */
13694      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
13695      break;
13696
13697   case 0x14: /* ADC Ib, AL */
13698      delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
13699      break;
13700   case 0x15: /* ADC Iv, eAX */
13701      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
13702      break;
13703
13704   case 0x1C: /* SBB Ib, AL */
13705      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
13706      break;
13707   case 0x1D: /* SBB Iv, eAX */
13708      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
13709      break;
13710
13711   case 0x24: /* AND Ib, AL */
13712      delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
13713      break;
13714   case 0x25: /* AND Iv, eAX */
13715      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
13716      break;
13717
13718   case 0x2C: /* SUB Ib, AL */
13719      delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
13720      break;
13721   case 0x2D: /* SUB Iv, eAX */
13722      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
13723      break;
13724
13725   case 0x34: /* XOR Ib, AL */
13726      delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
13727      break;
13728   case 0x35: /* XOR Iv, eAX */
13729      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
13730      break;
13731
13732   case 0x3C: /* CMP Ib, AL */
13733      delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
13734      break;
13735   case 0x3D: /* CMP Iv, eAX */
13736      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
13737      break;
13738
13739   case 0xA8: /* TEST Ib, AL */
13740      delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
13741      break;
13742   case 0xA9: /* TEST Iv, eAX */
13743      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
13744      break;
13745
13746   /* ------------------------ opl Ev, Gv ----------------- */
13747
13748   case 0x02: /* ADD Eb,Gb */
13749      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
13750      break;
13751   case 0x03: /* ADD Ev,Gv */
13752      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
13753      break;
13754
13755   case 0x0A: /* OR Eb,Gb */
13756      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
13757      break;
13758   case 0x0B: /* OR Ev,Gv */
13759      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
13760      break;
13761
13762   case 0x12: /* ADC Eb,Gb */
13763      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
13764      break;
13765   case 0x13: /* ADC Ev,Gv */
13766      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
13767      break;
13768
13769   case 0x1A: /* SBB Eb,Gb */
13770      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
13771      break;
13772   case 0x1B: /* SBB Ev,Gv */
13773      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
13774      break;
13775
13776   case 0x22: /* AND Eb,Gb */
13777      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
13778      break;
13779   case 0x23: /* AND Ev,Gv */
13780      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
13781      break;
13782
13783   case 0x2A: /* SUB Eb,Gb */
13784      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
13785      break;
13786   case 0x2B: /* SUB Ev,Gv */
13787      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
13788      break;
13789
13790   case 0x32: /* XOR Eb,Gb */
13791      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
13792      break;
13793   case 0x33: /* XOR Ev,Gv */
13794      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
13795      break;
13796
13797   case 0x3A: /* CMP Eb,Gb */
13798      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
13799      break;
13800   case 0x3B: /* CMP Ev,Gv */
13801      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
13802      break;
13803
13804   case 0x84: /* TEST Eb,Gb */
13805      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
13806      break;
13807   case 0x85: /* TEST Ev,Gv */
13808      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
13809      break;
13810
13811   /* ------------------------ opl Gv, Ev ----------------- */
13812
13813   case 0x00: /* ADD Gb,Eb */
13814      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13815                            Iop_Add8, True, 1, delta, "add" );
13816      break;
13817   case 0x01: /* ADD Gv,Ev */
13818      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13819                            Iop_Add8, True, sz, delta, "add" );
13820      break;
13821
13822   case 0x08: /* OR Gb,Eb */
13823      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13824                            Iop_Or8, True, 1, delta, "or" );
13825      break;
13826   case 0x09: /* OR Gv,Ev */
13827      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13828                            Iop_Or8, True, sz, delta, "or" );
13829      break;
13830
13831   case 0x10: /* ADC Gb,Eb */
13832      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13833                            Iop_Add8, True, 1, delta, "adc" );
13834      break;
13835   case 0x11: /* ADC Gv,Ev */
13836      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13837                            Iop_Add8, True, sz, delta, "adc" );
13838      break;
13839
13840   case 0x18: /* SBB Gb,Eb */
13841      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13842                            Iop_Sub8, True, 1, delta, "sbb" );
13843      break;
13844   case 0x19: /* SBB Gv,Ev */
13845      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13846                            Iop_Sub8, True, sz, delta, "sbb" );
13847      break;
13848
13849   case 0x20: /* AND Gb,Eb */
13850      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13851                            Iop_And8, True, 1, delta, "and" );
13852      break;
13853   case 0x21: /* AND Gv,Ev */
13854      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13855                            Iop_And8, True, sz, delta, "and" );
13856      break;
13857
13858   case 0x28: /* SUB Gb,Eb */
13859      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13860                            Iop_Sub8, True, 1, delta, "sub" );
13861      break;
13862   case 0x29: /* SUB Gv,Ev */
13863      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13864                            Iop_Sub8, True, sz, delta, "sub" );
13865      break;
13866
13867   case 0x30: /* XOR Gb,Eb */
13868      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13869                            Iop_Xor8, True, 1, delta, "xor" );
13870      break;
13871   case 0x31: /* XOR Gv,Ev */
13872      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13873                            Iop_Xor8, True, sz, delta, "xor" );
13874      break;
13875
13876   case 0x38: /* CMP Gb,Eb */
13877      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13878                            Iop_Sub8, False, 1, delta, "cmp" );
13879      break;
13880   case 0x39: /* CMP Gv,Ev */
13881      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13882                            Iop_Sub8, False, sz, delta, "cmp" );
13883      break;
13884
13885   /* ------------------------ POP ------------------------ */
13886
13887   case 0x58: /* POP eAX */
13888   case 0x59: /* POP eCX */
13889   case 0x5A: /* POP eDX */
13890   case 0x5B: /* POP eBX */
13891   case 0x5D: /* POP eBP */
13892   case 0x5E: /* POP eSI */
13893   case 0x5F: /* POP eDI */
13894   case 0x5C: /* POP eSP */
13895      vassert(sz == 2 || sz == 4);
13896      t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
13897      assign(t2, getIReg(4, R_ESP));
13898      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
13899      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13900      putIReg(sz, opc-0x58, mkexpr(t1));
13901      DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
13902      break;
13903
13904   case 0x9D: /* POPF */
13905      vassert(sz == 2 || sz == 4);
13906      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13907      assign(t2, getIReg(4, R_ESP));
13908      assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
13909      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13910
13911      /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
13912	 value in t1. */
13913      set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
13914                                 ((Addr32)guest_EIP_bbstart)+delta );
13915
13916      DIP("popf%c\n", nameISize(sz));
13917      break;
13918
13919   case 0x61: /* POPA */
13920      /* This is almost certainly wrong for sz==2.  So ... */
13921      if (sz != 4) goto decode_failure;
13922
13923      /* t5 is the old %ESP value. */
13924      t5 = newTemp(Ity_I32);
13925      assign( t5, getIReg(4, R_ESP) );
13926
13927      /* Reload all the registers, except %esp. */
13928      putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
13929      putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
13930      putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
13931      putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
13932      /* ignore saved %ESP */
13933      putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
13934      putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
13935      putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
13936
13937      /* and move %ESP back up */
13938      putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
13939
13940      DIP("popa%c\n", nameISize(sz));
13941      break;
13942
13943   case 0x8F: /* POPL/POPW m32 */
13944     { Int    len;
13945       UChar  rm = getIByte(delta);
13946
13947       /* make sure this instruction is correct POP */
13948       if (epartIsReg(rm) || gregOfRM(rm) != 0)
13949          goto decode_failure;
13950       /* and has correct size */
13951       if (sz != 4 && sz != 2)
13952          goto decode_failure;
13953       ty = szToITy(sz);
13954
13955       t1 = newTemp(Ity_I32); /* stack address */
13956       t3 = newTemp(ty); /* data */
13957       /* set t1 to ESP: t1 = ESP */
13958       assign( t1, getIReg(4, R_ESP) );
13959       /* load M[ESP] to virtual register t3: t3 = M[t1] */
13960       assign( t3, loadLE(ty, mkexpr(t1)) );
13961
13962       /* increase ESP; must be done before the STORE.  Intel manual says:
13963            If the ESP register is used as a base register for addressing
13964            a destination operand in memory, the POP instruction computes
13965            the effective address of the operand after it increments the
13966            ESP register.
13967       */
13968       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
13969
13970       /* resolve MODR/M */
13971       addr = disAMode ( &len, sorb, delta, dis_buf);
13972       storeLE( mkexpr(addr), mkexpr(t3) );
13973
13974       DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
13975
13976       delta += len;
13977       break;
13978     }
13979
13980   case 0x1F: /* POP %DS */
13981      dis_pop_segreg( R_DS, sz ); break;
13982   case 0x07: /* POP %ES */
13983      dis_pop_segreg( R_ES, sz ); break;
13984   case 0x17: /* POP %SS */
13985      dis_pop_segreg( R_SS, sz ); break;
13986
13987   /* ------------------------ PUSH ----------------------- */
13988
13989   case 0x50: /* PUSH eAX */
13990   case 0x51: /* PUSH eCX */
13991   case 0x52: /* PUSH eDX */
13992   case 0x53: /* PUSH eBX */
13993   case 0x55: /* PUSH eBP */
13994   case 0x56: /* PUSH eSI */
13995   case 0x57: /* PUSH eDI */
13996   case 0x54: /* PUSH eSP */
13997      /* This is the Right Way, in that the value to be pushed is
13998         established before %esp is changed, so that pushl %esp
13999         correctly pushes the old value. */
14000      vassert(sz == 2 || sz == 4);
14001      ty = sz==2 ? Ity_I16 : Ity_I32;
14002      t1 = newTemp(ty); t2 = newTemp(Ity_I32);
14003      assign(t1, getIReg(sz, opc-0x50));
14004      assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
14005      putIReg(4, R_ESP, mkexpr(t2) );
14006      storeLE(mkexpr(t2),mkexpr(t1));
14007      DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
14008      break;
14009
14010
14011   case 0x68: /* PUSH Iv */
14012      d32 = getUDisp(sz,delta); delta += sz;
14013      goto do_push_I;
14014   case 0x6A: /* PUSH Ib, sign-extended to sz */
14015      d32 = getSDisp8(delta); delta += 1;
14016      goto do_push_I;
14017   do_push_I:
14018      ty = szToITy(sz);
14019      t1 = newTemp(Ity_I32); t2 = newTemp(ty);
14020      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
14021      putIReg(4, R_ESP, mkexpr(t1) );
14022      /* stop mkU16 asserting if d32 is a negative 16-bit number
14023         (bug #132813) */
14024      if (ty == Ity_I16)
14025         d32 &= 0xFFFF;
14026      storeLE( mkexpr(t1), mkU(ty,d32) );
14027      DIP("push%c $0x%x\n", nameISize(sz), d32);
14028      break;
14029
14030   case 0x9C: /* PUSHF */ {
14031      vassert(sz == 2 || sz == 4);
14032
14033      t1 = newTemp(Ity_I32);
14034      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
14035      putIReg(4, R_ESP, mkexpr(t1) );
14036
14037      /* Calculate OSZACP, and patch in fixed fields as per
14038         Intel docs.
14039         - bit 1 is always 1
14040         - bit 9 is Interrupt Enable (should always be 1 in user mode?)
14041      */
14042      t2 = newTemp(Ity_I32);
14043      assign( t2, binop(Iop_Or32,
14044                        mk_x86g_calculate_eflags_all(),
14045                        mkU32( (1<<1)|(1<<9) ) ));
14046
14047      /* Patch in the D flag.  This can simply be a copy of bit 10 of
14048         baseBlock[OFFB_DFLAG]. */
14049      t3 = newTemp(Ity_I32);
14050      assign( t3, binop(Iop_Or32,
14051                        mkexpr(t2),
14052                        binop(Iop_And32,
14053                              IRExpr_Get(OFFB_DFLAG,Ity_I32),
14054                              mkU32(1<<10)))
14055            );
14056
14057      /* And patch in the ID flag. */
14058      t4 = newTemp(Ity_I32);
14059      assign( t4, binop(Iop_Or32,
14060                        mkexpr(t3),
14061                        binop(Iop_And32,
14062                              binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
14063                                               mkU8(21)),
14064                              mkU32(1<<21)))
14065            );
14066
14067      /* And patch in the AC flag. */
14068      t5 = newTemp(Ity_I32);
14069      assign( t5, binop(Iop_Or32,
14070                        mkexpr(t4),
14071                        binop(Iop_And32,
14072                              binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
14073                                               mkU8(18)),
14074                              mkU32(1<<18)))
14075            );
14076
14077      /* if sz==2, the stored value needs to be narrowed. */
14078      if (sz == 2)
14079        storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
14080      else
14081        storeLE( mkexpr(t1), mkexpr(t5) );
14082
14083      DIP("pushf%c\n", nameISize(sz));
14084      break;
14085   }
14086
14087   case 0x60: /* PUSHA */
14088      /* This is almost certainly wrong for sz==2.  So ... */
14089      if (sz != 4) goto decode_failure;
14090
14091      /* This is the Right Way, in that the value to be pushed is
14092         established before %esp is changed, so that pusha
14093         correctly pushes the old %esp value.  New value of %esp is
14094         pushed at start. */
14095      /* t0 is the %ESP value we're going to push. */
14096      t0 = newTemp(Ity_I32);
14097      assign( t0, getIReg(4, R_ESP) );
14098
14099      /* t5 will be the new %ESP value. */
14100      t5 = newTemp(Ity_I32);
14101      assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
14102
14103      /* Update guest state before prodding memory. */
14104      putIReg(4, R_ESP, mkexpr(t5));
14105
14106      /* Dump all the registers. */
14107      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
14108      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
14109      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
14110      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
14111      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
14112      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
14113      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
14114      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
14115
14116      DIP("pusha%c\n", nameISize(sz));
14117      break;
14118
14119   case 0x0E: /* PUSH %CS */
14120      dis_push_segreg( R_CS, sz ); break;
14121   case 0x1E: /* PUSH %DS */
14122      dis_push_segreg( R_DS, sz ); break;
14123   case 0x06: /* PUSH %ES */
14124      dis_push_segreg( R_ES, sz ); break;
14125   case 0x16: /* PUSH %SS */
14126      dis_push_segreg( R_SS, sz ); break;
14127
14128   /* ------------------------ SCAS et al ----------------- */
14129
14130   case 0xA4: /* MOVS, no REP prefix */
14131   case 0xA5:
14132      if (sorb != 0)
14133         goto decode_failure; /* else dis_string_op asserts */
14134      dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
14135      break;
14136
14137  case 0xA6: /* CMPSb, no REP prefix */
14138  case 0xA7:
14139      if (sorb != 0)
14140         goto decode_failure; /* else dis_string_op asserts */
14141      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
14142      break;
14143
14144   case 0xAA: /* STOS, no REP prefix */
14145   case 0xAB:
14146      if (sorb != 0)
14147         goto decode_failure; /* else dis_string_op asserts */
14148      dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
14149      break;
14150
14151   case 0xAC: /* LODS, no REP prefix */
14152   case 0xAD:
14153      if (sorb != 0)
14154         goto decode_failure; /* else dis_string_op asserts */
14155      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
14156      break;
14157
14158   case 0xAE: /* SCAS, no REP prefix */
14159   case 0xAF:
14160      if (sorb != 0)
14161         goto decode_failure; /* else dis_string_op asserts */
14162      dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
14163      break;
14164
14165
14166   case 0xFC: /* CLD */
14167      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
14168      DIP("cld\n");
14169      break;
14170
14171   case 0xFD: /* STD */
14172      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
14173      DIP("std\n");
14174      break;
14175
14176   case 0xF8: /* CLC */
14177   case 0xF9: /* STC */
14178   case 0xF5: /* CMC */
14179      t0 = newTemp(Ity_I32);
14180      t1 = newTemp(Ity_I32);
14181      assign( t0, mk_x86g_calculate_eflags_all() );
14182      switch (opc) {
14183         case 0xF8:
14184            assign( t1, binop(Iop_And32, mkexpr(t0),
14185                                         mkU32(~X86G_CC_MASK_C)));
14186            DIP("clc\n");
14187            break;
14188         case 0xF9:
14189            assign( t1, binop(Iop_Or32, mkexpr(t0),
14190                                        mkU32(X86G_CC_MASK_C)));
14191            DIP("stc\n");
14192            break;
14193         case 0xF5:
14194            assign( t1, binop(Iop_Xor32, mkexpr(t0),
14195                                         mkU32(X86G_CC_MASK_C)));
14196            DIP("cmc\n");
14197            break;
14198         default:
14199            vpanic("disInstr(x86)(clc/stc/cmc)");
14200      }
14201      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14202      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14203      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
14204      /* Set NDEP even though it isn't used.  This makes redundant-PUT
14205         elimination of previous stores to this field work better. */
14206      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14207      break;
14208
14209   case 0xD6: /* SALC */
14210      t0 = newTemp(Ity_I32);
14211      t1 = newTemp(Ity_I32);
14212      assign( t0,  binop(Iop_And32,
14213                         mk_x86g_calculate_eflags_c(),
14214                         mkU32(1)) );
14215      assign( t1, binop(Iop_Sar32,
14216                        binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
14217                        mkU8(31)) );
14218      putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
14219      DIP("salc\n");
14220      break;
14221
14222   /* REPNE prefix insn */
14223   case 0xF2: {
14224      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14225      if (sorb != 0) goto decode_failure;
14226      abyte = getIByte(delta); delta++;
14227
14228      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14229
14230      switch (abyte) {
14231      /* According to the Intel manual, "repne movs" should never occur, but
14232       * in practice it has happened, so allow for it here... */
14233      case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
14234      case 0xA5:
14235         dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
14236                             guest_EIP_bbstart+delta, "repne movs" );
14237         break;
14238
14239      case 0xA6: sz = 1;   /* REPNE CMP<sz> */
14240      case 0xA7:
14241         dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
14242                             guest_EIP_bbstart+delta, "repne cmps" );
14243         break;
14244
14245      case 0xAA: sz = 1;   /* REPNE STOS<sz> */
14246      case 0xAB:
14247         dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
14248                             guest_EIP_bbstart+delta, "repne stos" );
14249         break;
14250
14251      case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
14252      case 0xAF:
14253         dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
14254                             guest_EIP_bbstart+delta, "repne scas" );
14255         break;
14256
14257      default:
14258         goto decode_failure;
14259      }
14260      break;
14261   }
14262
14263   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
14264      for the rest, it means REP) */
14265   case 0xF3: {
14266      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14267      abyte = getIByte(delta); delta++;
14268
14269      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14270
14271      if (sorb != 0 && abyte != 0x0F) goto decode_failure;
14272
14273      switch (abyte) {
14274      case 0x0F:
14275         switch (getIByte(delta)) {
14276         /* On older CPUs, TZCNT behaves the same as BSF.  */
14277         case 0xBC: /* REP BSF Gv,Ev */
14278            delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
14279            break;
14280         /* On older CPUs, LZCNT behaves the same as BSR.  */
14281         case 0xBD: /* REP BSR Gv,Ev */
14282            delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
14283            break;
14284         default:
14285            goto decode_failure;
14286         }
14287         break;
14288
14289      case 0xA4: sz = 1;   /* REP MOVS<sz> */
14290      case 0xA5:
14291         dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
14292                             guest_EIP_bbstart+delta, "rep movs" );
14293         break;
14294
14295      case 0xA6: sz = 1;   /* REPE CMP<sz> */
14296      case 0xA7:
14297         dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
14298                             guest_EIP_bbstart+delta, "repe cmps" );
14299         break;
14300
14301      case 0xAA: sz = 1;   /* REP STOS<sz> */
14302      case 0xAB:
14303         dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
14304                             guest_EIP_bbstart+delta, "rep stos" );
14305         break;
14306
14307      case 0xAC: sz = 1;   /* REP LODS<sz> */
14308      case 0xAD:
14309         dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
14310                             guest_EIP_bbstart+delta, "rep lods" );
14311         break;
14312
14313      case 0xAE: sz = 1;   /* REPE SCAS<sz> */
14314      case 0xAF:
14315         dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
14316                             guest_EIP_bbstart+delta, "repe scas" );
14317         break;
14318
14319      case 0x90:           /* REP NOP (PAUSE) */
14320         /* a hint to the P4 re spin-wait loop */
14321         DIP("rep nop (P4 pause)\n");
14322         /* "observe" the hint.  The Vex client needs to be careful not
14323            to cause very long delays as a result, though. */
14324         jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
14325         vassert(dres.whatNext == Dis_StopHere);
14326         break;
14327
14328      case 0xC3:           /* REP RET -- same as normal ret? */
14329         dis_ret(&dres, 0);
14330         DIP("rep ret\n");
14331         break;
14332
14333      default:
14334         goto decode_failure;
14335      }
14336      break;
14337   }
14338
14339   /* ------------------------ XCHG ----------------------- */
14340
14341   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
14342      prefix; hence it must be translated with an IRCAS (at least, the
14343      memory variant). */
14344   case 0x86: /* XCHG Gb,Eb */
14345      sz = 1;
14346      /* Fall through ... */
14347   case 0x87: /* XCHG Gv,Ev */
14348      modrm = getIByte(delta);
14349      ty = szToITy(sz);
14350      t1 = newTemp(ty); t2 = newTemp(ty);
14351      if (epartIsReg(modrm)) {
14352         assign(t1, getIReg(sz, eregOfRM(modrm)));
14353         assign(t2, getIReg(sz, gregOfRM(modrm)));
14354         putIReg(sz, gregOfRM(modrm), mkexpr(t1));
14355         putIReg(sz, eregOfRM(modrm), mkexpr(t2));
14356         delta++;
14357         DIP("xchg%c %s, %s\n",
14358             nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
14359                            nameIReg(sz,eregOfRM(modrm)));
14360      } else {
14361         *expect_CAS = True;
14362         addr = disAMode ( &alen, sorb, delta, dis_buf );
14363         assign( t1, loadLE(ty,mkexpr(addr)) );
14364         assign( t2, getIReg(sz,gregOfRM(modrm)) );
14365         casLE( mkexpr(addr),
14366                mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
14367         putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
14368         delta += alen;
14369         DIP("xchg%c %s, %s\n", nameISize(sz),
14370                                nameIReg(sz,gregOfRM(modrm)), dis_buf);
14371      }
14372      break;
14373
14374   case 0x90: /* XCHG eAX,eAX */
14375      DIP("nop\n");
14376      break;
14377   case 0x91: /* XCHG eAX,eCX */
14378   case 0x92: /* XCHG eAX,eDX */
14379   case 0x93: /* XCHG eAX,eBX */
14380   case 0x94: /* XCHG eAX,eSP */
14381   case 0x95: /* XCHG eAX,eBP */
14382   case 0x96: /* XCHG eAX,eSI */
14383   case 0x97: /* XCHG eAX,eDI */
14384      codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
14385      break;
14386
14387   /* ------------------------ XLAT ----------------------- */
14388
14389   case 0xD7: /* XLAT */
14390      if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
14391      putIReg(
14392         1,
14393         R_EAX/*AL*/,
14394         loadLE(Ity_I8,
14395                handleSegOverride(
14396                   sorb,
14397                   binop(Iop_Add32,
14398                         getIReg(4, R_EBX),
14399                         unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
14400
14401      DIP("xlat%c [ebx]\n", nameISize(sz));
14402      break;
14403
14404   /* ------------------------ IN / OUT ----------------------- */
14405
14406   case 0xE4: /* IN imm8, AL */
14407      sz = 1;
14408      t1 = newTemp(Ity_I32);
14409      abyte = getIByte(delta); delta++;
14410      assign(t1, mkU32( abyte & 0xFF ));
14411      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14412      goto do_IN;
14413   case 0xE5: /* IN imm8, eAX */
14414      vassert(sz == 2 || sz == 4);
14415      t1 = newTemp(Ity_I32);
14416      abyte = getIByte(delta); delta++;
14417      assign(t1, mkU32( abyte & 0xFF ));
14418      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14419      goto do_IN;
14420   case 0xEC: /* IN %DX, AL */
14421      sz = 1;
14422      t1 = newTemp(Ity_I32);
14423      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14424      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14425                                         nameIReg(sz,R_EAX));
14426      goto do_IN;
14427   case 0xED: /* IN %DX, eAX */
14428      vassert(sz == 2 || sz == 4);
14429      t1 = newTemp(Ity_I32);
14430      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14431      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14432                                         nameIReg(sz,R_EAX));
14433      goto do_IN;
14434   do_IN: {
14435      /* At this point, sz indicates the width, and t1 is a 32-bit
14436         value giving port number. */
14437      IRDirty* d;
14438      vassert(sz == 1 || sz == 2 || sz == 4);
14439      ty = szToITy(sz);
14440      t2 = newTemp(Ity_I32);
14441      d = unsafeIRDirty_1_N(
14442             t2,
14443             0/*regparms*/,
14444             "x86g_dirtyhelper_IN",
14445             &x86g_dirtyhelper_IN,
14446             mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
14447          );
14448      /* do the call, dumping the result in t2. */
14449      stmt( IRStmt_Dirty(d) );
14450      putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
14451      break;
14452   }
14453
14454   case 0xE6: /* OUT AL, imm8 */
14455      sz = 1;
14456      t1 = newTemp(Ity_I32);
14457      abyte = getIByte(delta); delta++;
14458      assign( t1, mkU32( abyte & 0xFF ) );
14459      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14460      goto do_OUT;
14461   case 0xE7: /* OUT eAX, imm8 */
14462      vassert(sz == 2 || sz == 4);
14463      t1 = newTemp(Ity_I32);
14464      abyte = getIByte(delta); delta++;
14465      assign( t1, mkU32( abyte & 0xFF ) );
14466      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14467      goto do_OUT;
14468   case 0xEE: /* OUT AL, %DX */
14469      sz = 1;
14470      t1 = newTemp(Ity_I32);
14471      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14472      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14473                                          nameIReg(2,R_EDX));
14474      goto do_OUT;
14475   case 0xEF: /* OUT eAX, %DX */
14476      vassert(sz == 2 || sz == 4);
14477      t1 = newTemp(Ity_I32);
14478      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14479      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14480                                          nameIReg(2,R_EDX));
14481      goto do_OUT;
14482   do_OUT: {
14483      /* At this point, sz indicates the width, and t1 is a 32-bit
14484         value giving port number. */
14485      IRDirty* d;
14486      vassert(sz == 1 || sz == 2 || sz == 4);
14487      ty = szToITy(sz);
14488      d = unsafeIRDirty_0_N(
14489             0/*regparms*/,
14490             "x86g_dirtyhelper_OUT",
14491             &x86g_dirtyhelper_OUT,
14492             mkIRExprVec_3( mkexpr(t1),
14493                            widenUto32( getIReg(sz, R_EAX) ),
14494                            mkU32(sz) )
14495          );
14496      stmt( IRStmt_Dirty(d) );
14497      break;
14498   }
14499
14500   /* ------------------------ (Grp1 extensions) ---------- */
14501
14502   case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
14503                 case 0x80, but only in 32-bit mode. */
14504      /* fallthru */
14505   case 0x80: /* Grp1 Ib,Eb */
14506      modrm = getIByte(delta);
14507      am_sz = lengthAMode(delta);
14508      sz    = 1;
14509      d_sz  = 1;
14510      d32   = getUChar(delta + am_sz);
14511      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14512      break;
14513
14514   case 0x81: /* Grp1 Iv,Ev */
14515      modrm = getIByte(delta);
14516      am_sz = lengthAMode(delta);
14517      d_sz  = sz;
14518      d32   = getUDisp(d_sz, delta + am_sz);
14519      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14520      break;
14521
14522   case 0x83: /* Grp1 Ib,Ev */
14523      modrm = getIByte(delta);
14524      am_sz = lengthAMode(delta);
14525      d_sz  = 1;
14526      d32   = getSDisp8(delta + am_sz);
14527      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14528      break;
14529
14530   /* ------------------------ (Grp2 extensions) ---------- */
14531
14532   case 0xC0: { /* Grp2 Ib,Eb */
14533      Bool decode_OK = True;
14534      modrm = getIByte(delta);
14535      am_sz = lengthAMode(delta);
14536      d_sz  = 1;
14537      d32   = getUChar(delta + am_sz);
14538      sz    = 1;
14539      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14540                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14541      if (!decode_OK)
14542         goto decode_failure;
14543      break;
14544   }
14545   case 0xC1: { /* Grp2 Ib,Ev */
14546      Bool decode_OK = True;
14547      modrm = getIByte(delta);
14548      am_sz = lengthAMode(delta);
14549      d_sz  = 1;
14550      d32   = getUChar(delta + am_sz);
14551      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14552                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14553      if (!decode_OK)
14554         goto decode_failure;
14555      break;
14556   }
14557   case 0xD0: { /* Grp2 1,Eb */
14558      Bool decode_OK = True;
14559      modrm = getIByte(delta);
14560      am_sz = lengthAMode(delta);
14561      d_sz  = 0;
14562      d32   = 1;
14563      sz    = 1;
14564      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14565                         mkU8(d32), NULL, &decode_OK );
14566      if (!decode_OK)
14567         goto decode_failure;
14568      break;
14569   }
14570   case 0xD1: { /* Grp2 1,Ev */
14571      Bool decode_OK = True;
14572      modrm = getUChar(delta);
14573      am_sz = lengthAMode(delta);
14574      d_sz  = 0;
14575      d32   = 1;
14576      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14577                         mkU8(d32), NULL, &decode_OK );
14578      if (!decode_OK)
14579         goto decode_failure;
14580      break;
14581   }
14582   case 0xD2: { /* Grp2 CL,Eb */
14583      Bool decode_OK = True;
14584      modrm = getUChar(delta);
14585      am_sz = lengthAMode(delta);
14586      d_sz  = 0;
14587      sz    = 1;
14588      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14589                         getIReg(1,R_ECX), "%cl", &decode_OK );
14590      if (!decode_OK)
14591         goto decode_failure;
14592      break;
14593   }
14594   case 0xD3: { /* Grp2 CL,Ev */
14595      Bool decode_OK = True;
14596      modrm = getIByte(delta);
14597      am_sz = lengthAMode(delta);
14598      d_sz  = 0;
14599      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14600                         getIReg(1,R_ECX), "%cl", &decode_OK );
14601      if (!decode_OK)
14602         goto decode_failure;
14603      break;
14604   }
14605
14606   /* ------------------------ (Grp3 extensions) ---------- */
14607
14608   case 0xF6: { /* Grp3 Eb */
14609      Bool decode_OK = True;
14610      delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
14611      if (!decode_OK)
14612         goto decode_failure;
14613      break;
14614   }
14615   case 0xF7: { /* Grp3 Ev */
14616      Bool decode_OK = True;
14617      delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
14618      if (!decode_OK)
14619         goto decode_failure;
14620      break;
14621   }
14622
14623   /* ------------------------ (Grp4 extensions) ---------- */
14624
14625   case 0xFE: { /* Grp4 Eb */
14626      Bool decode_OK = True;
14627      delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
14628      if (!decode_OK)
14629         goto decode_failure;
14630      break;
14631   }
14632
14633   /* ------------------------ (Grp5 extensions) ---------- */
14634
14635   case 0xFF: { /* Grp5 Ev */
14636      Bool decode_OK = True;
14637      delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
14638      if (!decode_OK)
14639         goto decode_failure;
14640      break;
14641   }
14642
14643   /* ------------------------ Escapes to 2-byte opcodes -- */
14644
14645   case 0x0F: {
14646      opc = getIByte(delta); delta++;
14647      switch (opc) {
14648
14649      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
14650
14651      case 0xBA: { /* Grp8 Ib,Ev */
14652         Bool decode_OK = False;
14653         modrm = getUChar(delta);
14654         am_sz = lengthAMode(delta);
14655         d32   = getSDisp8(delta + am_sz);
14656         delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
14657                                am_sz, sz, d32, &decode_OK );
14658         if (!decode_OK)
14659            goto decode_failure;
14660         break;
14661      }
14662
14663      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
14664
14665      case 0xBC: /* BSF Gv,Ev */
14666         delta = dis_bs_E_G ( sorb, sz, delta, True );
14667         break;
14668      case 0xBD: /* BSR Gv,Ev */
14669         delta = dis_bs_E_G ( sorb, sz, delta, False );
14670         break;
14671
14672      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
14673
14674      case 0xC8: /* BSWAP %eax */
14675      case 0xC9:
14676      case 0xCA:
14677      case 0xCB:
14678      case 0xCC:
14679      case 0xCD:
14680      case 0xCE:
14681      case 0xCF: /* BSWAP %edi */
14682         /* AFAICS from the Intel docs, this only exists at size 4. */
14683         if (sz != 4) goto decode_failure;
14684
14685         t1 = newTemp(Ity_I32);
14686         assign( t1, getIReg(4, opc-0xC8) );
14687         t2 = math_BSWAP(t1, Ity_I32);
14688
14689         putIReg(4, opc-0xC8, mkexpr(t2));
14690         DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
14691         break;
14692
14693      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
14694
14695      case 0xA3: /* BT Gv,Ev */
14696         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
14697         break;
14698      case 0xB3: /* BTR Gv,Ev */
14699         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
14700         break;
14701      case 0xAB: /* BTS Gv,Ev */
14702         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
14703         break;
14704      case 0xBB: /* BTC Gv,Ev */
14705         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
14706         break;
14707
14708      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
14709
14710      case 0x40:
14711      case 0x41:
14712      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
14713      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
14714      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
14715      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
14716      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
14717      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
14718      case 0x48: /* CMOVSb (cmov negative) */
14719      case 0x49: /* CMOVSb (cmov not negative) */
14720      case 0x4A: /* CMOVP (cmov parity even) */
14721      case 0x4B: /* CMOVNP (cmov parity odd) */
14722      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
14723      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
14724      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
14725      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
14726         delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
14727         break;
14728
14729      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
14730
14731      case 0xB0: /* CMPXCHG Gb,Eb */
14732         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
14733         break;
14734      case 0xB1: /* CMPXCHG Gv,Ev */
14735         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
14736         break;
14737
14738      case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
14739         IRTemp expdHi    = newTemp(Ity_I32);
14740         IRTemp expdLo    = newTemp(Ity_I32);
14741         IRTemp dataHi    = newTemp(Ity_I32);
14742         IRTemp dataLo    = newTemp(Ity_I32);
14743         IRTemp oldHi     = newTemp(Ity_I32);
14744         IRTemp oldLo     = newTemp(Ity_I32);
14745         IRTemp flags_old = newTemp(Ity_I32);
14746         IRTemp flags_new = newTemp(Ity_I32);
14747         IRTemp success   = newTemp(Ity_I1);
14748
14749         /* Translate this using a DCAS, even if there is no LOCK
14750            prefix.  Life is too short to bother with generating two
14751            different translations for the with/without-LOCK-prefix
14752            cases. */
14753         *expect_CAS = True;
14754
14755	 /* Decode, and generate address. */
14756         if (sz != 4) goto decode_failure;
14757         modrm = getIByte(delta);
14758         if (epartIsReg(modrm)) goto decode_failure;
14759         if (gregOfRM(modrm) != 1) goto decode_failure;
14760         addr = disAMode ( &alen, sorb, delta, dis_buf );
14761         delta += alen;
14762
14763         /* Get the expected and new values. */
14764         assign( expdHi, getIReg(4,R_EDX) );
14765         assign( expdLo, getIReg(4,R_EAX) );
14766         assign( dataHi, getIReg(4,R_ECX) );
14767         assign( dataLo, getIReg(4,R_EBX) );
14768
14769         /* Do the DCAS */
14770         stmt( IRStmt_CAS(
14771                  mkIRCAS( oldHi, oldLo,
14772                           Iend_LE, mkexpr(addr),
14773                           mkexpr(expdHi), mkexpr(expdLo),
14774                           mkexpr(dataHi), mkexpr(dataLo)
14775               )));
14776
14777         /* success when oldHi:oldLo == expdHi:expdLo */
14778         assign( success,
14779                 binop(Iop_CasCmpEQ32,
14780                       binop(Iop_Or32,
14781                             binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
14782                             binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
14783                       ),
14784                       mkU32(0)
14785                 ));
14786
14787         /* If the DCAS is successful, that is to say oldHi:oldLo ==
14788            expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
14789            which is where they came from originally.  Both the actual
14790            contents of these two regs, and any shadow values, are
14791            unchanged.  If the DCAS fails then we're putting into
14792            EDX:EAX the value seen in memory. */
14793         putIReg(4, R_EDX,
14794                    IRExpr_ITE( mkexpr(success),
14795                                mkexpr(expdHi), mkexpr(oldHi)
14796                ));
14797         putIReg(4, R_EAX,
14798                    IRExpr_ITE( mkexpr(success),
14799                                mkexpr(expdLo), mkexpr(oldLo)
14800                ));
14801
14802         /* Copy the success bit into the Z flag and leave the others
14803            unchanged */
14804         assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
14805         assign(
14806            flags_new,
14807            binop(Iop_Or32,
14808                  binop(Iop_And32, mkexpr(flags_old),
14809                                   mkU32(~X86G_CC_MASK_Z)),
14810                  binop(Iop_Shl32,
14811                        binop(Iop_And32,
14812                              unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
14813                        mkU8(X86G_CC_SHIFT_Z)) ));
14814
14815         stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14816         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
14817         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14818         /* Set NDEP even though it isn't used.  This makes
14819            redundant-PUT elimination of previous stores to this field
14820            work better. */
14821         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14822
14823         /* Sheesh.  Aren't you glad it was me and not you that had to
14824	    write and validate all this grunge? */
14825
14826	 DIP("cmpxchg8b %s\n", dis_buf);
14827	 break;
14828      }
14829
14830      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
14831
14832      case 0xA2: { /* CPUID */
14833         /* Uses dirty helper:
14834               void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
14835            declared to mod eax, wr ebx, ecx, edx
14836         */
14837         IRDirty* d     = NULL;
14838         void*    fAddr = NULL;
14839         const HChar* fName = NULL;
14840         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
14841            fName = "x86g_dirtyhelper_CPUID_sse2";
14842            fAddr = &x86g_dirtyhelper_CPUID_sse2;
14843         }
14844         else
14845         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
14846            fName = "x86g_dirtyhelper_CPUID_sse1";
14847            fAddr = &x86g_dirtyhelper_CPUID_sse1;
14848         }
14849         else
14850         if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
14851            fName = "x86g_dirtyhelper_CPUID_mmxext";
14852            fAddr = &x86g_dirtyhelper_CPUID_mmxext;
14853         }
14854         else
14855         if (archinfo->hwcaps == 0/*no SSE*/) {
14856            fName = "x86g_dirtyhelper_CPUID_sse0";
14857            fAddr = &x86g_dirtyhelper_CPUID_sse0;
14858         } else
14859            vpanic("disInstr(x86)(cpuid)");
14860
14861         vassert(fName); vassert(fAddr);
14862         d = unsafeIRDirty_0_N ( 0/*regparms*/,
14863                                 fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
14864         /* declare guest state effects */
14865         d->nFxState = 4;
14866         vex_bzero(&d->fxState, sizeof(d->fxState));
14867         d->fxState[0].fx     = Ifx_Modify;
14868         d->fxState[0].offset = OFFB_EAX;
14869         d->fxState[0].size   = 4;
14870         d->fxState[1].fx     = Ifx_Write;
14871         d->fxState[1].offset = OFFB_EBX;
14872         d->fxState[1].size   = 4;
14873         d->fxState[2].fx     = Ifx_Modify;
14874         d->fxState[2].offset = OFFB_ECX;
14875         d->fxState[2].size   = 4;
14876         d->fxState[3].fx     = Ifx_Write;
14877         d->fxState[3].offset = OFFB_EDX;
14878         d->fxState[3].size   = 4;
14879         /* execute the dirty call, side-effecting guest state */
14880         stmt( IRStmt_Dirty(d) );
14881         /* CPUID is a serialising insn.  So, just in case someone is
14882            using it as a memory fence ... */
14883         stmt( IRStmt_MBE(Imbe_Fence) );
14884         DIP("cpuid\n");
14885         break;
14886      }
14887
14888//--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
14889//--             goto decode_failure;
14890//--
14891//--          t1 = newTemp(cb);
14892//--          t2 = newTemp(cb);
14893//--          t3 = newTemp(cb);
14894//--          t4 = newTemp(cb);
14895//--          uInstr0(cb, CALLM_S, 0);
14896//--
14897//--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
14898//--          uInstr1(cb, PUSH,  4, TempReg, t1);
14899//--
14900//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
14901//--          uLiteral(cb, 0);
14902//--          uInstr1(cb, PUSH,  4, TempReg, t2);
14903//--
14904//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
14905//--          uLiteral(cb, 0);
14906//--          uInstr1(cb, PUSH,  4, TempReg, t3);
14907//--
14908//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
14909//--          uLiteral(cb, 0);
14910//--          uInstr1(cb, PUSH,  4, TempReg, t4);
14911//--
14912//--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
14913//--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
14914//--
14915//--          uInstr1(cb, POP,   4, TempReg, t4);
14916//--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
14917//--
14918//--          uInstr1(cb, POP,   4, TempReg, t3);
14919//--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
14920//--
14921//--          uInstr1(cb, POP,   4, TempReg, t2);
14922//--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
14923//--
14924//--          uInstr1(cb, POP,   4, TempReg, t1);
14925//--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
14926//--
14927//--          uInstr0(cb, CALLM_E, 0);
14928//--          DIP("cpuid\n");
14929//--          break;
14930//--
14931      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
14932
14933      case 0xB6: /* MOVZXb Eb,Gv */
14934         if (sz != 2 && sz != 4)
14935            goto decode_failure;
14936         delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
14937         break;
14938
14939      case 0xB7: /* MOVZXw Ew,Gv */
14940         if (sz != 4)
14941            goto decode_failure;
14942         delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
14943         break;
14944
14945      case 0xBE: /* MOVSXb Eb,Gv */
14946         if (sz != 2 && sz != 4)
14947            goto decode_failure;
14948         delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
14949         break;
14950
14951      case 0xBF: /* MOVSXw Ew,Gv */
14952         if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
14953            goto decode_failure;
14954         delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
14955         break;
14956
14957//--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
14958//--
14959//--       case 0xC3: /* MOVNTI Gv,Ev */
14960//--          vg_assert(sz == 4);
14961//--          modrm = getUChar(eip);
14962//--          vg_assert(!epartIsReg(modrm));
14963//--          t1 = newTemp(cb);
14964//--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
14965//--          pair = disAMode ( cb, sorb, eip, dis_buf );
14966//--          t2 = LOW24(pair);
14967//--          eip += HI8(pair);
14968//--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
14969//--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
14970//--          break;
14971
14972      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
14973
14974      case 0xAF: /* IMUL Ev, Gv */
14975         delta = dis_mul_E_G ( sorb, sz, delta );
14976         break;
14977
14978      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
14979
14980      case 0x1F:
14981         modrm = getUChar(delta);
14982         if (epartIsReg(modrm)) goto decode_failure;
14983         addr = disAMode ( &alen, sorb, delta, dis_buf );
14984         delta += alen;
14985         DIP("nop%c %s\n", nameISize(sz), dis_buf);
14986         break;
14987
14988      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
14989      case 0x80:
14990      case 0x81:
14991      case 0x82: /* JBb/JNAEb (jump below) */
14992      case 0x83: /* JNBb/JAEb (jump not below) */
14993      case 0x84: /* JZb/JEb (jump zero) */
14994      case 0x85: /* JNZb/JNEb (jump not zero) */
14995      case 0x86: /* JBEb/JNAb (jump below or equal) */
14996      case 0x87: /* JNBEb/JAb (jump not below or equal) */
14997      case 0x88: /* JSb (jump negative) */
14998      case 0x89: /* JSb (jump not negative) */
14999      case 0x8A: /* JP (jump parity even) */
15000      case 0x8B: /* JNP/JPO (jump parity odd) */
15001      case 0x8C: /* JLb/JNGEb (jump less) */
15002      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
15003      case 0x8E: /* JLEb/JNGb (jump less or equal) */
15004      case 0x8F: /* JGb/JNLEb (jump greater) */
15005       { Int    jmpDelta;
15006         const HChar* comment  = "";
15007         jmpDelta = (Int)getUDisp32(delta);
15008         d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
15009         delta += 4;
15010         if (resteerCisOk
15011             && vex_control.guest_chase_cond
15012             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
15013             && jmpDelta < 0
15014             && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
15015            /* Speculation: assume this backward branch is taken.  So
15016               we need to emit a side-exit to the insn following this
15017               one, on the negation of the condition, and continue at
15018               the branch target address (d32).  If we wind up back at
15019               the first instruction of the trace, just stop; it's
15020               better to let the IR loop unroller handle that case.*/
15021            stmt( IRStmt_Exit(
15022                     mk_x86g_calculate_condition((X86Condcode)
15023                                                 (1 ^ (opc - 0x80))),
15024                     Ijk_Boring,
15025                     IRConst_U32(guest_EIP_bbstart+delta),
15026                     OFFB_EIP ) );
15027            dres.whatNext   = Dis_ResteerC;
15028            dres.continueAt = (Addr64)(Addr32)d32;
15029            comment = "(assumed taken)";
15030         }
15031         else
15032         if (resteerCisOk
15033             && vex_control.guest_chase_cond
15034             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
15035             && jmpDelta >= 0
15036             && resteerOkFn( callback_opaque,
15037                             (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
15038            /* Speculation: assume this forward branch is not taken.
15039               So we need to emit a side-exit to d32 (the dest) and
15040               continue disassembling at the insn immediately
15041               following this one. */
15042            stmt( IRStmt_Exit(
15043                     mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
15044                     Ijk_Boring,
15045                     IRConst_U32(d32),
15046                     OFFB_EIP ) );
15047            dres.whatNext   = Dis_ResteerC;
15048            dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
15049            comment = "(assumed not taken)";
15050         }
15051         else {
15052            /* Conservative default translation - end the block at
15053               this point. */
15054            jcc_01( &dres, (X86Condcode)(opc - 0x80),
15055                    (Addr32)(guest_EIP_bbstart+delta), d32);
15056            vassert(dres.whatNext == Dis_StopHere);
15057         }
15058         DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
15059         break;
15060       }
15061
15062      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
15063      case 0x31: { /* RDTSC */
15064         IRTemp   val  = newTemp(Ity_I64);
15065         IRExpr** args = mkIRExprVec_0();
15066         IRDirty* d    = unsafeIRDirty_1_N (
15067                            val,
15068                            0/*regparms*/,
15069                            "x86g_dirtyhelper_RDTSC",
15070                            &x86g_dirtyhelper_RDTSC,
15071                            args
15072                         );
15073         /* execute the dirty call, dumping the result in val. */
15074         stmt( IRStmt_Dirty(d) );
15075         putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
15076         putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
15077         DIP("rdtsc\n");
15078         break;
15079      }
15080
15081      /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
15082
15083      case 0xA1: /* POP %FS */
15084         dis_pop_segreg( R_FS, sz ); break;
15085      case 0xA9: /* POP %GS */
15086         dis_pop_segreg( R_GS, sz ); break;
15087
15088      case 0xA0: /* PUSH %FS */
15089         dis_push_segreg( R_FS, sz ); break;
15090      case 0xA8: /* PUSH %GS */
15091         dis_push_segreg( R_GS, sz ); break;
15092
15093      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
15094      case 0x90:
15095      case 0x91:
15096      case 0x92: /* set-Bb/set-NAEb (jump below) */
15097      case 0x93: /* set-NBb/set-AEb (jump not below) */
15098      case 0x94: /* set-Zb/set-Eb (jump zero) */
15099      case 0x95: /* set-NZb/set-NEb (jump not zero) */
15100      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
15101      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
15102      case 0x98: /* set-Sb (jump negative) */
15103      case 0x99: /* set-Sb (jump not negative) */
15104      case 0x9A: /* set-P (jump parity even) */
15105      case 0x9B: /* set-NP (jump parity odd) */
15106      case 0x9C: /* set-Lb/set-NGEb (jump less) */
15107      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
15108      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
15109      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
15110         t1 = newTemp(Ity_I8);
15111         assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
15112         modrm = getIByte(delta);
15113         if (epartIsReg(modrm)) {
15114            delta++;
15115            putIReg(1, eregOfRM(modrm), mkexpr(t1));
15116            DIP("set%s %s\n", name_X86Condcode(opc-0x90),
15117                              nameIReg(1,eregOfRM(modrm)));
15118         } else {
15119           addr = disAMode ( &alen, sorb, delta, dis_buf );
15120           delta += alen;
15121           storeLE( mkexpr(addr), mkexpr(t1) );
15122           DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
15123         }
15124         break;
15125
15126      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
15127
15128      case 0xA4: /* SHLDv imm8,Gv,Ev */
15129         modrm = getIByte(delta);
15130         d32   = delta + lengthAMode(delta);
15131         vex_sprintf(dis_buf, "$%d", getIByte(d32));
15132         delta = dis_SHLRD_Gv_Ev (
15133                  sorb, delta, modrm, sz,
15134                  mkU8(getIByte(d32)), True, /* literal */
15135                  dis_buf, True );
15136         break;
15137      case 0xA5: /* SHLDv %cl,Gv,Ev */
15138         modrm = getIByte(delta);
15139         delta = dis_SHLRD_Gv_Ev (
15140                    sorb, delta, modrm, sz,
15141                    getIReg(1,R_ECX), False, /* not literal */
15142                    "%cl", True );
15143         break;
15144
15145      case 0xAC: /* SHRDv imm8,Gv,Ev */
15146         modrm = getIByte(delta);
15147         d32   = delta + lengthAMode(delta);
15148         vex_sprintf(dis_buf, "$%d", getIByte(d32));
15149         delta = dis_SHLRD_Gv_Ev (
15150                    sorb, delta, modrm, sz,
15151                    mkU8(getIByte(d32)), True, /* literal */
15152                    dis_buf, False );
15153         break;
15154      case 0xAD: /* SHRDv %cl,Gv,Ev */
15155         modrm = getIByte(delta);
15156         delta = dis_SHLRD_Gv_Ev (
15157                    sorb, delta, modrm, sz,
15158                    getIReg(1,R_ECX), False, /* not literal */
15159                    "%cl", False );
15160         break;
15161
15162      /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
15163
15164      case 0x34:
15165         /* Simple implementation needing a long explaination.
15166
15167            sysenter is a kind of syscall entry.  The key thing here
15168            is that the return address is not known -- that is
15169            something that is beyond Vex's knowledge.  So this IR
15170            forces a return to the scheduler, which can do what it
15171            likes to simulate the systenter, but it MUST set this
15172            thread's guest_EIP field with the continuation address
15173            before resuming execution.  If that doesn't happen, the
15174            thread will jump to address zero, which is probably
15175            fatal.
15176         */
15177
15178         /* Note where we are, so we can back up the guest to this
15179            point if the syscall needs to be restarted. */
15180         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
15181                           mkU32(guest_EIP_curr_instr) ) );
15182         jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
15183         vassert(dres.whatNext == Dis_StopHere);
15184         DIP("sysenter");
15185         break;
15186
15187      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
15188
15189      case 0xC0: { /* XADD Gb,Eb */
15190         Bool decodeOK;
15191         delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
15192         if (!decodeOK) goto decode_failure;
15193         break;
15194      }
15195      case 0xC1: { /* XADD Gv,Ev */
15196         Bool decodeOK;
15197         delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
15198         if (!decodeOK) goto decode_failure;
15199         break;
15200      }
15201
15202      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
15203
15204      case 0x71:
15205      case 0x72:
15206      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
15207
15208      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
15209      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
15210      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
15211      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
15212
15213      case 0xFC:
15214      case 0xFD:
15215      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
15216
15217      case 0xEC:
15218      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
15219
15220      case 0xDC:
15221      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15222
15223      case 0xF8:
15224      case 0xF9:
15225      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
15226
15227      case 0xE8:
15228      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
15229
15230      case 0xD8:
15231      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15232
15233      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
15234      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
15235
15236      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
15237
15238      case 0x74:
15239      case 0x75:
15240      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
15241
15242      case 0x64:
15243      case 0x65:
15244      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
15245
15246      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
15247      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
15248      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
15249
15250      case 0x68:
15251      case 0x69:
15252      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
15253
15254      case 0x60:
15255      case 0x61:
15256      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
15257
15258      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
15259      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
15260      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
15261      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
15262
15263      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
15264      case 0xF2:
15265      case 0xF3:
15266
15267      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
15268      case 0xD2:
15269      case 0xD3:
15270
15271      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
15272      case 0xE2:
15273      {
15274         Int  delta0    = delta-1;
15275         Bool decode_OK = False;
15276
15277         /* If sz==2 this is SSE, and we assume sse idec has
15278            already spotted those cases by now. */
15279         if (sz != 4)
15280            goto decode_failure;
15281
15282         delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
15283         if (!decode_OK) {
15284            delta = delta0;
15285            goto decode_failure;
15286         }
15287         break;
15288      }
15289
15290      case 0x0E: /* FEMMS */
15291      case 0x77: /* EMMS */
15292         if (sz != 4)
15293            goto decode_failure;
15294         do_EMMS_preamble();
15295         DIP("{f}emms\n");
15296         break;
15297
15298      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
15299      case 0x01: /* 0F 01 /0 -- SGDT */
15300                 /* 0F 01 /1 -- SIDT */
15301      {
15302          /* This is really revolting, but ... since each processor
15303             (core) only has one IDT and one GDT, just let the guest
15304             see it (pass-through semantics).  I can't see any way to
15305             construct a faked-up value, so don't bother to try. */
15306         modrm = getUChar(delta);
15307         addr = disAMode ( &alen, sorb, delta, dis_buf );
15308         delta += alen;
15309         if (epartIsReg(modrm)) goto decode_failure;
15310         if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
15311            goto decode_failure;
15312         switch (gregOfRM(modrm)) {
15313            case 0: DIP("sgdt %s\n", dis_buf); break;
15314            case 1: DIP("sidt %s\n", dis_buf); break;
15315            default: vassert(0); /*NOTREACHED*/
15316         }
15317
15318         IRDirty* d = unsafeIRDirty_0_N (
15319                          0/*regparms*/,
15320                          "x86g_dirtyhelper_SxDT",
15321                          &x86g_dirtyhelper_SxDT,
15322                          mkIRExprVec_2( mkexpr(addr),
15323                                         mkU32(gregOfRM(modrm)) )
15324                      );
15325         /* declare we're writing memory */
15326         d->mFx   = Ifx_Write;
15327         d->mAddr = mkexpr(addr);
15328         d->mSize = 6;
15329         stmt( IRStmt_Dirty(d) );
15330         break;
15331      }
15332
15333      case 0x05: /* AMD's syscall */
15334         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
15335              mkU32(guest_EIP_curr_instr) ) );
15336         jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
15337         vassert(dres.whatNext == Dis_StopHere);
15338         DIP("syscall\n");
15339         break;
15340
15341      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
15342
15343      default:
15344         goto decode_failure;
15345   } /* switch (opc) for the 2-byte opcodes */
15346   goto decode_success;
15347   } /* case 0x0F: of primary opcode */
15348
15349   /* ------------------------ ??? ------------------------ */
15350
15351  default:
15352  decode_failure:
15353   /* All decode failures end up here. */
15354   if (sigill_diag) {
15355      vex_printf("vex x86->IR: unhandled instruction bytes: "
15356                 "0x%x 0x%x 0x%x 0x%x\n",
15357                 (Int)getIByte(delta_start+0),
15358                 (Int)getIByte(delta_start+1),
15359                 (Int)getIByte(delta_start+2),
15360                 (Int)getIByte(delta_start+3) );
15361   }
15362
15363   /* Tell the dispatcher that this insn cannot be decoded, and so has
15364      not been executed, and (is currently) the next to be executed.
15365      EIP should be up-to-date since it made so at the start of each
15366      insn, but nevertheless be paranoid and update it again right
15367      now. */
15368   stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
15369   jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
15370   vassert(dres.whatNext == Dis_StopHere);
15371   dres.len = 0;
15372   /* We also need to say that a CAS is not expected now, regardless
15373      of what it might have been set to at the start of the function,
15374      since the IR that we've emitted just above (to synthesis a
15375      SIGILL) does not involve any CAS, and presumably no other IR has
15376      been emitted for this (non-decoded) insn. */
15377   *expect_CAS = False;
15378   return dres;
15379
15380   } /* switch (opc) for the main (primary) opcode switch. */
15381
15382  decode_success:
15383   /* All decode successes end up here. */
15384   switch (dres.whatNext) {
15385      case Dis_Continue:
15386         stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
15387         break;
15388      case Dis_ResteerU:
15389      case Dis_ResteerC:
15390         stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
15391         break;
15392      case Dis_StopHere:
15393         break;
15394      default:
15395         vassert(0);
15396   }
15397
15398   DIP("\n");
15399   dres.len = delta - delta_start;
15400   return dres;
15401}
15402
15403#undef DIP
15404#undef DIS
15405
15406
15407/*------------------------------------------------------------*/
15408/*--- Top-level fn                                         ---*/
15409/*------------------------------------------------------------*/
15410
15411/* Disassemble a single instruction into IR.  The instruction
15412   is located in host memory at &guest_code[delta]. */
15413
15414DisResult disInstr_X86 ( IRSB*        irsb_IN,
15415                         Bool         (*resteerOkFn) ( void*, Addr64 ),
15416                         Bool         resteerCisOk,
15417                         void*        callback_opaque,
15418                         UChar*       guest_code_IN,
15419                         Long         delta,
15420                         Addr64       guest_IP,
15421                         VexArch      guest_arch,
15422                         VexArchInfo* archinfo,
15423                         VexAbiInfo*  abiinfo,
15424                         Bool         host_bigendian_IN,
15425                         Bool         sigill_diag_IN )
15426{
15427   Int       i, x1, x2;
15428   Bool      expect_CAS, has_CAS;
15429   DisResult dres;
15430
15431   /* Set globals (see top of this file) */
15432   vassert(guest_arch == VexArchX86);
15433   guest_code           = guest_code_IN;
15434   irsb                 = irsb_IN;
15435   host_is_bigendian    = host_bigendian_IN;
15436   guest_EIP_curr_instr = (Addr32)guest_IP;
15437   guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
15438
15439   x1 = irsb_IN->stmts_used;
15440   expect_CAS = False;
15441   dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
15442                             resteerCisOk,
15443                             callback_opaque,
15444                             delta, archinfo, abiinfo, sigill_diag_IN );
15445   x2 = irsb_IN->stmts_used;
15446   vassert(x2 >= x1);
15447
15448   /* See comment at the top of disInstr_X86_WRK for meaning of
15449      expect_CAS.  Here, we (sanity-)check for the presence/absence of
15450      IRCAS as directed by the returned expect_CAS value. */
15451   has_CAS = False;
15452   for (i = x1; i < x2; i++) {
15453      if (irsb_IN->stmts[i]->tag == Ist_CAS)
15454         has_CAS = True;
15455   }
15456
15457   if (expect_CAS != has_CAS) {
15458      /* inconsistency detected.  re-disassemble the instruction so as
15459         to generate a useful error message; then assert. */
15460      vex_traceflags |= VEX_TRACE_FE;
15461      dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
15462                                resteerCisOk,
15463                                callback_opaque,
15464                                delta, archinfo, abiinfo, sigill_diag_IN );
15465      for (i = x1; i < x2; i++) {
15466         vex_printf("\t\t");
15467         ppIRStmt(irsb_IN->stmts[i]);
15468         vex_printf("\n");
15469      }
15470      /* Failure of this assertion is serious and denotes a bug in
15471         disInstr. */
15472      vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
15473   }
15474
15475   return dres;
15476}
15477
15478
15479/*--------------------------------------------------------------------*/
15480/*--- end                                         guest_x86_toIR.c ---*/
15481/*--------------------------------------------------------------------*/
15482