host_x86_isel.c revision 9bea4c13fca0e3bb4b719dcb3ed63d47d479294e
1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2010 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_x86_defs.h"
46
47/* TODO 21 Apr 2005:
48
49   -- (Really an assembler issue) don't emit CMov32 as a cmov
50      insn, since that's expensive on P4 and conditional branch
51      is cheaper if (as we expect) the condition is highly predictable
52
53   -- preserve xmm registers across function calls (by declaring them
54      as trashed by call insns)
55
56   -- preserve x87 ST stack discipline across function calls.  Sigh.
57
58   -- Check doHelperCall: if a call is conditional, we cannot safely
59      compute any regparm args directly to registers.  Hence, the
60      fast-regparm marshalling should be restricted to unconditional
61      calls only.
62*/
63
64/*---------------------------------------------------------*/
65/*--- x87 control word stuff                            ---*/
66/*---------------------------------------------------------*/
67
68/* Vex-generated code expects to run with the FPU set as follows: all
69   exceptions masked, round-to-nearest, precision = 53 bits.  This
70   corresponds to a FPU control word value of 0x027F.
71
72   Similarly the SSE control word (%mxcsr) should be 0x1F80.
73
74   %fpucw and %mxcsr should have these values on entry to
75   Vex-generated code, and should those values should be
76   unchanged at exit.
77*/
78
79#define DEFAULT_FPUCW 0x027F
80
81/* debugging only, do not use */
82/* define DEFAULT_FPUCW 0x037F */
83
84
85/*---------------------------------------------------------*/
86/*--- misc helpers                                      ---*/
87/*---------------------------------------------------------*/
88
89/* These are duplicated in guest-x86/toIR.c */
90static IRExpr* unop ( IROp op, IRExpr* a )
91{
92   return IRExpr_Unop(op, a);
93}
94
95static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
96{
97   return IRExpr_Binop(op, a1, a2);
98}
99
100static IRExpr* bind ( Int binder )
101{
102   return IRExpr_Binder(binder);
103}
104
105static Bool isZeroU8 ( IRExpr* e )
106{
107   return e->tag == Iex_Const
108          && e->Iex.Const.con->tag == Ico_U8
109          && e->Iex.Const.con->Ico.U8 == 0;
110}
111
112static Bool isZeroU32 ( IRExpr* e )
113{
114   return e->tag == Iex_Const
115          && e->Iex.Const.con->tag == Ico_U32
116          && e->Iex.Const.con->Ico.U32 == 0;
117}
118
119static Bool isZeroU64 ( IRExpr* e )
120{
121   return e->tag == Iex_Const
122          && e->Iex.Const.con->tag == Ico_U64
123          && e->Iex.Const.con->Ico.U64 == 0ULL;
124}
125
126
127/*---------------------------------------------------------*/
128/*--- ISelEnv                                           ---*/
129/*---------------------------------------------------------*/
130
131/* This carries around:
132
133   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
134     might encounter.  This is computed before insn selection starts,
135     and does not change.
136
137   - A mapping from IRTemp to HReg.  This tells the insn selector
138     which virtual register(s) are associated with each IRTemp
139     temporary.  This is computed before insn selection starts, and
140     does not change.  We expect this mapping to map precisely the
141     same set of IRTemps as the type mapping does.
142
143        - vregmap   holds the primary register for the IRTemp.
144        - vregmapHI is only used for 64-bit integer-typed
145             IRTemps.  It holds the identity of a second
146             32-bit virtual HReg, which holds the high half
147             of the value.
148
149   - The code array, that is, the insns selected so far.
150
151   - A counter, for generating new virtual registers.
152
153   - The host subarchitecture we are selecting insns for.
154     This is set at the start and does not change.
155
156   Note, this is all host-independent.  */
157
158typedef
159   struct {
160      IRTypeEnv*   type_env;
161
162      HReg*        vregmap;
163      HReg*        vregmapHI;
164      Int          n_vregmap;
165
166      HInstrArray* code;
167
168      Int          vreg_ctr;
169
170      UInt         hwcaps;
171   }
172   ISelEnv;
173
174
175static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
176{
177   vassert(tmp >= 0);
178   vassert(tmp < env->n_vregmap);
179   return env->vregmap[tmp];
180}
181
182static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
183{
184   vassert(tmp >= 0);
185   vassert(tmp < env->n_vregmap);
186   vassert(env->vregmapHI[tmp] != INVALID_HREG);
187   *vrLO = env->vregmap[tmp];
188   *vrHI = env->vregmapHI[tmp];
189}
190
191static void addInstr ( ISelEnv* env, X86Instr* instr )
192{
193   addHInstr(env->code, instr);
194   if (vex_traceflags & VEX_TRACE_VCODE) {
195      ppX86Instr(instr, False);
196      vex_printf("\n");
197   }
198}
199
200static HReg newVRegI ( ISelEnv* env )
201{
202   HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
203   env->vreg_ctr++;
204   return reg;
205}
206
207static HReg newVRegF ( ISelEnv* env )
208{
209   HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
210   env->vreg_ctr++;
211   return reg;
212}
213
214static HReg newVRegV ( ISelEnv* env )
215{
216   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
217   env->vreg_ctr++;
218   return reg;
219}
220
221
222/*---------------------------------------------------------*/
223/*--- ISEL: Forward declarations                        ---*/
224/*---------------------------------------------------------*/
225
226/* These are organised as iselXXX and iselXXX_wrk pairs.  The
227   iselXXX_wrk do the real work, but are not to be called directly.
228   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
229   checks that all returned registers are virtual.  You should not
230   call the _wrk version directly.
231*/
232static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
233static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
234
235static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
236static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
237
238static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
239static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
240
241static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
242static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
243
244static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
245static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
246
247static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
248                                       ISelEnv* env, IRExpr* e );
249static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
250                                       ISelEnv* env, IRExpr* e );
251
252static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
253static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
254
255static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
256static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
257
258static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
259static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
260
261static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
262static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
263
264
265/*---------------------------------------------------------*/
266/*--- ISEL: Misc helpers                                ---*/
267/*---------------------------------------------------------*/
268
269/* Make a int reg-reg move. */
270
271static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
272{
273   vassert(hregClass(src) == HRcInt32);
274   vassert(hregClass(dst) == HRcInt32);
275   return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
276}
277
278
279/* Make a vector reg-reg move. */
280
281static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
282{
283   vassert(hregClass(src) == HRcVec128);
284   vassert(hregClass(dst) == HRcVec128);
285   return X86Instr_SseReRg(Xsse_MOV, src, dst);
286}
287
288/* Advance/retreat %esp by n. */
289
290static void add_to_esp ( ISelEnv* env, Int n )
291{
292   vassert(n > 0 && n < 256 && (n%4) == 0);
293   addInstr(env,
294            X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
295}
296
297static void sub_from_esp ( ISelEnv* env, Int n )
298{
299   vassert(n > 0 && n < 256 && (n%4) == 0);
300   addInstr(env,
301            X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
302}
303
304
305/* Given an amode, return one which references 4 bytes further
306   along. */
307
308static X86AMode* advance4 ( X86AMode* am )
309{
310   X86AMode* am4 = dopyX86AMode(am);
311   switch (am4->tag) {
312      case Xam_IRRS:
313         am4->Xam.IRRS.imm += 4; break;
314      case Xam_IR:
315         am4->Xam.IR.imm += 4; break;
316      default:
317         vpanic("advance4(x86,host)");
318   }
319   return am4;
320}
321
322
323/* Push an arg onto the host stack, in preparation for a call to a
324   helper function of some kind.  Returns the number of 32-bit words
325   pushed. */
326
327static Int pushArg ( ISelEnv* env, IRExpr* arg )
328{
329   IRType arg_ty = typeOfIRExpr(env->type_env, arg);
330   if (arg_ty == Ity_I32) {
331      addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
332      return 1;
333   } else
334   if (arg_ty == Ity_I64) {
335      HReg rHi, rLo;
336      iselInt64Expr(&rHi, &rLo, env, arg);
337      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
338      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
339      return 2;
340   }
341   ppIRExpr(arg);
342   vpanic("pushArg(x86): can't handle arg of this type");
343}
344
345
346/* Complete the call to a helper function, by calling the
347   helper and clearing the args off the stack. */
348
349static
350void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
351                              IRCallee* cee, Int n_arg_ws )
352{
353   /* Complication.  Need to decide which reg to use as the fn address
354      pointer, in a way that doesn't trash regparm-passed
355      parameters. */
356   vassert(sizeof(void*) == 4);
357
358   addInstr(env, X86Instr_Call( cc, toUInt(Ptr_to_ULong(cee->addr)),
359                                    cee->regparms));
360   if (n_arg_ws > 0)
361      add_to_esp(env, 4*n_arg_ws);
362}
363
364
365/* Used only in doHelperCall.  See big comment in doHelperCall re
366   handling of regparm args.  This function figures out whether
367   evaluation of an expression might require use of a fixed register.
368   If in doubt return True (safe but suboptimal).
369*/
370static
371Bool mightRequireFixedRegs ( IRExpr* e )
372{
373   switch (e->tag) {
374      case Iex_RdTmp: case Iex_Const: case Iex_Get:
375         return False;
376      default:
377         return True;
378   }
379}
380
381
382/* Do a complete function call.  guard is a Ity_Bit expression
383   indicating whether or not the call happens.  If guard==NULL, the
384   call is unconditional. */
385
386static
387void doHelperCall ( ISelEnv* env,
388                    Bool passBBP,
389                    IRExpr* guard, IRCallee* cee, IRExpr** args )
390{
391   X86CondCode cc;
392   HReg        argregs[3];
393   HReg        tmpregs[3];
394   Bool        danger;
395   Int         not_done_yet, n_args, n_arg_ws, stack_limit,
396               i, argreg, argregX;
397
398   /* Marshal args for a call, do the call, and clear the stack.
399      Complexities to consider:
400
401      * if passBBP is True, %ebp (the baseblock pointer) is to be
402        passed as the first arg.
403
404      * If the callee claims regparmness of 1, 2 or 3, we must pass the
405        first 1, 2 or 3 args in registers (EAX, EDX, and ECX
406        respectively).  To keep things relatively simple, only args of
407        type I32 may be passed as regparms -- just bomb out if anything
408        else turns up.  Clearly this depends on the front ends not
409        trying to pass any other types as regparms.
410   */
411
412   /* 16 Nov 2004: the regparm handling is complicated by the
413      following problem.
414
415      Consider a call two a function with two regparm parameters:
416      f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
417      Suppose code is first generated to compute e1 into %eax.  Then,
418      code is generated to compute e2 into %edx.  Unfortunately, if
419      the latter code sequence uses %eax, it will trash the value of
420      e1 computed by the former sequence.  This could happen if (for
421      example) e2 itself involved a function call.  In the code below,
422      args are evaluated right-to-left, not left-to-right, but the
423      principle and the problem are the same.
424
425      One solution is to compute all regparm-bound args into vregs
426      first, and once they are all done, move them to the relevant
427      real regs.  This always gives correct code, but it also gives
428      a bunch of vreg-to-rreg moves which are usually redundant but
429      are hard for the register allocator to get rid of.
430
431      A compromise is to first examine all regparm'd argument
432      expressions.  If they are all so simple that it is clear
433      they will be evaluated without use of any fixed registers,
434      use the old compute-directly-to-fixed-target scheme.  If not,
435      be safe and use the via-vregs scheme.
436
437      Note this requires being able to examine an expression and
438      determine whether or not evaluation of it might use a fixed
439      register.  That requires knowledge of how the rest of this
440      insn selector works.  Currently just the following 3 are
441      regarded as safe -- hopefully they cover the majority of
442      arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
443   */
444   vassert(cee->regparms >= 0 && cee->regparms <= 3);
445
446   n_args = n_arg_ws = 0;
447   while (args[n_args]) n_args++;
448
449   not_done_yet = n_args;
450   if (passBBP)
451      not_done_yet++;
452
453   stack_limit = cee->regparms;
454   if (cee->regparms > 0 && passBBP) stack_limit--;
455
456   /* ------ BEGIN marshall all arguments ------ */
457
458   /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
459   for (i = n_args-1; i >= stack_limit; i--) {
460      n_arg_ws += pushArg(env, args[i]);
461      not_done_yet--;
462   }
463
464   /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
465      registers. */
466
467   if (cee->regparms > 0) {
468
469      /* ------ BEGIN deal with regparms ------ */
470
471      /* deal with regparms, not forgetting %ebp if needed. */
472      argregs[0] = hregX86_EAX();
473      argregs[1] = hregX86_EDX();
474      argregs[2] = hregX86_ECX();
475      tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
476
477      argreg = cee->regparms;
478
479      /* In keeping with big comment above, detect potential danger
480         and use the via-vregs scheme if needed. */
481      danger = False;
482      for (i = stack_limit-1; i >= 0; i--) {
483         if (mightRequireFixedRegs(args[i])) {
484            danger = True;
485            break;
486         }
487      }
488
489      if (danger) {
490
491         /* Move via temporaries */
492         argregX = argreg;
493         for (i = stack_limit-1; i >= 0; i--) {
494
495            if (0) {
496               vex_printf("x86 host: register param is complex: ");
497               ppIRExpr(args[i]);
498               vex_printf("\n");
499            }
500
501            argreg--;
502            vassert(argreg >= 0);
503            vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
504            tmpregs[argreg] = iselIntExpr_R(env, args[i]);
505            not_done_yet--;
506         }
507         for (i = stack_limit-1; i >= 0; i--) {
508            argregX--;
509            vassert(argregX >= 0);
510            addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
511         }
512
513      } else {
514         /* It's safe to compute all regparm args directly into their
515            target registers. */
516         for (i = stack_limit-1; i >= 0; i--) {
517            argreg--;
518            vassert(argreg >= 0);
519            vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
520            addInstr(env, X86Instr_Alu32R(Xalu_MOV,
521                                          iselIntExpr_RMI(env, args[i]),
522                                          argregs[argreg]));
523            not_done_yet--;
524         }
525
526      }
527
528      /* Not forgetting %ebp if needed. */
529      if (passBBP) {
530         vassert(argreg == 1);
531         addInstr(env, mk_iMOVsd_RR( hregX86_EBP(), argregs[0]));
532         not_done_yet--;
533      }
534
535      /* ------ END deal with regparms ------ */
536
537   } else {
538
539      /* No regparms.  Heave %ebp on the stack if needed. */
540      if (passBBP) {
541         addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
542         n_arg_ws++;
543         not_done_yet--;
544      }
545
546   }
547
548   vassert(not_done_yet == 0);
549
550   /* ------ END marshall all arguments ------ */
551
552   /* Now we can compute the condition.  We can't do it earlier
553      because the argument computations could trash the condition
554      codes.  Be a bit clever to handle the common case where the
555      guard is 1:Bit. */
556   cc = Xcc_ALWAYS;
557   if (guard) {
558      if (guard->tag == Iex_Const
559          && guard->Iex.Const.con->tag == Ico_U1
560          && guard->Iex.Const.con->Ico.U1 == True) {
561         /* unconditional -- do nothing */
562      } else {
563         cc = iselCondCode( env, guard );
564      }
565   }
566
567   /* call the helper, and get the args off the stack afterwards. */
568   callHelperAndClearArgs( env, cc, cee, n_arg_ws );
569}
570
571
572/* Given a guest-state array descriptor, an index expression and a
573   bias, generate an X86AMode holding the relevant guest state
574   offset. */
575
576static
577X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
578                                IRExpr* off, Int bias )
579{
580   HReg tmp, roff;
581   Int  elemSz = sizeofIRType(descr->elemTy);
582   Int  nElems = descr->nElems;
583   Int  shift  = 0;
584
585   /* throw out any cases not generated by an x86 front end.  In
586      theory there might be a day where we need to handle them -- if
587      we ever run non-x86-guest on x86 host. */
588
589   if (nElems != 8)
590      vpanic("genGuestArrayOffset(x86 host)(1)");
591
592   switch (elemSz) {
593      case 1:  shift = 0; break;
594      case 4:  shift = 2; break;
595      case 8:  shift = 3; break;
596      default: vpanic("genGuestArrayOffset(x86 host)(2)");
597   }
598
599   /* Compute off into a reg, %off.  Then return:
600
601         movl %off, %tmp
602         addl $bias, %tmp  (if bias != 0)
603         andl %tmp, 7
604         ... base(%ebp, %tmp, shift) ...
605   */
606   tmp  = newVRegI(env);
607   roff = iselIntExpr_R(env, off);
608   addInstr(env, mk_iMOVsd_RR(roff, tmp));
609   if (bias != 0) {
610      addInstr(env,
611               X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
612   }
613   addInstr(env,
614            X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
615   return
616      X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
617}
618
619
620/* Mess with the FPU's rounding mode: set to the default rounding mode
621   (DEFAULT_FPUCW). */
622static
623void set_FPU_rounding_default ( ISelEnv* env )
624{
625   /* pushl $DEFAULT_FPUCW
626      fldcw 0(%esp)
627      addl $4, %esp
628   */
629   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
630   addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
631   addInstr(env, X86Instr_FpLdCW(zero_esp));
632   add_to_esp(env, 4);
633}
634
635
636/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
637   expression denoting a value in the range 0 .. 3, indicating a round
638   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
639   the same rounding.
640*/
641static
642void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
643{
644   HReg rrm  = iselIntExpr_R(env, mode);
645   HReg rrm2 = newVRegI(env);
646   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
647
648   /* movl  %rrm, %rrm2
649      andl  $3, %rrm2   -- shouldn't be needed; paranoia
650      shll  $10, %rrm2
651      orl   $DEFAULT_FPUCW, %rrm2
652      pushl %rrm2
653      fldcw 0(%esp)
654      addl  $4, %esp
655   */
656   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
657   addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
658   addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
659   addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
660   addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
661   addInstr(env, X86Instr_FpLdCW(zero_esp));
662   add_to_esp(env, 4);
663}
664
665
666/* Generate !src into a new vector register, and be sure that the code
667   is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
668   way to do this.
669*/
670static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
671{
672   HReg dst = newVRegV(env);
673   /* Set dst to zero.  If dst contains a NaN then all hell might
674      break loose after the comparison.  So, first zero it. */
675   addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
676   /* And now make it all 1s ... */
677   addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
678   /* Finally, xor 'src' into it. */
679   addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
680   /* Doesn't that just totally suck? */
681   return dst;
682}
683
684
685/* Round an x87 FPU value to 53-bit-mantissa precision, to be used
686   after most non-simple FPU operations (simple = +, -, *, / and
687   sqrt).
688
689   This could be done a lot more efficiently if needed, by loading
690   zero and adding it to the value to be rounded (fldz ; faddp?).
691*/
692static void roundToF64 ( ISelEnv* env, HReg reg )
693{
694   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
695   sub_from_esp(env, 8);
696   addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
697   addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
698   add_to_esp(env, 8);
699}
700
701
702/*---------------------------------------------------------*/
703/*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
704/*---------------------------------------------------------*/
705
706/* Select insns for an integer-typed expression, and add them to the
707   code list.  Return a reg holding the result.  This reg will be a
708   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
709   want to modify it, ask for a new vreg, copy it in there, and modify
710   the copy.  The register allocator will do its best to map both
711   vregs to the same real register, so the copies will often disappear
712   later in the game.
713
714   This should handle expressions of 32, 16 and 8-bit type.  All
715   results are returned in a 32-bit register.  For 16- and 8-bit
716   expressions, the upper 16/24 bits are arbitrary, so you should mask
717   or sign extend partial values if necessary.
718*/
719
720static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
721{
722   HReg r = iselIntExpr_R_wrk(env, e);
723   /* sanity checks ... */
724#  if 0
725   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
726#  endif
727   vassert(hregClass(r) == HRcInt32);
728   vassert(hregIsVirtual(r));
729   return r;
730}
731
732/* DO NOT CALL THIS DIRECTLY ! */
733static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
734{
735   MatchInfo mi;
736
737   IRType ty = typeOfIRExpr(env->type_env,e);
738   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
739
740   switch (e->tag) {
741
742   /* --------- TEMP --------- */
743   case Iex_RdTmp: {
744      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
745   }
746
747   /* --------- LOAD --------- */
748   case Iex_Load: {
749      HReg dst = newVRegI(env);
750      X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
751
752      /* We can't handle big-endian loads, nor load-linked. */
753      if (e->Iex.Load.end != Iend_LE)
754         goto irreducible;
755
756      if (ty == Ity_I32) {
757         addInstr(env, X86Instr_Alu32R(Xalu_MOV,
758                                       X86RMI_Mem(amode), dst) );
759         return dst;
760      }
761      if (ty == Ity_I16) {
762         addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
763         return dst;
764      }
765      if (ty == Ity_I8) {
766         addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
767         return dst;
768      }
769      break;
770   }
771
772   /* --------- TERNARY OP --------- */
773   case Iex_Triop: {
774      /* C3210 flags following FPU partial remainder (fprem), both
775         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
776      if (e->Iex.Triop.op == Iop_PRemC3210F64
777          || e->Iex.Triop.op == Iop_PRem1C3210F64) {
778         HReg junk = newVRegF(env);
779         HReg dst  = newVRegI(env);
780         HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
781         HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
782         /* XXXROUNDINGFIXME */
783         /* set roundingmode here */
784         addInstr(env, X86Instr_FpBinary(
785                           e->Iex.Binop.op==Iop_PRemC3210F64
786                              ? Xfp_PREM : Xfp_PREM1,
787                           srcL,srcR,junk
788                 ));
789         /* The previous pseudo-insn will have left the FPU's C3210
790            flags set correctly.  So bag them. */
791         addInstr(env, X86Instr_FpStSW_AX());
792         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
793         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
794         return dst;
795      }
796
797      break;
798   }
799
800   /* --------- BINARY OP --------- */
801   case Iex_Binop: {
802      X86AluOp   aluOp;
803      X86ShiftOp shOp;
804
805      /* Pattern: Sub32(0,x) */
806      if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
807         HReg dst = newVRegI(env);
808         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
809         addInstr(env, mk_iMOVsd_RR(reg,dst));
810         addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
811         return dst;
812      }
813
814      /* Is it an addition or logical style op? */
815      switch (e->Iex.Binop.op) {
816         case Iop_Add8: case Iop_Add16: case Iop_Add32:
817            aluOp = Xalu_ADD; break;
818         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
819            aluOp = Xalu_SUB; break;
820         case Iop_And8: case Iop_And16: case Iop_And32:
821            aluOp = Xalu_AND; break;
822         case Iop_Or8: case Iop_Or16: case Iop_Or32:
823            aluOp = Xalu_OR; break;
824         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
825            aluOp = Xalu_XOR; break;
826         case Iop_Mul16: case Iop_Mul32:
827            aluOp = Xalu_MUL; break;
828         default:
829            aluOp = Xalu_INVALID; break;
830      }
831      /* For commutative ops we assume any literal
832         values are on the second operand. */
833      if (aluOp != Xalu_INVALID) {
834         HReg dst    = newVRegI(env);
835         HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
836         X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
837         addInstr(env, mk_iMOVsd_RR(reg,dst));
838         addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
839         return dst;
840      }
841      /* Could do better here; forcing the first arg into a reg
842         isn't always clever.
843         -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
844                        LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
845                        t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
846            movl 0xFFFFFFA0(%vr41),%vr107
847            movl 0xFFFFFFA4(%vr41),%vr108
848            movl %vr107,%vr106
849            xorl %vr108,%vr106
850            movl 0xFFFFFFA8(%vr41),%vr109
851            movl %vr106,%vr105
852            andl %vr109,%vr105
853            movl 0xFFFFFFA0(%vr41),%vr110
854            movl %vr105,%vr104
855            xorl %vr110,%vr104
856            movl %vr104,%vr70
857      */
858
859      /* Perhaps a shift op? */
860      switch (e->Iex.Binop.op) {
861         case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
862            shOp = Xsh_SHL; break;
863         case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
864            shOp = Xsh_SHR; break;
865         case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
866            shOp = Xsh_SAR; break;
867         default:
868            shOp = Xsh_INVALID; break;
869      }
870      if (shOp != Xsh_INVALID) {
871         HReg dst = newVRegI(env);
872
873         /* regL = the value to be shifted */
874         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
875         addInstr(env, mk_iMOVsd_RR(regL,dst));
876
877         /* Do any necessary widening for 16/8 bit operands */
878         switch (e->Iex.Binop.op) {
879            case Iop_Shr8:
880               addInstr(env, X86Instr_Alu32R(
881                                Xalu_AND, X86RMI_Imm(0xFF), dst));
882               break;
883            case Iop_Shr16:
884               addInstr(env, X86Instr_Alu32R(
885                                Xalu_AND, X86RMI_Imm(0xFFFF), dst));
886               break;
887            case Iop_Sar8:
888               addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
889               addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
890               break;
891            case Iop_Sar16:
892               addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
893               addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
894               break;
895            default: break;
896         }
897
898         /* Now consider the shift amount.  If it's a literal, we
899            can do a much better job than the general case. */
900         if (e->Iex.Binop.arg2->tag == Iex_Const) {
901            /* assert that the IR is well-typed */
902            Int nshift;
903            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
904            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
905	    vassert(nshift >= 0);
906	    if (nshift > 0)
907               /* Can't allow nshift==0 since that means %cl */
908               addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
909         } else {
910            /* General case; we have to force the amount into %cl. */
911            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
912            addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
913            addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
914         }
915         return dst;
916      }
917
918      /* Handle misc other ops. */
919
920      if (e->Iex.Binop.op == Iop_Max32U) {
921         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
922         HReg dst  = newVRegI(env);
923         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
924         addInstr(env, mk_iMOVsd_RR(src1,dst));
925         addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
926         addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
927         return dst;
928      }
929
930      if (e->Iex.Binop.op == Iop_8HLto16) {
931         HReg hi8  = newVRegI(env);
932         HReg lo8  = newVRegI(env);
933         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
934         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
935         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
936         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
937         addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
938         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
939         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
940         return hi8;
941      }
942
943      if (e->Iex.Binop.op == Iop_16HLto32) {
944         HReg hi16  = newVRegI(env);
945         HReg lo16  = newVRegI(env);
946         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
947         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
948         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
949         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
950         addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
951         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
952         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
953         return hi16;
954      }
955
956      if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
957          || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
958         HReg a16   = newVRegI(env);
959         HReg b16   = newVRegI(env);
960         HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
961         HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
962         Int  shift = (e->Iex.Binop.op == Iop_MullS8
963                       || e->Iex.Binop.op == Iop_MullU8)
964                         ? 24 : 16;
965         X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
966                              || e->Iex.Binop.op == Iop_MullS16)
967                                ? Xsh_SAR : Xsh_SHR;
968
969         addInstr(env, mk_iMOVsd_RR(a16s, a16));
970         addInstr(env, mk_iMOVsd_RR(b16s, b16));
971         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
972         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
973         addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
974         addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
975         addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
976         return b16;
977      }
978
979      if (e->Iex.Binop.op == Iop_CmpF64) {
980         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
981         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
982         HReg dst = newVRegI(env);
983         addInstr(env, X86Instr_FpCmp(fL,fR,dst));
984         /* shift this right 8 bits so as to conform to CmpF64
985            definition. */
986         addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
987         return dst;
988      }
989
990      if (e->Iex.Binop.op == Iop_F64toI32S
991          || e->Iex.Binop.op == Iop_F64toI16S) {
992         Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
993         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
994         HReg dst = newVRegI(env);
995
996         /* Used several times ... */
997         X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
998
999	 /* rf now holds the value to be converted, and rrm holds the
1000	    rounding mode value, encoded as per the IRRoundingMode
1001	    enum.  The first thing to do is set the FPU's rounding
1002	    mode accordingly. */
1003
1004         /* Create a space for the format conversion. */
1005         /* subl $4, %esp */
1006         sub_from_esp(env, 4);
1007
1008	 /* Set host rounding mode */
1009	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
1010
1011         /* gistw/l %rf, 0(%esp) */
1012         addInstr(env, X86Instr_FpLdStI(False/*store*/,
1013                                        toUChar(sz), rf, zero_esp));
1014
1015         if (sz == 2) {
1016            /* movzwl 0(%esp), %dst */
1017            addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
1018         } else {
1019            /* movl 0(%esp), %dst */
1020            vassert(sz == 4);
1021            addInstr(env, X86Instr_Alu32R(
1022                             Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1023         }
1024
1025	 /* Restore default FPU rounding. */
1026         set_FPU_rounding_default( env );
1027
1028         /* addl $4, %esp */
1029	 add_to_esp(env, 4);
1030         return dst;
1031      }
1032
1033      break;
1034   }
1035
1036   /* --------- UNARY OP --------- */
1037   case Iex_Unop: {
1038
1039      /* 1Uto8(32to1(expr32)) */
1040      if (e->Iex.Unop.op == Iop_1Uto8) {
1041         DECLARE_PATTERN(p_32to1_then_1Uto8);
1042         DEFINE_PATTERN(p_32to1_then_1Uto8,
1043                        unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
1044         if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
1045            IRExpr* expr32 = mi.bindee[0];
1046            HReg dst = newVRegI(env);
1047            HReg src = iselIntExpr_R(env, expr32);
1048            addInstr(env, mk_iMOVsd_RR(src,dst) );
1049            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1050                                          X86RMI_Imm(1), dst));
1051            return dst;
1052         }
1053      }
1054
1055      /* 8Uto32(LDle(expr32)) */
1056      if (e->Iex.Unop.op == Iop_8Uto32) {
1057         DECLARE_PATTERN(p_LDle8_then_8Uto32);
1058         DEFINE_PATTERN(p_LDle8_then_8Uto32,
1059                        unop(Iop_8Uto32,
1060                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1061         if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
1062            HReg dst = newVRegI(env);
1063            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1064            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1065            return dst;
1066         }
1067      }
1068
1069      /* 8Sto32(LDle(expr32)) */
1070      if (e->Iex.Unop.op == Iop_8Sto32) {
1071         DECLARE_PATTERN(p_LDle8_then_8Sto32);
1072         DEFINE_PATTERN(p_LDle8_then_8Sto32,
1073                        unop(Iop_8Sto32,
1074                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1075         if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
1076            HReg dst = newVRegI(env);
1077            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1078            addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
1079            return dst;
1080         }
1081      }
1082
1083      /* 16Uto32(LDle(expr32)) */
1084      if (e->Iex.Unop.op == Iop_16Uto32) {
1085         DECLARE_PATTERN(p_LDle16_then_16Uto32);
1086         DEFINE_PATTERN(p_LDle16_then_16Uto32,
1087                        unop(Iop_16Uto32,
1088                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1089         if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
1090            HReg dst = newVRegI(env);
1091            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1092            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1093            return dst;
1094         }
1095      }
1096
1097      /* 8Uto32(GET:I8) */
1098      if (e->Iex.Unop.op == Iop_8Uto32) {
1099         if (e->Iex.Unop.arg->tag == Iex_Get) {
1100            HReg      dst;
1101            X86AMode* amode;
1102            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
1103            dst = newVRegI(env);
1104            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1105                                hregX86_EBP());
1106            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1107            return dst;
1108         }
1109      }
1110
1111      /* 16to32(GET:I16) */
1112      if (e->Iex.Unop.op == Iop_16Uto32) {
1113         if (e->Iex.Unop.arg->tag == Iex_Get) {
1114            HReg      dst;
1115            X86AMode* amode;
1116            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
1117            dst = newVRegI(env);
1118            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1119                                hregX86_EBP());
1120            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1121            return dst;
1122         }
1123      }
1124
1125      switch (e->Iex.Unop.op) {
1126         case Iop_8Uto16:
1127         case Iop_8Uto32:
1128         case Iop_16Uto32: {
1129            HReg dst = newVRegI(env);
1130            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1131            UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
1132            addInstr(env, mk_iMOVsd_RR(src,dst) );
1133            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1134                                          X86RMI_Imm(mask), dst));
1135            return dst;
1136         }
1137         case Iop_8Sto16:
1138         case Iop_8Sto32:
1139         case Iop_16Sto32: {
1140            HReg dst = newVRegI(env);
1141            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1142            UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
1143            addInstr(env, mk_iMOVsd_RR(src,dst) );
1144            addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
1145            addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
1146            return dst;
1147         }
1148	 case Iop_Not8:
1149	 case Iop_Not16:
1150         case Iop_Not32: {
1151            HReg dst = newVRegI(env);
1152            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1153            addInstr(env, mk_iMOVsd_RR(src,dst) );
1154            addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
1155            return dst;
1156         }
1157         case Iop_64HIto32: {
1158            HReg rHi, rLo;
1159            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1160            return rHi; /* and abandon rLo .. poor wee thing :-) */
1161         }
1162         case Iop_64to32: {
1163            HReg rHi, rLo;
1164            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1165            return rLo; /* similar stupid comment to the above ... */
1166         }
1167         case Iop_16HIto8:
1168         case Iop_32HIto16: {
1169            HReg dst  = newVRegI(env);
1170            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1171            Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
1172            addInstr(env, mk_iMOVsd_RR(src,dst) );
1173            addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
1174            return dst;
1175         }
1176         case Iop_1Uto32:
1177         case Iop_1Uto8: {
1178            HReg dst         = newVRegI(env);
1179            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1180            addInstr(env, X86Instr_Set32(cond,dst));
1181            return dst;
1182         }
1183         case Iop_1Sto8:
1184         case Iop_1Sto16:
1185         case Iop_1Sto32: {
1186            /* could do better than this, but for now ... */
1187            HReg dst         = newVRegI(env);
1188            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1189            addInstr(env, X86Instr_Set32(cond,dst));
1190            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
1191            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1192            return dst;
1193         }
1194         case Iop_Ctz32: {
1195            /* Count trailing zeroes, implemented by x86 'bsfl' */
1196            HReg dst = newVRegI(env);
1197            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1198            addInstr(env, X86Instr_Bsfr32(True,src,dst));
1199            return dst;
1200         }
1201         case Iop_Clz32: {
1202            /* Count leading zeroes.  Do 'bsrl' to establish the index
1203               of the highest set bit, and subtract that value from
1204               31. */
1205            HReg tmp = newVRegI(env);
1206            HReg dst = newVRegI(env);
1207            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1208            addInstr(env, X86Instr_Bsfr32(False,src,tmp));
1209            addInstr(env, X86Instr_Alu32R(Xalu_MOV,
1210                                          X86RMI_Imm(31), dst));
1211            addInstr(env, X86Instr_Alu32R(Xalu_SUB,
1212                                          X86RMI_Reg(tmp), dst));
1213            return dst;
1214         }
1215
1216         case Iop_CmpwNEZ32: {
1217            HReg dst = newVRegI(env);
1218            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1219            addInstr(env, mk_iMOVsd_RR(src,dst));
1220            addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
1221            addInstr(env, X86Instr_Alu32R(Xalu_OR,
1222                                          X86RMI_Reg(src), dst));
1223            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1224            return dst;
1225         }
1226         case Iop_Left8:
1227         case Iop_Left16:
1228         case Iop_Left32: {
1229            HReg dst = newVRegI(env);
1230            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1231            addInstr(env, mk_iMOVsd_RR(src, dst));
1232            addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
1233            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
1234            return dst;
1235         }
1236
1237         case Iop_V128to32: {
1238            HReg      dst  = newVRegI(env);
1239            HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
1240            X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
1241            sub_from_esp(env, 16);
1242            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
1243            addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
1244            add_to_esp(env, 16);
1245            return dst;
1246         }
1247
1248         /* ReinterpF32asI32(e) */
1249         /* Given an IEEE754 single, produce an I32 with the same bit
1250            pattern.  Keep stack 8-aligned even though only using 4
1251            bytes. */
1252         case Iop_ReinterpF32asI32: {
1253            HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
1254            HReg dst  = newVRegI(env);
1255            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1256            /* paranoia */
1257            set_FPU_rounding_default(env);
1258            /* subl $8, %esp */
1259            sub_from_esp(env, 8);
1260            /* gstF %rf, 0(%esp) */
1261            addInstr(env,
1262                     X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
1263            /* movl 0(%esp), %dst */
1264            addInstr(env,
1265                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1266            /* addl $8, %esp */
1267            add_to_esp(env, 8);
1268            return dst;
1269         }
1270
1271         case Iop_16to8:
1272         case Iop_32to8:
1273         case Iop_32to16:
1274            /* These are no-ops. */
1275            return iselIntExpr_R(env, e->Iex.Unop.arg);
1276
1277         default:
1278            break;
1279      }
1280      break;
1281   }
1282
1283   /* --------- GET --------- */
1284   case Iex_Get: {
1285      if (ty == Ity_I32) {
1286         HReg dst = newVRegI(env);
1287         addInstr(env, X86Instr_Alu32R(
1288                          Xalu_MOV,
1289                          X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1290                                                 hregX86_EBP())),
1291                          dst));
1292         return dst;
1293      }
1294      if (ty == Ity_I8 || ty == Ity_I16) {
1295         HReg dst = newVRegI(env);
1296         addInstr(env, X86Instr_LoadEX(
1297                          toUChar(ty==Ity_I8 ? 1 : 2),
1298                          False,
1299                          X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
1300                          dst));
1301         return dst;
1302      }
1303      break;
1304   }
1305
1306   case Iex_GetI: {
1307      X86AMode* am
1308         = genGuestArrayOffset(
1309              env, e->Iex.GetI.descr,
1310                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1311      HReg dst = newVRegI(env);
1312      if (ty == Ity_I8) {
1313         addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
1314         return dst;
1315      }
1316      if (ty == Ity_I32) {
1317         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
1318         return dst;
1319      }
1320      break;
1321   }
1322
1323   /* --------- CCALL --------- */
1324   case Iex_CCall: {
1325      HReg    dst = newVRegI(env);
1326      vassert(ty == e->Iex.CCall.retty);
1327
1328      /* be very restrictive for now.  Only 32/64-bit ints allowed
1329         for args, and 32 bits for return type. */
1330      if (e->Iex.CCall.retty != Ity_I32)
1331         goto irreducible;
1332
1333      /* Marshal args, do the call, clear stack. */
1334      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
1335
1336      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1337      return dst;
1338   }
1339
1340   /* --------- LITERAL --------- */
1341   /* 32/16/8-bit literals */
1342   case Iex_Const: {
1343      X86RMI* rmi = iselIntExpr_RMI ( env, e );
1344      HReg    r   = newVRegI(env);
1345      addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
1346      return r;
1347   }
1348
1349   /* --------- MULTIPLEX --------- */
1350   case Iex_Mux0X: {
1351     if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1352         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
1353        X86RM* r8;
1354        HReg   rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
1355        X86RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
1356        HReg   dst = newVRegI(env);
1357        addInstr(env, mk_iMOVsd_RR(rX,dst));
1358        r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
1359        addInstr(env, X86Instr_Test32(0xFF, r8));
1360        addInstr(env, X86Instr_CMov32(Xcc_Z,r0,dst));
1361        return dst;
1362      }
1363      break;
1364   }
1365
1366   default:
1367   break;
1368   } /* switch (e->tag) */
1369
1370   /* We get here if no pattern matched. */
1371  irreducible:
1372   ppIRExpr(e);
1373   vpanic("iselIntExpr_R: cannot reduce tree");
1374}
1375
1376
1377/*---------------------------------------------------------*/
1378/*--- ISEL: Integer expression auxiliaries              ---*/
1379/*---------------------------------------------------------*/
1380
1381/* --------------------- AMODEs --------------------- */
1382
1383/* Return an AMode which computes the value of the specified
1384   expression, possibly also adding insns to the code list as a
1385   result.  The expression may only be a 32-bit one.
1386*/
1387
1388static Bool sane_AMode ( X86AMode* am )
1389{
1390   switch (am->tag) {
1391      case Xam_IR:
1392         return
1393            toBool( hregClass(am->Xam.IR.reg) == HRcInt32
1394                    && (hregIsVirtual(am->Xam.IR.reg)
1395                        || am->Xam.IR.reg == hregX86_EBP()) );
1396      case Xam_IRRS:
1397         return
1398            toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
1399                    && hregIsVirtual(am->Xam.IRRS.base)
1400                    && hregClass(am->Xam.IRRS.index) == HRcInt32
1401                    && hregIsVirtual(am->Xam.IRRS.index) );
1402      default:
1403        vpanic("sane_AMode: unknown x86 amode tag");
1404   }
1405}
1406
1407static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1408{
1409   X86AMode* am = iselIntExpr_AMode_wrk(env, e);
1410   vassert(sane_AMode(am));
1411   return am;
1412}
1413
1414/* DO NOT CALL THIS DIRECTLY ! */
1415static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1416{
1417   IRType ty = typeOfIRExpr(env->type_env,e);
1418   vassert(ty == Ity_I32);
1419
1420   /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
1421   if (e->tag == Iex_Binop
1422       && e->Iex.Binop.op == Iop_Add32
1423       && e->Iex.Binop.arg2->tag == Iex_Const
1424       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
1425       && e->Iex.Binop.arg1->tag == Iex_Binop
1426       && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
1427       && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
1428       && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1429       && e->Iex.Binop.arg1
1430           ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1431       && e->Iex.Binop.arg1
1432           ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1433      UInt shift = e->Iex.Binop.arg1
1434                    ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1435      UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1436      if (shift == 1 || shift == 2 || shift == 3) {
1437         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
1438         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
1439                                       ->Iex.Binop.arg2->Iex.Binop.arg1 );
1440         return X86AMode_IRRS(imm32, r1, r2, shift);
1441      }
1442   }
1443
1444   /* Add32(expr1, Shl32(expr2, imm)) */
1445   if (e->tag == Iex_Binop
1446       && e->Iex.Binop.op == Iop_Add32
1447       && e->Iex.Binop.arg2->tag == Iex_Binop
1448       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1449       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1450       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1451      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1452      if (shift == 1 || shift == 2 || shift == 3) {
1453         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1454         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1455         return X86AMode_IRRS(0, r1, r2, shift);
1456      }
1457   }
1458
1459   /* Add32(expr,i) */
1460   if (e->tag == Iex_Binop
1461       && e->Iex.Binop.op == Iop_Add32
1462       && e->Iex.Binop.arg2->tag == Iex_Const
1463       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
1464      HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
1465      return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
1466   }
1467
1468   /* Doesn't match anything in particular.  Generate it into
1469      a register and use that. */
1470   {
1471      HReg r1 = iselIntExpr_R(env, e);
1472      return X86AMode_IR(0, r1);
1473   }
1474}
1475
1476
1477/* --------------------- RMIs --------------------- */
1478
1479/* Similarly, calculate an expression into an X86RMI operand.  As with
1480   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1481
1482static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
1483{
1484   X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1485   /* sanity checks ... */
1486   switch (rmi->tag) {
1487      case Xrmi_Imm:
1488         return rmi;
1489      case Xrmi_Reg:
1490         vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
1491         vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
1492         return rmi;
1493      case Xrmi_Mem:
1494         vassert(sane_AMode(rmi->Xrmi.Mem.am));
1495         return rmi;
1496      default:
1497         vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
1498   }
1499}
1500
1501/* DO NOT CALL THIS DIRECTLY ! */
1502static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
1503{
1504   IRType ty = typeOfIRExpr(env->type_env,e);
1505   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1506
1507   /* special case: immediate */
1508   if (e->tag == Iex_Const) {
1509      UInt u;
1510      switch (e->Iex.Const.con->tag) {
1511         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1512         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1513         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1514         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1515      }
1516      return X86RMI_Imm(u);
1517   }
1518
1519   /* special case: 32-bit GET */
1520   if (e->tag == Iex_Get && ty == Ity_I32) {
1521      return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1522                                    hregX86_EBP()));
1523   }
1524
1525   /* special case: 32-bit load from memory */
1526   if (e->tag == Iex_Load && ty == Ity_I32
1527       && e->Iex.Load.end == Iend_LE) {
1528      X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
1529      return X86RMI_Mem(am);
1530   }
1531
1532   /* default case: calculate into a register and return that */
1533   {
1534      HReg r = iselIntExpr_R ( env, e );
1535      return X86RMI_Reg(r);
1536   }
1537}
1538
1539
1540/* --------------------- RIs --------------------- */
1541
1542/* Calculate an expression into an X86RI operand.  As with
1543   iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
1544
1545static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
1546{
1547   X86RI* ri = iselIntExpr_RI_wrk(env, e);
1548   /* sanity checks ... */
1549   switch (ri->tag) {
1550      case Xri_Imm:
1551         return ri;
1552      case Xri_Reg:
1553         vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
1554         vassert(hregIsVirtual(ri->Xri.Reg.reg));
1555         return ri;
1556      default:
1557         vpanic("iselIntExpr_RI: unknown x86 RI tag");
1558   }
1559}
1560
1561/* DO NOT CALL THIS DIRECTLY ! */
1562static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
1563{
1564   IRType ty = typeOfIRExpr(env->type_env,e);
1565   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1566
1567   /* special case: immediate */
1568   if (e->tag == Iex_Const) {
1569      UInt u;
1570      switch (e->Iex.Const.con->tag) {
1571         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1572         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1573         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1574         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1575      }
1576      return X86RI_Imm(u);
1577   }
1578
1579   /* default case: calculate into a register and return that */
1580   {
1581      HReg r = iselIntExpr_R ( env, e );
1582      return X86RI_Reg(r);
1583   }
1584}
1585
1586
1587/* --------------------- RMs --------------------- */
1588
1589/* Similarly, calculate an expression into an X86RM operand.  As with
1590   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1591
1592static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
1593{
1594   X86RM* rm = iselIntExpr_RM_wrk(env, e);
1595   /* sanity checks ... */
1596   switch (rm->tag) {
1597      case Xrm_Reg:
1598         vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
1599         vassert(hregIsVirtual(rm->Xrm.Reg.reg));
1600         return rm;
1601      case Xrm_Mem:
1602         vassert(sane_AMode(rm->Xrm.Mem.am));
1603         return rm;
1604      default:
1605         vpanic("iselIntExpr_RM: unknown x86 RM tag");
1606   }
1607}
1608
1609/* DO NOT CALL THIS DIRECTLY ! */
1610static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
1611{
1612   IRType ty = typeOfIRExpr(env->type_env,e);
1613   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1614
1615   /* special case: 32-bit GET */
1616   if (e->tag == Iex_Get && ty == Ity_I32) {
1617      return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
1618                                   hregX86_EBP()));
1619   }
1620
1621   /* special case: load from memory */
1622
1623   /* default case: calculate into a register and return that */
1624   {
1625      HReg r = iselIntExpr_R ( env, e );
1626      return X86RM_Reg(r);
1627   }
1628}
1629
1630
1631/* --------------------- CONDCODE --------------------- */
1632
1633/* Generate code to evaluated a bit-typed expression, returning the
1634   condition code which would correspond when the expression would
1635   notionally have returned 1. */
1636
1637static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
1638{
1639   /* Uh, there's nothing we can sanity check here, unfortunately. */
1640   return iselCondCode_wrk(env,e);
1641}
1642
1643/* DO NOT CALL THIS DIRECTLY ! */
1644static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
1645{
1646   MatchInfo mi;
1647
1648   vassert(e);
1649   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
1650
1651   /* var */
1652   if (e->tag == Iex_RdTmp) {
1653      HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
1654      /* Test32 doesn't modify r32; so this is OK. */
1655      addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
1656      return Xcc_NZ;
1657   }
1658
1659   /* Constant 1:Bit */
1660   if (e->tag == Iex_Const) {
1661      HReg r;
1662      vassert(e->Iex.Const.con->tag == Ico_U1);
1663      vassert(e->Iex.Const.con->Ico.U1 == True
1664              || e->Iex.Const.con->Ico.U1 == False);
1665      r = newVRegI(env);
1666      addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
1667      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
1668      return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
1669   }
1670
1671   /* Not1(e) */
1672   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
1673      /* Generate code for the arg, and negate the test condition */
1674      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
1675   }
1676
1677   /* --- patterns rooted at: 32to1 --- */
1678
1679   if (e->tag == Iex_Unop
1680       && e->Iex.Unop.op == Iop_32to1) {
1681      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1682      addInstr(env, X86Instr_Test32(1,rm));
1683      return Xcc_NZ;
1684   }
1685
1686   /* --- patterns rooted at: CmpNEZ8 --- */
1687
1688   /* CmpNEZ8(x) */
1689   if (e->tag == Iex_Unop
1690       && e->Iex.Unop.op == Iop_CmpNEZ8) {
1691      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1692      addInstr(env, X86Instr_Test32(0xFF,rm));
1693      return Xcc_NZ;
1694   }
1695
1696   /* --- patterns rooted at: CmpNEZ16 --- */
1697
1698   /* CmpNEZ16(x) */
1699   if (e->tag == Iex_Unop
1700       && e->Iex.Unop.op == Iop_CmpNEZ16) {
1701      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1702      addInstr(env, X86Instr_Test32(0xFFFF,rm));
1703      return Xcc_NZ;
1704   }
1705
1706   /* --- patterns rooted at: CmpNEZ32 --- */
1707
1708   /* CmpNEZ32(And32(x,y)) */
1709   {
1710      DECLARE_PATTERN(p_CmpNEZ32_And32);
1711      DEFINE_PATTERN(p_CmpNEZ32_And32,
1712                     unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
1713      if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
1714         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1715         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1716         HReg    tmp  = newVRegI(env);
1717         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1718         addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
1719         return Xcc_NZ;
1720      }
1721   }
1722
1723   /* CmpNEZ32(Or32(x,y)) */
1724   {
1725      DECLARE_PATTERN(p_CmpNEZ32_Or32);
1726      DEFINE_PATTERN(p_CmpNEZ32_Or32,
1727                     unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
1728      if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
1729         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1730         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1731         HReg    tmp  = newVRegI(env);
1732         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1733         addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
1734         return Xcc_NZ;
1735      }
1736   }
1737
1738   /* CmpNEZ32(GET(..):I32) */
1739   if (e->tag == Iex_Unop
1740       && e->Iex.Unop.op == Iop_CmpNEZ32
1741       && e->Iex.Unop.arg->tag == Iex_Get) {
1742      X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1743                                 hregX86_EBP());
1744      addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
1745      return Xcc_NZ;
1746   }
1747
1748   /* CmpNEZ32(x) */
1749   if (e->tag == Iex_Unop
1750       && e->Iex.Unop.op == Iop_CmpNEZ32) {
1751      HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1752      X86RMI* rmi2 = X86RMI_Imm(0);
1753      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1754      return Xcc_NZ;
1755   }
1756
1757   /* --- patterns rooted at: CmpNEZ64 --- */
1758
1759   /* CmpNEZ64(Or64(x,y)) */
1760   {
1761      DECLARE_PATTERN(p_CmpNEZ64_Or64);
1762      DEFINE_PATTERN(p_CmpNEZ64_Or64,
1763                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
1764      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
1765         HReg    hi1, lo1, hi2, lo2;
1766         HReg    tmp  = newVRegI(env);
1767         iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
1768         addInstr(env, mk_iMOVsd_RR(hi1, tmp));
1769         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
1770         iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
1771         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
1772         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
1773         return Xcc_NZ;
1774      }
1775   }
1776
1777   /* CmpNEZ64(x) */
1778   if (e->tag == Iex_Unop
1779       && e->Iex.Unop.op == Iop_CmpNEZ64) {
1780      HReg hi, lo;
1781      HReg tmp = newVRegI(env);
1782      iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
1783      addInstr(env, mk_iMOVsd_RR(hi, tmp));
1784      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
1785      return Xcc_NZ;
1786   }
1787
1788   /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
1789
1790   /* CmpEQ8 / CmpNE8 */
1791   if (e->tag == Iex_Binop
1792       && (e->Iex.Binop.op == Iop_CmpEQ8
1793           || e->Iex.Binop.op == Iop_CmpNE8
1794           || e->Iex.Binop.op == Iop_CasCmpEQ8
1795           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
1796      if (isZeroU8(e->Iex.Binop.arg2)) {
1797         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1798         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
1799         switch (e->Iex.Binop.op) {
1800            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1801            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1802            default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
1803         }
1804      } else {
1805         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1806         X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1807         HReg    r    = newVRegI(env);
1808         addInstr(env, mk_iMOVsd_RR(r1,r));
1809         addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1810         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
1811         switch (e->Iex.Binop.op) {
1812            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1813            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1814            default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
1815         }
1816      }
1817   }
1818
1819   /* CmpEQ16 / CmpNE16 */
1820   if (e->tag == Iex_Binop
1821       && (e->Iex.Binop.op == Iop_CmpEQ16
1822           || e->Iex.Binop.op == Iop_CmpNE16
1823           || e->Iex.Binop.op == Iop_CasCmpEQ16
1824           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
1825      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1826      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1827      HReg    r    = newVRegI(env);
1828      addInstr(env, mk_iMOVsd_RR(r1,r));
1829      addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1830      addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
1831      switch (e->Iex.Binop.op) {
1832         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
1833         case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
1834         default: vpanic("iselCondCode(x86): CmpXX16");
1835      }
1836   }
1837
1838   /* Cmp*32*(x,y) */
1839   if (e->tag == Iex_Binop
1840       && (e->Iex.Binop.op == Iop_CmpEQ32
1841           || e->Iex.Binop.op == Iop_CmpNE32
1842           || e->Iex.Binop.op == Iop_CmpLT32S
1843           || e->Iex.Binop.op == Iop_CmpLT32U
1844           || e->Iex.Binop.op == Iop_CmpLE32S
1845           || e->Iex.Binop.op == Iop_CmpLE32U
1846           || e->Iex.Binop.op == Iop_CasCmpEQ32
1847           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
1848      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1849      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1850      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1851      switch (e->Iex.Binop.op) {
1852         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
1853         case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
1854         case Iop_CmpLT32S: return Xcc_L;
1855         case Iop_CmpLT32U: return Xcc_B;
1856         case Iop_CmpLE32S: return Xcc_LE;
1857         case Iop_CmpLE32U: return Xcc_BE;
1858         default: vpanic("iselCondCode(x86): CmpXX32");
1859      }
1860   }
1861
1862   /* CmpNE64 */
1863   if (e->tag == Iex_Binop
1864       && (e->Iex.Binop.op == Iop_CmpNE64
1865           || e->Iex.Binop.op == Iop_CmpEQ64)) {
1866      HReg hi1, hi2, lo1, lo2;
1867      HReg tHi = newVRegI(env);
1868      HReg tLo = newVRegI(env);
1869      iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
1870      iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
1871      addInstr(env, mk_iMOVsd_RR(hi1, tHi));
1872      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
1873      addInstr(env, mk_iMOVsd_RR(lo1, tLo));
1874      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
1875      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
1876      switch (e->Iex.Binop.op) {
1877         case Iop_CmpNE64: return Xcc_NZ;
1878         case Iop_CmpEQ64: return Xcc_Z;
1879         default: vpanic("iselCondCode(x86): CmpXX64");
1880      }
1881   }
1882
1883   ppIRExpr(e);
1884   vpanic("iselCondCode");
1885}
1886
1887
1888/*---------------------------------------------------------*/
1889/*--- ISEL: Integer expressions (64 bit)                ---*/
1890/*---------------------------------------------------------*/
1891
1892/* Compute a 64-bit value into a register pair, which is returned as
1893   the first two parameters.  As with iselIntExpr_R, these may be
1894   either real or virtual regs; in any case they must not be changed
1895   by subsequent code emitted by the caller.  */
1896
1897static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
1898{
1899   iselInt64Expr_wrk(rHi, rLo, env, e);
1900#  if 0
1901   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
1902#  endif
1903   vassert(hregClass(*rHi) == HRcInt32);
1904   vassert(hregIsVirtual(*rHi));
1905   vassert(hregClass(*rLo) == HRcInt32);
1906   vassert(hregIsVirtual(*rLo));
1907}
1908
1909/* DO NOT CALL THIS DIRECTLY ! */
1910static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
1911{
1912   MatchInfo mi;
1913   HWord fn = 0; /* helper fn for most SIMD64 stuff */
1914   vassert(e);
1915   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
1916
1917   /* 64-bit literal */
1918   if (e->tag == Iex_Const) {
1919      ULong w64 = e->Iex.Const.con->Ico.U64;
1920      UInt  wHi = toUInt(w64 >> 32);
1921      UInt  wLo = toUInt(w64);
1922      HReg  tLo = newVRegI(env);
1923      HReg  tHi = newVRegI(env);
1924      vassert(e->Iex.Const.con->tag == Ico_U64);
1925      if (wLo == wHi) {
1926         /* Save a precious Int register in this special case. */
1927         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
1928         *rHi = tLo;
1929         *rLo = tLo;
1930      } else {
1931         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
1932         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
1933         *rHi = tHi;
1934         *rLo = tLo;
1935      }
1936      return;
1937   }
1938
1939   /* read 64-bit IRTemp */
1940   if (e->tag == Iex_RdTmp) {
1941      lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
1942      return;
1943   }
1944
1945   /* 64-bit load */
1946   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
1947      HReg     tLo, tHi;
1948      X86AMode *am0, *am4;
1949      vassert(e->Iex.Load.ty == Ity_I64);
1950      tLo = newVRegI(env);
1951      tHi = newVRegI(env);
1952      am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
1953      am4 = advance4(am0);
1954      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
1955      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
1956      *rHi = tHi;
1957      *rLo = tLo;
1958      return;
1959   }
1960
1961   /* 64-bit GET */
1962   if (e->tag == Iex_Get) {
1963      X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
1964      X86AMode* am4 = advance4(am);
1965      HReg tLo = newVRegI(env);
1966      HReg tHi = newVRegI(env);
1967      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
1968      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
1969      *rHi = tHi;
1970      *rLo = tLo;
1971      return;
1972   }
1973
1974   /* 64-bit GETI */
1975   if (e->tag == Iex_GetI) {
1976      X86AMode* am
1977         = genGuestArrayOffset( env, e->Iex.GetI.descr,
1978                                     e->Iex.GetI.ix, e->Iex.GetI.bias );
1979      X86AMode* am4 = advance4(am);
1980      HReg tLo = newVRegI(env);
1981      HReg tHi = newVRegI(env);
1982      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
1983      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
1984      *rHi = tHi;
1985      *rLo = tLo;
1986      return;
1987   }
1988
1989   /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
1990   if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
1991      X86RM* r8;
1992      HReg e0Lo, e0Hi;
1993      HReg tLo = newVRegI(env);
1994      HReg tHi = newVRegI(env);
1995      X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1996      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
1997      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
1998      addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
1999      addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
2000      addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
2001      addInstr(env, X86Instr_Test32(0xFF, r8));
2002      addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
2003      addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
2004      add_to_esp(env, 4);
2005      *rHi = tHi;
2006      *rLo = tLo;
2007      return;
2008   }
2009   /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
2010   if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
2011      X86RM* r8;
2012      HReg e0Lo, e0Hi;
2013      HReg tLo = newVRegI(env);
2014      HReg tHi = newVRegI(env);
2015      X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2016      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
2017      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
2018      addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
2019      addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
2020      addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
2021      addInstr(env, X86Instr_Test32(0xFF, r8));
2022      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
2023      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
2024      add_to_esp(env, 4);
2025      *rHi = tHi;
2026      *rLo = tLo;
2027      return;
2028   }
2029
2030   /* 64-bit Mux0X: Mux0X(g, expr, expr) */
2031   if (e->tag == Iex_Mux0X) {
2032      X86RM* r8;
2033      HReg e0Lo, e0Hi, eXLo, eXHi;
2034      HReg tLo = newVRegI(env);
2035      HReg tHi = newVRegI(env);
2036      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
2037      iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
2038      addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
2039      addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
2040      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
2041      addInstr(env, X86Instr_Test32(0xFF, r8));
2042      /* This assumes the first cmov32 doesn't trash the condition
2043         codes, so they are still available for the second cmov32 */
2044      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
2045      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
2046      *rHi = tHi;
2047      *rLo = tLo;
2048      return;
2049   }
2050
2051   /* --------- BINARY ops --------- */
2052   if (e->tag == Iex_Binop) {
2053      switch (e->Iex.Binop.op) {
2054         /* 32 x 32 -> 64 multiply */
2055         case Iop_MullU32:
2056         case Iop_MullS32: {
2057            /* get one operand into %eax, and the other into a R/M.
2058               Need to make an educated guess about which is better in
2059               which. */
2060            HReg   tLo    = newVRegI(env);
2061            HReg   tHi    = newVRegI(env);
2062            Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
2063            X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2064            HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2065            addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
2066            addInstr(env, X86Instr_MulL(syned, rmLeft));
2067            /* Result is now in EDX:EAX.  Tell the caller. */
2068            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2069            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2070            *rHi = tHi;
2071            *rLo = tLo;
2072            return;
2073         }
2074
2075         /* 64 x 32 -> (32(rem),32(div)) division */
2076         case Iop_DivModU64to32:
2077         case Iop_DivModS64to32: {
2078            /* Get the 64-bit operand into edx:eax, and the other into
2079               any old R/M. */
2080            HReg sHi, sLo;
2081            HReg   tLo     = newVRegI(env);
2082            HReg   tHi     = newVRegI(env);
2083            Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
2084            X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2085            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2086            addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
2087            addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
2088            addInstr(env, X86Instr_Div(syned, rmRight));
2089            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2090            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2091            *rHi = tHi;
2092            *rLo = tLo;
2093            return;
2094         }
2095
2096         /* Or64/And64/Xor64 */
2097         case Iop_Or64:
2098         case Iop_And64:
2099         case Iop_Xor64: {
2100            HReg xLo, xHi, yLo, yHi;
2101            HReg tLo = newVRegI(env);
2102            HReg tHi = newVRegI(env);
2103            X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
2104                          : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
2105                          : Xalu_XOR;
2106            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2107            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2108            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2109            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
2110            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2111            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
2112            *rHi = tHi;
2113            *rLo = tLo;
2114            return;
2115         }
2116
2117         /* Add64/Sub64 */
2118         case Iop_Add64:
2119            if (e->Iex.Binop.arg2->tag == Iex_Const) {
2120               /* special case Add64(e, const) */
2121               ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
2122               UInt  wHi = toUInt(w64 >> 32);
2123               UInt  wLo = toUInt(w64);
2124               HReg  tLo = newVRegI(env);
2125               HReg  tHi = newVRegI(env);
2126               HReg  xLo, xHi;
2127               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
2128               iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2129               addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2130               addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2131               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
2132               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
2133               *rHi = tHi;
2134               *rLo = tLo;
2135               return;
2136            }
2137            /* else fall through to the generic case */
2138         case Iop_Sub64: {
2139            HReg xLo, xHi, yLo, yHi;
2140            HReg tLo = newVRegI(env);
2141            HReg tHi = newVRegI(env);
2142            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2143            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2144            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2145            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2146            if (e->Iex.Binop.op==Iop_Add64) {
2147               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
2148               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
2149            } else {
2150               addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2151               addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2152            }
2153            *rHi = tHi;
2154            *rLo = tLo;
2155            return;
2156         }
2157
2158         /* 32HLto64(e1,e2) */
2159         case Iop_32HLto64:
2160            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2161            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2162            return;
2163
2164         /* 64-bit shifts */
2165         case Iop_Shl64: {
2166            /* We use the same ingenious scheme as gcc.  Put the value
2167               to be shifted into %hi:%lo, and the shift amount into
2168               %cl.  Then (dsts on right, a la ATT syntax):
2169
2170               shldl %cl, %lo, %hi   -- make %hi be right for the
2171                                     -- shift amt %cl % 32
2172               shll  %cl, %lo        -- make %lo be right for the
2173                                     -- shift amt %cl % 32
2174
2175               Now, if (shift amount % 64) is in the range 32 .. 63,
2176               we have to do a fixup, which puts the result low half
2177               into the result high half, and zeroes the low half:
2178
2179               testl $32, %ecx
2180
2181               cmovnz %lo, %hi
2182               movl $0, %tmp         -- sigh; need yet another reg
2183               cmovnz %tmp, %lo
2184            */
2185            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2186            tLo = newVRegI(env);
2187            tHi = newVRegI(env);
2188            tTemp = newVRegI(env);
2189            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2190            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2191            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2192            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2193            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2194            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2195               and those regs are legitimately modifiable. */
2196            addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
2197            addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
2198            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2199            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
2200            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2201            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
2202            *rHi = tHi;
2203            *rLo = tLo;
2204            return;
2205         }
2206
2207         case Iop_Shr64: {
2208            /* We use the same ingenious scheme as gcc.  Put the value
2209               to be shifted into %hi:%lo, and the shift amount into
2210               %cl.  Then:
2211
2212               shrdl %cl, %hi, %lo   -- make %lo be right for the
2213                                     -- shift amt %cl % 32
2214               shrl  %cl, %hi        -- make %hi be right for the
2215                                     -- shift amt %cl % 32
2216
2217               Now, if (shift amount % 64) is in the range 32 .. 63,
2218               we have to do a fixup, which puts the result high half
2219               into the result low half, and zeroes the high half:
2220
2221               testl $32, %ecx
2222
2223               cmovnz %hi, %lo
2224               movl $0, %tmp         -- sigh; need yet another reg
2225               cmovnz %tmp, %hi
2226            */
2227            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2228            tLo = newVRegI(env);
2229            tHi = newVRegI(env);
2230            tTemp = newVRegI(env);
2231            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2232            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2233            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2234            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2235            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2236            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2237               and those regs are legitimately modifiable. */
2238            addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
2239            addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
2240            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2241            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
2242            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2243            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
2244            *rHi = tHi;
2245            *rLo = tLo;
2246            return;
2247         }
2248
2249         /* F64 -> I64 */
2250         /* Sigh, this is an almost exact copy of the F64 -> I32/I16
2251            case.  Unfortunately I see no easy way to avoid the
2252            duplication. */
2253         case Iop_F64toI64S: {
2254            HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2255            HReg tLo = newVRegI(env);
2256            HReg tHi = newVRegI(env);
2257
2258            /* Used several times ... */
2259            /* Careful ... this sharing is only safe because
2260	       zero_esp/four_esp do not hold any registers which the
2261	       register allocator could attempt to swizzle later. */
2262            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2263            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2264
2265            /* rf now holds the value to be converted, and rrm holds
2266               the rounding mode value, encoded as per the
2267               IRRoundingMode enum.  The first thing to do is set the
2268               FPU's rounding mode accordingly. */
2269
2270            /* Create a space for the format conversion. */
2271            /* subl $8, %esp */
2272            sub_from_esp(env, 8);
2273
2274            /* Set host rounding mode */
2275            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2276
2277            /* gistll %rf, 0(%esp) */
2278            addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
2279
2280            /* movl 0(%esp), %dstLo */
2281            /* movl 4(%esp), %dstHi */
2282            addInstr(env, X86Instr_Alu32R(
2283                             Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2284            addInstr(env, X86Instr_Alu32R(
2285                             Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2286
2287            /* Restore default FPU rounding. */
2288            set_FPU_rounding_default( env );
2289
2290            /* addl $8, %esp */
2291            add_to_esp(env, 8);
2292
2293            *rHi = tHi;
2294            *rLo = tLo;
2295            return;
2296         }
2297
2298         case Iop_Add8x8:
2299            fn = (HWord)h_generic_calc_Add8x8; goto binnish;
2300         case Iop_Add16x4:
2301            fn = (HWord)h_generic_calc_Add16x4; goto binnish;
2302         case Iop_Add32x2:
2303            fn = (HWord)h_generic_calc_Add32x2; goto binnish;
2304
2305         case Iop_Avg8Ux8:
2306            fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
2307         case Iop_Avg16Ux4:
2308            fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
2309
2310         case Iop_CmpEQ8x8:
2311            fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
2312         case Iop_CmpEQ16x4:
2313            fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
2314         case Iop_CmpEQ32x2:
2315            fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
2316
2317         case Iop_CmpGT8Sx8:
2318            fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
2319         case Iop_CmpGT16Sx4:
2320            fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
2321         case Iop_CmpGT32Sx2:
2322            fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
2323
2324         case Iop_InterleaveHI8x8:
2325            fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
2326         case Iop_InterleaveLO8x8:
2327            fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
2328         case Iop_InterleaveHI16x4:
2329            fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
2330         case Iop_InterleaveLO16x4:
2331            fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
2332         case Iop_InterleaveHI32x2:
2333            fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
2334         case Iop_InterleaveLO32x2:
2335            fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
2336         case Iop_CatOddLanes16x4:
2337            fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
2338         case Iop_CatEvenLanes16x4:
2339            fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
2340         case Iop_Perm8x8:
2341            fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
2342
2343         case Iop_Max8Ux8:
2344            fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
2345         case Iop_Max16Sx4:
2346            fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
2347         case Iop_Min8Ux8:
2348            fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
2349         case Iop_Min16Sx4:
2350            fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
2351
2352         case Iop_Mul16x4:
2353            fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
2354         case Iop_Mul32x2:
2355            fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
2356         case Iop_MulHi16Sx4:
2357            fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
2358         case Iop_MulHi16Ux4:
2359            fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
2360
2361         case Iop_QAdd8Sx8:
2362            fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
2363         case Iop_QAdd16Sx4:
2364            fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
2365         case Iop_QAdd8Ux8:
2366            fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
2367         case Iop_QAdd16Ux4:
2368            fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
2369
2370         case Iop_QNarrow32Sx2:
2371            fn = (HWord)h_generic_calc_QNarrow32Sx2; goto binnish;
2372         case Iop_QNarrow16Sx4:
2373            fn = (HWord)h_generic_calc_QNarrow16Sx4; goto binnish;
2374         case Iop_QNarrow16Ux4:
2375            fn = (HWord)h_generic_calc_QNarrow16Ux4; goto binnish;
2376
2377         case Iop_QSub8Sx8:
2378            fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
2379         case Iop_QSub16Sx4:
2380            fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
2381         case Iop_QSub8Ux8:
2382            fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
2383         case Iop_QSub16Ux4:
2384            fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
2385
2386         case Iop_Sub8x8:
2387            fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
2388         case Iop_Sub16x4:
2389            fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
2390         case Iop_Sub32x2:
2391            fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
2392
2393         binnish: {
2394            /* Note: the following assumes all helpers are of
2395               signature
2396                  ULong fn ( ULong, ULong ), and they are
2397               not marked as regparm functions.
2398            */
2399            HReg xLo, xHi, yLo, yHi;
2400            HReg tLo = newVRegI(env);
2401            HReg tHi = newVRegI(env);
2402            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2403            addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
2404            addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
2405            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2406            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2407            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2408            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
2409            add_to_esp(env, 4*4);
2410            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2411            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2412            *rHi = tHi;
2413            *rLo = tLo;
2414            return;
2415         }
2416
2417         case Iop_ShlN32x2:
2418            fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
2419         case Iop_ShlN16x4:
2420            fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
2421         case Iop_ShlN8x8:
2422            fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
2423         case Iop_ShrN32x2:
2424            fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
2425         case Iop_ShrN16x4:
2426            fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
2427         case Iop_SarN32x2:
2428            fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
2429         case Iop_SarN16x4:
2430            fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
2431         case Iop_SarN8x8:
2432            fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
2433         shifty: {
2434            /* Note: the following assumes all helpers are of
2435               signature
2436                  ULong fn ( ULong, UInt ), and they are
2437               not marked as regparm functions.
2438            */
2439            HReg xLo, xHi;
2440            HReg tLo = newVRegI(env);
2441            HReg tHi = newVRegI(env);
2442            X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2443            addInstr(env, X86Instr_Push(y));
2444            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2445            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2446            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2447            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
2448            add_to_esp(env, 3*4);
2449            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2450            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2451            *rHi = tHi;
2452            *rLo = tLo;
2453            return;
2454         }
2455
2456         default:
2457            break;
2458      }
2459   } /* if (e->tag == Iex_Binop) */
2460
2461
2462   /* --------- UNARY ops --------- */
2463   if (e->tag == Iex_Unop) {
2464      switch (e->Iex.Unop.op) {
2465
2466         /* 32Sto64(e) */
2467         case Iop_32Sto64: {
2468            HReg tLo = newVRegI(env);
2469            HReg tHi = newVRegI(env);
2470            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2471            addInstr(env, mk_iMOVsd_RR(src,tHi));
2472            addInstr(env, mk_iMOVsd_RR(src,tLo));
2473            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
2474            *rHi = tHi;
2475            *rLo = tLo;
2476            return;
2477         }
2478
2479         /* 32Uto64(e) */
2480         case Iop_32Uto64: {
2481            HReg tLo = newVRegI(env);
2482            HReg tHi = newVRegI(env);
2483            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2484            addInstr(env, mk_iMOVsd_RR(src,tLo));
2485            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2486            *rHi = tHi;
2487            *rLo = tLo;
2488            return;
2489         }
2490
2491         /* 16Uto64(e) */
2492         case Iop_16Uto64: {
2493            HReg tLo = newVRegI(env);
2494            HReg tHi = newVRegI(env);
2495            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2496            addInstr(env, mk_iMOVsd_RR(src,tLo));
2497            addInstr(env, X86Instr_Alu32R(Xalu_AND,
2498                                          X86RMI_Imm(0xFFFF), tLo));
2499            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2500            *rHi = tHi;
2501            *rLo = tLo;
2502            return;
2503         }
2504
2505         /* V128{HI}to64 */
2506         case Iop_V128HIto64:
2507         case Iop_V128to64: {
2508            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
2509            HReg tLo = newVRegI(env);
2510            HReg tHi = newVRegI(env);
2511            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
2512            X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
2513            X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
2514            X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
2515            sub_from_esp(env, 16);
2516            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
2517            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2518                                           X86RMI_Mem(espLO), tLo ));
2519            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2520                                           X86RMI_Mem(espHI), tHi ));
2521            add_to_esp(env, 16);
2522            *rHi = tHi;
2523            *rLo = tLo;
2524            return;
2525         }
2526
2527         /* could do better than this, but for now ... */
2528         case Iop_1Sto64: {
2529            HReg tLo = newVRegI(env);
2530            HReg tHi = newVRegI(env);
2531            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
2532            addInstr(env, X86Instr_Set32(cond,tLo));
2533            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
2534            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
2535            addInstr(env, mk_iMOVsd_RR(tLo, tHi));
2536            *rHi = tHi;
2537            *rLo = tLo;
2538            return;
2539         }
2540
2541         /* Not64(e) */
2542         case Iop_Not64: {
2543            HReg tLo = newVRegI(env);
2544            HReg tHi = newVRegI(env);
2545            HReg sHi, sLo;
2546            iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
2547            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2548            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2549            addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
2550            addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
2551            *rHi = tHi;
2552            *rLo = tLo;
2553            return;
2554         }
2555
2556         /* Left64(e) */
2557         case Iop_Left64: {
2558            HReg yLo, yHi;
2559            HReg tLo = newVRegI(env);
2560            HReg tHi = newVRegI(env);
2561            /* yHi:yLo = arg */
2562            iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
2563            /* tLo = 0 - yLo, and set carry */
2564            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
2565            addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2566            /* tHi = 0 - yHi - carry */
2567            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2568            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2569            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
2570               back in, so as to give the final result
2571               tHi:tLo = arg | -arg. */
2572            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
2573            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
2574            *rHi = tHi;
2575            *rLo = tLo;
2576            return;
2577         }
2578
2579         /* --- patterns rooted at: CmpwNEZ64 --- */
2580
2581         /* CmpwNEZ64(e) */
2582         case Iop_CmpwNEZ64: {
2583
2584         DECLARE_PATTERN(p_CmpwNEZ64_Or64);
2585         DEFINE_PATTERN(p_CmpwNEZ64_Or64,
2586                        unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
2587         if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
2588            /* CmpwNEZ64(Or64(x,y)) */
2589            HReg xHi,xLo,yHi,yLo;
2590            HReg xBoth = newVRegI(env);
2591            HReg merged = newVRegI(env);
2592            HReg tmp2 = newVRegI(env);
2593
2594            iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
2595            addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
2596            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2597                                          X86RMI_Reg(xLo),xBoth));
2598
2599            iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
2600            addInstr(env, mk_iMOVsd_RR(yHi,merged));
2601            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2602                                          X86RMI_Reg(yLo),merged));
2603            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2604                                             X86RMI_Reg(xBoth),merged));
2605
2606            /* tmp2 = (merged | -merged) >>s 31 */
2607            addInstr(env, mk_iMOVsd_RR(merged,tmp2));
2608            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2609            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2610                                          X86RMI_Reg(merged), tmp2));
2611            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2612            *rHi = tmp2;
2613            *rLo = tmp2;
2614            return;
2615         } else {
2616            /* CmpwNEZ64(e) */
2617            HReg srcLo, srcHi;
2618            HReg tmp1  = newVRegI(env);
2619            HReg tmp2  = newVRegI(env);
2620            /* srcHi:srcLo = arg */
2621            iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
2622            /* tmp1 = srcHi | srcLo */
2623            addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
2624            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2625                                          X86RMI_Reg(srcLo), tmp1));
2626            /* tmp2 = (tmp1 | -tmp1) >>s 31 */
2627            addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
2628            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2629            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2630                                          X86RMI_Reg(tmp1), tmp2));
2631            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2632            *rHi = tmp2;
2633            *rLo = tmp2;
2634            return;
2635         }
2636         }
2637
2638         /* ReinterpF64asI64(e) */
2639         /* Given an IEEE754 double, produce an I64 with the same bit
2640            pattern. */
2641         case Iop_ReinterpF64asI64: {
2642            HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
2643            HReg tLo  = newVRegI(env);
2644            HReg tHi  = newVRegI(env);
2645            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2646            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2647            /* paranoia */
2648            set_FPU_rounding_default(env);
2649            /* subl $8, %esp */
2650            sub_from_esp(env, 8);
2651            /* gstD %rf, 0(%esp) */
2652            addInstr(env,
2653                     X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
2654            /* movl 0(%esp), %tLo */
2655            addInstr(env,
2656                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2657            /* movl 4(%esp), %tHi */
2658            addInstr(env,
2659                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2660            /* addl $8, %esp */
2661            add_to_esp(env, 8);
2662            *rHi = tHi;
2663            *rLo = tLo;
2664            return;
2665         }
2666
2667         case Iop_CmpNEZ32x2:
2668            fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
2669         case Iop_CmpNEZ16x4:
2670            fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
2671         case Iop_CmpNEZ8x8:
2672            fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
2673         unish: {
2674            /* Note: the following assumes all helpers are of
2675               signature
2676                  ULong fn ( ULong ), and they are
2677               not marked as regparm functions.
2678            */
2679            HReg xLo, xHi;
2680            HReg tLo = newVRegI(env);
2681            HReg tHi = newVRegI(env);
2682            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
2683            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2684            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2685            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
2686            add_to_esp(env, 2*4);
2687            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2688            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2689            *rHi = tHi;
2690            *rLo = tLo;
2691            return;
2692         }
2693
2694         default:
2695            break;
2696      }
2697   } /* if (e->tag == Iex_Unop) */
2698
2699
2700   /* --------- CCALL --------- */
2701   if (e->tag == Iex_CCall) {
2702      HReg tLo = newVRegI(env);
2703      HReg tHi = newVRegI(env);
2704
2705      /* Marshal args, do the call, clear stack. */
2706      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
2707
2708      addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2709      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2710      *rHi = tHi;
2711      *rLo = tLo;
2712      return;
2713   }
2714
2715   ppIRExpr(e);
2716   vpanic("iselInt64Expr");
2717}
2718
2719
2720/*---------------------------------------------------------*/
2721/*--- ISEL: Floating point expressions (32 bit)         ---*/
2722/*---------------------------------------------------------*/
2723
2724/* Nothing interesting here; really just wrappers for
2725   64-bit stuff. */
2726
2727static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2728{
2729   HReg r = iselFltExpr_wrk( env, e );
2730#  if 0
2731   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2732#  endif
2733   vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
2734   vassert(hregIsVirtual(r));
2735   return r;
2736}
2737
2738/* DO NOT CALL THIS DIRECTLY */
2739static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2740{
2741   IRType ty = typeOfIRExpr(env->type_env,e);
2742   vassert(ty == Ity_F32);
2743
2744   if (e->tag == Iex_RdTmp) {
2745      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2746   }
2747
2748   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2749      X86AMode* am;
2750      HReg res = newVRegF(env);
2751      vassert(e->Iex.Load.ty == Ity_F32);
2752      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2753      addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
2754      return res;
2755   }
2756
2757   if (e->tag == Iex_Binop
2758       && e->Iex.Binop.op == Iop_F64toF32) {
2759      /* Although the result is still held in a standard FPU register,
2760         we need to round it to reflect the loss of accuracy/range
2761         entailed in casting it to a 32-bit float. */
2762      HReg dst = newVRegF(env);
2763      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2764      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2765      addInstr(env, X86Instr_Fp64to32(src,dst));
2766      set_FPU_rounding_default( env );
2767      return dst;
2768   }
2769
2770   if (e->tag == Iex_Get) {
2771      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2772                                  hregX86_EBP() );
2773      HReg res = newVRegF(env);
2774      addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
2775      return res;
2776   }
2777
2778   if (e->tag == Iex_Unop
2779       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2780       /* Given an I32, produce an IEEE754 float with the same bit
2781          pattern. */
2782      HReg    dst = newVRegF(env);
2783      X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
2784      /* paranoia */
2785      addInstr(env, X86Instr_Push(rmi));
2786      addInstr(env, X86Instr_FpLdSt(
2787                       True/*load*/, 4, dst,
2788                       X86AMode_IR(0, hregX86_ESP())));
2789      add_to_esp(env, 4);
2790      return dst;
2791   }
2792
2793   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2794      HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
2795      HReg dst = newVRegF(env);
2796
2797      /* rf now holds the value to be rounded.  The first thing to do
2798         is set the FPU's rounding mode accordingly. */
2799
2800      /* Set host rounding mode */
2801      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2802
2803      /* grndint %rf, %dst */
2804      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
2805
2806      /* Restore default FPU rounding. */
2807      set_FPU_rounding_default( env );
2808
2809      return dst;
2810   }
2811
2812   ppIRExpr(e);
2813   vpanic("iselFltExpr_wrk");
2814}
2815
2816
2817/*---------------------------------------------------------*/
2818/*--- ISEL: Floating point expressions (64 bit)         ---*/
2819/*---------------------------------------------------------*/
2820
2821/* Compute a 64-bit floating point value into a register, the identity
2822   of which is returned.  As with iselIntExpr_R, the reg may be either
2823   real or virtual; in any case it must not be changed by subsequent
2824   code emitted by the caller.  */
2825
2826/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2827
2828    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2829    ----                  ---------   -----------   -----------
2830    signalling NaN        u           2047 (max)    .0uuuuu---u
2831                                                    (with at least
2832                                                     one 1 bit)
2833    quiet NaN             u           2047 (max)    .1uuuuu---u
2834
2835    negative infinity     1           2047 (max)    .000000---0
2836
2837    positive infinity     0           2047 (max)    .000000---0
2838
2839    negative zero         1           0             .000000---0
2840
2841    positive zero         0           0             .000000---0
2842*/
2843
2844static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2845{
2846   HReg r = iselDblExpr_wrk( env, e );
2847#  if 0
2848   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2849#  endif
2850   vassert(hregClass(r) == HRcFlt64);
2851   vassert(hregIsVirtual(r));
2852   return r;
2853}
2854
2855/* DO NOT CALL THIS DIRECTLY */
2856static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
2857{
2858   IRType ty = typeOfIRExpr(env->type_env,e);
2859   vassert(e);
2860   vassert(ty == Ity_F64);
2861
2862   if (e->tag == Iex_RdTmp) {
2863      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2864   }
2865
2866   if (e->tag == Iex_Const) {
2867      union { UInt u32x2[2]; ULong u64; Double f64; } u;
2868      HReg freg = newVRegF(env);
2869      vassert(sizeof(u) == 8);
2870      vassert(sizeof(u.u64) == 8);
2871      vassert(sizeof(u.f64) == 8);
2872      vassert(sizeof(u.u32x2) == 8);
2873
2874      if (e->Iex.Const.con->tag == Ico_F64) {
2875         u.f64 = e->Iex.Const.con->Ico.F64;
2876      }
2877      else if (e->Iex.Const.con->tag == Ico_F64i) {
2878         u.u64 = e->Iex.Const.con->Ico.F64i;
2879      }
2880      else
2881         vpanic("iselDblExpr(x86): const");
2882
2883      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
2884      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
2885      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
2886                                    X86AMode_IR(0, hregX86_ESP())));
2887      add_to_esp(env, 8);
2888      return freg;
2889   }
2890
2891   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2892      X86AMode* am;
2893      HReg res = newVRegF(env);
2894      vassert(e->Iex.Load.ty == Ity_F64);
2895      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2896      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
2897      return res;
2898   }
2899
2900   if (e->tag == Iex_Get) {
2901      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2902                                  hregX86_EBP() );
2903      HReg res = newVRegF(env);
2904      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
2905      return res;
2906   }
2907
2908   if (e->tag == Iex_GetI) {
2909      X86AMode* am
2910         = genGuestArrayOffset(
2911              env, e->Iex.GetI.descr,
2912                   e->Iex.GetI.ix, e->Iex.GetI.bias );
2913      HReg res = newVRegF(env);
2914      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
2915      return res;
2916   }
2917
2918   if (e->tag == Iex_Triop) {
2919      X86FpOp fpop = Xfp_INVALID;
2920      switch (e->Iex.Triop.op) {
2921         case Iop_AddF64:    fpop = Xfp_ADD; break;
2922         case Iop_SubF64:    fpop = Xfp_SUB; break;
2923         case Iop_MulF64:    fpop = Xfp_MUL; break;
2924         case Iop_DivF64:    fpop = Xfp_DIV; break;
2925         case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
2926         case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
2927         case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
2928         case Iop_AtanF64:   fpop = Xfp_ATAN; break;
2929         case Iop_PRemF64:   fpop = Xfp_PREM; break;
2930         case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
2931         default: break;
2932      }
2933      if (fpop != Xfp_INVALID) {
2934         HReg res  = newVRegF(env);
2935         HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
2936         HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
2937         /* XXXROUNDINGFIXME */
2938         /* set roundingmode here */
2939         addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
2940	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
2941	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
2942            roundToF64(env, res);
2943         return res;
2944      }
2945   }
2946
2947   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2948      HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2949      HReg dst = newVRegF(env);
2950
2951      /* rf now holds the value to be rounded.  The first thing to do
2952         is set the FPU's rounding mode accordingly. */
2953
2954      /* Set host rounding mode */
2955      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2956
2957      /* grndint %rf, %dst */
2958      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
2959
2960      /* Restore default FPU rounding. */
2961      set_FPU_rounding_default( env );
2962
2963      return dst;
2964   }
2965
2966   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
2967      HReg dst = newVRegF(env);
2968      HReg rHi,rLo;
2969      iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
2970      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
2971      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
2972
2973      /* Set host rounding mode */
2974      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2975
2976      addInstr(env, X86Instr_FpLdStI(
2977                       True/*load*/, 8, dst,
2978                       X86AMode_IR(0, hregX86_ESP())));
2979
2980      /* Restore default FPU rounding. */
2981      set_FPU_rounding_default( env );
2982
2983      add_to_esp(env, 8);
2984      return dst;
2985   }
2986
2987   if (e->tag == Iex_Binop) {
2988      X86FpOp fpop = Xfp_INVALID;
2989      switch (e->Iex.Binop.op) {
2990         case Iop_SinF64:  fpop = Xfp_SIN; break;
2991         case Iop_CosF64:  fpop = Xfp_COS; break;
2992         case Iop_TanF64:  fpop = Xfp_TAN; break;
2993         case Iop_2xm1F64: fpop = Xfp_2XM1; break;
2994         case Iop_SqrtF64: fpop = Xfp_SQRT; break;
2995         default: break;
2996      }
2997      if (fpop != Xfp_INVALID) {
2998         HReg res = newVRegF(env);
2999         HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
3000         /* XXXROUNDINGFIXME */
3001         /* set roundingmode here */
3002         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3003	 if (fpop != Xfp_SQRT
3004             && fpop != Xfp_NEG && fpop != Xfp_ABS)
3005            roundToF64(env, res);
3006         return res;
3007      }
3008   }
3009
3010   if (e->tag == Iex_Unop) {
3011      X86FpOp fpop = Xfp_INVALID;
3012      switch (e->Iex.Unop.op) {
3013         case Iop_NegF64:  fpop = Xfp_NEG; break;
3014         case Iop_AbsF64:  fpop = Xfp_ABS; break;
3015         default: break;
3016      }
3017      if (fpop != Xfp_INVALID) {
3018         HReg res = newVRegF(env);
3019         HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3020         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3021	 if (fpop != Xfp_NEG && fpop != Xfp_ABS)
3022            roundToF64(env, res);
3023         return res;
3024      }
3025   }
3026
3027   if (e->tag == Iex_Unop) {
3028      switch (e->Iex.Unop.op) {
3029         case Iop_I32StoF64: {
3030            HReg dst = newVRegF(env);
3031            HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3032            addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3033            set_FPU_rounding_default(env);
3034            addInstr(env, X86Instr_FpLdStI(
3035                             True/*load*/, 4, dst,
3036                             X86AMode_IR(0, hregX86_ESP())));
3037	    add_to_esp(env, 4);
3038            return dst;
3039         }
3040         case Iop_ReinterpI64asF64: {
3041            /* Given an I64, produce an IEEE754 double with the same
3042               bit pattern. */
3043            HReg dst = newVRegF(env);
3044            HReg rHi, rLo;
3045	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
3046            /* paranoia */
3047            set_FPU_rounding_default(env);
3048            addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3049            addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3050            addInstr(env, X86Instr_FpLdSt(
3051                             True/*load*/, 8, dst,
3052                             X86AMode_IR(0, hregX86_ESP())));
3053	    add_to_esp(env, 8);
3054            return dst;
3055	 }
3056         case Iop_F32toF64: {
3057            /* this is a no-op */
3058            HReg res = iselFltExpr(env, e->Iex.Unop.arg);
3059            return res;
3060	 }
3061         default:
3062            break;
3063      }
3064   }
3065
3066   /* --------- MULTIPLEX --------- */
3067   if (e->tag == Iex_Mux0X) {
3068     if (ty == Ity_F64
3069         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
3070        X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
3071        HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
3072        HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
3073        HReg dst = newVRegF(env);
3074        addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
3075        addInstr(env, X86Instr_Test32(0xFF, r8));
3076        addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
3077        return dst;
3078      }
3079   }
3080
3081   ppIRExpr(e);
3082   vpanic("iselDblExpr_wrk");
3083}
3084
3085
3086/*---------------------------------------------------------*/
3087/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3088/*---------------------------------------------------------*/
3089
3090static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
3091{
3092   HReg r = iselVecExpr_wrk( env, e );
3093#  if 0
3094   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3095#  endif
3096   vassert(hregClass(r) == HRcVec128);
3097   vassert(hregIsVirtual(r));
3098   return r;
3099}
3100
3101
3102/* DO NOT CALL THIS DIRECTLY */
3103static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
3104{
3105
3106#  define REQUIRE_SSE1                                    \
3107      do { if (env->hwcaps == 0/*baseline, no sse*/)      \
3108              goto vec_fail;                              \
3109      } while (0)
3110
3111#  define REQUIRE_SSE2                                    \
3112      do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
3113              goto vec_fail;                              \
3114      } while (0)
3115
3116#  define SSE2_OR_ABOVE                                   \
3117       (env->hwcaps & VEX_HWCAPS_X86_SSE2)
3118
3119   MatchInfo mi;
3120   Bool      arg1isEReg = False;
3121   X86SseOp  op = Xsse_INVALID;
3122   IRType    ty = typeOfIRExpr(env->type_env,e);
3123   vassert(e);
3124   vassert(ty == Ity_V128);
3125
3126   REQUIRE_SSE1;
3127
3128   if (e->tag == Iex_RdTmp) {
3129      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3130   }
3131
3132   if (e->tag == Iex_Get) {
3133      HReg dst = newVRegV(env);
3134      addInstr(env, X86Instr_SseLdSt(
3135                       True/*load*/,
3136                       dst,
3137                       X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
3138                    )
3139              );
3140      return dst;
3141   }
3142
3143   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3144      HReg      dst = newVRegV(env);
3145      X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3146      addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
3147      return dst;
3148   }
3149
3150   if (e->tag == Iex_Const) {
3151      HReg dst = newVRegV(env);
3152      vassert(e->Iex.Const.con->tag == Ico_V128);
3153      addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
3154      return dst;
3155   }
3156
3157   if (e->tag == Iex_Unop) {
3158
3159   if (SSE2_OR_ABOVE) {
3160      /* 64UtoV128(LDle:I64(addr)) */
3161      DECLARE_PATTERN(p_zwiden_load64);
3162      DEFINE_PATTERN(p_zwiden_load64,
3163                     unop(Iop_64UtoV128,
3164                          IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
3165      if (matchIRExpr(&mi, p_zwiden_load64, e)) {
3166         X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
3167         HReg dst = newVRegV(env);
3168         addInstr(env, X86Instr_SseLdzLO(8, dst, am));
3169         return dst;
3170      }
3171   }
3172
3173   switch (e->Iex.Unop.op) {
3174
3175      case Iop_NotV128: {
3176         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3177         return do_sse_Not128(env, arg);
3178      }
3179
3180      case Iop_CmpNEZ64x2: {
3181         /* We can use SSE2 instructions for this. */
3182         /* Ideally, we want to do a 64Ix2 comparison against zero of
3183            the operand.  Problem is no such insn exists.  Solution
3184            therefore is to do a 32Ix4 comparison instead, and bitwise-
3185            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3186            let the not'd result of this initial comparison be a:b:c:d.
3187            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3188            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3189            giving the required result.
3190
3191            The required selection sequence is 2,3,0,1, which
3192            according to Intel's documentation means the pshufd
3193            literal value is 0xB1, that is,
3194            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3195         */
3196         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3197         HReg tmp  = newVRegV(env);
3198         HReg dst  = newVRegV(env);
3199         REQUIRE_SSE2;
3200         addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
3201         addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
3202         tmp = do_sse_Not128(env, tmp);
3203         addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
3204         addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
3205         return dst;
3206      }
3207
3208      case Iop_CmpNEZ32x4: {
3209         /* Sigh, we have to generate lousy code since this has to
3210            work on SSE1 hosts */
3211         /* basically, the idea is: for each lane:
3212               movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
3213               sbbl %r, %r               (now %r = 1Sto32(CF))
3214               movl %r, lane
3215         */
3216         Int       i;
3217         X86AMode* am;
3218         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3219         HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
3220         HReg      dst  = newVRegV(env);
3221         HReg      r32  = newVRegI(env);
3222         sub_from_esp(env, 16);
3223         addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
3224         for (i = 0; i < 4; i++) {
3225            am = X86AMode_IR(i*4, hregX86_ESP());
3226            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
3227            addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
3228            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
3229            addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
3230         }
3231         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3232         add_to_esp(env, 16);
3233         return dst;
3234      }
3235
3236      case Iop_CmpNEZ8x16:
3237      case Iop_CmpNEZ16x8: {
3238         /* We can use SSE2 instructions for this. */
3239         HReg arg;
3240         HReg vec0 = newVRegV(env);
3241         HReg vec1 = newVRegV(env);
3242         HReg dst  = newVRegV(env);
3243         X86SseOp cmpOp
3244            = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
3245                                             : Xsse_CMPEQ8;
3246         REQUIRE_SSE2;
3247         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
3248         addInstr(env, mk_vMOVsd_RR(vec0, vec1));
3249         addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
3250         /* defer arg computation to here so as to give CMPEQF as long
3251            as possible to complete */
3252         arg = iselVecExpr(env, e->Iex.Unop.arg);
3253         /* vec0 is all 0s; vec1 is all 1s */
3254         addInstr(env, mk_vMOVsd_RR(arg, dst));
3255         /* 16x8 or 8x16 comparison == */
3256         addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
3257         /* invert result */
3258         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
3259         return dst;
3260      }
3261
3262      case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
3263      case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
3264      case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
3265      do_32Fx4_unary:
3266      {
3267         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3268         HReg dst = newVRegV(env);
3269         addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
3270         return dst;
3271      }
3272
3273      case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
3274      case Iop_RSqrt64Fx2: op = Xsse_RSQRTF; goto do_64Fx2_unary;
3275      case Iop_Sqrt64Fx2:  op = Xsse_SQRTF;  goto do_64Fx2_unary;
3276      do_64Fx2_unary:
3277      {
3278         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3279         HReg dst = newVRegV(env);
3280         REQUIRE_SSE2;
3281         addInstr(env, X86Instr_Sse64Fx2(op, arg, dst));
3282         return dst;
3283      }
3284
3285      case Iop_Recip32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
3286      case Iop_RSqrt32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
3287      case Iop_Sqrt32F0x4:  op = Xsse_SQRTF;  goto do_32F0x4_unary;
3288      do_32F0x4_unary:
3289      {
3290         /* A bit subtle.  We have to copy the arg to the result
3291            register first, because actually doing the SSE scalar insn
3292            leaves the upper 3/4 of the destination register
3293            unchanged.  Whereas the required semantics of these
3294            primops is that the upper 3/4 is simply copied in from the
3295            argument. */
3296         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3297         HReg dst = newVRegV(env);
3298         addInstr(env, mk_vMOVsd_RR(arg, dst));
3299         addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
3300         return dst;
3301      }
3302
3303      case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
3304      case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
3305      case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
3306      do_64F0x2_unary:
3307      {
3308         /* A bit subtle.  We have to copy the arg to the result
3309            register first, because actually doing the SSE scalar insn
3310            leaves the upper half of the destination register
3311            unchanged.  Whereas the required semantics of these
3312            primops is that the upper half is simply copied in from the
3313            argument. */
3314         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3315         HReg dst = newVRegV(env);
3316         REQUIRE_SSE2;
3317         addInstr(env, mk_vMOVsd_RR(arg, dst));
3318         addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
3319         return dst;
3320      }
3321
3322      case Iop_32UtoV128: {
3323         HReg      dst  = newVRegV(env);
3324         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3325         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3326         addInstr(env, X86Instr_Push(rmi));
3327	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
3328         add_to_esp(env, 4);
3329         return dst;
3330      }
3331
3332      case Iop_64UtoV128: {
3333         HReg      rHi, rLo;
3334         HReg      dst  = newVRegV(env);
3335         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3336         iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
3337         addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3338         addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3339	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
3340         add_to_esp(env, 8);
3341         return dst;
3342      }
3343
3344      default:
3345         break;
3346   } /* switch (e->Iex.Unop.op) */
3347   } /* if (e->tag == Iex_Unop) */
3348
3349   if (e->tag == Iex_Binop) {
3350   switch (e->Iex.Binop.op) {
3351
3352      case Iop_SetV128lo32: {
3353         HReg dst = newVRegV(env);
3354         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3355         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3356         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3357         sub_from_esp(env, 16);
3358         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3359         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
3360         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3361         add_to_esp(env, 16);
3362         return dst;
3363      }
3364
3365      case Iop_SetV128lo64: {
3366         HReg dst = newVRegV(env);
3367         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3368         HReg srcIhi, srcIlo;
3369         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3370         X86AMode* esp4 = advance4(esp0);
3371         iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
3372         sub_from_esp(env, 16);
3373         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3374         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
3375         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
3376         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3377         add_to_esp(env, 16);
3378         return dst;
3379      }
3380
3381      case Iop_64HLtoV128: {
3382         HReg r3, r2, r1, r0;
3383         X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
3384         X86AMode* esp4  = advance4(esp0);
3385         X86AMode* esp8  = advance4(esp4);
3386         X86AMode* esp12 = advance4(esp8);
3387         HReg dst = newVRegV(env);
3388	 /* do this via the stack (easy, convenient, etc) */
3389         sub_from_esp(env, 16);
3390         /* Do the less significant 64 bits */
3391         iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
3392         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
3393         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
3394         /* Do the more significant 64 bits */
3395         iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
3396         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
3397         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
3398	 /* Fetch result back from stack. */
3399         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3400         add_to_esp(env, 16);
3401         return dst;
3402      }
3403
3404      case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
3405      case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
3406      case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
3407      case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
3408      case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
3409      case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
3410      case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
3411      case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
3412      case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
3413      case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
3414      do_32Fx4:
3415      {
3416         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3417         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3418         HReg dst = newVRegV(env);
3419         addInstr(env, mk_vMOVsd_RR(argL, dst));
3420         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3421         return dst;
3422      }
3423
3424      case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
3425      case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
3426      case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
3427      case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
3428      case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
3429      case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
3430      case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
3431      case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
3432      case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
3433      case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
3434      do_64Fx2:
3435      {
3436         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3437         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3438         HReg dst = newVRegV(env);
3439         REQUIRE_SSE2;
3440         addInstr(env, mk_vMOVsd_RR(argL, dst));
3441         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3442         return dst;
3443      }
3444
3445      case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
3446      case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
3447      case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
3448      case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
3449      case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
3450      case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
3451      case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
3452      case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
3453      case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
3454      case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
3455      do_32F0x4: {
3456         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3457         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3458         HReg dst = newVRegV(env);
3459         addInstr(env, mk_vMOVsd_RR(argL, dst));
3460         addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
3461         return dst;
3462      }
3463
3464      case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
3465      case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
3466      case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
3467      case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
3468      case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
3469      case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
3470      case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
3471      case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
3472      case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
3473      case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
3474      do_64F0x2: {
3475         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3476         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3477         HReg dst = newVRegV(env);
3478         REQUIRE_SSE2;
3479         addInstr(env, mk_vMOVsd_RR(argL, dst));
3480         addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
3481         return dst;
3482      }
3483
3484      case Iop_QNarrow32Sx4:
3485         op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3486      case Iop_QNarrow16Sx8:
3487         op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3488      case Iop_QNarrow16Ux8:
3489         op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3490
3491      case Iop_InterleaveHI8x16:
3492         op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3493      case Iop_InterleaveHI16x8:
3494         op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3495      case Iop_InterleaveHI32x4:
3496         op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3497      case Iop_InterleaveHI64x2:
3498         op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3499
3500      case Iop_InterleaveLO8x16:
3501         op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3502      case Iop_InterleaveLO16x8:
3503         op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3504      case Iop_InterleaveLO32x4:
3505         op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3506      case Iop_InterleaveLO64x2:
3507         op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3508
3509      case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
3510      case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
3511      case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
3512      case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
3513      case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
3514      case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
3515      case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
3516      case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
3517      case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
3518      case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
3519      case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
3520      case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
3521      case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
3522      case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
3523      case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
3524      case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
3525      case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
3526      case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
3527      case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
3528      case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
3529      case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
3530      case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
3531      case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
3532      case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
3533      case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
3534      case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
3535      case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
3536      case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
3537      case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
3538      case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
3539      case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
3540      case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
3541      case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
3542      case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
3543      do_SseReRg: {
3544         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3545         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3546         HReg dst = newVRegV(env);
3547         if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
3548            REQUIRE_SSE2;
3549         if (arg1isEReg) {
3550            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3551            addInstr(env, X86Instr_SseReRg(op, arg1, dst));
3552         } else {
3553            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3554            addInstr(env, X86Instr_SseReRg(op, arg2, dst));
3555         }
3556         return dst;
3557      }
3558
3559      case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
3560      case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
3561      case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
3562      case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
3563      case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
3564      case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
3565      case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
3566      case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
3567      do_SseShift: {
3568         HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
3569         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3570         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3571         HReg      ereg = newVRegV(env);
3572         HReg      dst  = newVRegV(env);
3573         REQUIRE_SSE2;
3574         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3575         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3576         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3577         addInstr(env, X86Instr_Push(rmi));
3578         addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
3579	 addInstr(env, mk_vMOVsd_RR(greg, dst));
3580         addInstr(env, X86Instr_SseReRg(op, ereg, dst));
3581         add_to_esp(env, 16);
3582         return dst;
3583      }
3584
3585      default:
3586         break;
3587   } /* switch (e->Iex.Binop.op) */
3588   } /* if (e->tag == Iex_Binop) */
3589
3590   if (e->tag == Iex_Mux0X) {
3591      X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
3592      HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
3593      HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
3594      HReg dst = newVRegV(env);
3595      addInstr(env, mk_vMOVsd_RR(rX,dst));
3596      addInstr(env, X86Instr_Test32(0xFF, r8));
3597      addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
3598      return dst;
3599   }
3600
3601   vec_fail:
3602   vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
3603              LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
3604   ppIRExpr(e);
3605   vpanic("iselVecExpr_wrk");
3606
3607#  undef REQUIRE_SSE1
3608#  undef REQUIRE_SSE2
3609#  undef SSE2_OR_ABOVE
3610}
3611
3612
3613/*---------------------------------------------------------*/
3614/*--- ISEL: Statements                                  ---*/
3615/*---------------------------------------------------------*/
3616
3617static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3618{
3619   if (vex_traceflags & VEX_TRACE_VCODE) {
3620      vex_printf("\n-- ");
3621      ppIRStmt(stmt);
3622      vex_printf("\n");
3623   }
3624
3625   switch (stmt->tag) {
3626
3627   /* --------- STORE --------- */
3628   case Ist_Store: {
3629      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3630      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3631      IREndness end   = stmt->Ist.Store.end;
3632
3633      if (tya != Ity_I32 || end != Iend_LE)
3634         goto stmt_fail;
3635
3636      if (tyd == Ity_I32) {
3637         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3638         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
3639         addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
3640         return;
3641      }
3642      if (tyd == Ity_I8 || tyd == Ity_I16) {
3643         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3644         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
3645         addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
3646                                       r,am ));
3647         return;
3648      }
3649      if (tyd == Ity_F64) {
3650         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3651         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
3652         addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
3653         return;
3654      }
3655      if (tyd == Ity_F32) {
3656         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3657         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
3658         addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
3659         return;
3660      }
3661      if (tyd == Ity_I64) {
3662         HReg vHi, vLo, rA;
3663         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
3664         rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
3665         addInstr(env, X86Instr_Alu32M(
3666                          Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
3667         addInstr(env, X86Instr_Alu32M(
3668                          Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
3669         return;
3670      }
3671      if (tyd == Ity_V128) {
3672         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3673         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
3674         addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
3675         return;
3676      }
3677      break;
3678   }
3679
3680   /* --------- PUT --------- */
3681   case Ist_Put: {
3682      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3683      if (ty == Ity_I32) {
3684         /* We're going to write to memory, so compute the RHS into an
3685            X86RI. */
3686         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
3687         addInstr(env,
3688                  X86Instr_Alu32M(
3689                     Xalu_MOV,
3690                     ri,
3691                     X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
3692                 ));
3693         return;
3694      }
3695      if (ty == Ity_I8 || ty == Ity_I16) {
3696         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
3697         addInstr(env, X86Instr_Store(
3698                          toUChar(ty==Ity_I8 ? 1 : 2),
3699                          r,
3700                          X86AMode_IR(stmt->Ist.Put.offset,
3701                                      hregX86_EBP())));
3702         return;
3703      }
3704      if (ty == Ity_I64) {
3705         HReg vHi, vLo;
3706         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3707         X86AMode* am4 = advance4(am);
3708         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
3709         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
3710         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
3711         return;
3712      }
3713      if (ty == Ity_V128) {
3714         HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
3715         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3716         addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
3717         return;
3718      }
3719      if (ty == Ity_F32) {
3720         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
3721         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3722         set_FPU_rounding_default(env); /* paranoia */
3723         addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
3724         return;
3725      }
3726      if (ty == Ity_F64) {
3727         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
3728         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3729         set_FPU_rounding_default(env); /* paranoia */
3730         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
3731         return;
3732      }
3733      break;
3734   }
3735
3736   /* --------- Indexed PUT --------- */
3737   case Ist_PutI: {
3738      X86AMode* am
3739         = genGuestArrayOffset(
3740              env, stmt->Ist.PutI.descr,
3741                   stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
3742
3743      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
3744      if (ty == Ity_F64) {
3745         HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
3746         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
3747         return;
3748      }
3749      if (ty == Ity_I8) {
3750         HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
3751         addInstr(env, X86Instr_Store( 1, r, am ));
3752         return;
3753      }
3754      if (ty == Ity_I32) {
3755         HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
3756         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
3757         return;
3758      }
3759      if (ty == Ity_I64) {
3760         HReg rHi, rLo;
3761         X86AMode* am4 = advance4(am);
3762         iselInt64Expr(&rHi, &rLo, env, stmt->Ist.PutI.data);
3763         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
3764         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
3765         return;
3766      }
3767      break;
3768   }
3769
3770   /* --------- TMP --------- */
3771   case Ist_WrTmp: {
3772      IRTemp tmp = stmt->Ist.WrTmp.tmp;
3773      IRType ty = typeOfIRTemp(env->type_env, tmp);
3774
3775      /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
3776         compute it into an AMode and then use LEA.  This usually
3777         produces fewer instructions, often because (for memcheck
3778         created IR) we get t = address-expression, (t is later used
3779         twice) and so doing this naturally turns address-expression
3780         back into an X86 amode. */
3781      if (ty == Ity_I32
3782          && stmt->Ist.WrTmp.data->tag == Iex_Binop
3783          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
3784         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
3785         HReg dst = lookupIRTemp(env, tmp);
3786         if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
3787            /* Hmm, iselIntExpr_AMode wimped out and just computed the
3788               value into a register.  Just emit a normal reg-reg move
3789               so reg-alloc can coalesce it away in the usual way. */
3790            HReg src = am->Xam.IR.reg;
3791            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
3792         } else {
3793            addInstr(env, X86Instr_Lea32(am,dst));
3794         }
3795         return;
3796      }
3797
3798      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
3799         X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
3800         HReg dst = lookupIRTemp(env, tmp);
3801         addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
3802         return;
3803      }
3804      if (ty == Ity_I64) {
3805         HReg rHi, rLo, dstHi, dstLo;
3806         iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
3807         lookupIRTemp64( &dstHi, &dstLo, env, tmp);
3808         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
3809         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
3810         return;
3811      }
3812      if (ty == Ity_I1) {
3813         X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
3814         HReg dst = lookupIRTemp(env, tmp);
3815         addInstr(env, X86Instr_Set32(cond, dst));
3816         return;
3817      }
3818      if (ty == Ity_F64) {
3819         HReg dst = lookupIRTemp(env, tmp);
3820         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
3821         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
3822         return;
3823      }
3824      if (ty == Ity_F32) {
3825         HReg dst = lookupIRTemp(env, tmp);
3826         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
3827         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
3828         return;
3829      }
3830      if (ty == Ity_V128) {
3831         HReg dst = lookupIRTemp(env, tmp);
3832         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
3833         addInstr(env, mk_vMOVsd_RR(src,dst));
3834         return;
3835      }
3836      break;
3837   }
3838
3839   /* --------- Call to DIRTY helper --------- */
3840   case Ist_Dirty: {
3841      IRType   retty;
3842      IRDirty* d = stmt->Ist.Dirty.details;
3843      Bool     passBBP = False;
3844
3845      if (d->nFxState == 0)
3846         vassert(!d->needsBBP);
3847
3848      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
3849
3850      /* Marshal args, do the call, clear stack. */
3851      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
3852
3853      /* Now figure out what to do with the returned value, if any. */
3854      if (d->tmp == IRTemp_INVALID)
3855         /* No return value.  Nothing to do. */
3856         return;
3857
3858      retty = typeOfIRTemp(env->type_env, d->tmp);
3859      if (retty == Ity_I64) {
3860         HReg dstHi, dstLo;
3861         /* The returned value is in %edx:%eax.  Park it in the
3862            register-pair associated with tmp. */
3863         lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
3864         addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
3865         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
3866         return;
3867      }
3868      if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
3869         /* The returned value is in %eax.  Park it in the register
3870            associated with tmp. */
3871         HReg dst = lookupIRTemp(env, d->tmp);
3872         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
3873         return;
3874      }
3875      break;
3876   }
3877
3878   /* --------- MEM FENCE --------- */
3879   case Ist_MBE:
3880      switch (stmt->Ist.MBE.event) {
3881         case Imbe_Fence:
3882            addInstr(env, X86Instr_MFence(env->hwcaps));
3883            return;
3884         default:
3885            break;
3886      }
3887      break;
3888
3889   /* --------- ACAS --------- */
3890   case Ist_CAS:
3891      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
3892         /* "normal" singleton CAS */
3893         UChar  sz;
3894         IRCAS* cas = stmt->Ist.CAS.details;
3895         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
3896         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
3897         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
3898         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
3899         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
3900         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
3901         vassert(cas->expdHi == NULL);
3902         vassert(cas->dataHi == NULL);
3903         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
3904         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
3905         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
3906         switch (ty) {
3907            case Ity_I32: sz = 4; break;
3908            case Ity_I16: sz = 2; break;
3909            case Ity_I8:  sz = 1; break;
3910            default: goto unhandled_cas;
3911         }
3912         addInstr(env, X86Instr_ACAS(am, sz));
3913         addInstr(env,
3914                  X86Instr_CMov32(Xcc_NZ,
3915                                  X86RM_Reg(hregX86_EAX()), rOldLo));
3916         return;
3917      } else {
3918         /* double CAS */
3919         IRCAS* cas = stmt->Ist.CAS.details;
3920         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
3921         /* only 32-bit allowed in this case */
3922         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
3923         /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
3924         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
3925         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
3926         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
3927         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
3928         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
3929         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
3930         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
3931         if (ty != Ity_I32)
3932            goto unhandled_cas;
3933         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
3934         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
3935         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
3936         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
3937         addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
3938         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
3939         addInstr(env, X86Instr_DACAS(am));
3940         addInstr(env,
3941                  X86Instr_CMov32(Xcc_NZ,
3942                                  X86RM_Reg(hregX86_EDX()), rOldHi));
3943         addInstr(env,
3944                  X86Instr_CMov32(Xcc_NZ,
3945                                  X86RM_Reg(hregX86_EAX()), rOldLo));
3946         return;
3947      }
3948      unhandled_cas:
3949      break;
3950
3951   /* --------- INSTR MARK --------- */
3952   /* Doesn't generate any executable code ... */
3953   case Ist_IMark:
3954       return;
3955
3956   /* --------- NO-OP --------- */
3957   /* Fairly self-explanatory, wouldn't you say? */
3958   case Ist_NoOp:
3959       return;
3960
3961   /* --------- EXIT --------- */
3962   case Ist_Exit: {
3963      X86RI*      dst;
3964      X86CondCode cc;
3965      if (stmt->Ist.Exit.dst->tag != Ico_U32)
3966         vpanic("isel_x86: Ist_Exit: dst is not a 32-bit value");
3967      dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
3968      cc  = iselCondCode(env,stmt->Ist.Exit.guard);
3969      addInstr(env, X86Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
3970      return;
3971   }
3972
3973   default: break;
3974   }
3975  stmt_fail:
3976   ppIRStmt(stmt);
3977   vpanic("iselStmt");
3978}
3979
3980
3981/*---------------------------------------------------------*/
3982/*--- ISEL: Basic block terminators (Nexts)             ---*/
3983/*---------------------------------------------------------*/
3984
3985static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
3986{
3987   X86RI* ri;
3988   if (vex_traceflags & VEX_TRACE_VCODE) {
3989      vex_printf("\n-- goto {");
3990      ppIRJumpKind(jk);
3991      vex_printf("} ");
3992      ppIRExpr(next);
3993      vex_printf("\n");
3994   }
3995   ri = iselIntExpr_RI(env, next);
3996   addInstr(env, X86Instr_Goto(jk, Xcc_ALWAYS,ri));
3997}
3998
3999
4000/*---------------------------------------------------------*/
4001/*--- Insn selector top-level                           ---*/
4002/*---------------------------------------------------------*/
4003
4004/* Translate an entire SB to x86 code. */
4005
4006HInstrArray* iselSB_X86 ( IRSB* bb, VexArch      arch_host,
4007                                    VexArchInfo* archinfo_host,
4008                                    VexAbiInfo*  vbi/*UNUSED*/ )
4009{
4010   Int      i, j;
4011   HReg     hreg, hregHI;
4012   ISelEnv* env;
4013   UInt     hwcaps_host = archinfo_host->hwcaps;
4014
4015   /* sanity ... */
4016   vassert(arch_host == VexArchX86);
4017   vassert(0 == (hwcaps_host
4018                 & ~(VEX_HWCAPS_X86_SSE1
4019                     | VEX_HWCAPS_X86_SSE2
4020                     | VEX_HWCAPS_X86_SSE3
4021                     | VEX_HWCAPS_X86_LZCNT)));
4022
4023   /* Make up an initial environment to use. */
4024   env = LibVEX_Alloc(sizeof(ISelEnv));
4025   env->vreg_ctr = 0;
4026
4027   /* Set up output code array. */
4028   env->code = newHInstrArray();
4029
4030   /* Copy BB's type env. */
4031   env->type_env = bb->tyenv;
4032
4033   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4034      change as we go along. */
4035   env->n_vregmap = bb->tyenv->types_used;
4036   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4037   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4038
4039   /* and finally ... */
4040   env->hwcaps = hwcaps_host;
4041
4042   /* For each IR temporary, allocate a suitably-kinded virtual
4043      register. */
4044   j = 0;
4045   for (i = 0; i < env->n_vregmap; i++) {
4046      hregHI = hreg = INVALID_HREG;
4047      switch (bb->tyenv->types[i]) {
4048         case Ity_I1:
4049         case Ity_I8:
4050         case Ity_I16:
4051         case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
4052         case Ity_I64:  hreg   = mkHReg(j++, HRcInt32, True);
4053                        hregHI = mkHReg(j++, HRcInt32, True); break;
4054         case Ity_F32:
4055         case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
4056         case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
4057         default: ppIRType(bb->tyenv->types[i]);
4058                  vpanic("iselBB: IRTemp type");
4059      }
4060      env->vregmap[i]   = hreg;
4061      env->vregmapHI[i] = hregHI;
4062   }
4063   env->vreg_ctr = j;
4064
4065   /* Ok, finally we can iterate over the statements. */
4066   for (i = 0; i < bb->stmts_used; i++)
4067      iselStmt(env,bb->stmts[i]);
4068
4069   iselNext(env,bb->next,bb->jumpkind);
4070
4071   /* record the number of vregs we used. */
4072   env->code->n_vregs = env->vreg_ctr;
4073   return env->code;
4074}
4075
4076
4077/*---------------------------------------------------------------*/
4078/*--- end                                     host_x86_isel.c ---*/
4079/*---------------------------------------------------------------*/
4080