1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_x86_defs.h"
47
48/* TODO 21 Apr 2005:
49
50   -- (Really an assembler issue) don't emit CMov32 as a cmov
51      insn, since that's expensive on P4 and conditional branch
52      is cheaper if (as we expect) the condition is highly predictable
53
54   -- preserve xmm registers across function calls (by declaring them
55      as trashed by call insns)
56
57   -- preserve x87 ST stack discipline across function calls.  Sigh.
58
59   -- Check doHelperCall: if a call is conditional, we cannot safely
60      compute any regparm args directly to registers.  Hence, the
61      fast-regparm marshalling should be restricted to unconditional
62      calls only.
63*/
64
65/*---------------------------------------------------------*/
66/*--- x87 control word stuff                            ---*/
67/*---------------------------------------------------------*/
68
69/* Vex-generated code expects to run with the FPU set as follows: all
70   exceptions masked, round-to-nearest, precision = 53 bits.  This
71   corresponds to a FPU control word value of 0x027F.
72
73   Similarly the SSE control word (%mxcsr) should be 0x1F80.
74
75   %fpucw and %mxcsr should have these values on entry to
76   Vex-generated code, and should those values should be
77   unchanged at exit.
78*/
79
80#define DEFAULT_FPUCW 0x027F
81
82/* debugging only, do not use */
83/* define DEFAULT_FPUCW 0x037F */
84
85
86/*---------------------------------------------------------*/
87/*--- misc helpers                                      ---*/
88/*---------------------------------------------------------*/
89
90/* These are duplicated in guest-x86/toIR.c */
91static IRExpr* unop ( IROp op, IRExpr* a )
92{
93   return IRExpr_Unop(op, a);
94}
95
96static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
97{
98   return IRExpr_Binop(op, a1, a2);
99}
100
101static IRExpr* bind ( Int binder )
102{
103   return IRExpr_Binder(binder);
104}
105
106static Bool isZeroU8 ( IRExpr* e )
107{
108   return e->tag == Iex_Const
109          && e->Iex.Const.con->tag == Ico_U8
110          && e->Iex.Const.con->Ico.U8 == 0;
111}
112
113static Bool isZeroU32 ( IRExpr* e )
114{
115   return e->tag == Iex_Const
116          && e->Iex.Const.con->tag == Ico_U32
117          && e->Iex.Const.con->Ico.U32 == 0;
118}
119
120//static Bool isZeroU64 ( IRExpr* e )
121//{
122//   return e->tag == Iex_Const
123//          && e->Iex.Const.con->tag == Ico_U64
124//          && e->Iex.Const.con->Ico.U64 == 0ULL;
125//}
126
127
128/*---------------------------------------------------------*/
129/*--- ISelEnv                                           ---*/
130/*---------------------------------------------------------*/
131
132/* This carries around:
133
134   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
135     might encounter.  This is computed before insn selection starts,
136     and does not change.
137
138   - A mapping from IRTemp to HReg.  This tells the insn selector
139     which virtual register(s) are associated with each IRTemp
140     temporary.  This is computed before insn selection starts, and
141     does not change.  We expect this mapping to map precisely the
142     same set of IRTemps as the type mapping does.
143
144        - vregmap   holds the primary register for the IRTemp.
145        - vregmapHI is only used for 64-bit integer-typed
146             IRTemps.  It holds the identity of a second
147             32-bit virtual HReg, which holds the high half
148             of the value.
149
150   - The code array, that is, the insns selected so far.
151
152   - A counter, for generating new virtual registers.
153
154   - The host subarchitecture we are selecting insns for.
155     This is set at the start and does not change.
156
157   - A Bool for indicating whether we may generate chain-me
158     instructions for control flow transfers, or whether we must use
159     XAssisted.
160
161   - The maximum guest address of any guest insn in this block.
162     Actually, the address of the highest-addressed byte from any insn
163     in this block.  Is set at the start and does not change.  This is
164     used for detecting jumps which are definitely forward-edges from
165     this block, and therefore can be made (chained) to the fast entry
166     point of the destination, thereby avoiding the destination's
167     event check.
168
169   Note, this is all (well, mostly) host-independent.
170*/
171
172typedef
173   struct {
174      /* Constant -- are set at the start and do not change. */
175      IRTypeEnv*   type_env;
176
177      HReg*        vregmap;
178      HReg*        vregmapHI;
179      Int          n_vregmap;
180
181      UInt         hwcaps;
182
183      Bool         chainingAllowed;
184      Addr32       max_ga;
185
186      /* These are modified as we go along. */
187      HInstrArray* code;
188      Int          vreg_ctr;
189   }
190   ISelEnv;
191
192
193static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
194{
195   vassert(tmp >= 0);
196   vassert(tmp < env->n_vregmap);
197   return env->vregmap[tmp];
198}
199
200static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
201{
202   vassert(tmp >= 0);
203   vassert(tmp < env->n_vregmap);
204   vassert(! hregIsInvalid(env->vregmapHI[tmp]));
205   *vrLO = env->vregmap[tmp];
206   *vrHI = env->vregmapHI[tmp];
207}
208
209static void addInstr ( ISelEnv* env, X86Instr* instr )
210{
211   addHInstr(env->code, instr);
212   if (vex_traceflags & VEX_TRACE_VCODE) {
213      ppX86Instr(instr, False);
214      vex_printf("\n");
215   }
216}
217
218static HReg newVRegI ( ISelEnv* env )
219{
220   HReg reg = mkHReg(True/*virtual reg*/, HRcInt32, 0/*enc*/, env->vreg_ctr);
221   env->vreg_ctr++;
222   return reg;
223}
224
225static HReg newVRegF ( ISelEnv* env )
226{
227   HReg reg = mkHReg(True/*virtual reg*/, HRcFlt64, 0/*enc*/, env->vreg_ctr);
228   env->vreg_ctr++;
229   return reg;
230}
231
232static HReg newVRegV ( ISelEnv* env )
233{
234   HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
235   env->vreg_ctr++;
236   return reg;
237}
238
239
240/*---------------------------------------------------------*/
241/*--- ISEL: Forward declarations                        ---*/
242/*---------------------------------------------------------*/
243
244/* These are organised as iselXXX and iselXXX_wrk pairs.  The
245   iselXXX_wrk do the real work, but are not to be called directly.
246   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
247   checks that all returned registers are virtual.  You should not
248   call the _wrk version directly.
249*/
250static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
251static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
252
253static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
254static X86RI*      iselIntExpr_RI     ( ISelEnv* env, const IRExpr* e );
255
256static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
257static X86RM*      iselIntExpr_RM     ( ISelEnv* env, const IRExpr* e );
258
259static HReg        iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
260static HReg        iselIntExpr_R     ( ISelEnv* env, const IRExpr* e );
261
262static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
263static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
264
265static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
266                                       ISelEnv* env, const IRExpr* e );
267static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
268                                       ISelEnv* env, const IRExpr* e );
269
270static X86CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e );
271static X86CondCode iselCondCode     ( ISelEnv* env, const IRExpr* e );
272
273static HReg        iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
274static HReg        iselDblExpr     ( ISelEnv* env, const IRExpr* e );
275
276static HReg        iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
277static HReg        iselFltExpr     ( ISelEnv* env, const IRExpr* e );
278
279static HReg        iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
280static HReg        iselVecExpr     ( ISelEnv* env, const IRExpr* e );
281
282
283/*---------------------------------------------------------*/
284/*--- ISEL: Misc helpers                                ---*/
285/*---------------------------------------------------------*/
286
287/* Make a int reg-reg move. */
288
289static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
290{
291   vassert(hregClass(src) == HRcInt32);
292   vassert(hregClass(dst) == HRcInt32);
293   return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
294}
295
296
297/* Make a vector reg-reg move. */
298
299static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
300{
301   vassert(hregClass(src) == HRcVec128);
302   vassert(hregClass(dst) == HRcVec128);
303   return X86Instr_SseReRg(Xsse_MOV, src, dst);
304}
305
306/* Advance/retreat %esp by n. */
307
308static void add_to_esp ( ISelEnv* env, Int n )
309{
310   vassert(n > 0 && n < 256 && (n%4) == 0);
311   addInstr(env,
312            X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
313}
314
315static void sub_from_esp ( ISelEnv* env, Int n )
316{
317   vassert(n > 0 && n < 256 && (n%4) == 0);
318   addInstr(env,
319            X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
320}
321
322
323/* Given an amode, return one which references 4 bytes further
324   along. */
325
326static X86AMode* advance4 ( X86AMode* am )
327{
328   X86AMode* am4 = dopyX86AMode(am);
329   switch (am4->tag) {
330      case Xam_IRRS:
331         am4->Xam.IRRS.imm += 4; break;
332      case Xam_IR:
333         am4->Xam.IR.imm += 4; break;
334      default:
335         vpanic("advance4(x86,host)");
336   }
337   return am4;
338}
339
340
341/* Push an arg onto the host stack, in preparation for a call to a
342   helper function of some kind.  Returns the number of 32-bit words
343   pushed.  If we encounter an IRExpr_VECRET() then we expect that
344   r_vecRetAddr will be a valid register, that holds the relevant
345   address.
346*/
347static Int pushArg ( ISelEnv* env, IRExpr* arg, HReg r_vecRetAddr )
348{
349   if (UNLIKELY(arg->tag == Iex_VECRET)) {
350      vassert(0); //ATC
351      vassert(!hregIsInvalid(r_vecRetAddr));
352      addInstr(env, X86Instr_Push(X86RMI_Reg(r_vecRetAddr)));
353      return 1;
354   }
355   if (UNLIKELY(arg->tag == Iex_GSPTR)) {
356      addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
357      return 1;
358   }
359   /* Else it's a "normal" expression. */
360   IRType arg_ty = typeOfIRExpr(env->type_env, arg);
361   if (arg_ty == Ity_I32) {
362      addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
363      return 1;
364   } else
365   if (arg_ty == Ity_I64) {
366      HReg rHi, rLo;
367      iselInt64Expr(&rHi, &rLo, env, arg);
368      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
369      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
370      return 2;
371   }
372   ppIRExpr(arg);
373   vpanic("pushArg(x86): can't handle arg of this type");
374}
375
376
377/* Complete the call to a helper function, by calling the
378   helper and clearing the args off the stack. */
379
380static
381void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
382                              IRCallee* cee, Int n_arg_ws,
383                              RetLoc rloc )
384{
385   /* Complication.  Need to decide which reg to use as the fn address
386      pointer, in a way that doesn't trash regparm-passed
387      parameters. */
388   vassert(sizeof(void*) == 4);
389
390   addInstr(env, X86Instr_Call( cc, (Addr)cee->addr,
391                                cee->regparms, rloc));
392   if (n_arg_ws > 0)
393      add_to_esp(env, 4*n_arg_ws);
394}
395
396
397/* Used only in doHelperCall.  See big comment in doHelperCall re
398   handling of regparm args.  This function figures out whether
399   evaluation of an expression might require use of a fixed register.
400   If in doubt return True (safe but suboptimal).
401*/
402static
403Bool mightRequireFixedRegs ( IRExpr* e )
404{
405   if (UNLIKELY(is_IRExpr_VECRET_or_GSPTR(e))) {
406      // These are always "safe" -- either a copy of %esp in some
407      // arbitrary vreg, or a copy of %ebp, respectively.
408      return False;
409   }
410   /* Else it's a "normal" expression. */
411   switch (e->tag) {
412      case Iex_RdTmp: case Iex_Const: case Iex_Get:
413         return False;
414      default:
415         return True;
416   }
417}
418
419
420/* Do a complete function call.  |guard| is a Ity_Bit expression
421   indicating whether or not the call happens.  If guard==NULL, the
422   call is unconditional.  |retloc| is set to indicate where the
423   return value is after the call.  The caller (of this fn) must
424   generate code to add |stackAdjustAfterCall| to the stack pointer
425   after the call is done. */
426
427static
428void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
429                    /*OUT*/RetLoc* retloc,
430                    ISelEnv* env,
431                    IRExpr* guard,
432                    IRCallee* cee, IRType retTy, IRExpr** args )
433{
434   X86CondCode cc;
435   HReg        argregs[3];
436   HReg        tmpregs[3];
437   Bool        danger;
438   Int         not_done_yet, n_args, n_arg_ws, stack_limit,
439               i, argreg, argregX;
440
441   /* Set default returns.  We'll update them later if needed. */
442   *stackAdjustAfterCall = 0;
443   *retloc               = mk_RetLoc_INVALID();
444
445   /* These are used for cross-checking that IR-level constraints on
446      the use of Iex_VECRET and Iex_GSPTR are observed. */
447   UInt nVECRETs = 0;
448   UInt nGSPTRs  = 0;
449
450   /* Marshal args for a call, do the call, and clear the stack.
451      Complexities to consider:
452
453      * The return type can be I{64,32,16,8} or V128.  In the V128
454        case, it is expected that |args| will contain the special
455        node IRExpr_VECRET(), in which case this routine generates
456        code to allocate space on the stack for the vector return
457        value.  Since we are not passing any scalars on the stack, it
458        is enough to preallocate the return space before marshalling
459        any arguments, in this case.
460
461        |args| may also contain IRExpr_GSPTR(), in which case the
462        value in %ebp is passed as the corresponding argument.
463
464      * If the callee claims regparmness of 1, 2 or 3, we must pass the
465        first 1, 2 or 3 args in registers (EAX, EDX, and ECX
466        respectively).  To keep things relatively simple, only args of
467        type I32 may be passed as regparms -- just bomb out if anything
468        else turns up.  Clearly this depends on the front ends not
469        trying to pass any other types as regparms.
470   */
471
472   /* 16 Nov 2004: the regparm handling is complicated by the
473      following problem.
474
475      Consider a call two a function with two regparm parameters:
476      f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
477      Suppose code is first generated to compute e1 into %eax.  Then,
478      code is generated to compute e2 into %edx.  Unfortunately, if
479      the latter code sequence uses %eax, it will trash the value of
480      e1 computed by the former sequence.  This could happen if (for
481      example) e2 itself involved a function call.  In the code below,
482      args are evaluated right-to-left, not left-to-right, but the
483      principle and the problem are the same.
484
485      One solution is to compute all regparm-bound args into vregs
486      first, and once they are all done, move them to the relevant
487      real regs.  This always gives correct code, but it also gives
488      a bunch of vreg-to-rreg moves which are usually redundant but
489      are hard for the register allocator to get rid of.
490
491      A compromise is to first examine all regparm'd argument
492      expressions.  If they are all so simple that it is clear
493      they will be evaluated without use of any fixed registers,
494      use the old compute-directly-to-fixed-target scheme.  If not,
495      be safe and use the via-vregs scheme.
496
497      Note this requires being able to examine an expression and
498      determine whether or not evaluation of it might use a fixed
499      register.  That requires knowledge of how the rest of this
500      insn selector works.  Currently just the following 3 are
501      regarded as safe -- hopefully they cover the majority of
502      arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
503   */
504   vassert(cee->regparms >= 0 && cee->regparms <= 3);
505
506   /* Count the number of args and also the VECRETs */
507   n_args = n_arg_ws = 0;
508   while (args[n_args]) {
509      IRExpr* arg = args[n_args];
510      n_args++;
511      if (UNLIKELY(arg->tag == Iex_VECRET)) {
512         nVECRETs++;
513      } else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
514         nGSPTRs++;
515      }
516   }
517
518   /* If this fails, the IR is ill-formed */
519   vassert(nGSPTRs == 0 || nGSPTRs == 1);
520
521   /* If we have a VECRET, allocate space on the stack for the return
522      value, and record the stack pointer after that. */
523   HReg r_vecRetAddr = INVALID_HREG;
524   if (nVECRETs == 1) {
525      vassert(retTy == Ity_V128 || retTy == Ity_V256);
526      vassert(retTy != Ity_V256); // we don't handle that yet (if ever)
527      r_vecRetAddr = newVRegI(env);
528      sub_from_esp(env, 16);
529      addInstr(env, mk_iMOVsd_RR( hregX86_ESP(), r_vecRetAddr ));
530   } else {
531      // If either of these fail, the IR is ill-formed
532      vassert(retTy != Ity_V128 && retTy != Ity_V256);
533      vassert(nVECRETs == 0);
534   }
535
536   not_done_yet = n_args;
537
538   stack_limit = cee->regparms;
539
540   /* ------ BEGIN marshall all arguments ------ */
541
542   /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
543   for (i = n_args-1; i >= stack_limit; i--) {
544      n_arg_ws += pushArg(env, args[i], r_vecRetAddr);
545      not_done_yet--;
546   }
547
548   /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
549      registers. */
550
551   if (cee->regparms > 0) {
552
553      /* ------ BEGIN deal with regparms ------ */
554
555      /* deal with regparms, not forgetting %ebp if needed. */
556      argregs[0] = hregX86_EAX();
557      argregs[1] = hregX86_EDX();
558      argregs[2] = hregX86_ECX();
559      tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
560
561      argreg = cee->regparms;
562
563      /* In keeping with big comment above, detect potential danger
564         and use the via-vregs scheme if needed. */
565      danger = False;
566      for (i = stack_limit-1; i >= 0; i--) {
567         if (mightRequireFixedRegs(args[i])) {
568            danger = True;
569            break;
570         }
571      }
572
573      if (danger) {
574
575         /* Move via temporaries */
576         argregX = argreg;
577         for (i = stack_limit-1; i >= 0; i--) {
578
579            if (0) {
580               vex_printf("x86 host: register param is complex: ");
581               ppIRExpr(args[i]);
582               vex_printf("\n");
583            }
584
585            IRExpr* arg = args[i];
586            argreg--;
587            vassert(argreg >= 0);
588            if (UNLIKELY(arg->tag == Iex_VECRET)) {
589               vassert(0); //ATC
590            }
591            else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
592               vassert(0); //ATC
593            } else {
594               vassert(typeOfIRExpr(env->type_env, arg) == Ity_I32);
595               tmpregs[argreg] = iselIntExpr_R(env, arg);
596            }
597            not_done_yet--;
598         }
599         for (i = stack_limit-1; i >= 0; i--) {
600            argregX--;
601            vassert(argregX >= 0);
602            addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
603         }
604
605      } else {
606         /* It's safe to compute all regparm args directly into their
607            target registers. */
608         for (i = stack_limit-1; i >= 0; i--) {
609            IRExpr* arg = args[i];
610            argreg--;
611            vassert(argreg >= 0);
612            if (UNLIKELY(arg->tag == Iex_VECRET)) {
613               vassert(!hregIsInvalid(r_vecRetAddr));
614               addInstr(env, X86Instr_Alu32R(Xalu_MOV,
615                                             X86RMI_Reg(r_vecRetAddr),
616                                             argregs[argreg]));
617            }
618            else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
619               vassert(0); //ATC
620            } else {
621               vassert(typeOfIRExpr(env->type_env, arg) == Ity_I32);
622               addInstr(env, X86Instr_Alu32R(Xalu_MOV,
623                                             iselIntExpr_RMI(env, arg),
624                                             argregs[argreg]));
625            }
626            not_done_yet--;
627         }
628
629      }
630
631      /* ------ END deal with regparms ------ */
632
633   }
634
635   vassert(not_done_yet == 0);
636
637   /* ------ END marshall all arguments ------ */
638
639   /* Now we can compute the condition.  We can't do it earlier
640      because the argument computations could trash the condition
641      codes.  Be a bit clever to handle the common case where the
642      guard is 1:Bit. */
643   cc = Xcc_ALWAYS;
644   if (guard) {
645      if (guard->tag == Iex_Const
646          && guard->Iex.Const.con->tag == Ico_U1
647          && guard->Iex.Const.con->Ico.U1 == True) {
648         /* unconditional -- do nothing */
649      } else {
650         cc = iselCondCode( env, guard );
651      }
652   }
653
654   /* Do final checks, set the return values, and generate the call
655      instruction proper. */
656   vassert(*stackAdjustAfterCall == 0);
657   vassert(is_RetLoc_INVALID(*retloc));
658   switch (retTy) {
659         case Ity_INVALID:
660            /* Function doesn't return a value. */
661            *retloc = mk_RetLoc_simple(RLPri_None);
662            break;
663         case Ity_I64:
664            *retloc = mk_RetLoc_simple(RLPri_2Int);
665            break;
666         case Ity_I32: case Ity_I16: case Ity_I8:
667            *retloc = mk_RetLoc_simple(RLPri_Int);
668            break;
669         case Ity_V128:
670            *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
671            *stackAdjustAfterCall = 16;
672            break;
673         case Ity_V256:
674            vassert(0); // ATC
675            *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
676            *stackAdjustAfterCall = 32;
677            break;
678         default:
679            /* IR can denote other possible return types, but we don't
680               handle those here. */
681           vassert(0);
682   }
683
684   /* Finally, generate the call itself.  This needs the *retloc value
685      set in the switch above, which is why it's at the end. */
686   callHelperAndClearArgs( env, cc, cee, n_arg_ws, *retloc );
687}
688
689
690/* Given a guest-state array descriptor, an index expression and a
691   bias, generate an X86AMode holding the relevant guest state
692   offset. */
693
694static
695X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
696                                IRExpr* off, Int bias )
697{
698   HReg tmp, roff;
699   Int  elemSz = sizeofIRType(descr->elemTy);
700   Int  nElems = descr->nElems;
701   Int  shift  = 0;
702
703   /* throw out any cases not generated by an x86 front end.  In
704      theory there might be a day where we need to handle them -- if
705      we ever run non-x86-guest on x86 host. */
706
707   if (nElems != 8)
708      vpanic("genGuestArrayOffset(x86 host)(1)");
709
710   switch (elemSz) {
711      case 1:  shift = 0; break;
712      case 4:  shift = 2; break;
713      case 8:  shift = 3; break;
714      default: vpanic("genGuestArrayOffset(x86 host)(2)");
715   }
716
717   /* Compute off into a reg, %off.  Then return:
718
719         movl %off, %tmp
720         addl $bias, %tmp  (if bias != 0)
721         andl %tmp, 7
722         ... base(%ebp, %tmp, shift) ...
723   */
724   tmp  = newVRegI(env);
725   roff = iselIntExpr_R(env, off);
726   addInstr(env, mk_iMOVsd_RR(roff, tmp));
727   if (bias != 0) {
728      addInstr(env,
729               X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
730   }
731   addInstr(env,
732            X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
733   return
734      X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
735}
736
737
738/* Mess with the FPU's rounding mode: set to the default rounding mode
739   (DEFAULT_FPUCW). */
740static
741void set_FPU_rounding_default ( ISelEnv* env )
742{
743   /* pushl $DEFAULT_FPUCW
744      fldcw 0(%esp)
745      addl $4, %esp
746   */
747   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
748   addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
749   addInstr(env, X86Instr_FpLdCW(zero_esp));
750   add_to_esp(env, 4);
751}
752
753
754/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
755   expression denoting a value in the range 0 .. 3, indicating a round
756   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
757   the same rounding.
758*/
759static
760void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
761{
762   HReg rrm  = iselIntExpr_R(env, mode);
763   HReg rrm2 = newVRegI(env);
764   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
765
766   /* movl  %rrm, %rrm2
767      andl  $3, %rrm2   -- shouldn't be needed; paranoia
768      shll  $10, %rrm2
769      orl   $DEFAULT_FPUCW, %rrm2
770      pushl %rrm2
771      fldcw 0(%esp)
772      addl  $4, %esp
773   */
774   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
775   addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
776   addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
777   addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
778   addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
779   addInstr(env, X86Instr_FpLdCW(zero_esp));
780   add_to_esp(env, 4);
781}
782
783
784/* Generate !src into a new vector register, and be sure that the code
785   is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
786   way to do this.
787*/
788static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
789{
790   HReg dst = newVRegV(env);
791   /* Set dst to zero.  If dst contains a NaN then all hell might
792      break loose after the comparison.  So, first zero it. */
793   addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
794   /* And now make it all 1s ... */
795   addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
796   /* Finally, xor 'src' into it. */
797   addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
798   /* Doesn't that just totally suck? */
799   return dst;
800}
801
802
803/* Round an x87 FPU value to 53-bit-mantissa precision, to be used
804   after most non-simple FPU operations (simple = +, -, *, / and
805   sqrt).
806
807   This could be done a lot more efficiently if needed, by loading
808   zero and adding it to the value to be rounded (fldz ; faddp?).
809*/
810static void roundToF64 ( ISelEnv* env, HReg reg )
811{
812   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
813   sub_from_esp(env, 8);
814   addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
815   addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
816   add_to_esp(env, 8);
817}
818
819
820/*---------------------------------------------------------*/
821/*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
822/*---------------------------------------------------------*/
823
824/* Select insns for an integer-typed expression, and add them to the
825   code list.  Return a reg holding the result.  This reg will be a
826   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
827   want to modify it, ask for a new vreg, copy it in there, and modify
828   the copy.  The register allocator will do its best to map both
829   vregs to the same real register, so the copies will often disappear
830   later in the game.
831
832   This should handle expressions of 32, 16 and 8-bit type.  All
833   results are returned in a 32-bit register.  For 16- and 8-bit
834   expressions, the upper 16/24 bits are arbitrary, so you should mask
835   or sign extend partial values if necessary.
836*/
837
838static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
839{
840   HReg r = iselIntExpr_R_wrk(env, e);
841   /* sanity checks ... */
842#  if 0
843   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
844#  endif
845   vassert(hregClass(r) == HRcInt32);
846   vassert(hregIsVirtual(r));
847   return r;
848}
849
850/* DO NOT CALL THIS DIRECTLY ! */
851static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
852{
853   MatchInfo mi;
854
855   IRType ty = typeOfIRExpr(env->type_env,e);
856   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
857
858   switch (e->tag) {
859
860   /* --------- TEMP --------- */
861   case Iex_RdTmp: {
862      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
863   }
864
865   /* --------- LOAD --------- */
866   case Iex_Load: {
867      HReg dst = newVRegI(env);
868      X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
869
870      /* We can't handle big-endian loads, nor load-linked. */
871      if (e->Iex.Load.end != Iend_LE)
872         goto irreducible;
873
874      if (ty == Ity_I32) {
875         addInstr(env, X86Instr_Alu32R(Xalu_MOV,
876                                       X86RMI_Mem(amode), dst) );
877         return dst;
878      }
879      if (ty == Ity_I16) {
880         addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
881         return dst;
882      }
883      if (ty == Ity_I8) {
884         addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
885         return dst;
886      }
887      break;
888   }
889
890   /* --------- TERNARY OP --------- */
891   case Iex_Triop: {
892      IRTriop *triop = e->Iex.Triop.details;
893      /* C3210 flags following FPU partial remainder (fprem), both
894         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
895      if (triop->op == Iop_PRemC3210F64
896          || triop->op == Iop_PRem1C3210F64) {
897         HReg junk = newVRegF(env);
898         HReg dst  = newVRegI(env);
899         HReg srcL = iselDblExpr(env, triop->arg2);
900         HReg srcR = iselDblExpr(env, triop->arg3);
901         /* XXXROUNDINGFIXME */
902         /* set roundingmode here */
903         addInstr(env, X86Instr_FpBinary(
904                           e->Iex.Binop.op==Iop_PRemC3210F64
905                              ? Xfp_PREM : Xfp_PREM1,
906                           srcL,srcR,junk
907                 ));
908         /* The previous pseudo-insn will have left the FPU's C3210
909            flags set correctly.  So bag them. */
910         addInstr(env, X86Instr_FpStSW_AX());
911         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
912         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
913         return dst;
914      }
915
916      break;
917   }
918
919   /* --------- BINARY OP --------- */
920   case Iex_Binop: {
921      X86AluOp   aluOp;
922      X86ShiftOp shOp;
923
924      /* Pattern: Sub32(0,x) */
925      if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
926         HReg dst = newVRegI(env);
927         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
928         addInstr(env, mk_iMOVsd_RR(reg,dst));
929         addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
930         return dst;
931      }
932
933      /* Is it an addition or logical style op? */
934      switch (e->Iex.Binop.op) {
935         case Iop_Add8: case Iop_Add16: case Iop_Add32:
936            aluOp = Xalu_ADD; break;
937         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
938            aluOp = Xalu_SUB; break;
939         case Iop_And8: case Iop_And16: case Iop_And32:
940            aluOp = Xalu_AND; break;
941         case Iop_Or8: case Iop_Or16: case Iop_Or32:
942            aluOp = Xalu_OR; break;
943         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
944            aluOp = Xalu_XOR; break;
945         case Iop_Mul16: case Iop_Mul32:
946            aluOp = Xalu_MUL; break;
947         default:
948            aluOp = Xalu_INVALID; break;
949      }
950      /* For commutative ops we assume any literal
951         values are on the second operand. */
952      if (aluOp != Xalu_INVALID) {
953         HReg dst    = newVRegI(env);
954         HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
955         X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
956         addInstr(env, mk_iMOVsd_RR(reg,dst));
957         addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
958         return dst;
959      }
960      /* Could do better here; forcing the first arg into a reg
961         isn't always clever.
962         -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
963                        LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
964                        t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
965            movl 0xFFFFFFA0(%vr41),%vr107
966            movl 0xFFFFFFA4(%vr41),%vr108
967            movl %vr107,%vr106
968            xorl %vr108,%vr106
969            movl 0xFFFFFFA8(%vr41),%vr109
970            movl %vr106,%vr105
971            andl %vr109,%vr105
972            movl 0xFFFFFFA0(%vr41),%vr110
973            movl %vr105,%vr104
974            xorl %vr110,%vr104
975            movl %vr104,%vr70
976      */
977
978      /* Perhaps a shift op? */
979      switch (e->Iex.Binop.op) {
980         case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
981            shOp = Xsh_SHL; break;
982         case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
983            shOp = Xsh_SHR; break;
984         case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
985            shOp = Xsh_SAR; break;
986         default:
987            shOp = Xsh_INVALID; break;
988      }
989      if (shOp != Xsh_INVALID) {
990         HReg dst = newVRegI(env);
991
992         /* regL = the value to be shifted */
993         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
994         addInstr(env, mk_iMOVsd_RR(regL,dst));
995
996         /* Do any necessary widening for 16/8 bit operands */
997         switch (e->Iex.Binop.op) {
998            case Iop_Shr8:
999               addInstr(env, X86Instr_Alu32R(
1000                                Xalu_AND, X86RMI_Imm(0xFF), dst));
1001               break;
1002            case Iop_Shr16:
1003               addInstr(env, X86Instr_Alu32R(
1004                                Xalu_AND, X86RMI_Imm(0xFFFF), dst));
1005               break;
1006            case Iop_Sar8:
1007               addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
1008               addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
1009               break;
1010            case Iop_Sar16:
1011               addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
1012               addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
1013               break;
1014            default: break;
1015         }
1016
1017         /* Now consider the shift amount.  If it's a literal, we
1018            can do a much better job than the general case. */
1019         if (e->Iex.Binop.arg2->tag == Iex_Const) {
1020            /* assert that the IR is well-typed */
1021            Int nshift;
1022            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1023            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1024	    vassert(nshift >= 0);
1025	    if (nshift > 0)
1026               /* Can't allow nshift==0 since that means %cl */
1027               addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
1028         } else {
1029            /* General case; we have to force the amount into %cl. */
1030            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1031            addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
1032            addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
1033         }
1034         return dst;
1035      }
1036
1037      /* Handle misc other ops. */
1038
1039      if (e->Iex.Binop.op == Iop_Max32U) {
1040         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1041         HReg dst  = newVRegI(env);
1042         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1043         addInstr(env, mk_iMOVsd_RR(src1,dst));
1044         addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
1045         addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
1046         return dst;
1047      }
1048
1049      if (e->Iex.Binop.op == Iop_8HLto16) {
1050         HReg hi8  = newVRegI(env);
1051         HReg lo8  = newVRegI(env);
1052         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1053         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1054         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1055         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1056         addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
1057         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
1058         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
1059         return hi8;
1060      }
1061
1062      if (e->Iex.Binop.op == Iop_16HLto32) {
1063         HReg hi16  = newVRegI(env);
1064         HReg lo16  = newVRegI(env);
1065         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1066         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1067         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1068         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1069         addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
1070         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
1071         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
1072         return hi16;
1073      }
1074
1075      if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
1076          || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
1077         HReg a16   = newVRegI(env);
1078         HReg b16   = newVRegI(env);
1079         HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1080         HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1081         Int  shift = (e->Iex.Binop.op == Iop_MullS8
1082                       || e->Iex.Binop.op == Iop_MullU8)
1083                         ? 24 : 16;
1084         X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
1085                              || e->Iex.Binop.op == Iop_MullS16)
1086                                ? Xsh_SAR : Xsh_SHR;
1087
1088         addInstr(env, mk_iMOVsd_RR(a16s, a16));
1089         addInstr(env, mk_iMOVsd_RR(b16s, b16));
1090         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
1091         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
1092         addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
1093         addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
1094         addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
1095         return b16;
1096      }
1097
1098      if (e->Iex.Binop.op == Iop_CmpF64) {
1099         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1100         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1101         HReg dst = newVRegI(env);
1102         addInstr(env, X86Instr_FpCmp(fL,fR,dst));
1103         /* shift this right 8 bits so as to conform to CmpF64
1104            definition. */
1105         addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
1106         return dst;
1107      }
1108
1109      if (e->Iex.Binop.op == Iop_F64toI32S
1110          || e->Iex.Binop.op == Iop_F64toI16S) {
1111         Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
1112         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1113         HReg dst = newVRegI(env);
1114
1115         /* Used several times ... */
1116         X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1117
1118	 /* rf now holds the value to be converted, and rrm holds the
1119	    rounding mode value, encoded as per the IRRoundingMode
1120	    enum.  The first thing to do is set the FPU's rounding
1121	    mode accordingly. */
1122
1123         /* Create a space for the format conversion. */
1124         /* subl $4, %esp */
1125         sub_from_esp(env, 4);
1126
1127	 /* Set host rounding mode */
1128	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
1129
1130         /* gistw/l %rf, 0(%esp) */
1131         addInstr(env, X86Instr_FpLdStI(False/*store*/,
1132                                        toUChar(sz), rf, zero_esp));
1133
1134         if (sz == 2) {
1135            /* movzwl 0(%esp), %dst */
1136            addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
1137         } else {
1138            /* movl 0(%esp), %dst */
1139            vassert(sz == 4);
1140            addInstr(env, X86Instr_Alu32R(
1141                             Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1142         }
1143
1144	 /* Restore default FPU rounding. */
1145         set_FPU_rounding_default( env );
1146
1147         /* addl $4, %esp */
1148	 add_to_esp(env, 4);
1149         return dst;
1150      }
1151
1152      break;
1153   }
1154
1155   /* --------- UNARY OP --------- */
1156   case Iex_Unop: {
1157
1158      /* 1Uto8(32to1(expr32)) */
1159      if (e->Iex.Unop.op == Iop_1Uto8) {
1160         DECLARE_PATTERN(p_32to1_then_1Uto8);
1161         DEFINE_PATTERN(p_32to1_then_1Uto8,
1162                        unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
1163         if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
1164            const IRExpr* expr32 = mi.bindee[0];
1165            HReg dst = newVRegI(env);
1166            HReg src = iselIntExpr_R(env, expr32);
1167            addInstr(env, mk_iMOVsd_RR(src,dst) );
1168            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1169                                          X86RMI_Imm(1), dst));
1170            return dst;
1171         }
1172      }
1173
1174      /* 8Uto32(LDle(expr32)) */
1175      if (e->Iex.Unop.op == Iop_8Uto32) {
1176         DECLARE_PATTERN(p_LDle8_then_8Uto32);
1177         DEFINE_PATTERN(p_LDle8_then_8Uto32,
1178                        unop(Iop_8Uto32,
1179                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1180         if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
1181            HReg dst = newVRegI(env);
1182            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1183            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1184            return dst;
1185         }
1186      }
1187
1188      /* 8Sto32(LDle(expr32)) */
1189      if (e->Iex.Unop.op == Iop_8Sto32) {
1190         DECLARE_PATTERN(p_LDle8_then_8Sto32);
1191         DEFINE_PATTERN(p_LDle8_then_8Sto32,
1192                        unop(Iop_8Sto32,
1193                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1194         if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
1195            HReg dst = newVRegI(env);
1196            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1197            addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
1198            return dst;
1199         }
1200      }
1201
1202      /* 16Uto32(LDle(expr32)) */
1203      if (e->Iex.Unop.op == Iop_16Uto32) {
1204         DECLARE_PATTERN(p_LDle16_then_16Uto32);
1205         DEFINE_PATTERN(p_LDle16_then_16Uto32,
1206                        unop(Iop_16Uto32,
1207                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1208         if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
1209            HReg dst = newVRegI(env);
1210            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1211            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1212            return dst;
1213         }
1214      }
1215
1216      /* 8Uto32(GET:I8) */
1217      if (e->Iex.Unop.op == Iop_8Uto32) {
1218         if (e->Iex.Unop.arg->tag == Iex_Get) {
1219            HReg      dst;
1220            X86AMode* amode;
1221            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
1222            dst = newVRegI(env);
1223            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1224                                hregX86_EBP());
1225            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1226            return dst;
1227         }
1228      }
1229
1230      /* 16to32(GET:I16) */
1231      if (e->Iex.Unop.op == Iop_16Uto32) {
1232         if (e->Iex.Unop.arg->tag == Iex_Get) {
1233            HReg      dst;
1234            X86AMode* amode;
1235            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
1236            dst = newVRegI(env);
1237            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1238                                hregX86_EBP());
1239            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1240            return dst;
1241         }
1242      }
1243
1244      switch (e->Iex.Unop.op) {
1245         case Iop_8Uto16:
1246         case Iop_8Uto32:
1247         case Iop_16Uto32: {
1248            HReg dst = newVRegI(env);
1249            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1250            UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
1251            addInstr(env, mk_iMOVsd_RR(src,dst) );
1252            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1253                                          X86RMI_Imm(mask), dst));
1254            return dst;
1255         }
1256         case Iop_8Sto16:
1257         case Iop_8Sto32:
1258         case Iop_16Sto32: {
1259            HReg dst = newVRegI(env);
1260            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1261            UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
1262            addInstr(env, mk_iMOVsd_RR(src,dst) );
1263            addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
1264            addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
1265            return dst;
1266         }
1267	 case Iop_Not8:
1268	 case Iop_Not16:
1269         case Iop_Not32: {
1270            HReg dst = newVRegI(env);
1271            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1272            addInstr(env, mk_iMOVsd_RR(src,dst) );
1273            addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
1274            return dst;
1275         }
1276         case Iop_64HIto32: {
1277            HReg rHi, rLo;
1278            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1279            return rHi; /* and abandon rLo .. poor wee thing :-) */
1280         }
1281         case Iop_64to32: {
1282            HReg rHi, rLo;
1283            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1284            return rLo; /* similar stupid comment to the above ... */
1285         }
1286         case Iop_16HIto8:
1287         case Iop_32HIto16: {
1288            HReg dst  = newVRegI(env);
1289            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1290            Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
1291            addInstr(env, mk_iMOVsd_RR(src,dst) );
1292            addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
1293            return dst;
1294         }
1295         case Iop_1Uto32:
1296         case Iop_1Uto8: {
1297            HReg dst         = newVRegI(env);
1298            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1299            addInstr(env, X86Instr_Set32(cond,dst));
1300            return dst;
1301         }
1302         case Iop_1Sto8:
1303         case Iop_1Sto16:
1304         case Iop_1Sto32: {
1305            /* could do better than this, but for now ... */
1306            HReg dst         = newVRegI(env);
1307            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1308            addInstr(env, X86Instr_Set32(cond,dst));
1309            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
1310            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1311            return dst;
1312         }
1313         case Iop_Ctz32: {
1314            /* Count trailing zeroes, implemented by x86 'bsfl' */
1315            HReg dst = newVRegI(env);
1316            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1317            addInstr(env, X86Instr_Bsfr32(True,src,dst));
1318            return dst;
1319         }
1320         case Iop_Clz32: {
1321            /* Count leading zeroes.  Do 'bsrl' to establish the index
1322               of the highest set bit, and subtract that value from
1323               31. */
1324            HReg tmp = newVRegI(env);
1325            HReg dst = newVRegI(env);
1326            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1327            addInstr(env, X86Instr_Bsfr32(False,src,tmp));
1328            addInstr(env, X86Instr_Alu32R(Xalu_MOV,
1329                                          X86RMI_Imm(31), dst));
1330            addInstr(env, X86Instr_Alu32R(Xalu_SUB,
1331                                          X86RMI_Reg(tmp), dst));
1332            return dst;
1333         }
1334
1335         case Iop_CmpwNEZ32: {
1336            HReg dst = newVRegI(env);
1337            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1338            addInstr(env, mk_iMOVsd_RR(src,dst));
1339            addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
1340            addInstr(env, X86Instr_Alu32R(Xalu_OR,
1341                                          X86RMI_Reg(src), dst));
1342            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1343            return dst;
1344         }
1345         case Iop_Left8:
1346         case Iop_Left16:
1347         case Iop_Left32: {
1348            HReg dst = newVRegI(env);
1349            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1350            addInstr(env, mk_iMOVsd_RR(src, dst));
1351            addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
1352            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
1353            return dst;
1354         }
1355
1356         case Iop_V128to32: {
1357            HReg      dst  = newVRegI(env);
1358            HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
1359            X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
1360            sub_from_esp(env, 16);
1361            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
1362            addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
1363            add_to_esp(env, 16);
1364            return dst;
1365         }
1366
1367         /* ReinterpF32asI32(e) */
1368         /* Given an IEEE754 single, produce an I32 with the same bit
1369            pattern.  Keep stack 8-aligned even though only using 4
1370            bytes. */
1371         case Iop_ReinterpF32asI32: {
1372            HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
1373            HReg dst  = newVRegI(env);
1374            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1375            /* paranoia */
1376            set_FPU_rounding_default(env);
1377            /* subl $8, %esp */
1378            sub_from_esp(env, 8);
1379            /* gstF %rf, 0(%esp) */
1380            addInstr(env,
1381                     X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
1382            /* movl 0(%esp), %dst */
1383            addInstr(env,
1384                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1385            /* addl $8, %esp */
1386            add_to_esp(env, 8);
1387            return dst;
1388         }
1389
1390         case Iop_16to8:
1391         case Iop_32to8:
1392         case Iop_32to16:
1393            /* These are no-ops. */
1394            return iselIntExpr_R(env, e->Iex.Unop.arg);
1395
1396         case Iop_GetMSBs8x8: {
1397            /* Note: the following assumes the helper is of
1398               signature
1399                  UInt fn ( ULong ), and is not a regparm fn.
1400            */
1401            HReg  xLo, xHi;
1402            HReg  dst = newVRegI(env);
1403            Addr fn = (Addr)h_generic_calc_GetMSBs8x8;
1404            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
1405            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
1406            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
1407            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
1408                                         0, mk_RetLoc_simple(RLPri_Int) ));
1409            add_to_esp(env, 2*4);
1410            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1411            return dst;
1412         }
1413
1414         default:
1415            break;
1416      }
1417      break;
1418   }
1419
1420   /* --------- GET --------- */
1421   case Iex_Get: {
1422      if (ty == Ity_I32) {
1423         HReg dst = newVRegI(env);
1424         addInstr(env, X86Instr_Alu32R(
1425                          Xalu_MOV,
1426                          X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1427                                                 hregX86_EBP())),
1428                          dst));
1429         return dst;
1430      }
1431      if (ty == Ity_I8 || ty == Ity_I16) {
1432         HReg dst = newVRegI(env);
1433         addInstr(env, X86Instr_LoadEX(
1434                          toUChar(ty==Ity_I8 ? 1 : 2),
1435                          False,
1436                          X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
1437                          dst));
1438         return dst;
1439      }
1440      break;
1441   }
1442
1443   case Iex_GetI: {
1444      X86AMode* am
1445         = genGuestArrayOffset(
1446              env, e->Iex.GetI.descr,
1447                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1448      HReg dst = newVRegI(env);
1449      if (ty == Ity_I8) {
1450         addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
1451         return dst;
1452      }
1453      if (ty == Ity_I32) {
1454         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
1455         return dst;
1456      }
1457      break;
1458   }
1459
1460   /* --------- CCALL --------- */
1461   case Iex_CCall: {
1462      HReg    dst = newVRegI(env);
1463      vassert(ty == e->Iex.CCall.retty);
1464
1465      /* be very restrictive for now.  Only 32/64-bit ints allowed for
1466         args, and 32 bits for return type.  Don't forget to change
1467         the RetLoc if more return types are allowed in future. */
1468      if (e->Iex.CCall.retty != Ity_I32)
1469         goto irreducible;
1470
1471      /* Marshal args, do the call, clear stack. */
1472      UInt   addToSp = 0;
1473      RetLoc rloc    = mk_RetLoc_INVALID();
1474      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1475                    e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1476      vassert(is_sane_RetLoc(rloc));
1477      vassert(rloc.pri == RLPri_Int);
1478      vassert(addToSp == 0);
1479
1480      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1481      return dst;
1482   }
1483
1484   /* --------- LITERAL --------- */
1485   /* 32/16/8-bit literals */
1486   case Iex_Const: {
1487      X86RMI* rmi = iselIntExpr_RMI ( env, e );
1488      HReg    r   = newVRegI(env);
1489      addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
1490      return r;
1491   }
1492
1493   /* --------- MULTIPLEX --------- */
1494   case Iex_ITE: { // VFD
1495     if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1496         && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1497        HReg   r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1498        X86RM* r0  = iselIntExpr_RM(env, e->Iex.ITE.iffalse);
1499        HReg   dst = newVRegI(env);
1500        addInstr(env, mk_iMOVsd_RR(r1,dst));
1501        X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1502        addInstr(env, X86Instr_CMov32(cc ^ 1, r0, dst));
1503        return dst;
1504      }
1505      break;
1506   }
1507
1508   default:
1509   break;
1510   } /* switch (e->tag) */
1511
1512   /* We get here if no pattern matched. */
1513  irreducible:
1514   ppIRExpr(e);
1515   vpanic("iselIntExpr_R: cannot reduce tree");
1516}
1517
1518
1519/*---------------------------------------------------------*/
1520/*--- ISEL: Integer expression auxiliaries              ---*/
1521/*---------------------------------------------------------*/
1522
1523/* --------------------- AMODEs --------------------- */
1524
1525/* Return an AMode which computes the value of the specified
1526   expression, possibly also adding insns to the code list as a
1527   result.  The expression may only be a 32-bit one.
1528*/
1529
1530static Bool sane_AMode ( X86AMode* am )
1531{
1532   switch (am->tag) {
1533      case Xam_IR:
1534         return
1535            toBool( hregClass(am->Xam.IR.reg) == HRcInt32
1536                    && (hregIsVirtual(am->Xam.IR.reg)
1537                        || sameHReg(am->Xam.IR.reg, hregX86_EBP())) );
1538      case Xam_IRRS:
1539         return
1540            toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
1541                    && hregIsVirtual(am->Xam.IRRS.base)
1542                    && hregClass(am->Xam.IRRS.index) == HRcInt32
1543                    && hregIsVirtual(am->Xam.IRRS.index) );
1544      default:
1545        vpanic("sane_AMode: unknown x86 amode tag");
1546   }
1547}
1548
1549static X86AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
1550{
1551   X86AMode* am = iselIntExpr_AMode_wrk(env, e);
1552   vassert(sane_AMode(am));
1553   return am;
1554}
1555
1556/* DO NOT CALL THIS DIRECTLY ! */
1557static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
1558{
1559   IRType ty = typeOfIRExpr(env->type_env,e);
1560   vassert(ty == Ity_I32);
1561
1562   /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
1563   if (e->tag == Iex_Binop
1564       && e->Iex.Binop.op == Iop_Add32
1565       && e->Iex.Binop.arg2->tag == Iex_Const
1566       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
1567       && e->Iex.Binop.arg1->tag == Iex_Binop
1568       && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
1569       && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
1570       && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1571       && e->Iex.Binop.arg1
1572           ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1573       && e->Iex.Binop.arg1
1574           ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1575      UInt shift = e->Iex.Binop.arg1
1576                    ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1577      UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1578      if (shift == 1 || shift == 2 || shift == 3) {
1579         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
1580         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
1581                                       ->Iex.Binop.arg2->Iex.Binop.arg1 );
1582         return X86AMode_IRRS(imm32, r1, r2, shift);
1583      }
1584   }
1585
1586   /* Add32(expr1, Shl32(expr2, imm)) */
1587   if (e->tag == Iex_Binop
1588       && e->Iex.Binop.op == Iop_Add32
1589       && e->Iex.Binop.arg2->tag == Iex_Binop
1590       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1591       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1592       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1593      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1594      if (shift == 1 || shift == 2 || shift == 3) {
1595         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1596         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1597         return X86AMode_IRRS(0, r1, r2, shift);
1598      }
1599   }
1600
1601   /* Add32(expr,i) */
1602   if (e->tag == Iex_Binop
1603       && e->Iex.Binop.op == Iop_Add32
1604       && e->Iex.Binop.arg2->tag == Iex_Const
1605       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
1606      HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
1607      return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
1608   }
1609
1610   /* Doesn't match anything in particular.  Generate it into
1611      a register and use that. */
1612   {
1613      HReg r1 = iselIntExpr_R(env, e);
1614      return X86AMode_IR(0, r1);
1615   }
1616}
1617
1618
1619/* --------------------- RMIs --------------------- */
1620
1621/* Similarly, calculate an expression into an X86RMI operand.  As with
1622   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1623
1624static X86RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
1625{
1626   X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1627   /* sanity checks ... */
1628   switch (rmi->tag) {
1629      case Xrmi_Imm:
1630         return rmi;
1631      case Xrmi_Reg:
1632         vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
1633         vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
1634         return rmi;
1635      case Xrmi_Mem:
1636         vassert(sane_AMode(rmi->Xrmi.Mem.am));
1637         return rmi;
1638      default:
1639         vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
1640   }
1641}
1642
1643/* DO NOT CALL THIS DIRECTLY ! */
1644static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
1645{
1646   IRType ty = typeOfIRExpr(env->type_env,e);
1647   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1648
1649   /* special case: immediate */
1650   if (e->tag == Iex_Const) {
1651      UInt u;
1652      switch (e->Iex.Const.con->tag) {
1653         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1654         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1655         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1656         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1657      }
1658      return X86RMI_Imm(u);
1659   }
1660
1661   /* special case: 32-bit GET */
1662   if (e->tag == Iex_Get && ty == Ity_I32) {
1663      return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1664                                    hregX86_EBP()));
1665   }
1666
1667   /* special case: 32-bit load from memory */
1668   if (e->tag == Iex_Load && ty == Ity_I32
1669       && e->Iex.Load.end == Iend_LE) {
1670      X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
1671      return X86RMI_Mem(am);
1672   }
1673
1674   /* default case: calculate into a register and return that */
1675   {
1676      HReg r = iselIntExpr_R ( env, e );
1677      return X86RMI_Reg(r);
1678   }
1679}
1680
1681
1682/* --------------------- RIs --------------------- */
1683
1684/* Calculate an expression into an X86RI operand.  As with
1685   iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
1686
1687static X86RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
1688{
1689   X86RI* ri = iselIntExpr_RI_wrk(env, e);
1690   /* sanity checks ... */
1691   switch (ri->tag) {
1692      case Xri_Imm:
1693         return ri;
1694      case Xri_Reg:
1695         vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
1696         vassert(hregIsVirtual(ri->Xri.Reg.reg));
1697         return ri;
1698      default:
1699         vpanic("iselIntExpr_RI: unknown x86 RI tag");
1700   }
1701}
1702
1703/* DO NOT CALL THIS DIRECTLY ! */
1704static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
1705{
1706   IRType ty = typeOfIRExpr(env->type_env,e);
1707   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1708
1709   /* special case: immediate */
1710   if (e->tag == Iex_Const) {
1711      UInt u;
1712      switch (e->Iex.Const.con->tag) {
1713         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1714         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1715         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1716         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1717      }
1718      return X86RI_Imm(u);
1719   }
1720
1721   /* default case: calculate into a register and return that */
1722   {
1723      HReg r = iselIntExpr_R ( env, e );
1724      return X86RI_Reg(r);
1725   }
1726}
1727
1728
1729/* --------------------- RMs --------------------- */
1730
1731/* Similarly, calculate an expression into an X86RM operand.  As with
1732   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1733
1734static X86RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
1735{
1736   X86RM* rm = iselIntExpr_RM_wrk(env, e);
1737   /* sanity checks ... */
1738   switch (rm->tag) {
1739      case Xrm_Reg:
1740         vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
1741         vassert(hregIsVirtual(rm->Xrm.Reg.reg));
1742         return rm;
1743      case Xrm_Mem:
1744         vassert(sane_AMode(rm->Xrm.Mem.am));
1745         return rm;
1746      default:
1747         vpanic("iselIntExpr_RM: unknown x86 RM tag");
1748   }
1749}
1750
1751/* DO NOT CALL THIS DIRECTLY ! */
1752static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
1753{
1754   IRType ty = typeOfIRExpr(env->type_env,e);
1755   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1756
1757   /* special case: 32-bit GET */
1758   if (e->tag == Iex_Get && ty == Ity_I32) {
1759      return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
1760                                   hregX86_EBP()));
1761   }
1762
1763   /* special case: load from memory */
1764
1765   /* default case: calculate into a register and return that */
1766   {
1767      HReg r = iselIntExpr_R ( env, e );
1768      return X86RM_Reg(r);
1769   }
1770}
1771
1772
1773/* --------------------- CONDCODE --------------------- */
1774
1775/* Generate code to evaluated a bit-typed expression, returning the
1776   condition code which would correspond when the expression would
1777   notionally have returned 1. */
1778
1779static X86CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
1780{
1781   /* Uh, there's nothing we can sanity check here, unfortunately. */
1782   return iselCondCode_wrk(env,e);
1783}
1784
1785/* DO NOT CALL THIS DIRECTLY ! */
1786static X86CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
1787{
1788   MatchInfo mi;
1789
1790   vassert(e);
1791   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
1792
1793   /* var */
1794   if (e->tag == Iex_RdTmp) {
1795      HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
1796      /* Test32 doesn't modify r32; so this is OK. */
1797      addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
1798      return Xcc_NZ;
1799   }
1800
1801   /* Constant 1:Bit */
1802   if (e->tag == Iex_Const) {
1803      HReg r;
1804      vassert(e->Iex.Const.con->tag == Ico_U1);
1805      vassert(e->Iex.Const.con->Ico.U1 == True
1806              || e->Iex.Const.con->Ico.U1 == False);
1807      r = newVRegI(env);
1808      addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
1809      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
1810      return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
1811   }
1812
1813   /* Not1(e) */
1814   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
1815      /* Generate code for the arg, and negate the test condition */
1816      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
1817   }
1818
1819   /* --- patterns rooted at: 32to1 --- */
1820
1821   if (e->tag == Iex_Unop
1822       && e->Iex.Unop.op == Iop_32to1) {
1823      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1824      addInstr(env, X86Instr_Test32(1,rm));
1825      return Xcc_NZ;
1826   }
1827
1828   /* --- patterns rooted at: CmpNEZ8 --- */
1829
1830   /* CmpNEZ8(x) */
1831   if (e->tag == Iex_Unop
1832       && e->Iex.Unop.op == Iop_CmpNEZ8) {
1833      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1834      addInstr(env, X86Instr_Test32(0xFF,rm));
1835      return Xcc_NZ;
1836   }
1837
1838   /* --- patterns rooted at: CmpNEZ16 --- */
1839
1840   /* CmpNEZ16(x) */
1841   if (e->tag == Iex_Unop
1842       && e->Iex.Unop.op == Iop_CmpNEZ16) {
1843      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1844      addInstr(env, X86Instr_Test32(0xFFFF,rm));
1845      return Xcc_NZ;
1846   }
1847
1848   /* --- patterns rooted at: CmpNEZ32 --- */
1849
1850   /* CmpNEZ32(And32(x,y)) */
1851   {
1852      DECLARE_PATTERN(p_CmpNEZ32_And32);
1853      DEFINE_PATTERN(p_CmpNEZ32_And32,
1854                     unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
1855      if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
1856         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1857         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1858         HReg    tmp  = newVRegI(env);
1859         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1860         addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
1861         return Xcc_NZ;
1862      }
1863   }
1864
1865   /* CmpNEZ32(Or32(x,y)) */
1866   {
1867      DECLARE_PATTERN(p_CmpNEZ32_Or32);
1868      DEFINE_PATTERN(p_CmpNEZ32_Or32,
1869                     unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
1870      if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
1871         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1872         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1873         HReg    tmp  = newVRegI(env);
1874         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1875         addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
1876         return Xcc_NZ;
1877      }
1878   }
1879
1880   /* CmpNEZ32(GET(..):I32) */
1881   if (e->tag == Iex_Unop
1882       && e->Iex.Unop.op == Iop_CmpNEZ32
1883       && e->Iex.Unop.arg->tag == Iex_Get) {
1884      X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1885                                 hregX86_EBP());
1886      addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
1887      return Xcc_NZ;
1888   }
1889
1890   /* CmpNEZ32(x) */
1891   if (e->tag == Iex_Unop
1892       && e->Iex.Unop.op == Iop_CmpNEZ32) {
1893      HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1894      X86RMI* rmi2 = X86RMI_Imm(0);
1895      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1896      return Xcc_NZ;
1897   }
1898
1899   /* --- patterns rooted at: CmpNEZ64 --- */
1900
1901   /* CmpNEZ64(Or64(x,y)) */
1902   {
1903      DECLARE_PATTERN(p_CmpNEZ64_Or64);
1904      DEFINE_PATTERN(p_CmpNEZ64_Or64,
1905                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
1906      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
1907         HReg    hi1, lo1, hi2, lo2;
1908         HReg    tmp  = newVRegI(env);
1909         iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
1910         addInstr(env, mk_iMOVsd_RR(hi1, tmp));
1911         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
1912         iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
1913         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
1914         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
1915         return Xcc_NZ;
1916      }
1917   }
1918
1919   /* CmpNEZ64(x) */
1920   if (e->tag == Iex_Unop
1921       && e->Iex.Unop.op == Iop_CmpNEZ64) {
1922      HReg hi, lo;
1923      HReg tmp = newVRegI(env);
1924      iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
1925      addInstr(env, mk_iMOVsd_RR(hi, tmp));
1926      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
1927      return Xcc_NZ;
1928   }
1929
1930   /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
1931
1932   /* CmpEQ8 / CmpNE8 */
1933   if (e->tag == Iex_Binop
1934       && (e->Iex.Binop.op == Iop_CmpEQ8
1935           || e->Iex.Binop.op == Iop_CmpNE8
1936           || e->Iex.Binop.op == Iop_CasCmpEQ8
1937           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
1938      if (isZeroU8(e->Iex.Binop.arg2)) {
1939         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1940         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
1941         switch (e->Iex.Binop.op) {
1942            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1943            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1944            default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
1945         }
1946      } else {
1947         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1948         X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1949         HReg    r    = newVRegI(env);
1950         addInstr(env, mk_iMOVsd_RR(r1,r));
1951         addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1952         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
1953         switch (e->Iex.Binop.op) {
1954            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1955            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1956            default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
1957         }
1958      }
1959   }
1960
1961   /* CmpEQ16 / CmpNE16 */
1962   if (e->tag == Iex_Binop
1963       && (e->Iex.Binop.op == Iop_CmpEQ16
1964           || e->Iex.Binop.op == Iop_CmpNE16
1965           || e->Iex.Binop.op == Iop_CasCmpEQ16
1966           || e->Iex.Binop.op == Iop_CasCmpNE16
1967           || e->Iex.Binop.op == Iop_ExpCmpNE16)) {
1968      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1969      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1970      HReg    r    = newVRegI(env);
1971      addInstr(env, mk_iMOVsd_RR(r1,r));
1972      addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1973      addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
1974      switch (e->Iex.Binop.op) {
1975         case Iop_CmpEQ16: case Iop_CasCmpEQ16:
1976            return Xcc_Z;
1977         case Iop_CmpNE16: case Iop_CasCmpNE16: case Iop_ExpCmpNE16:
1978            return Xcc_NZ;
1979         default:
1980            vpanic("iselCondCode(x86): CmpXX16");
1981      }
1982   }
1983
1984   /* CmpNE32(ccall, 32-bit constant) (--smc-check=all optimisation).
1985      Saves a "movl %eax, %tmp" compared to the default route. */
1986   if (e->tag == Iex_Binop
1987       && e->Iex.Binop.op == Iop_CmpNE32
1988       && e->Iex.Binop.arg1->tag == Iex_CCall
1989       && e->Iex.Binop.arg2->tag == Iex_Const) {
1990      IRExpr* cal = e->Iex.Binop.arg1;
1991      IRExpr* con = e->Iex.Binop.arg2;
1992      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
1993      vassert(cal->Iex.CCall.retty == Ity_I32); /* else ill-typed IR */
1994      vassert(con->Iex.Const.con->tag == Ico_U32);
1995      /* Marshal args, do the call. */
1996      UInt   addToSp = 0;
1997      RetLoc rloc    = mk_RetLoc_INVALID();
1998      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1999                    cal->Iex.CCall.cee,
2000                    cal->Iex.CCall.retty, cal->Iex.CCall.args );
2001      vassert(is_sane_RetLoc(rloc));
2002      vassert(rloc.pri == RLPri_Int);
2003      vassert(addToSp == 0);
2004      /* */
2005      addInstr(env, X86Instr_Alu32R(Xalu_CMP,
2006                                    X86RMI_Imm(con->Iex.Const.con->Ico.U32),
2007                                    hregX86_EAX()));
2008      return Xcc_NZ;
2009   }
2010
2011   /* Cmp*32*(x,y) */
2012   if (e->tag == Iex_Binop
2013       && (e->Iex.Binop.op == Iop_CmpEQ32
2014           || e->Iex.Binop.op == Iop_CmpNE32
2015           || e->Iex.Binop.op == Iop_CmpLT32S
2016           || e->Iex.Binop.op == Iop_CmpLT32U
2017           || e->Iex.Binop.op == Iop_CmpLE32S
2018           || e->Iex.Binop.op == Iop_CmpLE32U
2019           || e->Iex.Binop.op == Iop_CasCmpEQ32
2020           || e->Iex.Binop.op == Iop_CasCmpNE32
2021           || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2022      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2023      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2024      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
2025      switch (e->Iex.Binop.op) {
2026         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
2027         case Iop_CmpNE32:
2028         case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Xcc_NZ;
2029         case Iop_CmpLT32S: return Xcc_L;
2030         case Iop_CmpLT32U: return Xcc_B;
2031         case Iop_CmpLE32S: return Xcc_LE;
2032         case Iop_CmpLE32U: return Xcc_BE;
2033         default: vpanic("iselCondCode(x86): CmpXX32");
2034      }
2035   }
2036
2037   /* CmpNE64 */
2038   if (e->tag == Iex_Binop
2039       && (e->Iex.Binop.op == Iop_CmpNE64
2040           || e->Iex.Binop.op == Iop_CmpEQ64)) {
2041      HReg hi1, hi2, lo1, lo2;
2042      HReg tHi = newVRegI(env);
2043      HReg tLo = newVRegI(env);
2044      iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
2045      iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
2046      addInstr(env, mk_iMOVsd_RR(hi1, tHi));
2047      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
2048      addInstr(env, mk_iMOVsd_RR(lo1, tLo));
2049      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
2050      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
2051      switch (e->Iex.Binop.op) {
2052         case Iop_CmpNE64: return Xcc_NZ;
2053         case Iop_CmpEQ64: return Xcc_Z;
2054         default: vpanic("iselCondCode(x86): CmpXX64");
2055      }
2056   }
2057
2058   ppIRExpr(e);
2059   vpanic("iselCondCode");
2060}
2061
2062
2063/*---------------------------------------------------------*/
2064/*--- ISEL: Integer expressions (64 bit)                ---*/
2065/*---------------------------------------------------------*/
2066
2067/* Compute a 64-bit value into a register pair, which is returned as
2068   the first two parameters.  As with iselIntExpr_R, these may be
2069   either real or virtual regs; in any case they must not be changed
2070   by subsequent code emitted by the caller.  */
2071
2072static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env,
2073                            const IRExpr* e )
2074{
2075   iselInt64Expr_wrk(rHi, rLo, env, e);
2076#  if 0
2077   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2078#  endif
2079   vassert(hregClass(*rHi) == HRcInt32);
2080   vassert(hregIsVirtual(*rHi));
2081   vassert(hregClass(*rLo) == HRcInt32);
2082   vassert(hregIsVirtual(*rLo));
2083}
2084
2085/* DO NOT CALL THIS DIRECTLY ! */
2086static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env,
2087                                const IRExpr* e )
2088{
2089   MatchInfo mi;
2090   HWord fn = 0; /* helper fn for most SIMD64 stuff */
2091   vassert(e);
2092   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
2093
2094   /* 64-bit literal */
2095   if (e->tag == Iex_Const) {
2096      ULong w64 = e->Iex.Const.con->Ico.U64;
2097      UInt  wHi = toUInt(w64 >> 32);
2098      UInt  wLo = toUInt(w64);
2099      HReg  tLo = newVRegI(env);
2100      HReg  tHi = newVRegI(env);
2101      vassert(e->Iex.Const.con->tag == Ico_U64);
2102      if (wLo == wHi) {
2103         /* Save a precious Int register in this special case. */
2104         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2105         *rHi = tLo;
2106         *rLo = tLo;
2107      } else {
2108         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
2109         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2110         *rHi = tHi;
2111         *rLo = tLo;
2112      }
2113      return;
2114   }
2115
2116   /* read 64-bit IRTemp */
2117   if (e->tag == Iex_RdTmp) {
2118      lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
2119      return;
2120   }
2121
2122   /* 64-bit load */
2123   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2124      HReg     tLo, tHi;
2125      X86AMode *am0, *am4;
2126      vassert(e->Iex.Load.ty == Ity_I64);
2127      tLo = newVRegI(env);
2128      tHi = newVRegI(env);
2129      am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
2130      am4 = advance4(am0);
2131      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
2132      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2133      *rHi = tHi;
2134      *rLo = tLo;
2135      return;
2136   }
2137
2138   /* 64-bit GET */
2139   if (e->tag == Iex_Get) {
2140      X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
2141      X86AMode* am4 = advance4(am);
2142      HReg tLo = newVRegI(env);
2143      HReg tHi = newVRegI(env);
2144      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2145      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2146      *rHi = tHi;
2147      *rLo = tLo;
2148      return;
2149   }
2150
2151   /* 64-bit GETI */
2152   if (e->tag == Iex_GetI) {
2153      X86AMode* am
2154         = genGuestArrayOffset( env, e->Iex.GetI.descr,
2155                                     e->Iex.GetI.ix, e->Iex.GetI.bias );
2156      X86AMode* am4 = advance4(am);
2157      HReg tLo = newVRegI(env);
2158      HReg tHi = newVRegI(env);
2159      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2160      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2161      *rHi = tHi;
2162      *rLo = tLo;
2163      return;
2164   }
2165
2166   /* 64-bit ITE: ITE(g, expr, expr) */ // VFD
2167   if (e->tag == Iex_ITE) {
2168      HReg e0Lo, e0Hi, e1Lo, e1Hi;
2169      HReg tLo = newVRegI(env);
2170      HReg tHi = newVRegI(env);
2171      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.ITE.iffalse);
2172      iselInt64Expr(&e1Hi, &e1Lo, env, e->Iex.ITE.iftrue);
2173      addInstr(env, mk_iMOVsd_RR(e1Hi, tHi));
2174      addInstr(env, mk_iMOVsd_RR(e1Lo, tLo));
2175      X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
2176      /* This assumes the first cmov32 doesn't trash the condition
2177         codes, so they are still available for the second cmov32 */
2178      addInstr(env, X86Instr_CMov32(cc ^ 1, X86RM_Reg(e0Hi), tHi));
2179      addInstr(env, X86Instr_CMov32(cc ^ 1, X86RM_Reg(e0Lo), tLo));
2180      *rHi = tHi;
2181      *rLo = tLo;
2182      return;
2183   }
2184
2185   /* --------- BINARY ops --------- */
2186   if (e->tag == Iex_Binop) {
2187      switch (e->Iex.Binop.op) {
2188         /* 32 x 32 -> 64 multiply */
2189         case Iop_MullU32:
2190         case Iop_MullS32: {
2191            /* get one operand into %eax, and the other into a R/M.
2192               Need to make an educated guess about which is better in
2193               which. */
2194            HReg   tLo    = newVRegI(env);
2195            HReg   tHi    = newVRegI(env);
2196            Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
2197            X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2198            HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2199            addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
2200            addInstr(env, X86Instr_MulL(syned, rmLeft));
2201            /* Result is now in EDX:EAX.  Tell the caller. */
2202            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2203            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2204            *rHi = tHi;
2205            *rLo = tLo;
2206            return;
2207         }
2208
2209         /* 64 x 32 -> (32(rem),32(div)) division */
2210         case Iop_DivModU64to32:
2211         case Iop_DivModS64to32: {
2212            /* Get the 64-bit operand into edx:eax, and the other into
2213               any old R/M. */
2214            HReg sHi, sLo;
2215            HReg   tLo     = newVRegI(env);
2216            HReg   tHi     = newVRegI(env);
2217            Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
2218            X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2219            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2220            addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
2221            addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
2222            addInstr(env, X86Instr_Div(syned, rmRight));
2223            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2224            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2225            *rHi = tHi;
2226            *rLo = tLo;
2227            return;
2228         }
2229
2230         /* Or64/And64/Xor64 */
2231         case Iop_Or64:
2232         case Iop_And64:
2233         case Iop_Xor64: {
2234            HReg xLo, xHi, yLo, yHi;
2235            HReg tLo = newVRegI(env);
2236            HReg tHi = newVRegI(env);
2237            X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
2238                          : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
2239                          : Xalu_XOR;
2240            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2241            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2242            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2243            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
2244            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2245            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
2246            *rHi = tHi;
2247            *rLo = tLo;
2248            return;
2249         }
2250
2251         /* Add64/Sub64 */
2252         case Iop_Add64:
2253            if (e->Iex.Binop.arg2->tag == Iex_Const) {
2254               /* special case Add64(e, const) */
2255               ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
2256               UInt  wHi = toUInt(w64 >> 32);
2257               UInt  wLo = toUInt(w64);
2258               HReg  tLo = newVRegI(env);
2259               HReg  tHi = newVRegI(env);
2260               HReg  xLo, xHi;
2261               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
2262               iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2263               addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2264               addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2265               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
2266               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
2267               *rHi = tHi;
2268               *rLo = tLo;
2269               return;
2270            }
2271            /* else fall through to the generic case */
2272         case Iop_Sub64: {
2273            HReg xLo, xHi, yLo, yHi;
2274            HReg tLo = newVRegI(env);
2275            HReg tHi = newVRegI(env);
2276            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2277            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2278            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2279            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2280            if (e->Iex.Binop.op==Iop_Add64) {
2281               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
2282               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
2283            } else {
2284               addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2285               addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2286            }
2287            *rHi = tHi;
2288            *rLo = tLo;
2289            return;
2290         }
2291
2292         /* 32HLto64(e1,e2) */
2293         case Iop_32HLto64:
2294            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2295            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2296            return;
2297
2298         /* 64-bit shifts */
2299         case Iop_Shl64: {
2300            /* We use the same ingenious scheme as gcc.  Put the value
2301               to be shifted into %hi:%lo, and the shift amount into
2302               %cl.  Then (dsts on right, a la ATT syntax):
2303
2304               shldl %cl, %lo, %hi   -- make %hi be right for the
2305                                     -- shift amt %cl % 32
2306               shll  %cl, %lo        -- make %lo be right for the
2307                                     -- shift amt %cl % 32
2308
2309               Now, if (shift amount % 64) is in the range 32 .. 63,
2310               we have to do a fixup, which puts the result low half
2311               into the result high half, and zeroes the low half:
2312
2313               testl $32, %ecx
2314
2315               cmovnz %lo, %hi
2316               movl $0, %tmp         -- sigh; need yet another reg
2317               cmovnz %tmp, %lo
2318            */
2319            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2320            tLo = newVRegI(env);
2321            tHi = newVRegI(env);
2322            tTemp = newVRegI(env);
2323            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2324            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2325            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2326            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2327            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2328            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2329               and those regs are legitimately modifiable. */
2330            addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
2331            addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
2332            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2333            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
2334            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2335            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
2336            *rHi = tHi;
2337            *rLo = tLo;
2338            return;
2339         }
2340
2341         case Iop_Shr64: {
2342            /* We use the same ingenious scheme as gcc.  Put the value
2343               to be shifted into %hi:%lo, and the shift amount into
2344               %cl.  Then:
2345
2346               shrdl %cl, %hi, %lo   -- make %lo be right for the
2347                                     -- shift amt %cl % 32
2348               shrl  %cl, %hi        -- make %hi be right for the
2349                                     -- shift amt %cl % 32
2350
2351               Now, if (shift amount % 64) is in the range 32 .. 63,
2352               we have to do a fixup, which puts the result high half
2353               into the result low half, and zeroes the high half:
2354
2355               testl $32, %ecx
2356
2357               cmovnz %hi, %lo
2358               movl $0, %tmp         -- sigh; need yet another reg
2359               cmovnz %tmp, %hi
2360            */
2361            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2362            tLo = newVRegI(env);
2363            tHi = newVRegI(env);
2364            tTemp = newVRegI(env);
2365            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2366            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2367            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2368            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2369            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2370            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2371               and those regs are legitimately modifiable. */
2372            addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
2373            addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
2374            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2375            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
2376            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2377            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
2378            *rHi = tHi;
2379            *rLo = tLo;
2380            return;
2381         }
2382
2383         /* F64 -> I64 */
2384         /* Sigh, this is an almost exact copy of the F64 -> I32/I16
2385            case.  Unfortunately I see no easy way to avoid the
2386            duplication. */
2387         case Iop_F64toI64S: {
2388            HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2389            HReg tLo = newVRegI(env);
2390            HReg tHi = newVRegI(env);
2391
2392            /* Used several times ... */
2393            /* Careful ... this sharing is only safe because
2394	       zero_esp/four_esp do not hold any registers which the
2395	       register allocator could attempt to swizzle later. */
2396            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2397            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2398
2399            /* rf now holds the value to be converted, and rrm holds
2400               the rounding mode value, encoded as per the
2401               IRRoundingMode enum.  The first thing to do is set the
2402               FPU's rounding mode accordingly. */
2403
2404            /* Create a space for the format conversion. */
2405            /* subl $8, %esp */
2406            sub_from_esp(env, 8);
2407
2408            /* Set host rounding mode */
2409            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2410
2411            /* gistll %rf, 0(%esp) */
2412            addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
2413
2414            /* movl 0(%esp), %dstLo */
2415            /* movl 4(%esp), %dstHi */
2416            addInstr(env, X86Instr_Alu32R(
2417                             Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2418            addInstr(env, X86Instr_Alu32R(
2419                             Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2420
2421            /* Restore default FPU rounding. */
2422            set_FPU_rounding_default( env );
2423
2424            /* addl $8, %esp */
2425            add_to_esp(env, 8);
2426
2427            *rHi = tHi;
2428            *rLo = tLo;
2429            return;
2430         }
2431
2432         case Iop_Add8x8:
2433            fn = (HWord)h_generic_calc_Add8x8; goto binnish;
2434         case Iop_Add16x4:
2435            fn = (HWord)h_generic_calc_Add16x4; goto binnish;
2436         case Iop_Add32x2:
2437            fn = (HWord)h_generic_calc_Add32x2; goto binnish;
2438
2439         case Iop_Avg8Ux8:
2440            fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
2441         case Iop_Avg16Ux4:
2442            fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
2443
2444         case Iop_CmpEQ8x8:
2445            fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
2446         case Iop_CmpEQ16x4:
2447            fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
2448         case Iop_CmpEQ32x2:
2449            fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
2450
2451         case Iop_CmpGT8Sx8:
2452            fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
2453         case Iop_CmpGT16Sx4:
2454            fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
2455         case Iop_CmpGT32Sx2:
2456            fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
2457
2458         case Iop_InterleaveHI8x8:
2459            fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
2460         case Iop_InterleaveLO8x8:
2461            fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
2462         case Iop_InterleaveHI16x4:
2463            fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
2464         case Iop_InterleaveLO16x4:
2465            fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
2466         case Iop_InterleaveHI32x2:
2467            fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
2468         case Iop_InterleaveLO32x2:
2469            fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
2470         case Iop_CatOddLanes16x4:
2471            fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
2472         case Iop_CatEvenLanes16x4:
2473            fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
2474         case Iop_Perm8x8:
2475            fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
2476
2477         case Iop_Max8Ux8:
2478            fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
2479         case Iop_Max16Sx4:
2480            fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
2481         case Iop_Min8Ux8:
2482            fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
2483         case Iop_Min16Sx4:
2484            fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
2485
2486         case Iop_Mul16x4:
2487            fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
2488         case Iop_Mul32x2:
2489            fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
2490         case Iop_MulHi16Sx4:
2491            fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
2492         case Iop_MulHi16Ux4:
2493            fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
2494
2495         case Iop_QAdd8Sx8:
2496            fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
2497         case Iop_QAdd16Sx4:
2498            fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
2499         case Iop_QAdd8Ux8:
2500            fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
2501         case Iop_QAdd16Ux4:
2502            fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
2503
2504         case Iop_QNarrowBin32Sto16Sx4:
2505            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; goto binnish;
2506         case Iop_QNarrowBin16Sto8Sx8:
2507            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
2508         case Iop_QNarrowBin16Sto8Ux8:
2509            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
2510         case Iop_NarrowBin16to8x8:
2511            fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
2512         case Iop_NarrowBin32to16x4:
2513            fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
2514
2515         case Iop_QSub8Sx8:
2516            fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
2517         case Iop_QSub16Sx4:
2518            fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
2519         case Iop_QSub8Ux8:
2520            fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
2521         case Iop_QSub16Ux4:
2522            fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
2523
2524         case Iop_Sub8x8:
2525            fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
2526         case Iop_Sub16x4:
2527            fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
2528         case Iop_Sub32x2:
2529            fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
2530
2531         binnish: {
2532            /* Note: the following assumes all helpers are of
2533               signature
2534                  ULong fn ( ULong, ULong ), and they are
2535               not marked as regparm functions.
2536            */
2537            HReg xLo, xHi, yLo, yHi;
2538            HReg tLo = newVRegI(env);
2539            HReg tHi = newVRegI(env);
2540            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2541            addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
2542            addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
2543            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2544            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2545            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2546            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2547                                         0, mk_RetLoc_simple(RLPri_2Int) ));
2548            add_to_esp(env, 4*4);
2549            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2550            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2551            *rHi = tHi;
2552            *rLo = tLo;
2553            return;
2554         }
2555
2556         case Iop_ShlN32x2:
2557            fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
2558         case Iop_ShlN16x4:
2559            fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
2560         case Iop_ShlN8x8:
2561            fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
2562         case Iop_ShrN32x2:
2563            fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
2564         case Iop_ShrN16x4:
2565            fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
2566         case Iop_SarN32x2:
2567            fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
2568         case Iop_SarN16x4:
2569            fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
2570         case Iop_SarN8x8:
2571            fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
2572         shifty: {
2573            /* Note: the following assumes all helpers are of
2574               signature
2575                  ULong fn ( ULong, UInt ), and they are
2576               not marked as regparm functions.
2577            */
2578            HReg xLo, xHi;
2579            HReg tLo = newVRegI(env);
2580            HReg tHi = newVRegI(env);
2581            X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2582            addInstr(env, X86Instr_Push(y));
2583            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2584            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2585            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2586            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2587                                         0, mk_RetLoc_simple(RLPri_2Int) ));
2588            add_to_esp(env, 3*4);
2589            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2590            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2591            *rHi = tHi;
2592            *rLo = tLo;
2593            return;
2594         }
2595
2596         default:
2597            break;
2598      }
2599   } /* if (e->tag == Iex_Binop) */
2600
2601
2602   /* --------- UNARY ops --------- */
2603   if (e->tag == Iex_Unop) {
2604      switch (e->Iex.Unop.op) {
2605
2606         /* 32Sto64(e) */
2607         case Iop_32Sto64: {
2608            HReg tLo = newVRegI(env);
2609            HReg tHi = newVRegI(env);
2610            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2611            addInstr(env, mk_iMOVsd_RR(src,tHi));
2612            addInstr(env, mk_iMOVsd_RR(src,tLo));
2613            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
2614            *rHi = tHi;
2615            *rLo = tLo;
2616            return;
2617         }
2618
2619         /* 32Uto64(e) */
2620         case Iop_32Uto64: {
2621            HReg tLo = newVRegI(env);
2622            HReg tHi = newVRegI(env);
2623            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2624            addInstr(env, mk_iMOVsd_RR(src,tLo));
2625            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2626            *rHi = tHi;
2627            *rLo = tLo;
2628            return;
2629         }
2630
2631         /* 16Uto64(e) */
2632         case Iop_16Uto64: {
2633            HReg tLo = newVRegI(env);
2634            HReg tHi = newVRegI(env);
2635            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2636            addInstr(env, mk_iMOVsd_RR(src,tLo));
2637            addInstr(env, X86Instr_Alu32R(Xalu_AND,
2638                                          X86RMI_Imm(0xFFFF), tLo));
2639            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2640            *rHi = tHi;
2641            *rLo = tLo;
2642            return;
2643         }
2644
2645         /* V128{HI}to64 */
2646         case Iop_V128HIto64:
2647         case Iop_V128to64: {
2648            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
2649            HReg tLo = newVRegI(env);
2650            HReg tHi = newVRegI(env);
2651            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
2652            X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
2653            X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
2654            X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
2655            sub_from_esp(env, 16);
2656            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
2657            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2658                                           X86RMI_Mem(espLO), tLo ));
2659            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2660                                           X86RMI_Mem(espHI), tHi ));
2661            add_to_esp(env, 16);
2662            *rHi = tHi;
2663            *rLo = tLo;
2664            return;
2665         }
2666
2667         /* could do better than this, but for now ... */
2668         case Iop_1Sto64: {
2669            HReg tLo = newVRegI(env);
2670            HReg tHi = newVRegI(env);
2671            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
2672            addInstr(env, X86Instr_Set32(cond,tLo));
2673            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
2674            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
2675            addInstr(env, mk_iMOVsd_RR(tLo, tHi));
2676            *rHi = tHi;
2677            *rLo = tLo;
2678            return;
2679         }
2680
2681         /* Not64(e) */
2682         case Iop_Not64: {
2683            HReg tLo = newVRegI(env);
2684            HReg tHi = newVRegI(env);
2685            HReg sHi, sLo;
2686            iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
2687            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2688            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2689            addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
2690            addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
2691            *rHi = tHi;
2692            *rLo = tLo;
2693            return;
2694         }
2695
2696         /* Left64(e) */
2697         case Iop_Left64: {
2698            HReg yLo, yHi;
2699            HReg tLo = newVRegI(env);
2700            HReg tHi = newVRegI(env);
2701            /* yHi:yLo = arg */
2702            iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
2703            /* tLo = 0 - yLo, and set carry */
2704            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
2705            addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2706            /* tHi = 0 - yHi - carry */
2707            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2708            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2709            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
2710               back in, so as to give the final result
2711               tHi:tLo = arg | -arg. */
2712            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
2713            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
2714            *rHi = tHi;
2715            *rLo = tLo;
2716            return;
2717         }
2718
2719         /* --- patterns rooted at: CmpwNEZ64 --- */
2720
2721         /* CmpwNEZ64(e) */
2722         case Iop_CmpwNEZ64: {
2723
2724         DECLARE_PATTERN(p_CmpwNEZ64_Or64);
2725         DEFINE_PATTERN(p_CmpwNEZ64_Or64,
2726                        unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
2727         if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
2728            /* CmpwNEZ64(Or64(x,y)) */
2729            HReg xHi,xLo,yHi,yLo;
2730            HReg xBoth = newVRegI(env);
2731            HReg merged = newVRegI(env);
2732            HReg tmp2 = newVRegI(env);
2733
2734            iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
2735            addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
2736            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2737                                          X86RMI_Reg(xLo),xBoth));
2738
2739            iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
2740            addInstr(env, mk_iMOVsd_RR(yHi,merged));
2741            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2742                                          X86RMI_Reg(yLo),merged));
2743            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2744                                             X86RMI_Reg(xBoth),merged));
2745
2746            /* tmp2 = (merged | -merged) >>s 31 */
2747            addInstr(env, mk_iMOVsd_RR(merged,tmp2));
2748            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2749            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2750                                          X86RMI_Reg(merged), tmp2));
2751            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2752            *rHi = tmp2;
2753            *rLo = tmp2;
2754            return;
2755         } else {
2756            /* CmpwNEZ64(e) */
2757            HReg srcLo, srcHi;
2758            HReg tmp1  = newVRegI(env);
2759            HReg tmp2  = newVRegI(env);
2760            /* srcHi:srcLo = arg */
2761            iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
2762            /* tmp1 = srcHi | srcLo */
2763            addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
2764            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2765                                          X86RMI_Reg(srcLo), tmp1));
2766            /* tmp2 = (tmp1 | -tmp1) >>s 31 */
2767            addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
2768            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2769            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2770                                          X86RMI_Reg(tmp1), tmp2));
2771            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2772            *rHi = tmp2;
2773            *rLo = tmp2;
2774            return;
2775         }
2776         }
2777
2778         /* ReinterpF64asI64(e) */
2779         /* Given an IEEE754 double, produce an I64 with the same bit
2780            pattern. */
2781         case Iop_ReinterpF64asI64: {
2782            HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
2783            HReg tLo  = newVRegI(env);
2784            HReg tHi  = newVRegI(env);
2785            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2786            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2787            /* paranoia */
2788            set_FPU_rounding_default(env);
2789            /* subl $8, %esp */
2790            sub_from_esp(env, 8);
2791            /* gstD %rf, 0(%esp) */
2792            addInstr(env,
2793                     X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
2794            /* movl 0(%esp), %tLo */
2795            addInstr(env,
2796                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2797            /* movl 4(%esp), %tHi */
2798            addInstr(env,
2799                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2800            /* addl $8, %esp */
2801            add_to_esp(env, 8);
2802            *rHi = tHi;
2803            *rLo = tLo;
2804            return;
2805         }
2806
2807         case Iop_CmpNEZ32x2:
2808            fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
2809         case Iop_CmpNEZ16x4:
2810            fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
2811         case Iop_CmpNEZ8x8:
2812            fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
2813         unish: {
2814            /* Note: the following assumes all helpers are of
2815               signature
2816                  ULong fn ( ULong ), and they are
2817               not marked as regparm functions.
2818            */
2819            HReg xLo, xHi;
2820            HReg tLo = newVRegI(env);
2821            HReg tHi = newVRegI(env);
2822            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
2823            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2824            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2825            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2826                                         0, mk_RetLoc_simple(RLPri_2Int) ));
2827            add_to_esp(env, 2*4);
2828            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2829            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2830            *rHi = tHi;
2831            *rLo = tLo;
2832            return;
2833         }
2834
2835         default:
2836            break;
2837      }
2838   } /* if (e->tag == Iex_Unop) */
2839
2840
2841   /* --------- CCALL --------- */
2842   if (e->tag == Iex_CCall) {
2843      HReg tLo = newVRegI(env);
2844      HReg tHi = newVRegI(env);
2845
2846      /* Marshal args, do the call, clear stack. */
2847      UInt   addToSp = 0;
2848      RetLoc rloc    = mk_RetLoc_INVALID();
2849      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2850                    e->Iex.CCall.cee,
2851                    e->Iex.CCall.retty, e->Iex.CCall.args );
2852      vassert(is_sane_RetLoc(rloc));
2853      vassert(rloc.pri == RLPri_2Int);
2854      vassert(addToSp == 0);
2855      /* */
2856
2857      addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2858      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2859      *rHi = tHi;
2860      *rLo = tLo;
2861      return;
2862   }
2863
2864   ppIRExpr(e);
2865   vpanic("iselInt64Expr");
2866}
2867
2868
2869/*---------------------------------------------------------*/
2870/*--- ISEL: Floating point expressions (32 bit)         ---*/
2871/*---------------------------------------------------------*/
2872
2873/* Nothing interesting here; really just wrappers for
2874   64-bit stuff. */
2875
2876static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2877{
2878   HReg r = iselFltExpr_wrk( env, e );
2879#  if 0
2880   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2881#  endif
2882   vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
2883   vassert(hregIsVirtual(r));
2884   return r;
2885}
2886
2887/* DO NOT CALL THIS DIRECTLY */
2888static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2889{
2890   IRType ty = typeOfIRExpr(env->type_env,e);
2891   vassert(ty == Ity_F32);
2892
2893   if (e->tag == Iex_RdTmp) {
2894      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2895   }
2896
2897   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2898      X86AMode* am;
2899      HReg res = newVRegF(env);
2900      vassert(e->Iex.Load.ty == Ity_F32);
2901      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2902      addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
2903      return res;
2904   }
2905
2906   if (e->tag == Iex_Binop
2907       && e->Iex.Binop.op == Iop_F64toF32) {
2908      /* Although the result is still held in a standard FPU register,
2909         we need to round it to reflect the loss of accuracy/range
2910         entailed in casting it to a 32-bit float. */
2911      HReg dst = newVRegF(env);
2912      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2913      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2914      addInstr(env, X86Instr_Fp64to32(src,dst));
2915      set_FPU_rounding_default( env );
2916      return dst;
2917   }
2918
2919   if (e->tag == Iex_Get) {
2920      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2921                                  hregX86_EBP() );
2922      HReg res = newVRegF(env);
2923      addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
2924      return res;
2925   }
2926
2927   if (e->tag == Iex_Unop
2928       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2929       /* Given an I32, produce an IEEE754 float with the same bit
2930          pattern. */
2931      HReg    dst = newVRegF(env);
2932      X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
2933      /* paranoia */
2934      addInstr(env, X86Instr_Push(rmi));
2935      addInstr(env, X86Instr_FpLdSt(
2936                       True/*load*/, 4, dst,
2937                       X86AMode_IR(0, hregX86_ESP())));
2938      add_to_esp(env, 4);
2939      return dst;
2940   }
2941
2942   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2943      HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
2944      HReg dst = newVRegF(env);
2945
2946      /* rf now holds the value to be rounded.  The first thing to do
2947         is set the FPU's rounding mode accordingly. */
2948
2949      /* Set host rounding mode */
2950      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2951
2952      /* grndint %rf, %dst */
2953      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
2954
2955      /* Restore default FPU rounding. */
2956      set_FPU_rounding_default( env );
2957
2958      return dst;
2959   }
2960
2961   ppIRExpr(e);
2962   vpanic("iselFltExpr_wrk");
2963}
2964
2965
2966/*---------------------------------------------------------*/
2967/*--- ISEL: Floating point expressions (64 bit)         ---*/
2968/*---------------------------------------------------------*/
2969
2970/* Compute a 64-bit floating point value into a register, the identity
2971   of which is returned.  As with iselIntExpr_R, the reg may be either
2972   real or virtual; in any case it must not be changed by subsequent
2973   code emitted by the caller.  */
2974
2975/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2976
2977    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2978    ----                  ---------   -----------   -----------
2979    signalling NaN        u           2047 (max)    .0uuuuu---u
2980                                                    (with at least
2981                                                     one 1 bit)
2982    quiet NaN             u           2047 (max)    .1uuuuu---u
2983
2984    negative infinity     1           2047 (max)    .000000---0
2985
2986    positive infinity     0           2047 (max)    .000000---0
2987
2988    negative zero         1           0             .000000---0
2989
2990    positive zero         0           0             .000000---0
2991*/
2992
2993static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2994{
2995   HReg r = iselDblExpr_wrk( env, e );
2996#  if 0
2997   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2998#  endif
2999   vassert(hregClass(r) == HRcFlt64);
3000   vassert(hregIsVirtual(r));
3001   return r;
3002}
3003
3004/* DO NOT CALL THIS DIRECTLY */
3005static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
3006{
3007   IRType ty = typeOfIRExpr(env->type_env,e);
3008   vassert(e);
3009   vassert(ty == Ity_F64);
3010
3011   if (e->tag == Iex_RdTmp) {
3012      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3013   }
3014
3015   if (e->tag == Iex_Const) {
3016      union { UInt u32x2[2]; ULong u64; Double f64; } u;
3017      HReg freg = newVRegF(env);
3018      vassert(sizeof(u) == 8);
3019      vassert(sizeof(u.u64) == 8);
3020      vassert(sizeof(u.f64) == 8);
3021      vassert(sizeof(u.u32x2) == 8);
3022
3023      if (e->Iex.Const.con->tag == Ico_F64) {
3024         u.f64 = e->Iex.Const.con->Ico.F64;
3025      }
3026      else if (e->Iex.Const.con->tag == Ico_F64i) {
3027         u.u64 = e->Iex.Const.con->Ico.F64i;
3028      }
3029      else
3030         vpanic("iselDblExpr(x86): const");
3031
3032      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
3033      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
3034      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
3035                                    X86AMode_IR(0, hregX86_ESP())));
3036      add_to_esp(env, 8);
3037      return freg;
3038   }
3039
3040   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3041      X86AMode* am;
3042      HReg res = newVRegF(env);
3043      vassert(e->Iex.Load.ty == Ity_F64);
3044      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3045      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
3046      return res;
3047   }
3048
3049   if (e->tag == Iex_Get) {
3050      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
3051                                  hregX86_EBP() );
3052      HReg res = newVRegF(env);
3053      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
3054      return res;
3055   }
3056
3057   if (e->tag == Iex_GetI) {
3058      X86AMode* am
3059         = genGuestArrayOffset(
3060              env, e->Iex.GetI.descr,
3061                   e->Iex.GetI.ix, e->Iex.GetI.bias );
3062      HReg res = newVRegF(env);
3063      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
3064      return res;
3065   }
3066
3067   if (e->tag == Iex_Triop) {
3068      X86FpOp fpop = Xfp_INVALID;
3069      IRTriop *triop = e->Iex.Triop.details;
3070      switch (triop->op) {
3071         case Iop_AddF64:    fpop = Xfp_ADD; break;
3072         case Iop_SubF64:    fpop = Xfp_SUB; break;
3073         case Iop_MulF64:    fpop = Xfp_MUL; break;
3074         case Iop_DivF64:    fpop = Xfp_DIV; break;
3075         case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
3076         case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
3077         case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
3078         case Iop_AtanF64:   fpop = Xfp_ATAN; break;
3079         case Iop_PRemF64:   fpop = Xfp_PREM; break;
3080         case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
3081         default: break;
3082      }
3083      if (fpop != Xfp_INVALID) {
3084         HReg res  = newVRegF(env);
3085         HReg srcL = iselDblExpr(env, triop->arg2);
3086         HReg srcR = iselDblExpr(env, triop->arg3);
3087         /* XXXROUNDINGFIXME */
3088         /* set roundingmode here */
3089         addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
3090	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
3091	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
3092            roundToF64(env, res);
3093         return res;
3094      }
3095   }
3096
3097   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
3098      HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
3099      HReg dst = newVRegF(env);
3100
3101      /* rf now holds the value to be rounded.  The first thing to do
3102         is set the FPU's rounding mode accordingly. */
3103
3104      /* Set host rounding mode */
3105      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3106
3107      /* grndint %rf, %dst */
3108      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
3109
3110      /* Restore default FPU rounding. */
3111      set_FPU_rounding_default( env );
3112
3113      return dst;
3114   }
3115
3116   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3117      HReg dst = newVRegF(env);
3118      HReg rHi,rLo;
3119      iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
3120      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3121      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3122
3123      /* Set host rounding mode */
3124      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3125
3126      addInstr(env, X86Instr_FpLdStI(
3127                       True/*load*/, 8, dst,
3128                       X86AMode_IR(0, hregX86_ESP())));
3129
3130      /* Restore default FPU rounding. */
3131      set_FPU_rounding_default( env );
3132
3133      add_to_esp(env, 8);
3134      return dst;
3135   }
3136
3137   if (e->tag == Iex_Binop) {
3138      X86FpOp fpop = Xfp_INVALID;
3139      switch (e->Iex.Binop.op) {
3140         case Iop_SinF64:  fpop = Xfp_SIN; break;
3141         case Iop_CosF64:  fpop = Xfp_COS; break;
3142         case Iop_TanF64:  fpop = Xfp_TAN; break;
3143         case Iop_2xm1F64: fpop = Xfp_2XM1; break;
3144         case Iop_SqrtF64: fpop = Xfp_SQRT; break;
3145         default: break;
3146      }
3147      if (fpop != Xfp_INVALID) {
3148         HReg res = newVRegF(env);
3149         HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
3150         /* XXXROUNDINGFIXME */
3151         /* set roundingmode here */
3152         /* Note that X86Instr_FpUnary(Xfp_TAN,..) sets the condition
3153            codes.  I don't think that matters, since this insn
3154            selector never generates such an instruction intervening
3155            between an flag-setting instruction and a flag-using
3156            instruction. */
3157         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3158	 if (fpop != Xfp_SQRT
3159             && fpop != Xfp_NEG && fpop != Xfp_ABS)
3160            roundToF64(env, res);
3161         return res;
3162      }
3163   }
3164
3165   if (e->tag == Iex_Unop) {
3166      X86FpOp fpop = Xfp_INVALID;
3167      switch (e->Iex.Unop.op) {
3168         case Iop_NegF64:  fpop = Xfp_NEG; break;
3169         case Iop_AbsF64:  fpop = Xfp_ABS; break;
3170         default: break;
3171      }
3172      if (fpop != Xfp_INVALID) {
3173         HReg res = newVRegF(env);
3174         HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3175         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3176         /* No need to do roundToF64(env,res) for Xfp_NEG or Xfp_ABS,
3177            but might need to do that for other unary ops. */
3178         return res;
3179      }
3180   }
3181
3182   if (e->tag == Iex_Unop) {
3183      switch (e->Iex.Unop.op) {
3184         case Iop_I32StoF64: {
3185            HReg dst = newVRegF(env);
3186            HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3187            addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3188            set_FPU_rounding_default(env);
3189            addInstr(env, X86Instr_FpLdStI(
3190                             True/*load*/, 4, dst,
3191                             X86AMode_IR(0, hregX86_ESP())));
3192	    add_to_esp(env, 4);
3193            return dst;
3194         }
3195         case Iop_ReinterpI64asF64: {
3196            /* Given an I64, produce an IEEE754 double with the same
3197               bit pattern. */
3198            HReg dst = newVRegF(env);
3199            HReg rHi, rLo;
3200	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
3201            /* paranoia */
3202            set_FPU_rounding_default(env);
3203            addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3204            addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3205            addInstr(env, X86Instr_FpLdSt(
3206                             True/*load*/, 8, dst,
3207                             X86AMode_IR(0, hregX86_ESP())));
3208	    add_to_esp(env, 8);
3209            return dst;
3210	 }
3211         case Iop_F32toF64: {
3212            /* this is a no-op */
3213            HReg res = iselFltExpr(env, e->Iex.Unop.arg);
3214            return res;
3215	 }
3216         default:
3217            break;
3218      }
3219   }
3220
3221   /* --------- MULTIPLEX --------- */
3222   if (e->tag == Iex_ITE) { // VFD
3223     if (ty == Ity_F64
3224         && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
3225        HReg r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3226        HReg r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3227        HReg dst = newVRegF(env);
3228        addInstr(env, X86Instr_FpUnary(Xfp_MOV,r1,dst));
3229        X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3230        addInstr(env, X86Instr_FpCMov(cc ^ 1, r0, dst));
3231        return dst;
3232      }
3233   }
3234
3235   ppIRExpr(e);
3236   vpanic("iselDblExpr_wrk");
3237}
3238
3239
3240/*---------------------------------------------------------*/
3241/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3242/*---------------------------------------------------------*/
3243
3244static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3245{
3246   HReg r = iselVecExpr_wrk( env, e );
3247#  if 0
3248   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3249#  endif
3250   vassert(hregClass(r) == HRcVec128);
3251   vassert(hregIsVirtual(r));
3252   return r;
3253}
3254
3255
3256/* DO NOT CALL THIS DIRECTLY */
3257static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3258{
3259
3260#  define REQUIRE_SSE1                                    \
3261      do { if (env->hwcaps == 0/*baseline, no sse*/       \
3262               ||  env->hwcaps == VEX_HWCAPS_X86_MMXEXT /*Integer SSE*/) \
3263              goto vec_fail;                              \
3264      } while (0)
3265
3266#  define REQUIRE_SSE2                                    \
3267      do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
3268              goto vec_fail;                              \
3269      } while (0)
3270
3271#  define SSE2_OR_ABOVE                                   \
3272       (env->hwcaps & VEX_HWCAPS_X86_SSE2)
3273
3274   HWord     fn = 0; /* address of helper fn, if required */
3275   MatchInfo mi;
3276   Bool      arg1isEReg = False;
3277   X86SseOp  op = Xsse_INVALID;
3278   IRType    ty = typeOfIRExpr(env->type_env,e);
3279   vassert(e);
3280   vassert(ty == Ity_V128);
3281
3282   REQUIRE_SSE1;
3283
3284   if (e->tag == Iex_RdTmp) {
3285      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3286   }
3287
3288   if (e->tag == Iex_Get) {
3289      HReg dst = newVRegV(env);
3290      addInstr(env, X86Instr_SseLdSt(
3291                       True/*load*/,
3292                       dst,
3293                       X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
3294                    )
3295              );
3296      return dst;
3297   }
3298
3299   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3300      HReg      dst = newVRegV(env);
3301      X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3302      addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
3303      return dst;
3304   }
3305
3306   if (e->tag == Iex_Const) {
3307      HReg dst = newVRegV(env);
3308      vassert(e->Iex.Const.con->tag == Ico_V128);
3309      addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
3310      return dst;
3311   }
3312
3313   if (e->tag == Iex_Unop) {
3314
3315   if (SSE2_OR_ABOVE) {
3316      /* 64UtoV128(LDle:I64(addr)) */
3317      DECLARE_PATTERN(p_zwiden_load64);
3318      DEFINE_PATTERN(p_zwiden_load64,
3319                     unop(Iop_64UtoV128,
3320                          IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
3321      if (matchIRExpr(&mi, p_zwiden_load64, e)) {
3322         X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
3323         HReg dst = newVRegV(env);
3324         addInstr(env, X86Instr_SseLdzLO(8, dst, am));
3325         return dst;
3326      }
3327   }
3328
3329   switch (e->Iex.Unop.op) {
3330
3331      case Iop_NotV128: {
3332         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3333         return do_sse_Not128(env, arg);
3334      }
3335
3336      case Iop_CmpNEZ64x2: {
3337         /* We can use SSE2 instructions for this. */
3338         /* Ideally, we want to do a 64Ix2 comparison against zero of
3339            the operand.  Problem is no such insn exists.  Solution
3340            therefore is to do a 32Ix4 comparison instead, and bitwise-
3341            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3342            let the not'd result of this initial comparison be a:b:c:d.
3343            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3344            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3345            giving the required result.
3346
3347            The required selection sequence is 2,3,0,1, which
3348            according to Intel's documentation means the pshufd
3349            literal value is 0xB1, that is,
3350            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3351         */
3352         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3353         HReg tmp  = newVRegV(env);
3354         HReg dst  = newVRegV(env);
3355         REQUIRE_SSE2;
3356         addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
3357         addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
3358         tmp = do_sse_Not128(env, tmp);
3359         addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
3360         addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
3361         return dst;
3362      }
3363
3364      case Iop_CmpNEZ32x4: {
3365         /* Sigh, we have to generate lousy code since this has to
3366            work on SSE1 hosts */
3367         /* basically, the idea is: for each lane:
3368               movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
3369               sbbl %r, %r               (now %r = 1Sto32(CF))
3370               movl %r, lane
3371         */
3372         Int       i;
3373         X86AMode* am;
3374         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3375         HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
3376         HReg      dst  = newVRegV(env);
3377         HReg      r32  = newVRegI(env);
3378         sub_from_esp(env, 16);
3379         addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
3380         for (i = 0; i < 4; i++) {
3381            am = X86AMode_IR(i*4, hregX86_ESP());
3382            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
3383            addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
3384            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
3385            addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
3386         }
3387         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3388         add_to_esp(env, 16);
3389         return dst;
3390      }
3391
3392      case Iop_CmpNEZ8x16:
3393      case Iop_CmpNEZ16x8: {
3394         /* We can use SSE2 instructions for this. */
3395         HReg arg;
3396         HReg vec0 = newVRegV(env);
3397         HReg vec1 = newVRegV(env);
3398         HReg dst  = newVRegV(env);
3399         X86SseOp cmpOp
3400            = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
3401                                             : Xsse_CMPEQ8;
3402         REQUIRE_SSE2;
3403         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
3404         addInstr(env, mk_vMOVsd_RR(vec0, vec1));
3405         addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
3406         /* defer arg computation to here so as to give CMPEQF as long
3407            as possible to complete */
3408         arg = iselVecExpr(env, e->Iex.Unop.arg);
3409         /* vec0 is all 0s; vec1 is all 1s */
3410         addInstr(env, mk_vMOVsd_RR(arg, dst));
3411         /* 16x8 or 8x16 comparison == */
3412         addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
3413         /* invert result */
3414         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
3415         return dst;
3416      }
3417
3418      case Iop_RecipEst32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
3419      case Iop_RSqrtEst32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
3420      do_32Fx4_unary:
3421      {
3422         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3423         HReg dst = newVRegV(env);
3424         addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
3425         return dst;
3426      }
3427
3428      case Iop_RecipEst32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
3429      case Iop_RSqrtEst32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
3430      case Iop_Sqrt32F0x4:     op = Xsse_SQRTF;  goto do_32F0x4_unary;
3431      do_32F0x4_unary:
3432      {
3433         /* A bit subtle.  We have to copy the arg to the result
3434            register first, because actually doing the SSE scalar insn
3435            leaves the upper 3/4 of the destination register
3436            unchanged.  Whereas the required semantics of these
3437            primops is that the upper 3/4 is simply copied in from the
3438            argument. */
3439         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3440         HReg dst = newVRegV(env);
3441         addInstr(env, mk_vMOVsd_RR(arg, dst));
3442         addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
3443         return dst;
3444      }
3445
3446      case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
3447      do_64F0x2_unary:
3448      {
3449         /* A bit subtle.  We have to copy the arg to the result
3450            register first, because actually doing the SSE scalar insn
3451            leaves the upper half of the destination register
3452            unchanged.  Whereas the required semantics of these
3453            primops is that the upper half is simply copied in from the
3454            argument. */
3455         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3456         HReg dst = newVRegV(env);
3457         REQUIRE_SSE2;
3458         addInstr(env, mk_vMOVsd_RR(arg, dst));
3459         addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
3460         return dst;
3461      }
3462
3463      case Iop_32UtoV128: {
3464         HReg      dst  = newVRegV(env);
3465         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3466         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3467         addInstr(env, X86Instr_Push(rmi));
3468	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
3469         add_to_esp(env, 4);
3470         return dst;
3471      }
3472
3473      case Iop_64UtoV128: {
3474         HReg      rHi, rLo;
3475         HReg      dst  = newVRegV(env);
3476         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3477         iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
3478         addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3479         addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3480	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
3481         add_to_esp(env, 8);
3482         return dst;
3483      }
3484
3485      default:
3486         break;
3487   } /* switch (e->Iex.Unop.op) */
3488   } /* if (e->tag == Iex_Unop) */
3489
3490   if (e->tag == Iex_Binop) {
3491   switch (e->Iex.Binop.op) {
3492
3493      case Iop_Sqrt64Fx2:
3494         REQUIRE_SSE2;
3495         /* fallthrough */
3496      case Iop_Sqrt32Fx4: {
3497         /* :: (rmode, vec) -> vec */
3498         HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3499         HReg dst = newVRegV(env);
3500         /* XXXROUNDINGFIXME */
3501         /* set roundingmode here */
3502         addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3503                           ? X86Instr_Sse64Fx2 : X86Instr_Sse32Fx4)
3504                       (Xsse_SQRTF, arg, dst));
3505         return dst;
3506      }
3507
3508      case Iop_SetV128lo32: {
3509         HReg dst = newVRegV(env);
3510         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3511         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3512         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3513         sub_from_esp(env, 16);
3514         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3515         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
3516         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3517         add_to_esp(env, 16);
3518         return dst;
3519      }
3520
3521      case Iop_SetV128lo64: {
3522         HReg dst = newVRegV(env);
3523         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3524         HReg srcIhi, srcIlo;
3525         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3526         X86AMode* esp4 = advance4(esp0);
3527         iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
3528         sub_from_esp(env, 16);
3529         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3530         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
3531         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
3532         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3533         add_to_esp(env, 16);
3534         return dst;
3535      }
3536
3537      case Iop_64HLtoV128: {
3538         HReg r3, r2, r1, r0;
3539         X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
3540         X86AMode* esp4  = advance4(esp0);
3541         X86AMode* esp8  = advance4(esp4);
3542         X86AMode* esp12 = advance4(esp8);
3543         HReg dst = newVRegV(env);
3544	 /* do this via the stack (easy, convenient, etc) */
3545         sub_from_esp(env, 16);
3546         /* Do the less significant 64 bits */
3547         iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
3548         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
3549         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
3550         /* Do the more significant 64 bits */
3551         iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
3552         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
3553         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
3554	 /* Fetch result back from stack. */
3555         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3556         add_to_esp(env, 16);
3557         return dst;
3558      }
3559
3560      case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
3561      case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
3562      case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
3563      case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
3564      case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
3565      case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
3566      do_32Fx4:
3567      {
3568         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3569         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3570         HReg dst = newVRegV(env);
3571         addInstr(env, mk_vMOVsd_RR(argL, dst));
3572         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3573         return dst;
3574      }
3575
3576      case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
3577      case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
3578      case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
3579      case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
3580      case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
3581      case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
3582      do_64Fx2:
3583      {
3584         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3585         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3586         HReg dst = newVRegV(env);
3587         REQUIRE_SSE2;
3588         addInstr(env, mk_vMOVsd_RR(argL, dst));
3589         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3590         return dst;
3591      }
3592
3593      case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
3594      case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
3595      case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
3596      case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
3597      case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
3598      case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
3599      case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
3600      case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
3601      case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
3602      case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
3603      do_32F0x4: {
3604         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3605         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3606         HReg dst = newVRegV(env);
3607         addInstr(env, mk_vMOVsd_RR(argL, dst));
3608         addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
3609         return dst;
3610      }
3611
3612      case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
3613      case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
3614      case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
3615      case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
3616      case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
3617      case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
3618      case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
3619      case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
3620      case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
3621      case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
3622      do_64F0x2: {
3623         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3624         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3625         HReg dst = newVRegV(env);
3626         REQUIRE_SSE2;
3627         addInstr(env, mk_vMOVsd_RR(argL, dst));
3628         addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
3629         return dst;
3630      }
3631
3632      case Iop_QNarrowBin32Sto16Sx8:
3633         op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3634      case Iop_QNarrowBin16Sto8Sx16:
3635         op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3636      case Iop_QNarrowBin16Sto8Ux16:
3637         op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3638
3639      case Iop_InterleaveHI8x16:
3640         op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3641      case Iop_InterleaveHI16x8:
3642         op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3643      case Iop_InterleaveHI32x4:
3644         op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3645      case Iop_InterleaveHI64x2:
3646         op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3647
3648      case Iop_InterleaveLO8x16:
3649         op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3650      case Iop_InterleaveLO16x8:
3651         op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3652      case Iop_InterleaveLO32x4:
3653         op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3654      case Iop_InterleaveLO64x2:
3655         op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3656
3657      case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
3658      case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
3659      case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
3660      case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
3661      case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
3662      case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
3663      case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
3664      case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
3665      case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
3666      case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
3667      case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
3668      case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
3669      case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
3670      case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
3671      case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
3672      case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
3673      case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
3674      case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
3675      case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
3676      case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
3677      case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
3678      case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
3679      case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
3680      case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
3681      case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
3682      case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
3683      case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
3684      case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
3685      case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
3686      case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
3687      case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
3688      case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
3689      case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
3690      case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
3691      do_SseReRg: {
3692         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3693         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3694         HReg dst = newVRegV(env);
3695         if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
3696            REQUIRE_SSE2;
3697         if (arg1isEReg) {
3698            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3699            addInstr(env, X86Instr_SseReRg(op, arg1, dst));
3700         } else {
3701            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3702            addInstr(env, X86Instr_SseReRg(op, arg2, dst));
3703         }
3704         return dst;
3705      }
3706
3707      case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
3708      case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
3709      case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
3710      case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
3711      case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
3712      case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
3713      case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
3714      case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
3715      do_SseShift: {
3716         HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
3717         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3718         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3719         HReg      ereg = newVRegV(env);
3720         HReg      dst  = newVRegV(env);
3721         REQUIRE_SSE2;
3722         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3723         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3724         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3725         addInstr(env, X86Instr_Push(rmi));
3726         addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
3727	 addInstr(env, mk_vMOVsd_RR(greg, dst));
3728         addInstr(env, X86Instr_SseReRg(op, ereg, dst));
3729         add_to_esp(env, 16);
3730         return dst;
3731      }
3732
3733      case Iop_NarrowBin32to16x8:
3734         fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3735         goto do_SseAssistedBinary;
3736      case Iop_NarrowBin16to8x16:
3737         fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3738         goto do_SseAssistedBinary;
3739      do_SseAssistedBinary: {
3740         /* As with the amd64 case (where this is copied from) we
3741            generate pretty bad code. */
3742         vassert(fn != 0);
3743         HReg dst = newVRegV(env);
3744         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3745         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3746         HReg argp = newVRegI(env);
3747         /* subl $112, %esp         -- make a space */
3748         sub_from_esp(env, 112);
3749         /* leal 48(%esp), %r_argp  -- point into it */
3750         addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
3751                                      argp));
3752         /* andl $-16, %r_argp      -- 16-align the pointer */
3753         addInstr(env, X86Instr_Alu32R(Xalu_AND,
3754                                       X86RMI_Imm( ~(UInt)15 ),
3755                                       argp));
3756         /* Prepare 3 arg regs:
3757            leal  0(%r_argp), %eax
3758            leal 16(%r_argp), %edx
3759            leal 32(%r_argp), %ecx
3760         */
3761         addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
3762                                      hregX86_EAX()));
3763         addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
3764                                      hregX86_EDX()));
3765         addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
3766                                      hregX86_ECX()));
3767         /* Store the two args, at (%edx) and (%ecx):
3768            movupd  %argL, 0(%edx)
3769            movupd  %argR, 0(%ecx)
3770         */
3771         addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
3772                                        X86AMode_IR(0, hregX86_EDX())));
3773         addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
3774                                        X86AMode_IR(0, hregX86_ECX())));
3775         /* call the helper */
3776         addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
3777                                      3, mk_RetLoc_simple(RLPri_None) ));
3778         /* fetch the result from memory, using %r_argp, which the
3779            register allocator will keep alive across the call. */
3780         addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
3781                                        X86AMode_IR(0, argp)));
3782         /* and finally, clear the space */
3783         add_to_esp(env, 112);
3784         return dst;
3785      }
3786
3787      default:
3788         break;
3789   } /* switch (e->Iex.Binop.op) */
3790   } /* if (e->tag == Iex_Binop) */
3791
3792
3793   if (e->tag == Iex_Triop) {
3794   IRTriop *triop = e->Iex.Triop.details;
3795   switch (triop->op) {
3796
3797      case Iop_Add32Fx4: op = Xsse_ADDF; goto do_32Fx4_w_rm;
3798      case Iop_Sub32Fx4: op = Xsse_SUBF; goto do_32Fx4_w_rm;
3799      case Iop_Mul32Fx4: op = Xsse_MULF; goto do_32Fx4_w_rm;
3800      case Iop_Div32Fx4: op = Xsse_DIVF; goto do_32Fx4_w_rm;
3801      do_32Fx4_w_rm:
3802      {
3803         HReg argL = iselVecExpr(env, triop->arg2);
3804         HReg argR = iselVecExpr(env, triop->arg3);
3805         HReg dst = newVRegV(env);
3806         addInstr(env, mk_vMOVsd_RR(argL, dst));
3807         /* XXXROUNDINGFIXME */
3808         /* set roundingmode here */
3809         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3810         return dst;
3811      }
3812
3813      case Iop_Add64Fx2: op = Xsse_ADDF; goto do_64Fx2_w_rm;
3814      case Iop_Sub64Fx2: op = Xsse_SUBF; goto do_64Fx2_w_rm;
3815      case Iop_Mul64Fx2: op = Xsse_MULF; goto do_64Fx2_w_rm;
3816      case Iop_Div64Fx2: op = Xsse_DIVF; goto do_64Fx2_w_rm;
3817      do_64Fx2_w_rm:
3818      {
3819         HReg argL = iselVecExpr(env, triop->arg2);
3820         HReg argR = iselVecExpr(env, triop->arg3);
3821         HReg dst = newVRegV(env);
3822         REQUIRE_SSE2;
3823         addInstr(env, mk_vMOVsd_RR(argL, dst));
3824         /* XXXROUNDINGFIXME */
3825         /* set roundingmode here */
3826         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3827         return dst;
3828      }
3829
3830      default:
3831         break;
3832   } /* switch (triop->op) */
3833   } /* if (e->tag == Iex_Triop) */
3834
3835
3836   if (e->tag == Iex_ITE) { // VFD
3837      HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3838      HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3839      HReg dst = newVRegV(env);
3840      addInstr(env, mk_vMOVsd_RR(r1,dst));
3841      X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3842      addInstr(env, X86Instr_SseCMov(cc ^ 1, r0, dst));
3843      return dst;
3844   }
3845
3846   vec_fail:
3847   vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
3848              LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
3849   ppIRExpr(e);
3850   vpanic("iselVecExpr_wrk");
3851
3852#  undef REQUIRE_SSE1
3853#  undef REQUIRE_SSE2
3854#  undef SSE2_OR_ABOVE
3855}
3856
3857
3858/*---------------------------------------------------------*/
3859/*--- ISEL: Statements                                  ---*/
3860/*---------------------------------------------------------*/
3861
3862static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3863{
3864   if (vex_traceflags & VEX_TRACE_VCODE) {
3865      vex_printf("\n-- ");
3866      ppIRStmt(stmt);
3867      vex_printf("\n");
3868   }
3869
3870   switch (stmt->tag) {
3871
3872   /* --------- STORE --------- */
3873   case Ist_Store: {
3874      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3875      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3876      IREndness end   = stmt->Ist.Store.end;
3877
3878      if (tya != Ity_I32 || end != Iend_LE)
3879         goto stmt_fail;
3880
3881      if (tyd == Ity_I32) {
3882         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3883         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
3884         addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
3885         return;
3886      }
3887      if (tyd == Ity_I8 || tyd == Ity_I16) {
3888         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3889         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
3890         addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
3891                                       r,am ));
3892         return;
3893      }
3894      if (tyd == Ity_F64) {
3895         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3896         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
3897         addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
3898         return;
3899      }
3900      if (tyd == Ity_F32) {
3901         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3902         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
3903         addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
3904         return;
3905      }
3906      if (tyd == Ity_I64) {
3907         HReg vHi, vLo, rA;
3908         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
3909         rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
3910         addInstr(env, X86Instr_Alu32M(
3911                          Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
3912         addInstr(env, X86Instr_Alu32M(
3913                          Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
3914         return;
3915      }
3916      if (tyd == Ity_V128) {
3917         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3918         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
3919         addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
3920         return;
3921      }
3922      break;
3923   }
3924
3925   /* --------- PUT --------- */
3926   case Ist_Put: {
3927      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3928      if (ty == Ity_I32) {
3929         /* We're going to write to memory, so compute the RHS into an
3930            X86RI. */
3931         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
3932         addInstr(env,
3933                  X86Instr_Alu32M(
3934                     Xalu_MOV,
3935                     ri,
3936                     X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
3937                 ));
3938         return;
3939      }
3940      if (ty == Ity_I8 || ty == Ity_I16) {
3941         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
3942         addInstr(env, X86Instr_Store(
3943                          toUChar(ty==Ity_I8 ? 1 : 2),
3944                          r,
3945                          X86AMode_IR(stmt->Ist.Put.offset,
3946                                      hregX86_EBP())));
3947         return;
3948      }
3949      if (ty == Ity_I64) {
3950         HReg vHi, vLo;
3951         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3952         X86AMode* am4 = advance4(am);
3953         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
3954         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
3955         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
3956         return;
3957      }
3958      if (ty == Ity_V128) {
3959         HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
3960         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3961         addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
3962         return;
3963      }
3964      if (ty == Ity_F32) {
3965         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
3966         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3967         set_FPU_rounding_default(env); /* paranoia */
3968         addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
3969         return;
3970      }
3971      if (ty == Ity_F64) {
3972         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
3973         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3974         set_FPU_rounding_default(env); /* paranoia */
3975         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
3976         return;
3977      }
3978      break;
3979   }
3980
3981   /* --------- Indexed PUT --------- */
3982   case Ist_PutI: {
3983      IRPutI *puti = stmt->Ist.PutI.details;
3984
3985      X86AMode* am
3986         = genGuestArrayOffset(
3987              env, puti->descr,
3988                   puti->ix, puti->bias );
3989
3990      IRType ty = typeOfIRExpr(env->type_env, puti->data);
3991      if (ty == Ity_F64) {
3992         HReg val = iselDblExpr(env, puti->data);
3993         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
3994         return;
3995      }
3996      if (ty == Ity_I8) {
3997         HReg r = iselIntExpr_R(env, puti->data);
3998         addInstr(env, X86Instr_Store( 1, r, am ));
3999         return;
4000      }
4001      if (ty == Ity_I32) {
4002         HReg r = iselIntExpr_R(env, puti->data);
4003         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
4004         return;
4005      }
4006      if (ty == Ity_I64) {
4007         HReg rHi, rLo;
4008         X86AMode* am4 = advance4(am);
4009         iselInt64Expr(&rHi, &rLo, env, puti->data);
4010         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
4011         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
4012         return;
4013      }
4014      break;
4015   }
4016
4017   /* --------- TMP --------- */
4018   case Ist_WrTmp: {
4019      IRTemp tmp = stmt->Ist.WrTmp.tmp;
4020      IRType ty = typeOfIRTemp(env->type_env, tmp);
4021
4022      /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
4023         compute it into an AMode and then use LEA.  This usually
4024         produces fewer instructions, often because (for memcheck
4025         created IR) we get t = address-expression, (t is later used
4026         twice) and so doing this naturally turns address-expression
4027         back into an X86 amode. */
4028      if (ty == Ity_I32
4029          && stmt->Ist.WrTmp.data->tag == Iex_Binop
4030          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
4031         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4032         HReg dst = lookupIRTemp(env, tmp);
4033         if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
4034            /* Hmm, iselIntExpr_AMode wimped out and just computed the
4035               value into a register.  Just emit a normal reg-reg move
4036               so reg-alloc can coalesce it away in the usual way. */
4037            HReg src = am->Xam.IR.reg;
4038            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
4039         } else {
4040            addInstr(env, X86Instr_Lea32(am,dst));
4041         }
4042         return;
4043      }
4044
4045      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
4046         X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4047         HReg dst = lookupIRTemp(env, tmp);
4048         addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
4049         return;
4050      }
4051      if (ty == Ity_I64) {
4052         HReg rHi, rLo, dstHi, dstLo;
4053         iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4054         lookupIRTemp64( &dstHi, &dstLo, env, tmp);
4055         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4056         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4057         return;
4058      }
4059      if (ty == Ity_I1) {
4060         X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4061         HReg dst = lookupIRTemp(env, tmp);
4062         addInstr(env, X86Instr_Set32(cond, dst));
4063         return;
4064      }
4065      if (ty == Ity_F64) {
4066         HReg dst = lookupIRTemp(env, tmp);
4067         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4068         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
4069         return;
4070      }
4071      if (ty == Ity_F32) {
4072         HReg dst = lookupIRTemp(env, tmp);
4073         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4074         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
4075         return;
4076      }
4077      if (ty == Ity_V128) {
4078         HReg dst = lookupIRTemp(env, tmp);
4079         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4080         addInstr(env, mk_vMOVsd_RR(src,dst));
4081         return;
4082      }
4083      break;
4084   }
4085
4086   /* --------- Call to DIRTY helper --------- */
4087   case Ist_Dirty: {
4088      IRDirty* d = stmt->Ist.Dirty.details;
4089
4090      /* Figure out the return type, if any. */
4091      IRType retty = Ity_INVALID;
4092      if (d->tmp != IRTemp_INVALID)
4093         retty = typeOfIRTemp(env->type_env, d->tmp);
4094
4095      Bool retty_ok = False;
4096      switch (retty) {
4097         case Ity_INVALID: /* function doesn't return anything */
4098         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4099         case Ity_V128:
4100            retty_ok = True; break;
4101         default:
4102            break;
4103      }
4104      if (!retty_ok)
4105         break; /* will go to stmt_fail: */
4106
4107      /* Marshal args, do the call, and set the return value to
4108         0x555..555 if this is a conditional call that returns a value
4109         and the call is skipped. */
4110      UInt   addToSp = 0;
4111      RetLoc rloc    = mk_RetLoc_INVALID();
4112      doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4113      vassert(is_sane_RetLoc(rloc));
4114
4115      /* Now figure out what to do with the returned value, if any. */
4116      switch (retty) {
4117         case Ity_INVALID: {
4118            /* No return value.  Nothing to do. */
4119            vassert(d->tmp == IRTemp_INVALID);
4120            vassert(rloc.pri == RLPri_None);
4121            vassert(addToSp == 0);
4122            return;
4123         }
4124         case Ity_I32: case Ity_I16: case Ity_I8: {
4125            /* The returned value is in %eax.  Park it in the register
4126               associated with tmp. */
4127            vassert(rloc.pri == RLPri_Int);
4128            vassert(addToSp == 0);
4129            HReg dst = lookupIRTemp(env, d->tmp);
4130            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
4131            return;
4132         }
4133         case Ity_I64: {
4134            /* The returned value is in %edx:%eax.  Park it in the
4135               register-pair associated with tmp. */
4136            vassert(rloc.pri == RLPri_2Int);
4137            vassert(addToSp == 0);
4138            HReg dstHi, dstLo;
4139            lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
4140            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
4141            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
4142            return;
4143         }
4144         case Ity_V128: {
4145            /* The returned value is on the stack, and *retloc tells
4146               us where.  Fish it off the stack and then move the
4147               stack pointer upwards to clear it, as directed by
4148               doHelperCall. */
4149            vassert(rloc.pri == RLPri_V128SpRel);
4150            vassert(addToSp >= 16);
4151            HReg      dst = lookupIRTemp(env, d->tmp);
4152            X86AMode* am  = X86AMode_IR(rloc.spOff, hregX86_ESP());
4153            addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
4154            add_to_esp(env, addToSp);
4155            return;
4156         }
4157         default:
4158            /*NOTREACHED*/
4159            vassert(0);
4160      }
4161      break;
4162   }
4163
4164   /* --------- MEM FENCE --------- */
4165   case Ist_MBE:
4166      switch (stmt->Ist.MBE.event) {
4167         case Imbe_Fence:
4168            addInstr(env, X86Instr_MFence(env->hwcaps));
4169            return;
4170         default:
4171            break;
4172      }
4173      break;
4174
4175   /* --------- ACAS --------- */
4176   case Ist_CAS:
4177      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4178         /* "normal" singleton CAS */
4179         UChar  sz;
4180         IRCAS* cas = stmt->Ist.CAS.details;
4181         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4182         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4183         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4184         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4185         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4186         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4187         vassert(cas->expdHi == NULL);
4188         vassert(cas->dataHi == NULL);
4189         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4190         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4191         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4192         switch (ty) {
4193            case Ity_I32: sz = 4; break;
4194            case Ity_I16: sz = 2; break;
4195            case Ity_I8:  sz = 1; break;
4196            default: goto unhandled_cas;
4197         }
4198         addInstr(env, X86Instr_ACAS(am, sz));
4199         addInstr(env,
4200                  X86Instr_CMov32(Xcc_NZ,
4201                                  X86RM_Reg(hregX86_EAX()), rOldLo));
4202         return;
4203      } else {
4204         /* double CAS */
4205         IRCAS* cas = stmt->Ist.CAS.details;
4206         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4207         /* only 32-bit allowed in this case */
4208         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4209         /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
4210         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4211         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4212         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4213         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4214         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4215         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4216         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4217         if (ty != Ity_I32)
4218            goto unhandled_cas;
4219         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4220         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4221         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
4222         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4223         addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
4224         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4225         addInstr(env, X86Instr_DACAS(am));
4226         addInstr(env,
4227                  X86Instr_CMov32(Xcc_NZ,
4228                                  X86RM_Reg(hregX86_EDX()), rOldHi));
4229         addInstr(env,
4230                  X86Instr_CMov32(Xcc_NZ,
4231                                  X86RM_Reg(hregX86_EAX()), rOldLo));
4232         return;
4233      }
4234      unhandled_cas:
4235      break;
4236
4237   /* --------- INSTR MARK --------- */
4238   /* Doesn't generate any executable code ... */
4239   case Ist_IMark:
4240       return;
4241
4242   /* --------- NO-OP --------- */
4243   /* Fairly self-explanatory, wouldn't you say? */
4244   case Ist_NoOp:
4245       return;
4246
4247   /* --------- EXIT --------- */
4248   case Ist_Exit: {
4249      if (stmt->Ist.Exit.dst->tag != Ico_U32)
4250         vpanic("iselStmt(x86): Ist_Exit: dst is not a 32-bit value");
4251
4252      X86CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4253      X86AMode*   amEIP = X86AMode_IR(stmt->Ist.Exit.offsIP,
4254                                      hregX86_EBP());
4255
4256      /* Case: boring transfer to known address */
4257      if (stmt->Ist.Exit.jk == Ijk_Boring) {
4258         if (env->chainingAllowed) {
4259            /* .. almost always true .. */
4260            /* Skip the event check at the dst if this is a forwards
4261               edge. */
4262            Bool toFastEP
4263               = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
4264            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4265            addInstr(env, X86Instr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
4266                                           amEIP, cc, toFastEP));
4267         } else {
4268            /* .. very occasionally .. */
4269            /* We can't use chaining, so ask for an assisted transfer,
4270               as that's the only alternative that is allowable. */
4271            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4272            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, Ijk_Boring));
4273         }
4274         return;
4275      }
4276
4277      /* Case: assisted transfer to arbitrary address */
4278      switch (stmt->Ist.Exit.jk) {
4279         /* Keep this list in sync with that in iselNext below */
4280         case Ijk_ClientReq:
4281         case Ijk_EmWarn:
4282         case Ijk_MapFail:
4283         case Ijk_NoDecode:
4284         case Ijk_NoRedir:
4285         case Ijk_SigSEGV:
4286         case Ijk_SigTRAP:
4287         case Ijk_Sys_int128:
4288         case Ijk_Sys_int129:
4289         case Ijk_Sys_int130:
4290         case Ijk_Sys_int145:
4291         case Ijk_Sys_int210:
4292         case Ijk_Sys_syscall:
4293         case Ijk_Sys_sysenter:
4294         case Ijk_InvalICache:
4295         case Ijk_Yield:
4296         {
4297            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4298            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, stmt->Ist.Exit.jk));
4299            return;
4300         }
4301         default:
4302            break;
4303      }
4304
4305      /* Do we ever expect to see any other kind? */
4306      goto stmt_fail;
4307   }
4308
4309   default: break;
4310   }
4311  stmt_fail:
4312   ppIRStmt(stmt);
4313   vpanic("iselStmt");
4314}
4315
4316
4317/*---------------------------------------------------------*/
4318/*--- ISEL: Basic block terminators (Nexts)             ---*/
4319/*---------------------------------------------------------*/
4320
4321static void iselNext ( ISelEnv* env,
4322                       IRExpr* next, IRJumpKind jk, Int offsIP )
4323{
4324   if (vex_traceflags & VEX_TRACE_VCODE) {
4325      vex_printf( "\n-- PUT(%d) = ", offsIP);
4326      ppIRExpr( next );
4327      vex_printf( "; exit-");
4328      ppIRJumpKind(jk);
4329      vex_printf( "\n");
4330   }
4331
4332   /* Case: boring transfer to known address */
4333   if (next->tag == Iex_Const) {
4334      IRConst* cdst = next->Iex.Const.con;
4335      vassert(cdst->tag == Ico_U32);
4336      if (jk == Ijk_Boring || jk == Ijk_Call) {
4337         /* Boring transfer to known address */
4338         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4339         if (env->chainingAllowed) {
4340            /* .. almost always true .. */
4341            /* Skip the event check at the dst if this is a forwards
4342               edge. */
4343            Bool toFastEP
4344               = ((Addr32)cdst->Ico.U32) > env->max_ga;
4345            if (0) vex_printf("%s", toFastEP ? "X" : ".");
4346            addInstr(env, X86Instr_XDirect(cdst->Ico.U32,
4347                                           amEIP, Xcc_ALWAYS,
4348                                           toFastEP));
4349         } else {
4350            /* .. very occasionally .. */
4351            /* We can't use chaining, so ask for an assisted transfer,
4352               as that's the only alternative that is allowable. */
4353            HReg r = iselIntExpr_R(env, next);
4354            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4355                                             Ijk_Boring));
4356         }
4357         return;
4358      }
4359   }
4360
4361   /* Case: call/return (==boring) transfer to any address */
4362   switch (jk) {
4363      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4364         HReg      r     = iselIntExpr_R(env, next);
4365         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4366         if (env->chainingAllowed) {
4367            addInstr(env, X86Instr_XIndir(r, amEIP, Xcc_ALWAYS));
4368         } else {
4369            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4370                                               Ijk_Boring));
4371         }
4372         return;
4373      }
4374      default:
4375         break;
4376   }
4377
4378   /* Case: assisted transfer to arbitrary address */
4379   switch (jk) {
4380      /* Keep this list in sync with that for Ist_Exit above */
4381      case Ijk_ClientReq:
4382      case Ijk_EmWarn:
4383      case Ijk_MapFail:
4384      case Ijk_NoDecode:
4385      case Ijk_NoRedir:
4386      case Ijk_SigSEGV:
4387      case Ijk_SigTRAP:
4388      case Ijk_Sys_int128:
4389      case Ijk_Sys_int129:
4390      case Ijk_Sys_int130:
4391      case Ijk_Sys_int145:
4392      case Ijk_Sys_int210:
4393      case Ijk_Sys_syscall:
4394      case Ijk_Sys_sysenter:
4395      case Ijk_InvalICache:
4396      case Ijk_Yield:
4397      {
4398         HReg      r     = iselIntExpr_R(env, next);
4399         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4400         addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS, jk));
4401         return;
4402      }
4403      default:
4404         break;
4405   }
4406
4407   vex_printf( "\n-- PUT(%d) = ", offsIP);
4408   ppIRExpr( next );
4409   vex_printf( "; exit-");
4410   ppIRJumpKind(jk);
4411   vex_printf( "\n");
4412   vassert(0); // are we expecting any other kind?
4413}
4414
4415
4416/*---------------------------------------------------------*/
4417/*--- Insn selector top-level                           ---*/
4418/*---------------------------------------------------------*/
4419
4420/* Translate an entire SB to x86 code. */
4421
4422HInstrArray* iselSB_X86 ( const IRSB* bb,
4423                          VexArch      arch_host,
4424                          const VexArchInfo* archinfo_host,
4425                          const VexAbiInfo*  vbi/*UNUSED*/,
4426                          Int offs_Host_EvC_Counter,
4427                          Int offs_Host_EvC_FailAddr,
4428                          Bool chainingAllowed,
4429                          Bool addProfInc,
4430                          Addr max_ga )
4431{
4432   Int      i, j;
4433   HReg     hreg, hregHI;
4434   ISelEnv* env;
4435   UInt     hwcaps_host = archinfo_host->hwcaps;
4436   X86AMode *amCounter, *amFailAddr;
4437
4438   /* sanity ... */
4439   vassert(arch_host == VexArchX86);
4440   vassert(0 == (hwcaps_host
4441                 & ~(VEX_HWCAPS_X86_MMXEXT
4442                     | VEX_HWCAPS_X86_SSE1
4443                     | VEX_HWCAPS_X86_SSE2
4444                     | VEX_HWCAPS_X86_SSE3
4445                     | VEX_HWCAPS_X86_LZCNT)));
4446
4447   /* Check that the host's endianness is as expected. */
4448   vassert(archinfo_host->endness == VexEndnessLE);
4449
4450   /* Make up an initial environment to use. */
4451   env = LibVEX_Alloc_inline(sizeof(ISelEnv));
4452   env->vreg_ctr = 0;
4453
4454   /* Set up output code array. */
4455   env->code = newHInstrArray();
4456
4457   /* Copy BB's type env. */
4458   env->type_env = bb->tyenv;
4459
4460   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4461      change as we go along. */
4462   env->n_vregmap = bb->tyenv->types_used;
4463   env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4464   env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4465
4466   /* and finally ... */
4467   env->chainingAllowed = chainingAllowed;
4468   env->hwcaps          = hwcaps_host;
4469   env->max_ga          = max_ga;
4470
4471   /* For each IR temporary, allocate a suitably-kinded virtual
4472      register. */
4473   j = 0;
4474   for (i = 0; i < env->n_vregmap; i++) {
4475      hregHI = hreg = INVALID_HREG;
4476      switch (bb->tyenv->types[i]) {
4477         case Ity_I1:
4478         case Ity_I8:
4479         case Ity_I16:
4480         case Ity_I32:  hreg   = mkHReg(True, HRcInt32,  0, j++); break;
4481         case Ity_I64:  hreg   = mkHReg(True, HRcInt32,  0, j++);
4482                        hregHI = mkHReg(True, HRcInt32,  0, j++); break;
4483         case Ity_F32:
4484         case Ity_F64:  hreg   = mkHReg(True, HRcFlt64,  0, j++); break;
4485         case Ity_V128: hreg   = mkHReg(True, HRcVec128, 0, j++); break;
4486         default: ppIRType(bb->tyenv->types[i]);
4487                  vpanic("iselBB: IRTemp type");
4488      }
4489      env->vregmap[i]   = hreg;
4490      env->vregmapHI[i] = hregHI;
4491   }
4492   env->vreg_ctr = j;
4493
4494   /* The very first instruction must be an event check. */
4495   amCounter  = X86AMode_IR(offs_Host_EvC_Counter,  hregX86_EBP());
4496   amFailAddr = X86AMode_IR(offs_Host_EvC_FailAddr, hregX86_EBP());
4497   addInstr(env, X86Instr_EvCheck(amCounter, amFailAddr));
4498
4499   /* Possibly a block counter increment (for profiling).  At this
4500      point we don't know the address of the counter, so just pretend
4501      it is zero.  It will have to be patched later, but before this
4502      translation is used, by a call to LibVEX_patchProfCtr. */
4503   if (addProfInc) {
4504      addInstr(env, X86Instr_ProfInc());
4505   }
4506
4507   /* Ok, finally we can iterate over the statements. */
4508   for (i = 0; i < bb->stmts_used; i++)
4509      iselStmt(env, bb->stmts[i]);
4510
4511   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4512
4513   /* record the number of vregs we used. */
4514   env->code->n_vregs = env->vreg_ctr;
4515   return env->code;
4516}
4517
4518
4519/*---------------------------------------------------------------*/
4520/*--- end                                     host_x86_isel.c ---*/
4521/*---------------------------------------------------------------*/
4522