1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2015 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_x86_defs.h"
47
48/* TODO 21 Apr 2005:
49
50   -- (Really an assembler issue) don't emit CMov32 as a cmov
51      insn, since that's expensive on P4 and conditional branch
52      is cheaper if (as we expect) the condition is highly predictable
53
54   -- preserve xmm registers across function calls (by declaring them
55      as trashed by call insns)
56
57   -- preserve x87 ST stack discipline across function calls.  Sigh.
58
59   -- Check doHelperCall: if a call is conditional, we cannot safely
60      compute any regparm args directly to registers.  Hence, the
61      fast-regparm marshalling should be restricted to unconditional
62      calls only.
63*/
64
65/*---------------------------------------------------------*/
66/*--- x87 control word stuff                            ---*/
67/*---------------------------------------------------------*/
68
69/* Vex-generated code expects to run with the FPU set as follows: all
70   exceptions masked, round-to-nearest, precision = 53 bits.  This
71   corresponds to a FPU control word value of 0x027F.
72
73   Similarly the SSE control word (%mxcsr) should be 0x1F80.
74
75   %fpucw and %mxcsr should have these values on entry to
76   Vex-generated code, and should those values should be
77   unchanged at exit.
78*/
79
80#define DEFAULT_FPUCW 0x027F
81
82/* debugging only, do not use */
83/* define DEFAULT_FPUCW 0x037F */
84
85
86/*---------------------------------------------------------*/
87/*--- misc helpers                                      ---*/
88/*---------------------------------------------------------*/
89
90/* These are duplicated in guest-x86/toIR.c */
91static IRExpr* unop ( IROp op, IRExpr* a )
92{
93   return IRExpr_Unop(op, a);
94}
95
96static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
97{
98   return IRExpr_Binop(op, a1, a2);
99}
100
101static IRExpr* bind ( Int binder )
102{
103   return IRExpr_Binder(binder);
104}
105
106static Bool isZeroU8 ( IRExpr* e )
107{
108   return e->tag == Iex_Const
109          && e->Iex.Const.con->tag == Ico_U8
110          && e->Iex.Const.con->Ico.U8 == 0;
111}
112
113static Bool isZeroU32 ( IRExpr* e )
114{
115   return e->tag == Iex_Const
116          && e->Iex.Const.con->tag == Ico_U32
117          && e->Iex.Const.con->Ico.U32 == 0;
118}
119
120//static Bool isZeroU64 ( IRExpr* e )
121//{
122//   return e->tag == Iex_Const
123//          && e->Iex.Const.con->tag == Ico_U64
124//          && e->Iex.Const.con->Ico.U64 == 0ULL;
125//}
126
127
128/*---------------------------------------------------------*/
129/*--- ISelEnv                                           ---*/
130/*---------------------------------------------------------*/
131
132/* This carries around:
133
134   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
135     might encounter.  This is computed before insn selection starts,
136     and does not change.
137
138   - A mapping from IRTemp to HReg.  This tells the insn selector
139     which virtual register(s) are associated with each IRTemp
140     temporary.  This is computed before insn selection starts, and
141     does not change.  We expect this mapping to map precisely the
142     same set of IRTemps as the type mapping does.
143
144        - vregmap   holds the primary register for the IRTemp.
145        - vregmapHI is only used for 64-bit integer-typed
146             IRTemps.  It holds the identity of a second
147             32-bit virtual HReg, which holds the high half
148             of the value.
149
150   - The code array, that is, the insns selected so far.
151
152   - A counter, for generating new virtual registers.
153
154   - The host subarchitecture we are selecting insns for.
155     This is set at the start and does not change.
156
157   - A Bool for indicating whether we may generate chain-me
158     instructions for control flow transfers, or whether we must use
159     XAssisted.
160
161   - The maximum guest address of any guest insn in this block.
162     Actually, the address of the highest-addressed byte from any insn
163     in this block.  Is set at the start and does not change.  This is
164     used for detecting jumps which are definitely forward-edges from
165     this block, and therefore can be made (chained) to the fast entry
166     point of the destination, thereby avoiding the destination's
167     event check.
168
169   Note, this is all (well, mostly) host-independent.
170*/
171
172typedef
173   struct {
174      /* Constant -- are set at the start and do not change. */
175      IRTypeEnv*   type_env;
176
177      HReg*        vregmap;
178      HReg*        vregmapHI;
179      Int          n_vregmap;
180
181      UInt         hwcaps;
182
183      Bool         chainingAllowed;
184      Addr32       max_ga;
185
186      /* These are modified as we go along. */
187      HInstrArray* code;
188      Int          vreg_ctr;
189   }
190   ISelEnv;
191
192
193static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
194{
195   vassert(tmp >= 0);
196   vassert(tmp < env->n_vregmap);
197   return env->vregmap[tmp];
198}
199
200static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
201{
202   vassert(tmp >= 0);
203   vassert(tmp < env->n_vregmap);
204   vassert(! hregIsInvalid(env->vregmapHI[tmp]));
205   *vrLO = env->vregmap[tmp];
206   *vrHI = env->vregmapHI[tmp];
207}
208
209static void addInstr ( ISelEnv* env, X86Instr* instr )
210{
211   addHInstr(env->code, instr);
212   if (vex_traceflags & VEX_TRACE_VCODE) {
213      ppX86Instr(instr, False);
214      vex_printf("\n");
215   }
216}
217
218static HReg newVRegI ( ISelEnv* env )
219{
220   HReg reg = mkHReg(True/*virtual reg*/, HRcInt32, 0/*enc*/, env->vreg_ctr);
221   env->vreg_ctr++;
222   return reg;
223}
224
225static HReg newVRegF ( ISelEnv* env )
226{
227   HReg reg = mkHReg(True/*virtual reg*/, HRcFlt64, 0/*enc*/, env->vreg_ctr);
228   env->vreg_ctr++;
229   return reg;
230}
231
232static HReg newVRegV ( ISelEnv* env )
233{
234   HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
235   env->vreg_ctr++;
236   return reg;
237}
238
239
240/*---------------------------------------------------------*/
241/*--- ISEL: Forward declarations                        ---*/
242/*---------------------------------------------------------*/
243
244/* These are organised as iselXXX and iselXXX_wrk pairs.  The
245   iselXXX_wrk do the real work, but are not to be called directly.
246   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
247   checks that all returned registers are virtual.  You should not
248   call the _wrk version directly.
249*/
250static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
251static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
252
253static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
254static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
255
256static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
257static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
258
259static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
260static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
261
262static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
263static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
264
265static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
266                                       ISelEnv* env, IRExpr* e );
267static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
268                                       ISelEnv* env, IRExpr* e );
269
270static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
271static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
272
273static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
274static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
275
276static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
277static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
278
279static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
280static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
281
282
283/*---------------------------------------------------------*/
284/*--- ISEL: Misc helpers                                ---*/
285/*---------------------------------------------------------*/
286
287/* Make a int reg-reg move. */
288
289static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
290{
291   vassert(hregClass(src) == HRcInt32);
292   vassert(hregClass(dst) == HRcInt32);
293   return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
294}
295
296
297/* Make a vector reg-reg move. */
298
299static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
300{
301   vassert(hregClass(src) == HRcVec128);
302   vassert(hregClass(dst) == HRcVec128);
303   return X86Instr_SseReRg(Xsse_MOV, src, dst);
304}
305
306/* Advance/retreat %esp by n. */
307
308static void add_to_esp ( ISelEnv* env, Int n )
309{
310   vassert(n > 0 && n < 256 && (n%4) == 0);
311   addInstr(env,
312            X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
313}
314
315static void sub_from_esp ( ISelEnv* env, Int n )
316{
317   vassert(n > 0 && n < 256 && (n%4) == 0);
318   addInstr(env,
319            X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
320}
321
322
323/* Given an amode, return one which references 4 bytes further
324   along. */
325
326static X86AMode* advance4 ( X86AMode* am )
327{
328   X86AMode* am4 = dopyX86AMode(am);
329   switch (am4->tag) {
330      case Xam_IRRS:
331         am4->Xam.IRRS.imm += 4; break;
332      case Xam_IR:
333         am4->Xam.IR.imm += 4; break;
334      default:
335         vpanic("advance4(x86,host)");
336   }
337   return am4;
338}
339
340
341/* Push an arg onto the host stack, in preparation for a call to a
342   helper function of some kind.  Returns the number of 32-bit words
343   pushed.  If we encounter an IRExpr_VECRET() then we expect that
344   r_vecRetAddr will be a valid register, that holds the relevant
345   address.
346*/
347static Int pushArg ( ISelEnv* env, IRExpr* arg, HReg r_vecRetAddr )
348{
349   if (UNLIKELY(arg->tag == Iex_VECRET)) {
350      vassert(0); //ATC
351      vassert(!hregIsInvalid(r_vecRetAddr));
352      addInstr(env, X86Instr_Push(X86RMI_Reg(r_vecRetAddr)));
353      return 1;
354   }
355   if (UNLIKELY(arg->tag == Iex_BBPTR)) {
356      addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
357      return 1;
358   }
359   /* Else it's a "normal" expression. */
360   IRType arg_ty = typeOfIRExpr(env->type_env, arg);
361   if (arg_ty == Ity_I32) {
362      addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
363      return 1;
364   } else
365   if (arg_ty == Ity_I64) {
366      HReg rHi, rLo;
367      iselInt64Expr(&rHi, &rLo, env, arg);
368      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
369      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
370      return 2;
371   }
372   ppIRExpr(arg);
373   vpanic("pushArg(x86): can't handle arg of this type");
374}
375
376
377/* Complete the call to a helper function, by calling the
378   helper and clearing the args off the stack. */
379
380static
381void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
382                              IRCallee* cee, Int n_arg_ws,
383                              RetLoc rloc )
384{
385   /* Complication.  Need to decide which reg to use as the fn address
386      pointer, in a way that doesn't trash regparm-passed
387      parameters. */
388   vassert(sizeof(void*) == 4);
389
390   addInstr(env, X86Instr_Call( cc, (Addr)cee->addr,
391                                cee->regparms, rloc));
392   if (n_arg_ws > 0)
393      add_to_esp(env, 4*n_arg_ws);
394}
395
396
397/* Used only in doHelperCall.  See big comment in doHelperCall re
398   handling of regparm args.  This function figures out whether
399   evaluation of an expression might require use of a fixed register.
400   If in doubt return True (safe but suboptimal).
401*/
402static
403Bool mightRequireFixedRegs ( IRExpr* e )
404{
405   if (UNLIKELY(is_IRExpr_VECRET_or_BBPTR(e))) {
406      // These are always "safe" -- either a copy of %esp in some
407      // arbitrary vreg, or a copy of %ebp, respectively.
408      return False;
409   }
410   /* Else it's a "normal" expression. */
411   switch (e->tag) {
412      case Iex_RdTmp: case Iex_Const: case Iex_Get:
413         return False;
414      default:
415         return True;
416   }
417}
418
419
420/* Do a complete function call.  |guard| is a Ity_Bit expression
421   indicating whether or not the call happens.  If guard==NULL, the
422   call is unconditional.  |retloc| is set to indicate where the
423   return value is after the call.  The caller (of this fn) must
424   generate code to add |stackAdjustAfterCall| to the stack pointer
425   after the call is done. */
426
427static
428void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
429                    /*OUT*/RetLoc* retloc,
430                    ISelEnv* env,
431                    IRExpr* guard,
432                    IRCallee* cee, IRType retTy, IRExpr** args )
433{
434   X86CondCode cc;
435   HReg        argregs[3];
436   HReg        tmpregs[3];
437   Bool        danger;
438   Int         not_done_yet, n_args, n_arg_ws, stack_limit,
439               i, argreg, argregX;
440
441   /* Set default returns.  We'll update them later if needed. */
442   *stackAdjustAfterCall = 0;
443   *retloc               = mk_RetLoc_INVALID();
444
445   /* These are used for cross-checking that IR-level constraints on
446      the use of Iex_VECRET and Iex_BBPTR are observed. */
447   UInt nVECRETs = 0;
448   UInt nBBPTRs  = 0;
449
450   /* Marshal args for a call, do the call, and clear the stack.
451      Complexities to consider:
452
453      * The return type can be I{64,32,16,8} or V128.  In the V128
454        case, it is expected that |args| will contain the special
455        node IRExpr_VECRET(), in which case this routine generates
456        code to allocate space on the stack for the vector return
457        value.  Since we are not passing any scalars on the stack, it
458        is enough to preallocate the return space before marshalling
459        any arguments, in this case.
460
461        |args| may also contain IRExpr_BBPTR(), in which case the
462        value in %ebp is passed as the corresponding argument.
463
464      * If the callee claims regparmness of 1, 2 or 3, we must pass the
465        first 1, 2 or 3 args in registers (EAX, EDX, and ECX
466        respectively).  To keep things relatively simple, only args of
467        type I32 may be passed as regparms -- just bomb out if anything
468        else turns up.  Clearly this depends on the front ends not
469        trying to pass any other types as regparms.
470   */
471
472   /* 16 Nov 2004: the regparm handling is complicated by the
473      following problem.
474
475      Consider a call two a function with two regparm parameters:
476      f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
477      Suppose code is first generated to compute e1 into %eax.  Then,
478      code is generated to compute e2 into %edx.  Unfortunately, if
479      the latter code sequence uses %eax, it will trash the value of
480      e1 computed by the former sequence.  This could happen if (for
481      example) e2 itself involved a function call.  In the code below,
482      args are evaluated right-to-left, not left-to-right, but the
483      principle and the problem are the same.
484
485      One solution is to compute all regparm-bound args into vregs
486      first, and once they are all done, move them to the relevant
487      real regs.  This always gives correct code, but it also gives
488      a bunch of vreg-to-rreg moves which are usually redundant but
489      are hard for the register allocator to get rid of.
490
491      A compromise is to first examine all regparm'd argument
492      expressions.  If they are all so simple that it is clear
493      they will be evaluated without use of any fixed registers,
494      use the old compute-directly-to-fixed-target scheme.  If not,
495      be safe and use the via-vregs scheme.
496
497      Note this requires being able to examine an expression and
498      determine whether or not evaluation of it might use a fixed
499      register.  That requires knowledge of how the rest of this
500      insn selector works.  Currently just the following 3 are
501      regarded as safe -- hopefully they cover the majority of
502      arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
503   */
504   vassert(cee->regparms >= 0 && cee->regparms <= 3);
505
506   /* Count the number of args and also the VECRETs */
507   n_args = n_arg_ws = 0;
508   while (args[n_args]) {
509      IRExpr* arg = args[n_args];
510      n_args++;
511      if (UNLIKELY(arg->tag == Iex_VECRET)) {
512         nVECRETs++;
513      } else if (UNLIKELY(arg->tag == Iex_BBPTR)) {
514         nBBPTRs++;
515      }
516   }
517
518   /* If this fails, the IR is ill-formed */
519   vassert(nBBPTRs == 0 || nBBPTRs == 1);
520
521   /* If we have a VECRET, allocate space on the stack for the return
522      value, and record the stack pointer after that. */
523   HReg r_vecRetAddr = INVALID_HREG;
524   if (nVECRETs == 1) {
525      vassert(retTy == Ity_V128 || retTy == Ity_V256);
526      vassert(retTy != Ity_V256); // we don't handle that yet (if ever)
527      r_vecRetAddr = newVRegI(env);
528      sub_from_esp(env, 16);
529      addInstr(env, mk_iMOVsd_RR( hregX86_ESP(), r_vecRetAddr ));
530   } else {
531      // If either of these fail, the IR is ill-formed
532      vassert(retTy != Ity_V128 && retTy != Ity_V256);
533      vassert(nVECRETs == 0);
534   }
535
536   not_done_yet = n_args;
537
538   stack_limit = cee->regparms;
539
540   /* ------ BEGIN marshall all arguments ------ */
541
542   /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
543   for (i = n_args-1; i >= stack_limit; i--) {
544      n_arg_ws += pushArg(env, args[i], r_vecRetAddr);
545      not_done_yet--;
546   }
547
548   /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
549      registers. */
550
551   if (cee->regparms > 0) {
552
553      /* ------ BEGIN deal with regparms ------ */
554
555      /* deal with regparms, not forgetting %ebp if needed. */
556      argregs[0] = hregX86_EAX();
557      argregs[1] = hregX86_EDX();
558      argregs[2] = hregX86_ECX();
559      tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
560
561      argreg = cee->regparms;
562
563      /* In keeping with big comment above, detect potential danger
564         and use the via-vregs scheme if needed. */
565      danger = False;
566      for (i = stack_limit-1; i >= 0; i--) {
567         if (mightRequireFixedRegs(args[i])) {
568            danger = True;
569            break;
570         }
571      }
572
573      if (danger) {
574
575         /* Move via temporaries */
576         argregX = argreg;
577         for (i = stack_limit-1; i >= 0; i--) {
578
579            if (0) {
580               vex_printf("x86 host: register param is complex: ");
581               ppIRExpr(args[i]);
582               vex_printf("\n");
583            }
584
585            IRExpr* arg = args[i];
586            argreg--;
587            vassert(argreg >= 0);
588            if (UNLIKELY(arg->tag == Iex_VECRET)) {
589               vassert(0); //ATC
590            }
591            else if (UNLIKELY(arg->tag == Iex_BBPTR)) {
592               vassert(0); //ATC
593            } else {
594               vassert(typeOfIRExpr(env->type_env, arg) == Ity_I32);
595               tmpregs[argreg] = iselIntExpr_R(env, arg);
596            }
597            not_done_yet--;
598         }
599         for (i = stack_limit-1; i >= 0; i--) {
600            argregX--;
601            vassert(argregX >= 0);
602            addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
603         }
604
605      } else {
606         /* It's safe to compute all regparm args directly into their
607            target registers. */
608         for (i = stack_limit-1; i >= 0; i--) {
609            IRExpr* arg = args[i];
610            argreg--;
611            vassert(argreg >= 0);
612            if (UNLIKELY(arg->tag == Iex_VECRET)) {
613               vassert(!hregIsInvalid(r_vecRetAddr));
614               addInstr(env, X86Instr_Alu32R(Xalu_MOV,
615                                             X86RMI_Reg(r_vecRetAddr),
616                                             argregs[argreg]));
617            }
618            else if (UNLIKELY(arg->tag == Iex_BBPTR)) {
619               vassert(0); //ATC
620            } else {
621               vassert(typeOfIRExpr(env->type_env, arg) == Ity_I32);
622               addInstr(env, X86Instr_Alu32R(Xalu_MOV,
623                                             iselIntExpr_RMI(env, arg),
624                                             argregs[argreg]));
625            }
626            not_done_yet--;
627         }
628
629      }
630
631      /* ------ END deal with regparms ------ */
632
633   }
634
635   vassert(not_done_yet == 0);
636
637   /* ------ END marshall all arguments ------ */
638
639   /* Now we can compute the condition.  We can't do it earlier
640      because the argument computations could trash the condition
641      codes.  Be a bit clever to handle the common case where the
642      guard is 1:Bit. */
643   cc = Xcc_ALWAYS;
644   if (guard) {
645      if (guard->tag == Iex_Const
646          && guard->Iex.Const.con->tag == Ico_U1
647          && guard->Iex.Const.con->Ico.U1 == True) {
648         /* unconditional -- do nothing */
649      } else {
650         cc = iselCondCode( env, guard );
651      }
652   }
653
654   /* Do final checks, set the return values, and generate the call
655      instruction proper. */
656   vassert(*stackAdjustAfterCall == 0);
657   vassert(is_RetLoc_INVALID(*retloc));
658   switch (retTy) {
659         case Ity_INVALID:
660            /* Function doesn't return a value. */
661            *retloc = mk_RetLoc_simple(RLPri_None);
662            break;
663         case Ity_I64:
664            *retloc = mk_RetLoc_simple(RLPri_2Int);
665            break;
666         case Ity_I32: case Ity_I16: case Ity_I8:
667            *retloc = mk_RetLoc_simple(RLPri_Int);
668            break;
669         case Ity_V128:
670            *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
671            *stackAdjustAfterCall = 16;
672            break;
673         case Ity_V256:
674            vassert(0); // ATC
675            *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
676            *stackAdjustAfterCall = 32;
677            break;
678         default:
679            /* IR can denote other possible return types, but we don't
680               handle those here. */
681           vassert(0);
682   }
683
684   /* Finally, generate the call itself.  This needs the *retloc value
685      set in the switch above, which is why it's at the end. */
686   callHelperAndClearArgs( env, cc, cee, n_arg_ws, *retloc );
687}
688
689
690/* Given a guest-state array descriptor, an index expression and a
691   bias, generate an X86AMode holding the relevant guest state
692   offset. */
693
694static
695X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
696                                IRExpr* off, Int bias )
697{
698   HReg tmp, roff;
699   Int  elemSz = sizeofIRType(descr->elemTy);
700   Int  nElems = descr->nElems;
701   Int  shift  = 0;
702
703   /* throw out any cases not generated by an x86 front end.  In
704      theory there might be a day where we need to handle them -- if
705      we ever run non-x86-guest on x86 host. */
706
707   if (nElems != 8)
708      vpanic("genGuestArrayOffset(x86 host)(1)");
709
710   switch (elemSz) {
711      case 1:  shift = 0; break;
712      case 4:  shift = 2; break;
713      case 8:  shift = 3; break;
714      default: vpanic("genGuestArrayOffset(x86 host)(2)");
715   }
716
717   /* Compute off into a reg, %off.  Then return:
718
719         movl %off, %tmp
720         addl $bias, %tmp  (if bias != 0)
721         andl %tmp, 7
722         ... base(%ebp, %tmp, shift) ...
723   */
724   tmp  = newVRegI(env);
725   roff = iselIntExpr_R(env, off);
726   addInstr(env, mk_iMOVsd_RR(roff, tmp));
727   if (bias != 0) {
728      addInstr(env,
729               X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
730   }
731   addInstr(env,
732            X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
733   return
734      X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
735}
736
737
738/* Mess with the FPU's rounding mode: set to the default rounding mode
739   (DEFAULT_FPUCW). */
740static
741void set_FPU_rounding_default ( ISelEnv* env )
742{
743   /* pushl $DEFAULT_FPUCW
744      fldcw 0(%esp)
745      addl $4, %esp
746   */
747   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
748   addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
749   addInstr(env, X86Instr_FpLdCW(zero_esp));
750   add_to_esp(env, 4);
751}
752
753
754/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
755   expression denoting a value in the range 0 .. 3, indicating a round
756   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
757   the same rounding.
758*/
759static
760void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
761{
762   HReg rrm  = iselIntExpr_R(env, mode);
763   HReg rrm2 = newVRegI(env);
764   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
765
766   /* movl  %rrm, %rrm2
767      andl  $3, %rrm2   -- shouldn't be needed; paranoia
768      shll  $10, %rrm2
769      orl   $DEFAULT_FPUCW, %rrm2
770      pushl %rrm2
771      fldcw 0(%esp)
772      addl  $4, %esp
773   */
774   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
775   addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
776   addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
777   addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
778   addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
779   addInstr(env, X86Instr_FpLdCW(zero_esp));
780   add_to_esp(env, 4);
781}
782
783
784/* Generate !src into a new vector register, and be sure that the code
785   is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
786   way to do this.
787*/
788static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
789{
790   HReg dst = newVRegV(env);
791   /* Set dst to zero.  If dst contains a NaN then all hell might
792      break loose after the comparison.  So, first zero it. */
793   addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
794   /* And now make it all 1s ... */
795   addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
796   /* Finally, xor 'src' into it. */
797   addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
798   /* Doesn't that just totally suck? */
799   return dst;
800}
801
802
803/* Round an x87 FPU value to 53-bit-mantissa precision, to be used
804   after most non-simple FPU operations (simple = +, -, *, / and
805   sqrt).
806
807   This could be done a lot more efficiently if needed, by loading
808   zero and adding it to the value to be rounded (fldz ; faddp?).
809*/
810static void roundToF64 ( ISelEnv* env, HReg reg )
811{
812   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
813   sub_from_esp(env, 8);
814   addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
815   addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
816   add_to_esp(env, 8);
817}
818
819
820/*---------------------------------------------------------*/
821/*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
822/*---------------------------------------------------------*/
823
824/* Select insns for an integer-typed expression, and add them to the
825   code list.  Return a reg holding the result.  This reg will be a
826   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
827   want to modify it, ask for a new vreg, copy it in there, and modify
828   the copy.  The register allocator will do its best to map both
829   vregs to the same real register, so the copies will often disappear
830   later in the game.
831
832   This should handle expressions of 32, 16 and 8-bit type.  All
833   results are returned in a 32-bit register.  For 16- and 8-bit
834   expressions, the upper 16/24 bits are arbitrary, so you should mask
835   or sign extend partial values if necessary.
836*/
837
838static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
839{
840   HReg r = iselIntExpr_R_wrk(env, e);
841   /* sanity checks ... */
842#  if 0
843   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
844#  endif
845   vassert(hregClass(r) == HRcInt32);
846   vassert(hregIsVirtual(r));
847   return r;
848}
849
850/* DO NOT CALL THIS DIRECTLY ! */
851static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
852{
853   MatchInfo mi;
854
855   IRType ty = typeOfIRExpr(env->type_env,e);
856   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
857
858   switch (e->tag) {
859
860   /* --------- TEMP --------- */
861   case Iex_RdTmp: {
862      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
863   }
864
865   /* --------- LOAD --------- */
866   case Iex_Load: {
867      HReg dst = newVRegI(env);
868      X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
869
870      /* We can't handle big-endian loads, nor load-linked. */
871      if (e->Iex.Load.end != Iend_LE)
872         goto irreducible;
873
874      if (ty == Ity_I32) {
875         addInstr(env, X86Instr_Alu32R(Xalu_MOV,
876                                       X86RMI_Mem(amode), dst) );
877         return dst;
878      }
879      if (ty == Ity_I16) {
880         addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
881         return dst;
882      }
883      if (ty == Ity_I8) {
884         addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
885         return dst;
886      }
887      break;
888   }
889
890   /* --------- TERNARY OP --------- */
891   case Iex_Triop: {
892      IRTriop *triop = e->Iex.Triop.details;
893      /* C3210 flags following FPU partial remainder (fprem), both
894         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
895      if (triop->op == Iop_PRemC3210F64
896          || triop->op == Iop_PRem1C3210F64) {
897         HReg junk = newVRegF(env);
898         HReg dst  = newVRegI(env);
899         HReg srcL = iselDblExpr(env, triop->arg2);
900         HReg srcR = iselDblExpr(env, triop->arg3);
901         /* XXXROUNDINGFIXME */
902         /* set roundingmode here */
903         addInstr(env, X86Instr_FpBinary(
904                           e->Iex.Binop.op==Iop_PRemC3210F64
905                              ? Xfp_PREM : Xfp_PREM1,
906                           srcL,srcR,junk
907                 ));
908         /* The previous pseudo-insn will have left the FPU's C3210
909            flags set correctly.  So bag them. */
910         addInstr(env, X86Instr_FpStSW_AX());
911         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
912         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
913         return dst;
914      }
915
916      break;
917   }
918
919   /* --------- BINARY OP --------- */
920   case Iex_Binop: {
921      X86AluOp   aluOp;
922      X86ShiftOp shOp;
923
924      /* Pattern: Sub32(0,x) */
925      if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
926         HReg dst = newVRegI(env);
927         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
928         addInstr(env, mk_iMOVsd_RR(reg,dst));
929         addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
930         return dst;
931      }
932
933      /* Is it an addition or logical style op? */
934      switch (e->Iex.Binop.op) {
935         case Iop_Add8: case Iop_Add16: case Iop_Add32:
936            aluOp = Xalu_ADD; break;
937         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
938            aluOp = Xalu_SUB; break;
939         case Iop_And8: case Iop_And16: case Iop_And32:
940            aluOp = Xalu_AND; break;
941         case Iop_Or8: case Iop_Or16: case Iop_Or32:
942            aluOp = Xalu_OR; break;
943         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
944            aluOp = Xalu_XOR; break;
945         case Iop_Mul16: case Iop_Mul32:
946            aluOp = Xalu_MUL; break;
947         default:
948            aluOp = Xalu_INVALID; break;
949      }
950      /* For commutative ops we assume any literal
951         values are on the second operand. */
952      if (aluOp != Xalu_INVALID) {
953         HReg dst    = newVRegI(env);
954         HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
955         X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
956         addInstr(env, mk_iMOVsd_RR(reg,dst));
957         addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
958         return dst;
959      }
960      /* Could do better here; forcing the first arg into a reg
961         isn't always clever.
962         -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
963                        LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
964                        t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
965            movl 0xFFFFFFA0(%vr41),%vr107
966            movl 0xFFFFFFA4(%vr41),%vr108
967            movl %vr107,%vr106
968            xorl %vr108,%vr106
969            movl 0xFFFFFFA8(%vr41),%vr109
970            movl %vr106,%vr105
971            andl %vr109,%vr105
972            movl 0xFFFFFFA0(%vr41),%vr110
973            movl %vr105,%vr104
974            xorl %vr110,%vr104
975            movl %vr104,%vr70
976      */
977
978      /* Perhaps a shift op? */
979      switch (e->Iex.Binop.op) {
980         case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
981            shOp = Xsh_SHL; break;
982         case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
983            shOp = Xsh_SHR; break;
984         case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
985            shOp = Xsh_SAR; break;
986         default:
987            shOp = Xsh_INVALID; break;
988      }
989      if (shOp != Xsh_INVALID) {
990         HReg dst = newVRegI(env);
991
992         /* regL = the value to be shifted */
993         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
994         addInstr(env, mk_iMOVsd_RR(regL,dst));
995
996         /* Do any necessary widening for 16/8 bit operands */
997         switch (e->Iex.Binop.op) {
998            case Iop_Shr8:
999               addInstr(env, X86Instr_Alu32R(
1000                                Xalu_AND, X86RMI_Imm(0xFF), dst));
1001               break;
1002            case Iop_Shr16:
1003               addInstr(env, X86Instr_Alu32R(
1004                                Xalu_AND, X86RMI_Imm(0xFFFF), dst));
1005               break;
1006            case Iop_Sar8:
1007               addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
1008               addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
1009               break;
1010            case Iop_Sar16:
1011               addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
1012               addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
1013               break;
1014            default: break;
1015         }
1016
1017         /* Now consider the shift amount.  If it's a literal, we
1018            can do a much better job than the general case. */
1019         if (e->Iex.Binop.arg2->tag == Iex_Const) {
1020            /* assert that the IR is well-typed */
1021            Int nshift;
1022            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1023            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1024	    vassert(nshift >= 0);
1025	    if (nshift > 0)
1026               /* Can't allow nshift==0 since that means %cl */
1027               addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
1028         } else {
1029            /* General case; we have to force the amount into %cl. */
1030            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1031            addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
1032            addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
1033         }
1034         return dst;
1035      }
1036
1037      /* Handle misc other ops. */
1038
1039      if (e->Iex.Binop.op == Iop_Max32U) {
1040         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1041         HReg dst  = newVRegI(env);
1042         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1043         addInstr(env, mk_iMOVsd_RR(src1,dst));
1044         addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
1045         addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
1046         return dst;
1047      }
1048
1049      if (e->Iex.Binop.op == Iop_8HLto16) {
1050         HReg hi8  = newVRegI(env);
1051         HReg lo8  = newVRegI(env);
1052         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1053         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1054         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1055         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1056         addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
1057         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
1058         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
1059         return hi8;
1060      }
1061
1062      if (e->Iex.Binop.op == Iop_16HLto32) {
1063         HReg hi16  = newVRegI(env);
1064         HReg lo16  = newVRegI(env);
1065         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1066         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1067         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1068         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1069         addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
1070         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
1071         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
1072         return hi16;
1073      }
1074
1075      if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
1076          || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
1077         HReg a16   = newVRegI(env);
1078         HReg b16   = newVRegI(env);
1079         HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1080         HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1081         Int  shift = (e->Iex.Binop.op == Iop_MullS8
1082                       || e->Iex.Binop.op == Iop_MullU8)
1083                         ? 24 : 16;
1084         X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
1085                              || e->Iex.Binop.op == Iop_MullS16)
1086                                ? Xsh_SAR : Xsh_SHR;
1087
1088         addInstr(env, mk_iMOVsd_RR(a16s, a16));
1089         addInstr(env, mk_iMOVsd_RR(b16s, b16));
1090         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
1091         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
1092         addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
1093         addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
1094         addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
1095         return b16;
1096      }
1097
1098      if (e->Iex.Binop.op == Iop_CmpF64) {
1099         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1100         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1101         HReg dst = newVRegI(env);
1102         addInstr(env, X86Instr_FpCmp(fL,fR,dst));
1103         /* shift this right 8 bits so as to conform to CmpF64
1104            definition. */
1105         addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
1106         return dst;
1107      }
1108
1109      if (e->Iex.Binop.op == Iop_F64toI32S
1110          || e->Iex.Binop.op == Iop_F64toI16S) {
1111         Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
1112         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1113         HReg dst = newVRegI(env);
1114
1115         /* Used several times ... */
1116         X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1117
1118	 /* rf now holds the value to be converted, and rrm holds the
1119	    rounding mode value, encoded as per the IRRoundingMode
1120	    enum.  The first thing to do is set the FPU's rounding
1121	    mode accordingly. */
1122
1123         /* Create a space for the format conversion. */
1124         /* subl $4, %esp */
1125         sub_from_esp(env, 4);
1126
1127	 /* Set host rounding mode */
1128	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
1129
1130         /* gistw/l %rf, 0(%esp) */
1131         addInstr(env, X86Instr_FpLdStI(False/*store*/,
1132                                        toUChar(sz), rf, zero_esp));
1133
1134         if (sz == 2) {
1135            /* movzwl 0(%esp), %dst */
1136            addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
1137         } else {
1138            /* movl 0(%esp), %dst */
1139            vassert(sz == 4);
1140            addInstr(env, X86Instr_Alu32R(
1141                             Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1142         }
1143
1144	 /* Restore default FPU rounding. */
1145         set_FPU_rounding_default( env );
1146
1147         /* addl $4, %esp */
1148	 add_to_esp(env, 4);
1149         return dst;
1150      }
1151
1152      break;
1153   }
1154
1155   /* --------- UNARY OP --------- */
1156   case Iex_Unop: {
1157
1158      /* 1Uto8(32to1(expr32)) */
1159      if (e->Iex.Unop.op == Iop_1Uto8) {
1160         DECLARE_PATTERN(p_32to1_then_1Uto8);
1161         DEFINE_PATTERN(p_32to1_then_1Uto8,
1162                        unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
1163         if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
1164            IRExpr* expr32 = mi.bindee[0];
1165            HReg dst = newVRegI(env);
1166            HReg src = iselIntExpr_R(env, expr32);
1167            addInstr(env, mk_iMOVsd_RR(src,dst) );
1168            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1169                                          X86RMI_Imm(1), dst));
1170            return dst;
1171         }
1172      }
1173
1174      /* 8Uto32(LDle(expr32)) */
1175      if (e->Iex.Unop.op == Iop_8Uto32) {
1176         DECLARE_PATTERN(p_LDle8_then_8Uto32);
1177         DEFINE_PATTERN(p_LDle8_then_8Uto32,
1178                        unop(Iop_8Uto32,
1179                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1180         if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
1181            HReg dst = newVRegI(env);
1182            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1183            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1184            return dst;
1185         }
1186      }
1187
1188      /* 8Sto32(LDle(expr32)) */
1189      if (e->Iex.Unop.op == Iop_8Sto32) {
1190         DECLARE_PATTERN(p_LDle8_then_8Sto32);
1191         DEFINE_PATTERN(p_LDle8_then_8Sto32,
1192                        unop(Iop_8Sto32,
1193                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1194         if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
1195            HReg dst = newVRegI(env);
1196            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1197            addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
1198            return dst;
1199         }
1200      }
1201
1202      /* 16Uto32(LDle(expr32)) */
1203      if (e->Iex.Unop.op == Iop_16Uto32) {
1204         DECLARE_PATTERN(p_LDle16_then_16Uto32);
1205         DEFINE_PATTERN(p_LDle16_then_16Uto32,
1206                        unop(Iop_16Uto32,
1207                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1208         if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
1209            HReg dst = newVRegI(env);
1210            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1211            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1212            return dst;
1213         }
1214      }
1215
1216      /* 8Uto32(GET:I8) */
1217      if (e->Iex.Unop.op == Iop_8Uto32) {
1218         if (e->Iex.Unop.arg->tag == Iex_Get) {
1219            HReg      dst;
1220            X86AMode* amode;
1221            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
1222            dst = newVRegI(env);
1223            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1224                                hregX86_EBP());
1225            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1226            return dst;
1227         }
1228      }
1229
1230      /* 16to32(GET:I16) */
1231      if (e->Iex.Unop.op == Iop_16Uto32) {
1232         if (e->Iex.Unop.arg->tag == Iex_Get) {
1233            HReg      dst;
1234            X86AMode* amode;
1235            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
1236            dst = newVRegI(env);
1237            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1238                                hregX86_EBP());
1239            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1240            return dst;
1241         }
1242      }
1243
1244      switch (e->Iex.Unop.op) {
1245         case Iop_8Uto16:
1246         case Iop_8Uto32:
1247         case Iop_16Uto32: {
1248            HReg dst = newVRegI(env);
1249            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1250            UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
1251            addInstr(env, mk_iMOVsd_RR(src,dst) );
1252            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1253                                          X86RMI_Imm(mask), dst));
1254            return dst;
1255         }
1256         case Iop_8Sto16:
1257         case Iop_8Sto32:
1258         case Iop_16Sto32: {
1259            HReg dst = newVRegI(env);
1260            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1261            UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
1262            addInstr(env, mk_iMOVsd_RR(src,dst) );
1263            addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
1264            addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
1265            return dst;
1266         }
1267	 case Iop_Not8:
1268	 case Iop_Not16:
1269         case Iop_Not32: {
1270            HReg dst = newVRegI(env);
1271            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1272            addInstr(env, mk_iMOVsd_RR(src,dst) );
1273            addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
1274            return dst;
1275         }
1276         case Iop_64HIto32: {
1277            HReg rHi, rLo;
1278            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1279            return rHi; /* and abandon rLo .. poor wee thing :-) */
1280         }
1281         case Iop_64to32: {
1282            HReg rHi, rLo;
1283            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1284            return rLo; /* similar stupid comment to the above ... */
1285         }
1286         case Iop_16HIto8:
1287         case Iop_32HIto16: {
1288            HReg dst  = newVRegI(env);
1289            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1290            Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
1291            addInstr(env, mk_iMOVsd_RR(src,dst) );
1292            addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
1293            return dst;
1294         }
1295         case Iop_1Uto32:
1296         case Iop_1Uto8: {
1297            HReg dst         = newVRegI(env);
1298            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1299            addInstr(env, X86Instr_Set32(cond,dst));
1300            return dst;
1301         }
1302         case Iop_1Sto8:
1303         case Iop_1Sto16:
1304         case Iop_1Sto32: {
1305            /* could do better than this, but for now ... */
1306            HReg dst         = newVRegI(env);
1307            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1308            addInstr(env, X86Instr_Set32(cond,dst));
1309            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
1310            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1311            return dst;
1312         }
1313         case Iop_Ctz32: {
1314            /* Count trailing zeroes, implemented by x86 'bsfl' */
1315            HReg dst = newVRegI(env);
1316            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1317            addInstr(env, X86Instr_Bsfr32(True,src,dst));
1318            return dst;
1319         }
1320         case Iop_Clz32: {
1321            /* Count leading zeroes.  Do 'bsrl' to establish the index
1322               of the highest set bit, and subtract that value from
1323               31. */
1324            HReg tmp = newVRegI(env);
1325            HReg dst = newVRegI(env);
1326            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1327            addInstr(env, X86Instr_Bsfr32(False,src,tmp));
1328            addInstr(env, X86Instr_Alu32R(Xalu_MOV,
1329                                          X86RMI_Imm(31), dst));
1330            addInstr(env, X86Instr_Alu32R(Xalu_SUB,
1331                                          X86RMI_Reg(tmp), dst));
1332            return dst;
1333         }
1334
1335         case Iop_CmpwNEZ32: {
1336            HReg dst = newVRegI(env);
1337            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1338            addInstr(env, mk_iMOVsd_RR(src,dst));
1339            addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
1340            addInstr(env, X86Instr_Alu32R(Xalu_OR,
1341                                          X86RMI_Reg(src), dst));
1342            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1343            return dst;
1344         }
1345         case Iop_Left8:
1346         case Iop_Left16:
1347         case Iop_Left32: {
1348            HReg dst = newVRegI(env);
1349            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1350            addInstr(env, mk_iMOVsd_RR(src, dst));
1351            addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
1352            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
1353            return dst;
1354         }
1355
1356         case Iop_V128to32: {
1357            HReg      dst  = newVRegI(env);
1358            HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
1359            X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
1360            sub_from_esp(env, 16);
1361            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
1362            addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
1363            add_to_esp(env, 16);
1364            return dst;
1365         }
1366
1367         /* ReinterpF32asI32(e) */
1368         /* Given an IEEE754 single, produce an I32 with the same bit
1369            pattern.  Keep stack 8-aligned even though only using 4
1370            bytes. */
1371         case Iop_ReinterpF32asI32: {
1372            HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
1373            HReg dst  = newVRegI(env);
1374            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1375            /* paranoia */
1376            set_FPU_rounding_default(env);
1377            /* subl $8, %esp */
1378            sub_from_esp(env, 8);
1379            /* gstF %rf, 0(%esp) */
1380            addInstr(env,
1381                     X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
1382            /* movl 0(%esp), %dst */
1383            addInstr(env,
1384                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1385            /* addl $8, %esp */
1386            add_to_esp(env, 8);
1387            return dst;
1388         }
1389
1390         case Iop_16to8:
1391         case Iop_32to8:
1392         case Iop_32to16:
1393            /* These are no-ops. */
1394            return iselIntExpr_R(env, e->Iex.Unop.arg);
1395
1396         case Iop_GetMSBs8x8: {
1397            /* Note: the following assumes the helper is of
1398               signature
1399                  UInt fn ( ULong ), and is not a regparm fn.
1400            */
1401            HReg  xLo, xHi;
1402            HReg  dst = newVRegI(env);
1403            Addr fn = (Addr)h_generic_calc_GetMSBs8x8;
1404            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
1405            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
1406            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
1407            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
1408                                         0, mk_RetLoc_simple(RLPri_Int) ));
1409            add_to_esp(env, 2*4);
1410            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1411            return dst;
1412         }
1413
1414         default:
1415            break;
1416      }
1417      break;
1418   }
1419
1420   /* --------- GET --------- */
1421   case Iex_Get: {
1422      if (ty == Ity_I32) {
1423         HReg dst = newVRegI(env);
1424         addInstr(env, X86Instr_Alu32R(
1425                          Xalu_MOV,
1426                          X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1427                                                 hregX86_EBP())),
1428                          dst));
1429         return dst;
1430      }
1431      if (ty == Ity_I8 || ty == Ity_I16) {
1432         HReg dst = newVRegI(env);
1433         addInstr(env, X86Instr_LoadEX(
1434                          toUChar(ty==Ity_I8 ? 1 : 2),
1435                          False,
1436                          X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
1437                          dst));
1438         return dst;
1439      }
1440      break;
1441   }
1442
1443   case Iex_GetI: {
1444      X86AMode* am
1445         = genGuestArrayOffset(
1446              env, e->Iex.GetI.descr,
1447                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1448      HReg dst = newVRegI(env);
1449      if (ty == Ity_I8) {
1450         addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
1451         return dst;
1452      }
1453      if (ty == Ity_I32) {
1454         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
1455         return dst;
1456      }
1457      break;
1458   }
1459
1460   /* --------- CCALL --------- */
1461   case Iex_CCall: {
1462      HReg    dst = newVRegI(env);
1463      vassert(ty == e->Iex.CCall.retty);
1464
1465      /* be very restrictive for now.  Only 32/64-bit ints allowed for
1466         args, and 32 bits for return type.  Don't forget to change
1467         the RetLoc if more return types are allowed in future. */
1468      if (e->Iex.CCall.retty != Ity_I32)
1469         goto irreducible;
1470
1471      /* Marshal args, do the call, clear stack. */
1472      UInt   addToSp = 0;
1473      RetLoc rloc    = mk_RetLoc_INVALID();
1474      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1475                    e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1476      vassert(is_sane_RetLoc(rloc));
1477      vassert(rloc.pri == RLPri_Int);
1478      vassert(addToSp == 0);
1479
1480      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1481      return dst;
1482   }
1483
1484   /* --------- LITERAL --------- */
1485   /* 32/16/8-bit literals */
1486   case Iex_Const: {
1487      X86RMI* rmi = iselIntExpr_RMI ( env, e );
1488      HReg    r   = newVRegI(env);
1489      addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
1490      return r;
1491   }
1492
1493   /* --------- MULTIPLEX --------- */
1494   case Iex_ITE: { // VFD
1495     if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1496         && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1497        HReg   r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1498        X86RM* r0  = iselIntExpr_RM(env, e->Iex.ITE.iffalse);
1499        HReg   dst = newVRegI(env);
1500        addInstr(env, mk_iMOVsd_RR(r1,dst));
1501        X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1502        addInstr(env, X86Instr_CMov32(cc ^ 1, r0, dst));
1503        return dst;
1504      }
1505      break;
1506   }
1507
1508   default:
1509   break;
1510   } /* switch (e->tag) */
1511
1512   /* We get here if no pattern matched. */
1513  irreducible:
1514   ppIRExpr(e);
1515   vpanic("iselIntExpr_R: cannot reduce tree");
1516}
1517
1518
1519/*---------------------------------------------------------*/
1520/*--- ISEL: Integer expression auxiliaries              ---*/
1521/*---------------------------------------------------------*/
1522
1523/* --------------------- AMODEs --------------------- */
1524
1525/* Return an AMode which computes the value of the specified
1526   expression, possibly also adding insns to the code list as a
1527   result.  The expression may only be a 32-bit one.
1528*/
1529
1530static Bool sane_AMode ( X86AMode* am )
1531{
1532   switch (am->tag) {
1533      case Xam_IR:
1534         return
1535            toBool( hregClass(am->Xam.IR.reg) == HRcInt32
1536                    && (hregIsVirtual(am->Xam.IR.reg)
1537                        || sameHReg(am->Xam.IR.reg, hregX86_EBP())) );
1538      case Xam_IRRS:
1539         return
1540            toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
1541                    && hregIsVirtual(am->Xam.IRRS.base)
1542                    && hregClass(am->Xam.IRRS.index) == HRcInt32
1543                    && hregIsVirtual(am->Xam.IRRS.index) );
1544      default:
1545        vpanic("sane_AMode: unknown x86 amode tag");
1546   }
1547}
1548
1549static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1550{
1551   X86AMode* am = iselIntExpr_AMode_wrk(env, e);
1552   vassert(sane_AMode(am));
1553   return am;
1554}
1555
1556/* DO NOT CALL THIS DIRECTLY ! */
1557static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1558{
1559   IRType ty = typeOfIRExpr(env->type_env,e);
1560   vassert(ty == Ity_I32);
1561
1562   /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
1563   if (e->tag == Iex_Binop
1564       && e->Iex.Binop.op == Iop_Add32
1565       && e->Iex.Binop.arg2->tag == Iex_Const
1566       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
1567       && e->Iex.Binop.arg1->tag == Iex_Binop
1568       && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
1569       && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
1570       && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1571       && e->Iex.Binop.arg1
1572           ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1573       && e->Iex.Binop.arg1
1574           ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1575      UInt shift = e->Iex.Binop.arg1
1576                    ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1577      UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1578      if (shift == 1 || shift == 2 || shift == 3) {
1579         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
1580         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
1581                                       ->Iex.Binop.arg2->Iex.Binop.arg1 );
1582         return X86AMode_IRRS(imm32, r1, r2, shift);
1583      }
1584   }
1585
1586   /* Add32(expr1, Shl32(expr2, imm)) */
1587   if (e->tag == Iex_Binop
1588       && e->Iex.Binop.op == Iop_Add32
1589       && e->Iex.Binop.arg2->tag == Iex_Binop
1590       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1591       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1592       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1593      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1594      if (shift == 1 || shift == 2 || shift == 3) {
1595         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1596         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1597         return X86AMode_IRRS(0, r1, r2, shift);
1598      }
1599   }
1600
1601   /* Add32(expr,i) */
1602   if (e->tag == Iex_Binop
1603       && e->Iex.Binop.op == Iop_Add32
1604       && e->Iex.Binop.arg2->tag == Iex_Const
1605       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
1606      HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
1607      return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
1608   }
1609
1610   /* Doesn't match anything in particular.  Generate it into
1611      a register and use that. */
1612   {
1613      HReg r1 = iselIntExpr_R(env, e);
1614      return X86AMode_IR(0, r1);
1615   }
1616}
1617
1618
1619/* --------------------- RMIs --------------------- */
1620
1621/* Similarly, calculate an expression into an X86RMI operand.  As with
1622   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1623
1624static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
1625{
1626   X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1627   /* sanity checks ... */
1628   switch (rmi->tag) {
1629      case Xrmi_Imm:
1630         return rmi;
1631      case Xrmi_Reg:
1632         vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
1633         vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
1634         return rmi;
1635      case Xrmi_Mem:
1636         vassert(sane_AMode(rmi->Xrmi.Mem.am));
1637         return rmi;
1638      default:
1639         vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
1640   }
1641}
1642
1643/* DO NOT CALL THIS DIRECTLY ! */
1644static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
1645{
1646   IRType ty = typeOfIRExpr(env->type_env,e);
1647   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1648
1649   /* special case: immediate */
1650   if (e->tag == Iex_Const) {
1651      UInt u;
1652      switch (e->Iex.Const.con->tag) {
1653         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1654         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1655         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1656         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1657      }
1658      return X86RMI_Imm(u);
1659   }
1660
1661   /* special case: 32-bit GET */
1662   if (e->tag == Iex_Get && ty == Ity_I32) {
1663      return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1664                                    hregX86_EBP()));
1665   }
1666
1667   /* special case: 32-bit load from memory */
1668   if (e->tag == Iex_Load && ty == Ity_I32
1669       && e->Iex.Load.end == Iend_LE) {
1670      X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
1671      return X86RMI_Mem(am);
1672   }
1673
1674   /* default case: calculate into a register and return that */
1675   {
1676      HReg r = iselIntExpr_R ( env, e );
1677      return X86RMI_Reg(r);
1678   }
1679}
1680
1681
1682/* --------------------- RIs --------------------- */
1683
1684/* Calculate an expression into an X86RI operand.  As with
1685   iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
1686
1687static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
1688{
1689   X86RI* ri = iselIntExpr_RI_wrk(env, e);
1690   /* sanity checks ... */
1691   switch (ri->tag) {
1692      case Xri_Imm:
1693         return ri;
1694      case Xri_Reg:
1695         vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
1696         vassert(hregIsVirtual(ri->Xri.Reg.reg));
1697         return ri;
1698      default:
1699         vpanic("iselIntExpr_RI: unknown x86 RI tag");
1700   }
1701}
1702
1703/* DO NOT CALL THIS DIRECTLY ! */
1704static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
1705{
1706   IRType ty = typeOfIRExpr(env->type_env,e);
1707   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1708
1709   /* special case: immediate */
1710   if (e->tag == Iex_Const) {
1711      UInt u;
1712      switch (e->Iex.Const.con->tag) {
1713         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1714         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1715         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1716         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1717      }
1718      return X86RI_Imm(u);
1719   }
1720
1721   /* default case: calculate into a register and return that */
1722   {
1723      HReg r = iselIntExpr_R ( env, e );
1724      return X86RI_Reg(r);
1725   }
1726}
1727
1728
1729/* --------------------- RMs --------------------- */
1730
1731/* Similarly, calculate an expression into an X86RM operand.  As with
1732   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1733
1734static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
1735{
1736   X86RM* rm = iselIntExpr_RM_wrk(env, e);
1737   /* sanity checks ... */
1738   switch (rm->tag) {
1739      case Xrm_Reg:
1740         vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
1741         vassert(hregIsVirtual(rm->Xrm.Reg.reg));
1742         return rm;
1743      case Xrm_Mem:
1744         vassert(sane_AMode(rm->Xrm.Mem.am));
1745         return rm;
1746      default:
1747         vpanic("iselIntExpr_RM: unknown x86 RM tag");
1748   }
1749}
1750
1751/* DO NOT CALL THIS DIRECTLY ! */
1752static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
1753{
1754   IRType ty = typeOfIRExpr(env->type_env,e);
1755   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1756
1757   /* special case: 32-bit GET */
1758   if (e->tag == Iex_Get && ty == Ity_I32) {
1759      return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
1760                                   hregX86_EBP()));
1761   }
1762
1763   /* special case: load from memory */
1764
1765   /* default case: calculate into a register and return that */
1766   {
1767      HReg r = iselIntExpr_R ( env, e );
1768      return X86RM_Reg(r);
1769   }
1770}
1771
1772
1773/* --------------------- CONDCODE --------------------- */
1774
1775/* Generate code to evaluated a bit-typed expression, returning the
1776   condition code which would correspond when the expression would
1777   notionally have returned 1. */
1778
1779static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
1780{
1781   /* Uh, there's nothing we can sanity check here, unfortunately. */
1782   return iselCondCode_wrk(env,e);
1783}
1784
1785/* DO NOT CALL THIS DIRECTLY ! */
1786static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
1787{
1788   MatchInfo mi;
1789
1790   vassert(e);
1791   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
1792
1793   /* var */
1794   if (e->tag == Iex_RdTmp) {
1795      HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
1796      /* Test32 doesn't modify r32; so this is OK. */
1797      addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
1798      return Xcc_NZ;
1799   }
1800
1801   /* Constant 1:Bit */
1802   if (e->tag == Iex_Const) {
1803      HReg r;
1804      vassert(e->Iex.Const.con->tag == Ico_U1);
1805      vassert(e->Iex.Const.con->Ico.U1 == True
1806              || e->Iex.Const.con->Ico.U1 == False);
1807      r = newVRegI(env);
1808      addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
1809      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
1810      return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
1811   }
1812
1813   /* Not1(e) */
1814   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
1815      /* Generate code for the arg, and negate the test condition */
1816      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
1817   }
1818
1819   /* --- patterns rooted at: 32to1 --- */
1820
1821   if (e->tag == Iex_Unop
1822       && e->Iex.Unop.op == Iop_32to1) {
1823      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1824      addInstr(env, X86Instr_Test32(1,rm));
1825      return Xcc_NZ;
1826   }
1827
1828   /* --- patterns rooted at: CmpNEZ8 --- */
1829
1830   /* CmpNEZ8(x) */
1831   if (e->tag == Iex_Unop
1832       && e->Iex.Unop.op == Iop_CmpNEZ8) {
1833      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1834      addInstr(env, X86Instr_Test32(0xFF,rm));
1835      return Xcc_NZ;
1836   }
1837
1838   /* --- patterns rooted at: CmpNEZ16 --- */
1839
1840   /* CmpNEZ16(x) */
1841   if (e->tag == Iex_Unop
1842       && e->Iex.Unop.op == Iop_CmpNEZ16) {
1843      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1844      addInstr(env, X86Instr_Test32(0xFFFF,rm));
1845      return Xcc_NZ;
1846   }
1847
1848   /* --- patterns rooted at: CmpNEZ32 --- */
1849
1850   /* CmpNEZ32(And32(x,y)) */
1851   {
1852      DECLARE_PATTERN(p_CmpNEZ32_And32);
1853      DEFINE_PATTERN(p_CmpNEZ32_And32,
1854                     unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
1855      if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
1856         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1857         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1858         HReg    tmp  = newVRegI(env);
1859         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1860         addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
1861         return Xcc_NZ;
1862      }
1863   }
1864
1865   /* CmpNEZ32(Or32(x,y)) */
1866   {
1867      DECLARE_PATTERN(p_CmpNEZ32_Or32);
1868      DEFINE_PATTERN(p_CmpNEZ32_Or32,
1869                     unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
1870      if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
1871         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1872         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1873         HReg    tmp  = newVRegI(env);
1874         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1875         addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
1876         return Xcc_NZ;
1877      }
1878   }
1879
1880   /* CmpNEZ32(GET(..):I32) */
1881   if (e->tag == Iex_Unop
1882       && e->Iex.Unop.op == Iop_CmpNEZ32
1883       && e->Iex.Unop.arg->tag == Iex_Get) {
1884      X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1885                                 hregX86_EBP());
1886      addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
1887      return Xcc_NZ;
1888   }
1889
1890   /* CmpNEZ32(x) */
1891   if (e->tag == Iex_Unop
1892       && e->Iex.Unop.op == Iop_CmpNEZ32) {
1893      HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1894      X86RMI* rmi2 = X86RMI_Imm(0);
1895      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1896      return Xcc_NZ;
1897   }
1898
1899   /* --- patterns rooted at: CmpNEZ64 --- */
1900
1901   /* CmpNEZ64(Or64(x,y)) */
1902   {
1903      DECLARE_PATTERN(p_CmpNEZ64_Or64);
1904      DEFINE_PATTERN(p_CmpNEZ64_Or64,
1905                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
1906      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
1907         HReg    hi1, lo1, hi2, lo2;
1908         HReg    tmp  = newVRegI(env);
1909         iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
1910         addInstr(env, mk_iMOVsd_RR(hi1, tmp));
1911         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
1912         iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
1913         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
1914         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
1915         return Xcc_NZ;
1916      }
1917   }
1918
1919   /* CmpNEZ64(x) */
1920   if (e->tag == Iex_Unop
1921       && e->Iex.Unop.op == Iop_CmpNEZ64) {
1922      HReg hi, lo;
1923      HReg tmp = newVRegI(env);
1924      iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
1925      addInstr(env, mk_iMOVsd_RR(hi, tmp));
1926      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
1927      return Xcc_NZ;
1928   }
1929
1930   /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
1931
1932   /* CmpEQ8 / CmpNE8 */
1933   if (e->tag == Iex_Binop
1934       && (e->Iex.Binop.op == Iop_CmpEQ8
1935           || e->Iex.Binop.op == Iop_CmpNE8
1936           || e->Iex.Binop.op == Iop_CasCmpEQ8
1937           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
1938      if (isZeroU8(e->Iex.Binop.arg2)) {
1939         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1940         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
1941         switch (e->Iex.Binop.op) {
1942            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1943            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1944            default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
1945         }
1946      } else {
1947         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1948         X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1949         HReg    r    = newVRegI(env);
1950         addInstr(env, mk_iMOVsd_RR(r1,r));
1951         addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1952         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
1953         switch (e->Iex.Binop.op) {
1954            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1955            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1956            default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
1957         }
1958      }
1959   }
1960
1961   /* CmpEQ16 / CmpNE16 */
1962   if (e->tag == Iex_Binop
1963       && (e->Iex.Binop.op == Iop_CmpEQ16
1964           || e->Iex.Binop.op == Iop_CmpNE16
1965           || e->Iex.Binop.op == Iop_CasCmpEQ16
1966           || e->Iex.Binop.op == Iop_CasCmpNE16
1967           || e->Iex.Binop.op == Iop_ExpCmpNE16)) {
1968      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1969      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1970      HReg    r    = newVRegI(env);
1971      addInstr(env, mk_iMOVsd_RR(r1,r));
1972      addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1973      addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
1974      switch (e->Iex.Binop.op) {
1975         case Iop_CmpEQ16: case Iop_CasCmpEQ16:
1976            return Xcc_Z;
1977         case Iop_CmpNE16: case Iop_CasCmpNE16: case Iop_ExpCmpNE16:
1978            return Xcc_NZ;
1979         default:
1980            vpanic("iselCondCode(x86): CmpXX16");
1981      }
1982   }
1983
1984   /* CmpNE32(ccall, 32-bit constant) (--smc-check=all optimisation).
1985      Saves a "movl %eax, %tmp" compared to the default route. */
1986   if (e->tag == Iex_Binop
1987       && e->Iex.Binop.op == Iop_CmpNE32
1988       && e->Iex.Binop.arg1->tag == Iex_CCall
1989       && e->Iex.Binop.arg2->tag == Iex_Const) {
1990      IRExpr* cal = e->Iex.Binop.arg1;
1991      IRExpr* con = e->Iex.Binop.arg2;
1992      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
1993      vassert(cal->Iex.CCall.retty == Ity_I32); /* else ill-typed IR */
1994      vassert(con->Iex.Const.con->tag == Ico_U32);
1995      /* Marshal args, do the call. */
1996      UInt   addToSp = 0;
1997      RetLoc rloc    = mk_RetLoc_INVALID();
1998      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1999                    cal->Iex.CCall.cee,
2000                    cal->Iex.CCall.retty, cal->Iex.CCall.args );
2001      vassert(is_sane_RetLoc(rloc));
2002      vassert(rloc.pri == RLPri_Int);
2003      vassert(addToSp == 0);
2004      /* */
2005      addInstr(env, X86Instr_Alu32R(Xalu_CMP,
2006                                    X86RMI_Imm(con->Iex.Const.con->Ico.U32),
2007                                    hregX86_EAX()));
2008      return Xcc_NZ;
2009   }
2010
2011   /* Cmp*32*(x,y) */
2012   if (e->tag == Iex_Binop
2013       && (e->Iex.Binop.op == Iop_CmpEQ32
2014           || e->Iex.Binop.op == Iop_CmpNE32
2015           || e->Iex.Binop.op == Iop_CmpLT32S
2016           || e->Iex.Binop.op == Iop_CmpLT32U
2017           || e->Iex.Binop.op == Iop_CmpLE32S
2018           || e->Iex.Binop.op == Iop_CmpLE32U
2019           || e->Iex.Binop.op == Iop_CasCmpEQ32
2020           || e->Iex.Binop.op == Iop_CasCmpNE32
2021           || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2022      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2023      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2024      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
2025      switch (e->Iex.Binop.op) {
2026         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
2027         case Iop_CmpNE32:
2028         case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Xcc_NZ;
2029         case Iop_CmpLT32S: return Xcc_L;
2030         case Iop_CmpLT32U: return Xcc_B;
2031         case Iop_CmpLE32S: return Xcc_LE;
2032         case Iop_CmpLE32U: return Xcc_BE;
2033         default: vpanic("iselCondCode(x86): CmpXX32");
2034      }
2035   }
2036
2037   /* CmpNE64 */
2038   if (e->tag == Iex_Binop
2039       && (e->Iex.Binop.op == Iop_CmpNE64
2040           || e->Iex.Binop.op == Iop_CmpEQ64)) {
2041      HReg hi1, hi2, lo1, lo2;
2042      HReg tHi = newVRegI(env);
2043      HReg tLo = newVRegI(env);
2044      iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
2045      iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
2046      addInstr(env, mk_iMOVsd_RR(hi1, tHi));
2047      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
2048      addInstr(env, mk_iMOVsd_RR(lo1, tLo));
2049      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
2050      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
2051      switch (e->Iex.Binop.op) {
2052         case Iop_CmpNE64: return Xcc_NZ;
2053         case Iop_CmpEQ64: return Xcc_Z;
2054         default: vpanic("iselCondCode(x86): CmpXX64");
2055      }
2056   }
2057
2058   ppIRExpr(e);
2059   vpanic("iselCondCode");
2060}
2061
2062
2063/*---------------------------------------------------------*/
2064/*--- ISEL: Integer expressions (64 bit)                ---*/
2065/*---------------------------------------------------------*/
2066
2067/* Compute a 64-bit value into a register pair, which is returned as
2068   the first two parameters.  As with iselIntExpr_R, these may be
2069   either real or virtual regs; in any case they must not be changed
2070   by subsequent code emitted by the caller.  */
2071
2072static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
2073{
2074   iselInt64Expr_wrk(rHi, rLo, env, e);
2075#  if 0
2076   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2077#  endif
2078   vassert(hregClass(*rHi) == HRcInt32);
2079   vassert(hregIsVirtual(*rHi));
2080   vassert(hregClass(*rLo) == HRcInt32);
2081   vassert(hregIsVirtual(*rLo));
2082}
2083
2084/* DO NOT CALL THIS DIRECTLY ! */
2085static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
2086{
2087   MatchInfo mi;
2088   HWord fn = 0; /* helper fn for most SIMD64 stuff */
2089   vassert(e);
2090   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
2091
2092   /* 64-bit literal */
2093   if (e->tag == Iex_Const) {
2094      ULong w64 = e->Iex.Const.con->Ico.U64;
2095      UInt  wHi = toUInt(w64 >> 32);
2096      UInt  wLo = toUInt(w64);
2097      HReg  tLo = newVRegI(env);
2098      HReg  tHi = newVRegI(env);
2099      vassert(e->Iex.Const.con->tag == Ico_U64);
2100      if (wLo == wHi) {
2101         /* Save a precious Int register in this special case. */
2102         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2103         *rHi = tLo;
2104         *rLo = tLo;
2105      } else {
2106         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
2107         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2108         *rHi = tHi;
2109         *rLo = tLo;
2110      }
2111      return;
2112   }
2113
2114   /* read 64-bit IRTemp */
2115   if (e->tag == Iex_RdTmp) {
2116      lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
2117      return;
2118   }
2119
2120   /* 64-bit load */
2121   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2122      HReg     tLo, tHi;
2123      X86AMode *am0, *am4;
2124      vassert(e->Iex.Load.ty == Ity_I64);
2125      tLo = newVRegI(env);
2126      tHi = newVRegI(env);
2127      am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
2128      am4 = advance4(am0);
2129      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
2130      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2131      *rHi = tHi;
2132      *rLo = tLo;
2133      return;
2134   }
2135
2136   /* 64-bit GET */
2137   if (e->tag == Iex_Get) {
2138      X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
2139      X86AMode* am4 = advance4(am);
2140      HReg tLo = newVRegI(env);
2141      HReg tHi = newVRegI(env);
2142      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2143      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2144      *rHi = tHi;
2145      *rLo = tLo;
2146      return;
2147   }
2148
2149   /* 64-bit GETI */
2150   if (e->tag == Iex_GetI) {
2151      X86AMode* am
2152         = genGuestArrayOffset( env, e->Iex.GetI.descr,
2153                                     e->Iex.GetI.ix, e->Iex.GetI.bias );
2154      X86AMode* am4 = advance4(am);
2155      HReg tLo = newVRegI(env);
2156      HReg tHi = newVRegI(env);
2157      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2158      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2159      *rHi = tHi;
2160      *rLo = tLo;
2161      return;
2162   }
2163
2164   /* 64-bit ITE: ITE(g, expr, expr) */ // VFD
2165   if (e->tag == Iex_ITE) {
2166      HReg e0Lo, e0Hi, e1Lo, e1Hi;
2167      HReg tLo = newVRegI(env);
2168      HReg tHi = newVRegI(env);
2169      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.ITE.iffalse);
2170      iselInt64Expr(&e1Hi, &e1Lo, env, e->Iex.ITE.iftrue);
2171      addInstr(env, mk_iMOVsd_RR(e1Hi, tHi));
2172      addInstr(env, mk_iMOVsd_RR(e1Lo, tLo));
2173      X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
2174      /* This assumes the first cmov32 doesn't trash the condition
2175         codes, so they are still available for the second cmov32 */
2176      addInstr(env, X86Instr_CMov32(cc ^ 1, X86RM_Reg(e0Hi), tHi));
2177      addInstr(env, X86Instr_CMov32(cc ^ 1, X86RM_Reg(e0Lo), tLo));
2178      *rHi = tHi;
2179      *rLo = tLo;
2180      return;
2181   }
2182
2183   /* --------- BINARY ops --------- */
2184   if (e->tag == Iex_Binop) {
2185      switch (e->Iex.Binop.op) {
2186         /* 32 x 32 -> 64 multiply */
2187         case Iop_MullU32:
2188         case Iop_MullS32: {
2189            /* get one operand into %eax, and the other into a R/M.
2190               Need to make an educated guess about which is better in
2191               which. */
2192            HReg   tLo    = newVRegI(env);
2193            HReg   tHi    = newVRegI(env);
2194            Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
2195            X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2196            HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2197            addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
2198            addInstr(env, X86Instr_MulL(syned, rmLeft));
2199            /* Result is now in EDX:EAX.  Tell the caller. */
2200            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2201            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2202            *rHi = tHi;
2203            *rLo = tLo;
2204            return;
2205         }
2206
2207         /* 64 x 32 -> (32(rem),32(div)) division */
2208         case Iop_DivModU64to32:
2209         case Iop_DivModS64to32: {
2210            /* Get the 64-bit operand into edx:eax, and the other into
2211               any old R/M. */
2212            HReg sHi, sLo;
2213            HReg   tLo     = newVRegI(env);
2214            HReg   tHi     = newVRegI(env);
2215            Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
2216            X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2217            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2218            addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
2219            addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
2220            addInstr(env, X86Instr_Div(syned, rmRight));
2221            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2222            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2223            *rHi = tHi;
2224            *rLo = tLo;
2225            return;
2226         }
2227
2228         /* Or64/And64/Xor64 */
2229         case Iop_Or64:
2230         case Iop_And64:
2231         case Iop_Xor64: {
2232            HReg xLo, xHi, yLo, yHi;
2233            HReg tLo = newVRegI(env);
2234            HReg tHi = newVRegI(env);
2235            X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
2236                          : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
2237                          : Xalu_XOR;
2238            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2239            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2240            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2241            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
2242            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2243            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
2244            *rHi = tHi;
2245            *rLo = tLo;
2246            return;
2247         }
2248
2249         /* Add64/Sub64 */
2250         case Iop_Add64:
2251            if (e->Iex.Binop.arg2->tag == Iex_Const) {
2252               /* special case Add64(e, const) */
2253               ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
2254               UInt  wHi = toUInt(w64 >> 32);
2255               UInt  wLo = toUInt(w64);
2256               HReg  tLo = newVRegI(env);
2257               HReg  tHi = newVRegI(env);
2258               HReg  xLo, xHi;
2259               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
2260               iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2261               addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2262               addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2263               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
2264               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
2265               *rHi = tHi;
2266               *rLo = tLo;
2267               return;
2268            }
2269            /* else fall through to the generic case */
2270         case Iop_Sub64: {
2271            HReg xLo, xHi, yLo, yHi;
2272            HReg tLo = newVRegI(env);
2273            HReg tHi = newVRegI(env);
2274            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2275            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2276            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2277            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2278            if (e->Iex.Binop.op==Iop_Add64) {
2279               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
2280               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
2281            } else {
2282               addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2283               addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2284            }
2285            *rHi = tHi;
2286            *rLo = tLo;
2287            return;
2288         }
2289
2290         /* 32HLto64(e1,e2) */
2291         case Iop_32HLto64:
2292            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2293            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2294            return;
2295
2296         /* 64-bit shifts */
2297         case Iop_Shl64: {
2298            /* We use the same ingenious scheme as gcc.  Put the value
2299               to be shifted into %hi:%lo, and the shift amount into
2300               %cl.  Then (dsts on right, a la ATT syntax):
2301
2302               shldl %cl, %lo, %hi   -- make %hi be right for the
2303                                     -- shift amt %cl % 32
2304               shll  %cl, %lo        -- make %lo be right for the
2305                                     -- shift amt %cl % 32
2306
2307               Now, if (shift amount % 64) is in the range 32 .. 63,
2308               we have to do a fixup, which puts the result low half
2309               into the result high half, and zeroes the low half:
2310
2311               testl $32, %ecx
2312
2313               cmovnz %lo, %hi
2314               movl $0, %tmp         -- sigh; need yet another reg
2315               cmovnz %tmp, %lo
2316            */
2317            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2318            tLo = newVRegI(env);
2319            tHi = newVRegI(env);
2320            tTemp = newVRegI(env);
2321            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2322            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2323            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2324            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2325            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2326            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2327               and those regs are legitimately modifiable. */
2328            addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
2329            addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
2330            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2331            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
2332            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2333            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
2334            *rHi = tHi;
2335            *rLo = tLo;
2336            return;
2337         }
2338
2339         case Iop_Shr64: {
2340            /* We use the same ingenious scheme as gcc.  Put the value
2341               to be shifted into %hi:%lo, and the shift amount into
2342               %cl.  Then:
2343
2344               shrdl %cl, %hi, %lo   -- make %lo be right for the
2345                                     -- shift amt %cl % 32
2346               shrl  %cl, %hi        -- make %hi be right for the
2347                                     -- shift amt %cl % 32
2348
2349               Now, if (shift amount % 64) is in the range 32 .. 63,
2350               we have to do a fixup, which puts the result high half
2351               into the result low half, and zeroes the high half:
2352
2353               testl $32, %ecx
2354
2355               cmovnz %hi, %lo
2356               movl $0, %tmp         -- sigh; need yet another reg
2357               cmovnz %tmp, %hi
2358            */
2359            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2360            tLo = newVRegI(env);
2361            tHi = newVRegI(env);
2362            tTemp = newVRegI(env);
2363            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2364            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2365            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2366            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2367            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2368            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2369               and those regs are legitimately modifiable. */
2370            addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
2371            addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
2372            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2373            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
2374            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2375            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
2376            *rHi = tHi;
2377            *rLo = tLo;
2378            return;
2379         }
2380
2381         /* F64 -> I64 */
2382         /* Sigh, this is an almost exact copy of the F64 -> I32/I16
2383            case.  Unfortunately I see no easy way to avoid the
2384            duplication. */
2385         case Iop_F64toI64S: {
2386            HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2387            HReg tLo = newVRegI(env);
2388            HReg tHi = newVRegI(env);
2389
2390            /* Used several times ... */
2391            /* Careful ... this sharing is only safe because
2392	       zero_esp/four_esp do not hold any registers which the
2393	       register allocator could attempt to swizzle later. */
2394            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2395            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2396
2397            /* rf now holds the value to be converted, and rrm holds
2398               the rounding mode value, encoded as per the
2399               IRRoundingMode enum.  The first thing to do is set the
2400               FPU's rounding mode accordingly. */
2401
2402            /* Create a space for the format conversion. */
2403            /* subl $8, %esp */
2404            sub_from_esp(env, 8);
2405
2406            /* Set host rounding mode */
2407            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2408
2409            /* gistll %rf, 0(%esp) */
2410            addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
2411
2412            /* movl 0(%esp), %dstLo */
2413            /* movl 4(%esp), %dstHi */
2414            addInstr(env, X86Instr_Alu32R(
2415                             Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2416            addInstr(env, X86Instr_Alu32R(
2417                             Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2418
2419            /* Restore default FPU rounding. */
2420            set_FPU_rounding_default( env );
2421
2422            /* addl $8, %esp */
2423            add_to_esp(env, 8);
2424
2425            *rHi = tHi;
2426            *rLo = tLo;
2427            return;
2428         }
2429
2430         case Iop_Add8x8:
2431            fn = (HWord)h_generic_calc_Add8x8; goto binnish;
2432         case Iop_Add16x4:
2433            fn = (HWord)h_generic_calc_Add16x4; goto binnish;
2434         case Iop_Add32x2:
2435            fn = (HWord)h_generic_calc_Add32x2; goto binnish;
2436
2437         case Iop_Avg8Ux8:
2438            fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
2439         case Iop_Avg16Ux4:
2440            fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
2441
2442         case Iop_CmpEQ8x8:
2443            fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
2444         case Iop_CmpEQ16x4:
2445            fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
2446         case Iop_CmpEQ32x2:
2447            fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
2448
2449         case Iop_CmpGT8Sx8:
2450            fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
2451         case Iop_CmpGT16Sx4:
2452            fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
2453         case Iop_CmpGT32Sx2:
2454            fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
2455
2456         case Iop_InterleaveHI8x8:
2457            fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
2458         case Iop_InterleaveLO8x8:
2459            fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
2460         case Iop_InterleaveHI16x4:
2461            fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
2462         case Iop_InterleaveLO16x4:
2463            fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
2464         case Iop_InterleaveHI32x2:
2465            fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
2466         case Iop_InterleaveLO32x2:
2467            fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
2468         case Iop_CatOddLanes16x4:
2469            fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
2470         case Iop_CatEvenLanes16x4:
2471            fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
2472         case Iop_Perm8x8:
2473            fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
2474
2475         case Iop_Max8Ux8:
2476            fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
2477         case Iop_Max16Sx4:
2478            fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
2479         case Iop_Min8Ux8:
2480            fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
2481         case Iop_Min16Sx4:
2482            fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
2483
2484         case Iop_Mul16x4:
2485            fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
2486         case Iop_Mul32x2:
2487            fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
2488         case Iop_MulHi16Sx4:
2489            fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
2490         case Iop_MulHi16Ux4:
2491            fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
2492
2493         case Iop_QAdd8Sx8:
2494            fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
2495         case Iop_QAdd16Sx4:
2496            fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
2497         case Iop_QAdd8Ux8:
2498            fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
2499         case Iop_QAdd16Ux4:
2500            fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
2501
2502         case Iop_QNarrowBin32Sto16Sx4:
2503            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; goto binnish;
2504         case Iop_QNarrowBin16Sto8Sx8:
2505            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
2506         case Iop_QNarrowBin16Sto8Ux8:
2507            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
2508         case Iop_NarrowBin16to8x8:
2509            fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
2510         case Iop_NarrowBin32to16x4:
2511            fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
2512
2513         case Iop_QSub8Sx8:
2514            fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
2515         case Iop_QSub16Sx4:
2516            fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
2517         case Iop_QSub8Ux8:
2518            fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
2519         case Iop_QSub16Ux4:
2520            fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
2521
2522         case Iop_Sub8x8:
2523            fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
2524         case Iop_Sub16x4:
2525            fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
2526         case Iop_Sub32x2:
2527            fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
2528
2529         binnish: {
2530            /* Note: the following assumes all helpers are of
2531               signature
2532                  ULong fn ( ULong, ULong ), and they are
2533               not marked as regparm functions.
2534            */
2535            HReg xLo, xHi, yLo, yHi;
2536            HReg tLo = newVRegI(env);
2537            HReg tHi = newVRegI(env);
2538            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2539            addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
2540            addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
2541            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2542            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2543            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2544            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2545                                         0, mk_RetLoc_simple(RLPri_2Int) ));
2546            add_to_esp(env, 4*4);
2547            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2548            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2549            *rHi = tHi;
2550            *rLo = tLo;
2551            return;
2552         }
2553
2554         case Iop_ShlN32x2:
2555            fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
2556         case Iop_ShlN16x4:
2557            fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
2558         case Iop_ShlN8x8:
2559            fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
2560         case Iop_ShrN32x2:
2561            fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
2562         case Iop_ShrN16x4:
2563            fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
2564         case Iop_SarN32x2:
2565            fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
2566         case Iop_SarN16x4:
2567            fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
2568         case Iop_SarN8x8:
2569            fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
2570         shifty: {
2571            /* Note: the following assumes all helpers are of
2572               signature
2573                  ULong fn ( ULong, UInt ), and they are
2574               not marked as regparm functions.
2575            */
2576            HReg xLo, xHi;
2577            HReg tLo = newVRegI(env);
2578            HReg tHi = newVRegI(env);
2579            X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2580            addInstr(env, X86Instr_Push(y));
2581            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2582            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2583            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2584            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2585                                         0, mk_RetLoc_simple(RLPri_2Int) ));
2586            add_to_esp(env, 3*4);
2587            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2588            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2589            *rHi = tHi;
2590            *rLo = tLo;
2591            return;
2592         }
2593
2594         default:
2595            break;
2596      }
2597   } /* if (e->tag == Iex_Binop) */
2598
2599
2600   /* --------- UNARY ops --------- */
2601   if (e->tag == Iex_Unop) {
2602      switch (e->Iex.Unop.op) {
2603
2604         /* 32Sto64(e) */
2605         case Iop_32Sto64: {
2606            HReg tLo = newVRegI(env);
2607            HReg tHi = newVRegI(env);
2608            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2609            addInstr(env, mk_iMOVsd_RR(src,tHi));
2610            addInstr(env, mk_iMOVsd_RR(src,tLo));
2611            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
2612            *rHi = tHi;
2613            *rLo = tLo;
2614            return;
2615         }
2616
2617         /* 32Uto64(e) */
2618         case Iop_32Uto64: {
2619            HReg tLo = newVRegI(env);
2620            HReg tHi = newVRegI(env);
2621            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2622            addInstr(env, mk_iMOVsd_RR(src,tLo));
2623            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2624            *rHi = tHi;
2625            *rLo = tLo;
2626            return;
2627         }
2628
2629         /* 16Uto64(e) */
2630         case Iop_16Uto64: {
2631            HReg tLo = newVRegI(env);
2632            HReg tHi = newVRegI(env);
2633            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2634            addInstr(env, mk_iMOVsd_RR(src,tLo));
2635            addInstr(env, X86Instr_Alu32R(Xalu_AND,
2636                                          X86RMI_Imm(0xFFFF), tLo));
2637            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2638            *rHi = tHi;
2639            *rLo = tLo;
2640            return;
2641         }
2642
2643         /* V128{HI}to64 */
2644         case Iop_V128HIto64:
2645         case Iop_V128to64: {
2646            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
2647            HReg tLo = newVRegI(env);
2648            HReg tHi = newVRegI(env);
2649            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
2650            X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
2651            X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
2652            X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
2653            sub_from_esp(env, 16);
2654            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
2655            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2656                                           X86RMI_Mem(espLO), tLo ));
2657            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2658                                           X86RMI_Mem(espHI), tHi ));
2659            add_to_esp(env, 16);
2660            *rHi = tHi;
2661            *rLo = tLo;
2662            return;
2663         }
2664
2665         /* could do better than this, but for now ... */
2666         case Iop_1Sto64: {
2667            HReg tLo = newVRegI(env);
2668            HReg tHi = newVRegI(env);
2669            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
2670            addInstr(env, X86Instr_Set32(cond,tLo));
2671            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
2672            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
2673            addInstr(env, mk_iMOVsd_RR(tLo, tHi));
2674            *rHi = tHi;
2675            *rLo = tLo;
2676            return;
2677         }
2678
2679         /* Not64(e) */
2680         case Iop_Not64: {
2681            HReg tLo = newVRegI(env);
2682            HReg tHi = newVRegI(env);
2683            HReg sHi, sLo;
2684            iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
2685            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2686            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2687            addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
2688            addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
2689            *rHi = tHi;
2690            *rLo = tLo;
2691            return;
2692         }
2693
2694         /* Left64(e) */
2695         case Iop_Left64: {
2696            HReg yLo, yHi;
2697            HReg tLo = newVRegI(env);
2698            HReg tHi = newVRegI(env);
2699            /* yHi:yLo = arg */
2700            iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
2701            /* tLo = 0 - yLo, and set carry */
2702            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
2703            addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2704            /* tHi = 0 - yHi - carry */
2705            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2706            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2707            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
2708               back in, so as to give the final result
2709               tHi:tLo = arg | -arg. */
2710            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
2711            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
2712            *rHi = tHi;
2713            *rLo = tLo;
2714            return;
2715         }
2716
2717         /* --- patterns rooted at: CmpwNEZ64 --- */
2718
2719         /* CmpwNEZ64(e) */
2720         case Iop_CmpwNEZ64: {
2721
2722         DECLARE_PATTERN(p_CmpwNEZ64_Or64);
2723         DEFINE_PATTERN(p_CmpwNEZ64_Or64,
2724                        unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
2725         if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
2726            /* CmpwNEZ64(Or64(x,y)) */
2727            HReg xHi,xLo,yHi,yLo;
2728            HReg xBoth = newVRegI(env);
2729            HReg merged = newVRegI(env);
2730            HReg tmp2 = newVRegI(env);
2731
2732            iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
2733            addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
2734            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2735                                          X86RMI_Reg(xLo),xBoth));
2736
2737            iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
2738            addInstr(env, mk_iMOVsd_RR(yHi,merged));
2739            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2740                                          X86RMI_Reg(yLo),merged));
2741            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2742                                             X86RMI_Reg(xBoth),merged));
2743
2744            /* tmp2 = (merged | -merged) >>s 31 */
2745            addInstr(env, mk_iMOVsd_RR(merged,tmp2));
2746            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2747            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2748                                          X86RMI_Reg(merged), tmp2));
2749            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2750            *rHi = tmp2;
2751            *rLo = tmp2;
2752            return;
2753         } else {
2754            /* CmpwNEZ64(e) */
2755            HReg srcLo, srcHi;
2756            HReg tmp1  = newVRegI(env);
2757            HReg tmp2  = newVRegI(env);
2758            /* srcHi:srcLo = arg */
2759            iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
2760            /* tmp1 = srcHi | srcLo */
2761            addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
2762            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2763                                          X86RMI_Reg(srcLo), tmp1));
2764            /* tmp2 = (tmp1 | -tmp1) >>s 31 */
2765            addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
2766            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2767            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2768                                          X86RMI_Reg(tmp1), tmp2));
2769            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2770            *rHi = tmp2;
2771            *rLo = tmp2;
2772            return;
2773         }
2774         }
2775
2776         /* ReinterpF64asI64(e) */
2777         /* Given an IEEE754 double, produce an I64 with the same bit
2778            pattern. */
2779         case Iop_ReinterpF64asI64: {
2780            HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
2781            HReg tLo  = newVRegI(env);
2782            HReg tHi  = newVRegI(env);
2783            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2784            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2785            /* paranoia */
2786            set_FPU_rounding_default(env);
2787            /* subl $8, %esp */
2788            sub_from_esp(env, 8);
2789            /* gstD %rf, 0(%esp) */
2790            addInstr(env,
2791                     X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
2792            /* movl 0(%esp), %tLo */
2793            addInstr(env,
2794                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2795            /* movl 4(%esp), %tHi */
2796            addInstr(env,
2797                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2798            /* addl $8, %esp */
2799            add_to_esp(env, 8);
2800            *rHi = tHi;
2801            *rLo = tLo;
2802            return;
2803         }
2804
2805         case Iop_CmpNEZ32x2:
2806            fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
2807         case Iop_CmpNEZ16x4:
2808            fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
2809         case Iop_CmpNEZ8x8:
2810            fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
2811         unish: {
2812            /* Note: the following assumes all helpers are of
2813               signature
2814                  ULong fn ( ULong ), and they are
2815               not marked as regparm functions.
2816            */
2817            HReg xLo, xHi;
2818            HReg tLo = newVRegI(env);
2819            HReg tHi = newVRegI(env);
2820            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
2821            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2822            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2823            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2824                                         0, mk_RetLoc_simple(RLPri_2Int) ));
2825            add_to_esp(env, 2*4);
2826            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2827            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2828            *rHi = tHi;
2829            *rLo = tLo;
2830            return;
2831         }
2832
2833         default:
2834            break;
2835      }
2836   } /* if (e->tag == Iex_Unop) */
2837
2838
2839   /* --------- CCALL --------- */
2840   if (e->tag == Iex_CCall) {
2841      HReg tLo = newVRegI(env);
2842      HReg tHi = newVRegI(env);
2843
2844      /* Marshal args, do the call, clear stack. */
2845      UInt   addToSp = 0;
2846      RetLoc rloc    = mk_RetLoc_INVALID();
2847      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2848                    e->Iex.CCall.cee,
2849                    e->Iex.CCall.retty, e->Iex.CCall.args );
2850      vassert(is_sane_RetLoc(rloc));
2851      vassert(rloc.pri == RLPri_2Int);
2852      vassert(addToSp == 0);
2853      /* */
2854
2855      addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2856      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2857      *rHi = tHi;
2858      *rLo = tLo;
2859      return;
2860   }
2861
2862   ppIRExpr(e);
2863   vpanic("iselInt64Expr");
2864}
2865
2866
2867/*---------------------------------------------------------*/
2868/*--- ISEL: Floating point expressions (32 bit)         ---*/
2869/*---------------------------------------------------------*/
2870
2871/* Nothing interesting here; really just wrappers for
2872   64-bit stuff. */
2873
2874static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2875{
2876   HReg r = iselFltExpr_wrk( env, e );
2877#  if 0
2878   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2879#  endif
2880   vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
2881   vassert(hregIsVirtual(r));
2882   return r;
2883}
2884
2885/* DO NOT CALL THIS DIRECTLY */
2886static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2887{
2888   IRType ty = typeOfIRExpr(env->type_env,e);
2889   vassert(ty == Ity_F32);
2890
2891   if (e->tag == Iex_RdTmp) {
2892      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2893   }
2894
2895   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2896      X86AMode* am;
2897      HReg res = newVRegF(env);
2898      vassert(e->Iex.Load.ty == Ity_F32);
2899      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2900      addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
2901      return res;
2902   }
2903
2904   if (e->tag == Iex_Binop
2905       && e->Iex.Binop.op == Iop_F64toF32) {
2906      /* Although the result is still held in a standard FPU register,
2907         we need to round it to reflect the loss of accuracy/range
2908         entailed in casting it to a 32-bit float. */
2909      HReg dst = newVRegF(env);
2910      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2911      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2912      addInstr(env, X86Instr_Fp64to32(src,dst));
2913      set_FPU_rounding_default( env );
2914      return dst;
2915   }
2916
2917   if (e->tag == Iex_Get) {
2918      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2919                                  hregX86_EBP() );
2920      HReg res = newVRegF(env);
2921      addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
2922      return res;
2923   }
2924
2925   if (e->tag == Iex_Unop
2926       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2927       /* Given an I32, produce an IEEE754 float with the same bit
2928          pattern. */
2929      HReg    dst = newVRegF(env);
2930      X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
2931      /* paranoia */
2932      addInstr(env, X86Instr_Push(rmi));
2933      addInstr(env, X86Instr_FpLdSt(
2934                       True/*load*/, 4, dst,
2935                       X86AMode_IR(0, hregX86_ESP())));
2936      add_to_esp(env, 4);
2937      return dst;
2938   }
2939
2940   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2941      HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
2942      HReg dst = newVRegF(env);
2943
2944      /* rf now holds the value to be rounded.  The first thing to do
2945         is set the FPU's rounding mode accordingly. */
2946
2947      /* Set host rounding mode */
2948      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2949
2950      /* grndint %rf, %dst */
2951      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
2952
2953      /* Restore default FPU rounding. */
2954      set_FPU_rounding_default( env );
2955
2956      return dst;
2957   }
2958
2959   ppIRExpr(e);
2960   vpanic("iselFltExpr_wrk");
2961}
2962
2963
2964/*---------------------------------------------------------*/
2965/*--- ISEL: Floating point expressions (64 bit)         ---*/
2966/*---------------------------------------------------------*/
2967
2968/* Compute a 64-bit floating point value into a register, the identity
2969   of which is returned.  As with iselIntExpr_R, the reg may be either
2970   real or virtual; in any case it must not be changed by subsequent
2971   code emitted by the caller.  */
2972
2973/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2974
2975    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2976    ----                  ---------   -----------   -----------
2977    signalling NaN        u           2047 (max)    .0uuuuu---u
2978                                                    (with at least
2979                                                     one 1 bit)
2980    quiet NaN             u           2047 (max)    .1uuuuu---u
2981
2982    negative infinity     1           2047 (max)    .000000---0
2983
2984    positive infinity     0           2047 (max)    .000000---0
2985
2986    negative zero         1           0             .000000---0
2987
2988    positive zero         0           0             .000000---0
2989*/
2990
2991static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2992{
2993   HReg r = iselDblExpr_wrk( env, e );
2994#  if 0
2995   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2996#  endif
2997   vassert(hregClass(r) == HRcFlt64);
2998   vassert(hregIsVirtual(r));
2999   return r;
3000}
3001
3002/* DO NOT CALL THIS DIRECTLY */
3003static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
3004{
3005   IRType ty = typeOfIRExpr(env->type_env,e);
3006   vassert(e);
3007   vassert(ty == Ity_F64);
3008
3009   if (e->tag == Iex_RdTmp) {
3010      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3011   }
3012
3013   if (e->tag == Iex_Const) {
3014      union { UInt u32x2[2]; ULong u64; Double f64; } u;
3015      HReg freg = newVRegF(env);
3016      vassert(sizeof(u) == 8);
3017      vassert(sizeof(u.u64) == 8);
3018      vassert(sizeof(u.f64) == 8);
3019      vassert(sizeof(u.u32x2) == 8);
3020
3021      if (e->Iex.Const.con->tag == Ico_F64) {
3022         u.f64 = e->Iex.Const.con->Ico.F64;
3023      }
3024      else if (e->Iex.Const.con->tag == Ico_F64i) {
3025         u.u64 = e->Iex.Const.con->Ico.F64i;
3026      }
3027      else
3028         vpanic("iselDblExpr(x86): const");
3029
3030      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
3031      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
3032      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
3033                                    X86AMode_IR(0, hregX86_ESP())));
3034      add_to_esp(env, 8);
3035      return freg;
3036   }
3037
3038   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3039      X86AMode* am;
3040      HReg res = newVRegF(env);
3041      vassert(e->Iex.Load.ty == Ity_F64);
3042      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3043      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
3044      return res;
3045   }
3046
3047   if (e->tag == Iex_Get) {
3048      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
3049                                  hregX86_EBP() );
3050      HReg res = newVRegF(env);
3051      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
3052      return res;
3053   }
3054
3055   if (e->tag == Iex_GetI) {
3056      X86AMode* am
3057         = genGuestArrayOffset(
3058              env, e->Iex.GetI.descr,
3059                   e->Iex.GetI.ix, e->Iex.GetI.bias );
3060      HReg res = newVRegF(env);
3061      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
3062      return res;
3063   }
3064
3065   if (e->tag == Iex_Triop) {
3066      X86FpOp fpop = Xfp_INVALID;
3067      IRTriop *triop = e->Iex.Triop.details;
3068      switch (triop->op) {
3069         case Iop_AddF64:    fpop = Xfp_ADD; break;
3070         case Iop_SubF64:    fpop = Xfp_SUB; break;
3071         case Iop_MulF64:    fpop = Xfp_MUL; break;
3072         case Iop_DivF64:    fpop = Xfp_DIV; break;
3073         case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
3074         case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
3075         case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
3076         case Iop_AtanF64:   fpop = Xfp_ATAN; break;
3077         case Iop_PRemF64:   fpop = Xfp_PREM; break;
3078         case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
3079         default: break;
3080      }
3081      if (fpop != Xfp_INVALID) {
3082         HReg res  = newVRegF(env);
3083         HReg srcL = iselDblExpr(env, triop->arg2);
3084         HReg srcR = iselDblExpr(env, triop->arg3);
3085         /* XXXROUNDINGFIXME */
3086         /* set roundingmode here */
3087         addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
3088	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
3089	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
3090            roundToF64(env, res);
3091         return res;
3092      }
3093   }
3094
3095   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
3096      HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
3097      HReg dst = newVRegF(env);
3098
3099      /* rf now holds the value to be rounded.  The first thing to do
3100         is set the FPU's rounding mode accordingly. */
3101
3102      /* Set host rounding mode */
3103      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3104
3105      /* grndint %rf, %dst */
3106      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
3107
3108      /* Restore default FPU rounding. */
3109      set_FPU_rounding_default( env );
3110
3111      return dst;
3112   }
3113
3114   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3115      HReg dst = newVRegF(env);
3116      HReg rHi,rLo;
3117      iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
3118      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3119      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3120
3121      /* Set host rounding mode */
3122      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3123
3124      addInstr(env, X86Instr_FpLdStI(
3125                       True/*load*/, 8, dst,
3126                       X86AMode_IR(0, hregX86_ESP())));
3127
3128      /* Restore default FPU rounding. */
3129      set_FPU_rounding_default( env );
3130
3131      add_to_esp(env, 8);
3132      return dst;
3133   }
3134
3135   if (e->tag == Iex_Binop) {
3136      X86FpOp fpop = Xfp_INVALID;
3137      switch (e->Iex.Binop.op) {
3138         case Iop_SinF64:  fpop = Xfp_SIN; break;
3139         case Iop_CosF64:  fpop = Xfp_COS; break;
3140         case Iop_TanF64:  fpop = Xfp_TAN; break;
3141         case Iop_2xm1F64: fpop = Xfp_2XM1; break;
3142         case Iop_SqrtF64: fpop = Xfp_SQRT; break;
3143         default: break;
3144      }
3145      if (fpop != Xfp_INVALID) {
3146         HReg res = newVRegF(env);
3147         HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
3148         /* XXXROUNDINGFIXME */
3149         /* set roundingmode here */
3150         /* Note that X86Instr_FpUnary(Xfp_TAN,..) sets the condition
3151            codes.  I don't think that matters, since this insn
3152            selector never generates such an instruction intervening
3153            between an flag-setting instruction and a flag-using
3154            instruction. */
3155         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3156	 if (fpop != Xfp_SQRT
3157             && fpop != Xfp_NEG && fpop != Xfp_ABS)
3158            roundToF64(env, res);
3159         return res;
3160      }
3161   }
3162
3163   if (e->tag == Iex_Unop) {
3164      X86FpOp fpop = Xfp_INVALID;
3165      switch (e->Iex.Unop.op) {
3166         case Iop_NegF64:  fpop = Xfp_NEG; break;
3167         case Iop_AbsF64:  fpop = Xfp_ABS; break;
3168         default: break;
3169      }
3170      if (fpop != Xfp_INVALID) {
3171         HReg res = newVRegF(env);
3172         HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3173         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3174         /* No need to do roundToF64(env,res) for Xfp_NEG or Xfp_ABS,
3175            but might need to do that for other unary ops. */
3176         return res;
3177      }
3178   }
3179
3180   if (e->tag == Iex_Unop) {
3181      switch (e->Iex.Unop.op) {
3182         case Iop_I32StoF64: {
3183            HReg dst = newVRegF(env);
3184            HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3185            addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3186            set_FPU_rounding_default(env);
3187            addInstr(env, X86Instr_FpLdStI(
3188                             True/*load*/, 4, dst,
3189                             X86AMode_IR(0, hregX86_ESP())));
3190	    add_to_esp(env, 4);
3191            return dst;
3192         }
3193         case Iop_ReinterpI64asF64: {
3194            /* Given an I64, produce an IEEE754 double with the same
3195               bit pattern. */
3196            HReg dst = newVRegF(env);
3197            HReg rHi, rLo;
3198	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
3199            /* paranoia */
3200            set_FPU_rounding_default(env);
3201            addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3202            addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3203            addInstr(env, X86Instr_FpLdSt(
3204                             True/*load*/, 8, dst,
3205                             X86AMode_IR(0, hregX86_ESP())));
3206	    add_to_esp(env, 8);
3207            return dst;
3208	 }
3209         case Iop_F32toF64: {
3210            /* this is a no-op */
3211            HReg res = iselFltExpr(env, e->Iex.Unop.arg);
3212            return res;
3213	 }
3214         default:
3215            break;
3216      }
3217   }
3218
3219   /* --------- MULTIPLEX --------- */
3220   if (e->tag == Iex_ITE) { // VFD
3221     if (ty == Ity_F64
3222         && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
3223        HReg r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3224        HReg r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3225        HReg dst = newVRegF(env);
3226        addInstr(env, X86Instr_FpUnary(Xfp_MOV,r1,dst));
3227        X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3228        addInstr(env, X86Instr_FpCMov(cc ^ 1, r0, dst));
3229        return dst;
3230      }
3231   }
3232
3233   ppIRExpr(e);
3234   vpanic("iselDblExpr_wrk");
3235}
3236
3237
3238/*---------------------------------------------------------*/
3239/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3240/*---------------------------------------------------------*/
3241
3242static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
3243{
3244   HReg r = iselVecExpr_wrk( env, e );
3245#  if 0
3246   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3247#  endif
3248   vassert(hregClass(r) == HRcVec128);
3249   vassert(hregIsVirtual(r));
3250   return r;
3251}
3252
3253
3254/* DO NOT CALL THIS DIRECTLY */
3255static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
3256{
3257
3258#  define REQUIRE_SSE1                                    \
3259      do { if (env->hwcaps == 0/*baseline, no sse*/       \
3260               ||  env->hwcaps == VEX_HWCAPS_X86_MMXEXT /*Integer SSE*/) \
3261              goto vec_fail;                              \
3262      } while (0)
3263
3264#  define REQUIRE_SSE2                                    \
3265      do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
3266              goto vec_fail;                              \
3267      } while (0)
3268
3269#  define SSE2_OR_ABOVE                                   \
3270       (env->hwcaps & VEX_HWCAPS_X86_SSE2)
3271
3272   HWord     fn = 0; /* address of helper fn, if required */
3273   MatchInfo mi;
3274   Bool      arg1isEReg = False;
3275   X86SseOp  op = Xsse_INVALID;
3276   IRType    ty = typeOfIRExpr(env->type_env,e);
3277   vassert(e);
3278   vassert(ty == Ity_V128);
3279
3280   REQUIRE_SSE1;
3281
3282   if (e->tag == Iex_RdTmp) {
3283      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3284   }
3285
3286   if (e->tag == Iex_Get) {
3287      HReg dst = newVRegV(env);
3288      addInstr(env, X86Instr_SseLdSt(
3289                       True/*load*/,
3290                       dst,
3291                       X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
3292                    )
3293              );
3294      return dst;
3295   }
3296
3297   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3298      HReg      dst = newVRegV(env);
3299      X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3300      addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
3301      return dst;
3302   }
3303
3304   if (e->tag == Iex_Const) {
3305      HReg dst = newVRegV(env);
3306      vassert(e->Iex.Const.con->tag == Ico_V128);
3307      addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
3308      return dst;
3309   }
3310
3311   if (e->tag == Iex_Unop) {
3312
3313   if (SSE2_OR_ABOVE) {
3314      /* 64UtoV128(LDle:I64(addr)) */
3315      DECLARE_PATTERN(p_zwiden_load64);
3316      DEFINE_PATTERN(p_zwiden_load64,
3317                     unop(Iop_64UtoV128,
3318                          IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
3319      if (matchIRExpr(&mi, p_zwiden_load64, e)) {
3320         X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
3321         HReg dst = newVRegV(env);
3322         addInstr(env, X86Instr_SseLdzLO(8, dst, am));
3323         return dst;
3324      }
3325   }
3326
3327   switch (e->Iex.Unop.op) {
3328
3329      case Iop_NotV128: {
3330         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3331         return do_sse_Not128(env, arg);
3332      }
3333
3334      case Iop_CmpNEZ64x2: {
3335         /* We can use SSE2 instructions for this. */
3336         /* Ideally, we want to do a 64Ix2 comparison against zero of
3337            the operand.  Problem is no such insn exists.  Solution
3338            therefore is to do a 32Ix4 comparison instead, and bitwise-
3339            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3340            let the not'd result of this initial comparison be a:b:c:d.
3341            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3342            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3343            giving the required result.
3344
3345            The required selection sequence is 2,3,0,1, which
3346            according to Intel's documentation means the pshufd
3347            literal value is 0xB1, that is,
3348            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3349         */
3350         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3351         HReg tmp  = newVRegV(env);
3352         HReg dst  = newVRegV(env);
3353         REQUIRE_SSE2;
3354         addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
3355         addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
3356         tmp = do_sse_Not128(env, tmp);
3357         addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
3358         addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
3359         return dst;
3360      }
3361
3362      case Iop_CmpNEZ32x4: {
3363         /* Sigh, we have to generate lousy code since this has to
3364            work on SSE1 hosts */
3365         /* basically, the idea is: for each lane:
3366               movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
3367               sbbl %r, %r               (now %r = 1Sto32(CF))
3368               movl %r, lane
3369         */
3370         Int       i;
3371         X86AMode* am;
3372         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3373         HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
3374         HReg      dst  = newVRegV(env);
3375         HReg      r32  = newVRegI(env);
3376         sub_from_esp(env, 16);
3377         addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
3378         for (i = 0; i < 4; i++) {
3379            am = X86AMode_IR(i*4, hregX86_ESP());
3380            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
3381            addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
3382            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
3383            addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
3384         }
3385         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3386         add_to_esp(env, 16);
3387         return dst;
3388      }
3389
3390      case Iop_CmpNEZ8x16:
3391      case Iop_CmpNEZ16x8: {
3392         /* We can use SSE2 instructions for this. */
3393         HReg arg;
3394         HReg vec0 = newVRegV(env);
3395         HReg vec1 = newVRegV(env);
3396         HReg dst  = newVRegV(env);
3397         X86SseOp cmpOp
3398            = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
3399                                             : Xsse_CMPEQ8;
3400         REQUIRE_SSE2;
3401         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
3402         addInstr(env, mk_vMOVsd_RR(vec0, vec1));
3403         addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
3404         /* defer arg computation to here so as to give CMPEQF as long
3405            as possible to complete */
3406         arg = iselVecExpr(env, e->Iex.Unop.arg);
3407         /* vec0 is all 0s; vec1 is all 1s */
3408         addInstr(env, mk_vMOVsd_RR(arg, dst));
3409         /* 16x8 or 8x16 comparison == */
3410         addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
3411         /* invert result */
3412         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
3413         return dst;
3414      }
3415
3416      case Iop_RecipEst32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
3417      case Iop_RSqrtEst32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
3418      do_32Fx4_unary:
3419      {
3420         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3421         HReg dst = newVRegV(env);
3422         addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
3423         return dst;
3424      }
3425
3426      case Iop_RecipEst32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
3427      case Iop_RSqrtEst32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
3428      case Iop_Sqrt32F0x4:     op = Xsse_SQRTF;  goto do_32F0x4_unary;
3429      do_32F0x4_unary:
3430      {
3431         /* A bit subtle.  We have to copy the arg to the result
3432            register first, because actually doing the SSE scalar insn
3433            leaves the upper 3/4 of the destination register
3434            unchanged.  Whereas the required semantics of these
3435            primops is that the upper 3/4 is simply copied in from the
3436            argument. */
3437         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3438         HReg dst = newVRegV(env);
3439         addInstr(env, mk_vMOVsd_RR(arg, dst));
3440         addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
3441         return dst;
3442      }
3443
3444      case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
3445      do_64F0x2_unary:
3446      {
3447         /* A bit subtle.  We have to copy the arg to the result
3448            register first, because actually doing the SSE scalar insn
3449            leaves the upper half of the destination register
3450            unchanged.  Whereas the required semantics of these
3451            primops is that the upper half is simply copied in from the
3452            argument. */
3453         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3454         HReg dst = newVRegV(env);
3455         REQUIRE_SSE2;
3456         addInstr(env, mk_vMOVsd_RR(arg, dst));
3457         addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
3458         return dst;
3459      }
3460
3461      case Iop_32UtoV128: {
3462         HReg      dst  = newVRegV(env);
3463         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3464         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3465         addInstr(env, X86Instr_Push(rmi));
3466	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
3467         add_to_esp(env, 4);
3468         return dst;
3469      }
3470
3471      case Iop_64UtoV128: {
3472         HReg      rHi, rLo;
3473         HReg      dst  = newVRegV(env);
3474         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3475         iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
3476         addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3477         addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3478	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
3479         add_to_esp(env, 8);
3480         return dst;
3481      }
3482
3483      default:
3484         break;
3485   } /* switch (e->Iex.Unop.op) */
3486   } /* if (e->tag == Iex_Unop) */
3487
3488   if (e->tag == Iex_Binop) {
3489   switch (e->Iex.Binop.op) {
3490
3491      case Iop_Sqrt64Fx2:
3492         REQUIRE_SSE2;
3493         /* fallthrough */
3494      case Iop_Sqrt32Fx4: {
3495         /* :: (rmode, vec) -> vec */
3496         HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3497         HReg dst = newVRegV(env);
3498         /* XXXROUNDINGFIXME */
3499         /* set roundingmode here */
3500         addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3501                           ? X86Instr_Sse64Fx2 : X86Instr_Sse32Fx4)
3502                       (Xsse_SQRTF, arg, dst));
3503         return dst;
3504      }
3505
3506      case Iop_SetV128lo32: {
3507         HReg dst = newVRegV(env);
3508         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3509         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3510         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3511         sub_from_esp(env, 16);
3512         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3513         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
3514         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3515         add_to_esp(env, 16);
3516         return dst;
3517      }
3518
3519      case Iop_SetV128lo64: {
3520         HReg dst = newVRegV(env);
3521         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3522         HReg srcIhi, srcIlo;
3523         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3524         X86AMode* esp4 = advance4(esp0);
3525         iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
3526         sub_from_esp(env, 16);
3527         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3528         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
3529         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
3530         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3531         add_to_esp(env, 16);
3532         return dst;
3533      }
3534
3535      case Iop_64HLtoV128: {
3536         HReg r3, r2, r1, r0;
3537         X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
3538         X86AMode* esp4  = advance4(esp0);
3539         X86AMode* esp8  = advance4(esp4);
3540         X86AMode* esp12 = advance4(esp8);
3541         HReg dst = newVRegV(env);
3542	 /* do this via the stack (easy, convenient, etc) */
3543         sub_from_esp(env, 16);
3544         /* Do the less significant 64 bits */
3545         iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
3546         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
3547         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
3548         /* Do the more significant 64 bits */
3549         iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
3550         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
3551         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
3552	 /* Fetch result back from stack. */
3553         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3554         add_to_esp(env, 16);
3555         return dst;
3556      }
3557
3558      case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
3559      case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
3560      case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
3561      case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
3562      case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
3563      case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
3564      do_32Fx4:
3565      {
3566         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3567         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3568         HReg dst = newVRegV(env);
3569         addInstr(env, mk_vMOVsd_RR(argL, dst));
3570         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3571         return dst;
3572      }
3573
3574      case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
3575      case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
3576      case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
3577      case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
3578      case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
3579      case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
3580      do_64Fx2:
3581      {
3582         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3583         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3584         HReg dst = newVRegV(env);
3585         REQUIRE_SSE2;
3586         addInstr(env, mk_vMOVsd_RR(argL, dst));
3587         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3588         return dst;
3589      }
3590
3591      case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
3592      case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
3593      case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
3594      case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
3595      case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
3596      case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
3597      case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
3598      case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
3599      case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
3600      case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
3601      do_32F0x4: {
3602         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3603         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3604         HReg dst = newVRegV(env);
3605         addInstr(env, mk_vMOVsd_RR(argL, dst));
3606         addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
3607         return dst;
3608      }
3609
3610      case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
3611      case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
3612      case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
3613      case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
3614      case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
3615      case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
3616      case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
3617      case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
3618      case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
3619      case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
3620      do_64F0x2: {
3621         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3622         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3623         HReg dst = newVRegV(env);
3624         REQUIRE_SSE2;
3625         addInstr(env, mk_vMOVsd_RR(argL, dst));
3626         addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
3627         return dst;
3628      }
3629
3630      case Iop_QNarrowBin32Sto16Sx8:
3631         op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3632      case Iop_QNarrowBin16Sto8Sx16:
3633         op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3634      case Iop_QNarrowBin16Sto8Ux16:
3635         op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3636
3637      case Iop_InterleaveHI8x16:
3638         op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3639      case Iop_InterleaveHI16x8:
3640         op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3641      case Iop_InterleaveHI32x4:
3642         op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3643      case Iop_InterleaveHI64x2:
3644         op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3645
3646      case Iop_InterleaveLO8x16:
3647         op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3648      case Iop_InterleaveLO16x8:
3649         op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3650      case Iop_InterleaveLO32x4:
3651         op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3652      case Iop_InterleaveLO64x2:
3653         op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3654
3655      case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
3656      case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
3657      case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
3658      case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
3659      case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
3660      case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
3661      case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
3662      case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
3663      case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
3664      case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
3665      case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
3666      case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
3667      case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
3668      case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
3669      case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
3670      case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
3671      case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
3672      case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
3673      case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
3674      case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
3675      case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
3676      case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
3677      case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
3678      case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
3679      case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
3680      case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
3681      case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
3682      case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
3683      case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
3684      case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
3685      case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
3686      case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
3687      case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
3688      case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
3689      do_SseReRg: {
3690         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3691         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3692         HReg dst = newVRegV(env);
3693         if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
3694            REQUIRE_SSE2;
3695         if (arg1isEReg) {
3696            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3697            addInstr(env, X86Instr_SseReRg(op, arg1, dst));
3698         } else {
3699            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3700            addInstr(env, X86Instr_SseReRg(op, arg2, dst));
3701         }
3702         return dst;
3703      }
3704
3705      case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
3706      case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
3707      case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
3708      case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
3709      case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
3710      case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
3711      case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
3712      case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
3713      do_SseShift: {
3714         HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
3715         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3716         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3717         HReg      ereg = newVRegV(env);
3718         HReg      dst  = newVRegV(env);
3719         REQUIRE_SSE2;
3720         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3721         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3722         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3723         addInstr(env, X86Instr_Push(rmi));
3724         addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
3725	 addInstr(env, mk_vMOVsd_RR(greg, dst));
3726         addInstr(env, X86Instr_SseReRg(op, ereg, dst));
3727         add_to_esp(env, 16);
3728         return dst;
3729      }
3730
3731      case Iop_NarrowBin32to16x8:
3732         fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3733         goto do_SseAssistedBinary;
3734      case Iop_NarrowBin16to8x16:
3735         fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3736         goto do_SseAssistedBinary;
3737      do_SseAssistedBinary: {
3738         /* As with the amd64 case (where this is copied from) we
3739            generate pretty bad code. */
3740         vassert(fn != 0);
3741         HReg dst = newVRegV(env);
3742         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3743         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3744         HReg argp = newVRegI(env);
3745         /* subl $112, %esp         -- make a space */
3746         sub_from_esp(env, 112);
3747         /* leal 48(%esp), %r_argp  -- point into it */
3748         addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
3749                                      argp));
3750         /* andl $-16, %r_argp      -- 16-align the pointer */
3751         addInstr(env, X86Instr_Alu32R(Xalu_AND,
3752                                       X86RMI_Imm( ~(UInt)15 ),
3753                                       argp));
3754         /* Prepare 3 arg regs:
3755            leal  0(%r_argp), %eax
3756            leal 16(%r_argp), %edx
3757            leal 32(%r_argp), %ecx
3758         */
3759         addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
3760                                      hregX86_EAX()));
3761         addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
3762                                      hregX86_EDX()));
3763         addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
3764                                      hregX86_ECX()));
3765         /* Store the two args, at (%edx) and (%ecx):
3766            movupd  %argL, 0(%edx)
3767            movupd  %argR, 0(%ecx)
3768         */
3769         addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
3770                                        X86AMode_IR(0, hregX86_EDX())));
3771         addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
3772                                        X86AMode_IR(0, hregX86_ECX())));
3773         /* call the helper */
3774         addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
3775                                      3, mk_RetLoc_simple(RLPri_None) ));
3776         /* fetch the result from memory, using %r_argp, which the
3777            register allocator will keep alive across the call. */
3778         addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
3779                                        X86AMode_IR(0, argp)));
3780         /* and finally, clear the space */
3781         add_to_esp(env, 112);
3782         return dst;
3783      }
3784
3785      default:
3786         break;
3787   } /* switch (e->Iex.Binop.op) */
3788   } /* if (e->tag == Iex_Binop) */
3789
3790
3791   if (e->tag == Iex_Triop) {
3792   IRTriop *triop = e->Iex.Triop.details;
3793   switch (triop->op) {
3794
3795      case Iop_Add32Fx4: op = Xsse_ADDF; goto do_32Fx4_w_rm;
3796      case Iop_Sub32Fx4: op = Xsse_SUBF; goto do_32Fx4_w_rm;
3797      case Iop_Mul32Fx4: op = Xsse_MULF; goto do_32Fx4_w_rm;
3798      case Iop_Div32Fx4: op = Xsse_DIVF; goto do_32Fx4_w_rm;
3799      do_32Fx4_w_rm:
3800      {
3801         HReg argL = iselVecExpr(env, triop->arg2);
3802         HReg argR = iselVecExpr(env, triop->arg3);
3803         HReg dst = newVRegV(env);
3804         addInstr(env, mk_vMOVsd_RR(argL, dst));
3805         /* XXXROUNDINGFIXME */
3806         /* set roundingmode here */
3807         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3808         return dst;
3809      }
3810
3811      case Iop_Add64Fx2: op = Xsse_ADDF; goto do_64Fx2_w_rm;
3812      case Iop_Sub64Fx2: op = Xsse_SUBF; goto do_64Fx2_w_rm;
3813      case Iop_Mul64Fx2: op = Xsse_MULF; goto do_64Fx2_w_rm;
3814      case Iop_Div64Fx2: op = Xsse_DIVF; goto do_64Fx2_w_rm;
3815      do_64Fx2_w_rm:
3816      {
3817         HReg argL = iselVecExpr(env, triop->arg2);
3818         HReg argR = iselVecExpr(env, triop->arg3);
3819         HReg dst = newVRegV(env);
3820         REQUIRE_SSE2;
3821         addInstr(env, mk_vMOVsd_RR(argL, dst));
3822         /* XXXROUNDINGFIXME */
3823         /* set roundingmode here */
3824         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3825         return dst;
3826      }
3827
3828      default:
3829         break;
3830   } /* switch (triop->op) */
3831   } /* if (e->tag == Iex_Triop) */
3832
3833
3834   if (e->tag == Iex_ITE) { // VFD
3835      HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3836      HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3837      HReg dst = newVRegV(env);
3838      addInstr(env, mk_vMOVsd_RR(r1,dst));
3839      X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3840      addInstr(env, X86Instr_SseCMov(cc ^ 1, r0, dst));
3841      return dst;
3842   }
3843
3844   vec_fail:
3845   vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
3846              LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
3847   ppIRExpr(e);
3848   vpanic("iselVecExpr_wrk");
3849
3850#  undef REQUIRE_SSE1
3851#  undef REQUIRE_SSE2
3852#  undef SSE2_OR_ABOVE
3853}
3854
3855
3856/*---------------------------------------------------------*/
3857/*--- ISEL: Statements                                  ---*/
3858/*---------------------------------------------------------*/
3859
3860static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3861{
3862   if (vex_traceflags & VEX_TRACE_VCODE) {
3863      vex_printf("\n-- ");
3864      ppIRStmt(stmt);
3865      vex_printf("\n");
3866   }
3867
3868   switch (stmt->tag) {
3869
3870   /* --------- STORE --------- */
3871   case Ist_Store: {
3872      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3873      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3874      IREndness end   = stmt->Ist.Store.end;
3875
3876      if (tya != Ity_I32 || end != Iend_LE)
3877         goto stmt_fail;
3878
3879      if (tyd == Ity_I32) {
3880         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3881         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
3882         addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
3883         return;
3884      }
3885      if (tyd == Ity_I8 || tyd == Ity_I16) {
3886         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3887         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
3888         addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
3889                                       r,am ));
3890         return;
3891      }
3892      if (tyd == Ity_F64) {
3893         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3894         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
3895         addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
3896         return;
3897      }
3898      if (tyd == Ity_F32) {
3899         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3900         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
3901         addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
3902         return;
3903      }
3904      if (tyd == Ity_I64) {
3905         HReg vHi, vLo, rA;
3906         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
3907         rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
3908         addInstr(env, X86Instr_Alu32M(
3909                          Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
3910         addInstr(env, X86Instr_Alu32M(
3911                          Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
3912         return;
3913      }
3914      if (tyd == Ity_V128) {
3915         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3916         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
3917         addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
3918         return;
3919      }
3920      break;
3921   }
3922
3923   /* --------- PUT --------- */
3924   case Ist_Put: {
3925      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3926      if (ty == Ity_I32) {
3927         /* We're going to write to memory, so compute the RHS into an
3928            X86RI. */
3929         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
3930         addInstr(env,
3931                  X86Instr_Alu32M(
3932                     Xalu_MOV,
3933                     ri,
3934                     X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
3935                 ));
3936         return;
3937      }
3938      if (ty == Ity_I8 || ty == Ity_I16) {
3939         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
3940         addInstr(env, X86Instr_Store(
3941                          toUChar(ty==Ity_I8 ? 1 : 2),
3942                          r,
3943                          X86AMode_IR(stmt->Ist.Put.offset,
3944                                      hregX86_EBP())));
3945         return;
3946      }
3947      if (ty == Ity_I64) {
3948         HReg vHi, vLo;
3949         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3950         X86AMode* am4 = advance4(am);
3951         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
3952         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
3953         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
3954         return;
3955      }
3956      if (ty == Ity_V128) {
3957         HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
3958         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3959         addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
3960         return;
3961      }
3962      if (ty == Ity_F32) {
3963         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
3964         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3965         set_FPU_rounding_default(env); /* paranoia */
3966         addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
3967         return;
3968      }
3969      if (ty == Ity_F64) {
3970         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
3971         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3972         set_FPU_rounding_default(env); /* paranoia */
3973         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
3974         return;
3975      }
3976      break;
3977   }
3978
3979   /* --------- Indexed PUT --------- */
3980   case Ist_PutI: {
3981      IRPutI *puti = stmt->Ist.PutI.details;
3982
3983      X86AMode* am
3984         = genGuestArrayOffset(
3985              env, puti->descr,
3986                   puti->ix, puti->bias );
3987
3988      IRType ty = typeOfIRExpr(env->type_env, puti->data);
3989      if (ty == Ity_F64) {
3990         HReg val = iselDblExpr(env, puti->data);
3991         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
3992         return;
3993      }
3994      if (ty == Ity_I8) {
3995         HReg r = iselIntExpr_R(env, puti->data);
3996         addInstr(env, X86Instr_Store( 1, r, am ));
3997         return;
3998      }
3999      if (ty == Ity_I32) {
4000         HReg r = iselIntExpr_R(env, puti->data);
4001         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
4002         return;
4003      }
4004      if (ty == Ity_I64) {
4005         HReg rHi, rLo;
4006         X86AMode* am4 = advance4(am);
4007         iselInt64Expr(&rHi, &rLo, env, puti->data);
4008         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
4009         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
4010         return;
4011      }
4012      break;
4013   }
4014
4015   /* --------- TMP --------- */
4016   case Ist_WrTmp: {
4017      IRTemp tmp = stmt->Ist.WrTmp.tmp;
4018      IRType ty = typeOfIRTemp(env->type_env, tmp);
4019
4020      /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
4021         compute it into an AMode and then use LEA.  This usually
4022         produces fewer instructions, often because (for memcheck
4023         created IR) we get t = address-expression, (t is later used
4024         twice) and so doing this naturally turns address-expression
4025         back into an X86 amode. */
4026      if (ty == Ity_I32
4027          && stmt->Ist.WrTmp.data->tag == Iex_Binop
4028          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
4029         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4030         HReg dst = lookupIRTemp(env, tmp);
4031         if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
4032            /* Hmm, iselIntExpr_AMode wimped out and just computed the
4033               value into a register.  Just emit a normal reg-reg move
4034               so reg-alloc can coalesce it away in the usual way. */
4035            HReg src = am->Xam.IR.reg;
4036            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
4037         } else {
4038            addInstr(env, X86Instr_Lea32(am,dst));
4039         }
4040         return;
4041      }
4042
4043      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
4044         X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4045         HReg dst = lookupIRTemp(env, tmp);
4046         addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
4047         return;
4048      }
4049      if (ty == Ity_I64) {
4050         HReg rHi, rLo, dstHi, dstLo;
4051         iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4052         lookupIRTemp64( &dstHi, &dstLo, env, tmp);
4053         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4054         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4055         return;
4056      }
4057      if (ty == Ity_I1) {
4058         X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4059         HReg dst = lookupIRTemp(env, tmp);
4060         addInstr(env, X86Instr_Set32(cond, dst));
4061         return;
4062      }
4063      if (ty == Ity_F64) {
4064         HReg dst = lookupIRTemp(env, tmp);
4065         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4066         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
4067         return;
4068      }
4069      if (ty == Ity_F32) {
4070         HReg dst = lookupIRTemp(env, tmp);
4071         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4072         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
4073         return;
4074      }
4075      if (ty == Ity_V128) {
4076         HReg dst = lookupIRTemp(env, tmp);
4077         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4078         addInstr(env, mk_vMOVsd_RR(src,dst));
4079         return;
4080      }
4081      break;
4082   }
4083
4084   /* --------- Call to DIRTY helper --------- */
4085   case Ist_Dirty: {
4086      IRDirty* d = stmt->Ist.Dirty.details;
4087
4088      /* Figure out the return type, if any. */
4089      IRType retty = Ity_INVALID;
4090      if (d->tmp != IRTemp_INVALID)
4091         retty = typeOfIRTemp(env->type_env, d->tmp);
4092
4093      Bool retty_ok = False;
4094      switch (retty) {
4095         case Ity_INVALID: /* function doesn't return anything */
4096         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4097         case Ity_V128:
4098            retty_ok = True; break;
4099         default:
4100            break;
4101      }
4102      if (!retty_ok)
4103         break; /* will go to stmt_fail: */
4104
4105      /* Marshal args, do the call, and set the return value to
4106         0x555..555 if this is a conditional call that returns a value
4107         and the call is skipped. */
4108      UInt   addToSp = 0;
4109      RetLoc rloc    = mk_RetLoc_INVALID();
4110      doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4111      vassert(is_sane_RetLoc(rloc));
4112
4113      /* Now figure out what to do with the returned value, if any. */
4114      switch (retty) {
4115         case Ity_INVALID: {
4116            /* No return value.  Nothing to do. */
4117            vassert(d->tmp == IRTemp_INVALID);
4118            vassert(rloc.pri == RLPri_None);
4119            vassert(addToSp == 0);
4120            return;
4121         }
4122         case Ity_I32: case Ity_I16: case Ity_I8: {
4123            /* The returned value is in %eax.  Park it in the register
4124               associated with tmp. */
4125            vassert(rloc.pri == RLPri_Int);
4126            vassert(addToSp == 0);
4127            HReg dst = lookupIRTemp(env, d->tmp);
4128            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
4129            return;
4130         }
4131         case Ity_I64: {
4132            /* The returned value is in %edx:%eax.  Park it in the
4133               register-pair associated with tmp. */
4134            vassert(rloc.pri == RLPri_2Int);
4135            vassert(addToSp == 0);
4136            HReg dstHi, dstLo;
4137            lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
4138            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
4139            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
4140            return;
4141         }
4142         case Ity_V128: {
4143            /* The returned value is on the stack, and *retloc tells
4144               us where.  Fish it off the stack and then move the
4145               stack pointer upwards to clear it, as directed by
4146               doHelperCall. */
4147            vassert(rloc.pri == RLPri_V128SpRel);
4148            vassert(addToSp >= 16);
4149            HReg      dst = lookupIRTemp(env, d->tmp);
4150            X86AMode* am  = X86AMode_IR(rloc.spOff, hregX86_ESP());
4151            addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
4152            add_to_esp(env, addToSp);
4153            return;
4154         }
4155         default:
4156            /*NOTREACHED*/
4157            vassert(0);
4158      }
4159      break;
4160   }
4161
4162   /* --------- MEM FENCE --------- */
4163   case Ist_MBE:
4164      switch (stmt->Ist.MBE.event) {
4165         case Imbe_Fence:
4166            addInstr(env, X86Instr_MFence(env->hwcaps));
4167            return;
4168         default:
4169            break;
4170      }
4171      break;
4172
4173   /* --------- ACAS --------- */
4174   case Ist_CAS:
4175      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4176         /* "normal" singleton CAS */
4177         UChar  sz;
4178         IRCAS* cas = stmt->Ist.CAS.details;
4179         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4180         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4181         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4182         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4183         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4184         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4185         vassert(cas->expdHi == NULL);
4186         vassert(cas->dataHi == NULL);
4187         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4188         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4189         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4190         switch (ty) {
4191            case Ity_I32: sz = 4; break;
4192            case Ity_I16: sz = 2; break;
4193            case Ity_I8:  sz = 1; break;
4194            default: goto unhandled_cas;
4195         }
4196         addInstr(env, X86Instr_ACAS(am, sz));
4197         addInstr(env,
4198                  X86Instr_CMov32(Xcc_NZ,
4199                                  X86RM_Reg(hregX86_EAX()), rOldLo));
4200         return;
4201      } else {
4202         /* double CAS */
4203         IRCAS* cas = stmt->Ist.CAS.details;
4204         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4205         /* only 32-bit allowed in this case */
4206         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4207         /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
4208         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4209         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4210         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4211         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4212         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4213         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4214         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4215         if (ty != Ity_I32)
4216            goto unhandled_cas;
4217         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4218         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4219         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
4220         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4221         addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
4222         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4223         addInstr(env, X86Instr_DACAS(am));
4224         addInstr(env,
4225                  X86Instr_CMov32(Xcc_NZ,
4226                                  X86RM_Reg(hregX86_EDX()), rOldHi));
4227         addInstr(env,
4228                  X86Instr_CMov32(Xcc_NZ,
4229                                  X86RM_Reg(hregX86_EAX()), rOldLo));
4230         return;
4231      }
4232      unhandled_cas:
4233      break;
4234
4235   /* --------- INSTR MARK --------- */
4236   /* Doesn't generate any executable code ... */
4237   case Ist_IMark:
4238       return;
4239
4240   /* --------- NO-OP --------- */
4241   /* Fairly self-explanatory, wouldn't you say? */
4242   case Ist_NoOp:
4243       return;
4244
4245   /* --------- EXIT --------- */
4246   case Ist_Exit: {
4247      if (stmt->Ist.Exit.dst->tag != Ico_U32)
4248         vpanic("iselStmt(x86): Ist_Exit: dst is not a 32-bit value");
4249
4250      X86CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4251      X86AMode*   amEIP = X86AMode_IR(stmt->Ist.Exit.offsIP,
4252                                      hregX86_EBP());
4253
4254      /* Case: boring transfer to known address */
4255      if (stmt->Ist.Exit.jk == Ijk_Boring) {
4256         if (env->chainingAllowed) {
4257            /* .. almost always true .. */
4258            /* Skip the event check at the dst if this is a forwards
4259               edge. */
4260            Bool toFastEP
4261               = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
4262            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4263            addInstr(env, X86Instr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
4264                                           amEIP, cc, toFastEP));
4265         } else {
4266            /* .. very occasionally .. */
4267            /* We can't use chaining, so ask for an assisted transfer,
4268               as that's the only alternative that is allowable. */
4269            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4270            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, Ijk_Boring));
4271         }
4272         return;
4273      }
4274
4275      /* Case: assisted transfer to arbitrary address */
4276      switch (stmt->Ist.Exit.jk) {
4277         /* Keep this list in sync with that in iselNext below */
4278         case Ijk_ClientReq:
4279         case Ijk_EmWarn:
4280         case Ijk_MapFail:
4281         case Ijk_NoDecode:
4282         case Ijk_NoRedir:
4283         case Ijk_SigSEGV:
4284         case Ijk_SigTRAP:
4285         case Ijk_Sys_int128:
4286         case Ijk_Sys_int129:
4287         case Ijk_Sys_int130:
4288         case Ijk_Sys_int145:
4289         case Ijk_Sys_int210:
4290         case Ijk_Sys_syscall:
4291         case Ijk_Sys_sysenter:
4292         case Ijk_InvalICache:
4293         case Ijk_Yield:
4294         {
4295            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4296            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, stmt->Ist.Exit.jk));
4297            return;
4298         }
4299         default:
4300            break;
4301      }
4302
4303      /* Do we ever expect to see any other kind? */
4304      goto stmt_fail;
4305   }
4306
4307   default: break;
4308   }
4309  stmt_fail:
4310   ppIRStmt(stmt);
4311   vpanic("iselStmt");
4312}
4313
4314
4315/*---------------------------------------------------------*/
4316/*--- ISEL: Basic block terminators (Nexts)             ---*/
4317/*---------------------------------------------------------*/
4318
4319static void iselNext ( ISelEnv* env,
4320                       IRExpr* next, IRJumpKind jk, Int offsIP )
4321{
4322   if (vex_traceflags & VEX_TRACE_VCODE) {
4323      vex_printf( "\n-- PUT(%d) = ", offsIP);
4324      ppIRExpr( next );
4325      vex_printf( "; exit-");
4326      ppIRJumpKind(jk);
4327      vex_printf( "\n");
4328   }
4329
4330   /* Case: boring transfer to known address */
4331   if (next->tag == Iex_Const) {
4332      IRConst* cdst = next->Iex.Const.con;
4333      vassert(cdst->tag == Ico_U32);
4334      if (jk == Ijk_Boring || jk == Ijk_Call) {
4335         /* Boring transfer to known address */
4336         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4337         if (env->chainingAllowed) {
4338            /* .. almost always true .. */
4339            /* Skip the event check at the dst if this is a forwards
4340               edge. */
4341            Bool toFastEP
4342               = ((Addr32)cdst->Ico.U32) > env->max_ga;
4343            if (0) vex_printf("%s", toFastEP ? "X" : ".");
4344            addInstr(env, X86Instr_XDirect(cdst->Ico.U32,
4345                                           amEIP, Xcc_ALWAYS,
4346                                           toFastEP));
4347         } else {
4348            /* .. very occasionally .. */
4349            /* We can't use chaining, so ask for an assisted transfer,
4350               as that's the only alternative that is allowable. */
4351            HReg r = iselIntExpr_R(env, next);
4352            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4353                                             Ijk_Boring));
4354         }
4355         return;
4356      }
4357   }
4358
4359   /* Case: call/return (==boring) transfer to any address */
4360   switch (jk) {
4361      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4362         HReg      r     = iselIntExpr_R(env, next);
4363         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4364         if (env->chainingAllowed) {
4365            addInstr(env, X86Instr_XIndir(r, amEIP, Xcc_ALWAYS));
4366         } else {
4367            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4368                                               Ijk_Boring));
4369         }
4370         return;
4371      }
4372      default:
4373         break;
4374   }
4375
4376   /* Case: assisted transfer to arbitrary address */
4377   switch (jk) {
4378      /* Keep this list in sync with that for Ist_Exit above */
4379      case Ijk_ClientReq:
4380      case Ijk_EmWarn:
4381      case Ijk_MapFail:
4382      case Ijk_NoDecode:
4383      case Ijk_NoRedir:
4384      case Ijk_SigSEGV:
4385      case Ijk_SigTRAP:
4386      case Ijk_Sys_int128:
4387      case Ijk_Sys_int129:
4388      case Ijk_Sys_int130:
4389      case Ijk_Sys_int145:
4390      case Ijk_Sys_int210:
4391      case Ijk_Sys_syscall:
4392      case Ijk_Sys_sysenter:
4393      case Ijk_InvalICache:
4394      case Ijk_Yield:
4395      {
4396         HReg      r     = iselIntExpr_R(env, next);
4397         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4398         addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS, jk));
4399         return;
4400      }
4401      default:
4402         break;
4403   }
4404
4405   vex_printf( "\n-- PUT(%d) = ", offsIP);
4406   ppIRExpr( next );
4407   vex_printf( "; exit-");
4408   ppIRJumpKind(jk);
4409   vex_printf( "\n");
4410   vassert(0); // are we expecting any other kind?
4411}
4412
4413
4414/*---------------------------------------------------------*/
4415/*--- Insn selector top-level                           ---*/
4416/*---------------------------------------------------------*/
4417
4418/* Translate an entire SB to x86 code. */
4419
4420HInstrArray* iselSB_X86 ( const IRSB* bb,
4421                          VexArch      arch_host,
4422                          const VexArchInfo* archinfo_host,
4423                          const VexAbiInfo*  vbi/*UNUSED*/,
4424                          Int offs_Host_EvC_Counter,
4425                          Int offs_Host_EvC_FailAddr,
4426                          Bool chainingAllowed,
4427                          Bool addProfInc,
4428                          Addr max_ga )
4429{
4430   Int      i, j;
4431   HReg     hreg, hregHI;
4432   ISelEnv* env;
4433   UInt     hwcaps_host = archinfo_host->hwcaps;
4434   X86AMode *amCounter, *amFailAddr;
4435
4436   /* sanity ... */
4437   vassert(arch_host == VexArchX86);
4438   vassert(0 == (hwcaps_host
4439                 & ~(VEX_HWCAPS_X86_MMXEXT
4440                     | VEX_HWCAPS_X86_SSE1
4441                     | VEX_HWCAPS_X86_SSE2
4442                     | VEX_HWCAPS_X86_SSE3
4443                     | VEX_HWCAPS_X86_LZCNT)));
4444
4445   /* Check that the host's endianness is as expected. */
4446   vassert(archinfo_host->endness == VexEndnessLE);
4447
4448   /* Make up an initial environment to use. */
4449   env = LibVEX_Alloc_inline(sizeof(ISelEnv));
4450   env->vreg_ctr = 0;
4451
4452   /* Set up output code array. */
4453   env->code = newHInstrArray();
4454
4455   /* Copy BB's type env. */
4456   env->type_env = bb->tyenv;
4457
4458   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4459      change as we go along. */
4460   env->n_vregmap = bb->tyenv->types_used;
4461   env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4462   env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4463
4464   /* and finally ... */
4465   env->chainingAllowed = chainingAllowed;
4466   env->hwcaps          = hwcaps_host;
4467   env->max_ga          = max_ga;
4468
4469   /* For each IR temporary, allocate a suitably-kinded virtual
4470      register. */
4471   j = 0;
4472   for (i = 0; i < env->n_vregmap; i++) {
4473      hregHI = hreg = INVALID_HREG;
4474      switch (bb->tyenv->types[i]) {
4475         case Ity_I1:
4476         case Ity_I8:
4477         case Ity_I16:
4478         case Ity_I32:  hreg   = mkHReg(True, HRcInt32,  0, j++); break;
4479         case Ity_I64:  hreg   = mkHReg(True, HRcInt32,  0, j++);
4480                        hregHI = mkHReg(True, HRcInt32,  0, j++); break;
4481         case Ity_F32:
4482         case Ity_F64:  hreg   = mkHReg(True, HRcFlt64,  0, j++); break;
4483         case Ity_V128: hreg   = mkHReg(True, HRcVec128, 0, j++); break;
4484         default: ppIRType(bb->tyenv->types[i]);
4485                  vpanic("iselBB: IRTemp type");
4486      }
4487      env->vregmap[i]   = hreg;
4488      env->vregmapHI[i] = hregHI;
4489   }
4490   env->vreg_ctr = j;
4491
4492   /* The very first instruction must be an event check. */
4493   amCounter  = X86AMode_IR(offs_Host_EvC_Counter,  hregX86_EBP());
4494   amFailAddr = X86AMode_IR(offs_Host_EvC_FailAddr, hregX86_EBP());
4495   addInstr(env, X86Instr_EvCheck(amCounter, amFailAddr));
4496
4497   /* Possibly a block counter increment (for profiling).  At this
4498      point we don't know the address of the counter, so just pretend
4499      it is zero.  It will have to be patched later, but before this
4500      translation is used, by a call to LibVEX_patchProfCtr. */
4501   if (addProfInc) {
4502      addInstr(env, X86Instr_ProfInc());
4503   }
4504
4505   /* Ok, finally we can iterate over the statements. */
4506   for (i = 0; i < bb->stmts_used; i++)
4507      iselStmt(env, bb->stmts[i]);
4508
4509   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4510
4511   /* record the number of vregs we used. */
4512   env->code->n_vregs = env->vreg_ctr;
4513   return env->code;
4514}
4515
4516
4517/*---------------------------------------------------------------*/
4518/*--- end                                     host_x86_isel.c ---*/
4519/*---------------------------------------------------------------*/
4520