1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_x86_defs.h"
47
48/* TODO 21 Apr 2005:
49
50   -- (Really an assembler issue) don't emit CMov32 as a cmov
51      insn, since that's expensive on P4 and conditional branch
52      is cheaper if (as we expect) the condition is highly predictable
53
54   -- preserve xmm registers across function calls (by declaring them
55      as trashed by call insns)
56
57   -- preserve x87 ST stack discipline across function calls.  Sigh.
58
59   -- Check doHelperCall: if a call is conditional, we cannot safely
60      compute any regparm args directly to registers.  Hence, the
61      fast-regparm marshalling should be restricted to unconditional
62      calls only.
63*/
64
65/*---------------------------------------------------------*/
66/*--- x87 control word stuff                            ---*/
67/*---------------------------------------------------------*/
68
69/* Vex-generated code expects to run with the FPU set as follows: all
70   exceptions masked, round-to-nearest, precision = 53 bits.  This
71   corresponds to a FPU control word value of 0x027F.
72
73   Similarly the SSE control word (%mxcsr) should be 0x1F80.
74
75   %fpucw and %mxcsr should have these values on entry to
76   Vex-generated code, and should those values should be
77   unchanged at exit.
78*/
79
80#define DEFAULT_FPUCW 0x027F
81
82/* debugging only, do not use */
83/* define DEFAULT_FPUCW 0x037F */
84
85
86/*---------------------------------------------------------*/
87/*--- misc helpers                                      ---*/
88/*---------------------------------------------------------*/
89
90/* These are duplicated in guest-x86/toIR.c */
91static IRExpr* unop ( IROp op, IRExpr* a )
92{
93   return IRExpr_Unop(op, a);
94}
95
96static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
97{
98   return IRExpr_Binop(op, a1, a2);
99}
100
101static IRExpr* bind ( Int binder )
102{
103   return IRExpr_Binder(binder);
104}
105
106static Bool isZeroU8 ( IRExpr* e )
107{
108   return e->tag == Iex_Const
109          && e->Iex.Const.con->tag == Ico_U8
110          && e->Iex.Const.con->Ico.U8 == 0;
111}
112
113static Bool isZeroU32 ( IRExpr* e )
114{
115   return e->tag == Iex_Const
116          && e->Iex.Const.con->tag == Ico_U32
117          && e->Iex.Const.con->Ico.U32 == 0;
118}
119
120static Bool isZeroU64 ( IRExpr* e )
121{
122   return e->tag == Iex_Const
123          && e->Iex.Const.con->tag == Ico_U64
124          && e->Iex.Const.con->Ico.U64 == 0ULL;
125}
126
127
128/*---------------------------------------------------------*/
129/*--- ISelEnv                                           ---*/
130/*---------------------------------------------------------*/
131
132/* This carries around:
133
134   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
135     might encounter.  This is computed before insn selection starts,
136     and does not change.
137
138   - A mapping from IRTemp to HReg.  This tells the insn selector
139     which virtual register(s) are associated with each IRTemp
140     temporary.  This is computed before insn selection starts, and
141     does not change.  We expect this mapping to map precisely the
142     same set of IRTemps as the type mapping does.
143
144        - vregmap   holds the primary register for the IRTemp.
145        - vregmapHI is only used for 64-bit integer-typed
146             IRTemps.  It holds the identity of a second
147             32-bit virtual HReg, which holds the high half
148             of the value.
149
150   - The code array, that is, the insns selected so far.
151
152   - A counter, for generating new virtual registers.
153
154   - The host subarchitecture we are selecting insns for.
155     This is set at the start and does not change.
156
157   - A Bool for indicating whether we may generate chain-me
158     instructions for control flow transfers, or whether we must use
159     XAssisted.
160
161   - The maximum guest address of any guest insn in this block.
162     Actually, the address of the highest-addressed byte from any insn
163     in this block.  Is set at the start and does not change.  This is
164     used for detecting jumps which are definitely forward-edges from
165     this block, and therefore can be made (chained) to the fast entry
166     point of the destination, thereby avoiding the destination's
167     event check.
168
169   Note, this is all (well, mostly) host-independent.
170*/
171
172typedef
173   struct {
174      /* Constant -- are set at the start and do not change. */
175      IRTypeEnv*   type_env;
176
177      HReg*        vregmap;
178      HReg*        vregmapHI;
179      Int          n_vregmap;
180
181      UInt         hwcaps;
182
183      Bool         chainingAllowed;
184      Addr64       max_ga;
185
186      /* These are modified as we go along. */
187      HInstrArray* code;
188      Int          vreg_ctr;
189   }
190   ISelEnv;
191
192
193static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
194{
195   vassert(tmp >= 0);
196   vassert(tmp < env->n_vregmap);
197   return env->vregmap[tmp];
198}
199
200static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
201{
202   vassert(tmp >= 0);
203   vassert(tmp < env->n_vregmap);
204   vassert(env->vregmapHI[tmp] != INVALID_HREG);
205   *vrLO = env->vregmap[tmp];
206   *vrHI = env->vregmapHI[tmp];
207}
208
209static void addInstr ( ISelEnv* env, X86Instr* instr )
210{
211   addHInstr(env->code, instr);
212   if (vex_traceflags & VEX_TRACE_VCODE) {
213      ppX86Instr(instr, False);
214      vex_printf("\n");
215   }
216}
217
218static HReg newVRegI ( ISelEnv* env )
219{
220   HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
221   env->vreg_ctr++;
222   return reg;
223}
224
225static HReg newVRegF ( ISelEnv* env )
226{
227   HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
228   env->vreg_ctr++;
229   return reg;
230}
231
232static HReg newVRegV ( ISelEnv* env )
233{
234   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
235   env->vreg_ctr++;
236   return reg;
237}
238
239
240/*---------------------------------------------------------*/
241/*--- ISEL: Forward declarations                        ---*/
242/*---------------------------------------------------------*/
243
244/* These are organised as iselXXX and iselXXX_wrk pairs.  The
245   iselXXX_wrk do the real work, but are not to be called directly.
246   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
247   checks that all returned registers are virtual.  You should not
248   call the _wrk version directly.
249*/
250static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
251static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
252
253static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
254static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
255
256static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
257static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
258
259static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
260static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
261
262static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
263static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
264
265static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
266                                       ISelEnv* env, IRExpr* e );
267static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
268                                       ISelEnv* env, IRExpr* e );
269
270static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
271static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
272
273static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
274static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
275
276static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
277static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
278
279static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
280static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
281
282
283/*---------------------------------------------------------*/
284/*--- ISEL: Misc helpers                                ---*/
285/*---------------------------------------------------------*/
286
287/* Make a int reg-reg move. */
288
289static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
290{
291   vassert(hregClass(src) == HRcInt32);
292   vassert(hregClass(dst) == HRcInt32);
293   return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
294}
295
296
297/* Make a vector reg-reg move. */
298
299static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
300{
301   vassert(hregClass(src) == HRcVec128);
302   vassert(hregClass(dst) == HRcVec128);
303   return X86Instr_SseReRg(Xsse_MOV, src, dst);
304}
305
306/* Advance/retreat %esp by n. */
307
308static void add_to_esp ( ISelEnv* env, Int n )
309{
310   vassert(n > 0 && n < 256 && (n%4) == 0);
311   addInstr(env,
312            X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
313}
314
315static void sub_from_esp ( ISelEnv* env, Int n )
316{
317   vassert(n > 0 && n < 256 && (n%4) == 0);
318   addInstr(env,
319            X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
320}
321
322
323/* Given an amode, return one which references 4 bytes further
324   along. */
325
326static X86AMode* advance4 ( X86AMode* am )
327{
328   X86AMode* am4 = dopyX86AMode(am);
329   switch (am4->tag) {
330      case Xam_IRRS:
331         am4->Xam.IRRS.imm += 4; break;
332      case Xam_IR:
333         am4->Xam.IR.imm += 4; break;
334      default:
335         vpanic("advance4(x86,host)");
336   }
337   return am4;
338}
339
340
341/* Push an arg onto the host stack, in preparation for a call to a
342   helper function of some kind.  Returns the number of 32-bit words
343   pushed. */
344
345static Int pushArg ( ISelEnv* env, IRExpr* arg )
346{
347   IRType arg_ty = typeOfIRExpr(env->type_env, arg);
348   if (arg_ty == Ity_I32) {
349      addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
350      return 1;
351   } else
352   if (arg_ty == Ity_I64) {
353      HReg rHi, rLo;
354      iselInt64Expr(&rHi, &rLo, env, arg);
355      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
356      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
357      return 2;
358   }
359   ppIRExpr(arg);
360   vpanic("pushArg(x86): can't handle arg of this type");
361}
362
363
364/* Complete the call to a helper function, by calling the
365   helper and clearing the args off the stack. */
366
367static
368void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
369                              IRCallee* cee, Int n_arg_ws )
370{
371   /* Complication.  Need to decide which reg to use as the fn address
372      pointer, in a way that doesn't trash regparm-passed
373      parameters. */
374   vassert(sizeof(void*) == 4);
375
376   addInstr(env, X86Instr_Call( cc, toUInt(Ptr_to_ULong(cee->addr)),
377                                    cee->regparms));
378   if (n_arg_ws > 0)
379      add_to_esp(env, 4*n_arg_ws);
380}
381
382
383/* Used only in doHelperCall.  See big comment in doHelperCall re
384   handling of regparm args.  This function figures out whether
385   evaluation of an expression might require use of a fixed register.
386   If in doubt return True (safe but suboptimal).
387*/
388static
389Bool mightRequireFixedRegs ( IRExpr* e )
390{
391   switch (e->tag) {
392      case Iex_RdTmp: case Iex_Const: case Iex_Get:
393         return False;
394      default:
395         return True;
396   }
397}
398
399
400/* Do a complete function call.  guard is a Ity_Bit expression
401   indicating whether or not the call happens.  If guard==NULL, the
402   call is unconditional. */
403
404static
405void doHelperCall ( ISelEnv* env,
406                    Bool passBBP,
407                    IRExpr* guard, IRCallee* cee, IRExpr** args )
408{
409   X86CondCode cc;
410   HReg        argregs[3];
411   HReg        tmpregs[3];
412   Bool        danger;
413   Int         not_done_yet, n_args, n_arg_ws, stack_limit,
414               i, argreg, argregX;
415
416   /* Marshal args for a call, do the call, and clear the stack.
417      Complexities to consider:
418
419      * if passBBP is True, %ebp (the baseblock pointer) is to be
420        passed as the first arg.
421
422      * If the callee claims regparmness of 1, 2 or 3, we must pass the
423        first 1, 2 or 3 args in registers (EAX, EDX, and ECX
424        respectively).  To keep things relatively simple, only args of
425        type I32 may be passed as regparms -- just bomb out if anything
426        else turns up.  Clearly this depends on the front ends not
427        trying to pass any other types as regparms.
428   */
429
430   /* 16 Nov 2004: the regparm handling is complicated by the
431      following problem.
432
433      Consider a call two a function with two regparm parameters:
434      f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
435      Suppose code is first generated to compute e1 into %eax.  Then,
436      code is generated to compute e2 into %edx.  Unfortunately, if
437      the latter code sequence uses %eax, it will trash the value of
438      e1 computed by the former sequence.  This could happen if (for
439      example) e2 itself involved a function call.  In the code below,
440      args are evaluated right-to-left, not left-to-right, but the
441      principle and the problem are the same.
442
443      One solution is to compute all regparm-bound args into vregs
444      first, and once they are all done, move them to the relevant
445      real regs.  This always gives correct code, but it also gives
446      a bunch of vreg-to-rreg moves which are usually redundant but
447      are hard for the register allocator to get rid of.
448
449      A compromise is to first examine all regparm'd argument
450      expressions.  If they are all so simple that it is clear
451      they will be evaluated without use of any fixed registers,
452      use the old compute-directly-to-fixed-target scheme.  If not,
453      be safe and use the via-vregs scheme.
454
455      Note this requires being able to examine an expression and
456      determine whether or not evaluation of it might use a fixed
457      register.  That requires knowledge of how the rest of this
458      insn selector works.  Currently just the following 3 are
459      regarded as safe -- hopefully they cover the majority of
460      arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
461   */
462   vassert(cee->regparms >= 0 && cee->regparms <= 3);
463
464   n_args = n_arg_ws = 0;
465   while (args[n_args]) n_args++;
466
467   not_done_yet = n_args;
468   if (passBBP)
469      not_done_yet++;
470
471   stack_limit = cee->regparms;
472   if (cee->regparms > 0 && passBBP) stack_limit--;
473
474   /* ------ BEGIN marshall all arguments ------ */
475
476   /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
477   for (i = n_args-1; i >= stack_limit; i--) {
478      n_arg_ws += pushArg(env, args[i]);
479      not_done_yet--;
480   }
481
482   /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
483      registers. */
484
485   if (cee->regparms > 0) {
486
487      /* ------ BEGIN deal with regparms ------ */
488
489      /* deal with regparms, not forgetting %ebp if needed. */
490      argregs[0] = hregX86_EAX();
491      argregs[1] = hregX86_EDX();
492      argregs[2] = hregX86_ECX();
493      tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
494
495      argreg = cee->regparms;
496
497      /* In keeping with big comment above, detect potential danger
498         and use the via-vregs scheme if needed. */
499      danger = False;
500      for (i = stack_limit-1; i >= 0; i--) {
501         if (mightRequireFixedRegs(args[i])) {
502            danger = True;
503            break;
504         }
505      }
506
507      if (danger) {
508
509         /* Move via temporaries */
510         argregX = argreg;
511         for (i = stack_limit-1; i >= 0; i--) {
512
513            if (0) {
514               vex_printf("x86 host: register param is complex: ");
515               ppIRExpr(args[i]);
516               vex_printf("\n");
517            }
518
519            argreg--;
520            vassert(argreg >= 0);
521            vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
522            tmpregs[argreg] = iselIntExpr_R(env, args[i]);
523            not_done_yet--;
524         }
525         for (i = stack_limit-1; i >= 0; i--) {
526            argregX--;
527            vassert(argregX >= 0);
528            addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
529         }
530
531      } else {
532         /* It's safe to compute all regparm args directly into their
533            target registers. */
534         for (i = stack_limit-1; i >= 0; i--) {
535            argreg--;
536            vassert(argreg >= 0);
537            vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
538            addInstr(env, X86Instr_Alu32R(Xalu_MOV,
539                                          iselIntExpr_RMI(env, args[i]),
540                                          argregs[argreg]));
541            not_done_yet--;
542         }
543
544      }
545
546      /* Not forgetting %ebp if needed. */
547      if (passBBP) {
548         vassert(argreg == 1);
549         addInstr(env, mk_iMOVsd_RR( hregX86_EBP(), argregs[0]));
550         not_done_yet--;
551      }
552
553      /* ------ END deal with regparms ------ */
554
555   } else {
556
557      /* No regparms.  Heave %ebp on the stack if needed. */
558      if (passBBP) {
559         addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
560         n_arg_ws++;
561         not_done_yet--;
562      }
563
564   }
565
566   vassert(not_done_yet == 0);
567
568   /* ------ END marshall all arguments ------ */
569
570   /* Now we can compute the condition.  We can't do it earlier
571      because the argument computations could trash the condition
572      codes.  Be a bit clever to handle the common case where the
573      guard is 1:Bit. */
574   cc = Xcc_ALWAYS;
575   if (guard) {
576      if (guard->tag == Iex_Const
577          && guard->Iex.Const.con->tag == Ico_U1
578          && guard->Iex.Const.con->Ico.U1 == True) {
579         /* unconditional -- do nothing */
580      } else {
581         cc = iselCondCode( env, guard );
582      }
583   }
584
585   /* call the helper, and get the args off the stack afterwards. */
586   callHelperAndClearArgs( env, cc, cee, n_arg_ws );
587}
588
589
590/* Given a guest-state array descriptor, an index expression and a
591   bias, generate an X86AMode holding the relevant guest state
592   offset. */
593
594static
595X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
596                                IRExpr* off, Int bias )
597{
598   HReg tmp, roff;
599   Int  elemSz = sizeofIRType(descr->elemTy);
600   Int  nElems = descr->nElems;
601   Int  shift  = 0;
602
603   /* throw out any cases not generated by an x86 front end.  In
604      theory there might be a day where we need to handle them -- if
605      we ever run non-x86-guest on x86 host. */
606
607   if (nElems != 8)
608      vpanic("genGuestArrayOffset(x86 host)(1)");
609
610   switch (elemSz) {
611      case 1:  shift = 0; break;
612      case 4:  shift = 2; break;
613      case 8:  shift = 3; break;
614      default: vpanic("genGuestArrayOffset(x86 host)(2)");
615   }
616
617   /* Compute off into a reg, %off.  Then return:
618
619         movl %off, %tmp
620         addl $bias, %tmp  (if bias != 0)
621         andl %tmp, 7
622         ... base(%ebp, %tmp, shift) ...
623   */
624   tmp  = newVRegI(env);
625   roff = iselIntExpr_R(env, off);
626   addInstr(env, mk_iMOVsd_RR(roff, tmp));
627   if (bias != 0) {
628      addInstr(env,
629               X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
630   }
631   addInstr(env,
632            X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
633   return
634      X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
635}
636
637
638/* Mess with the FPU's rounding mode: set to the default rounding mode
639   (DEFAULT_FPUCW). */
640static
641void set_FPU_rounding_default ( ISelEnv* env )
642{
643   /* pushl $DEFAULT_FPUCW
644      fldcw 0(%esp)
645      addl $4, %esp
646   */
647   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
648   addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
649   addInstr(env, X86Instr_FpLdCW(zero_esp));
650   add_to_esp(env, 4);
651}
652
653
654/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
655   expression denoting a value in the range 0 .. 3, indicating a round
656   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
657   the same rounding.
658*/
659static
660void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
661{
662   HReg rrm  = iselIntExpr_R(env, mode);
663   HReg rrm2 = newVRegI(env);
664   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
665
666   /* movl  %rrm, %rrm2
667      andl  $3, %rrm2   -- shouldn't be needed; paranoia
668      shll  $10, %rrm2
669      orl   $DEFAULT_FPUCW, %rrm2
670      pushl %rrm2
671      fldcw 0(%esp)
672      addl  $4, %esp
673   */
674   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
675   addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
676   addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
677   addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
678   addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
679   addInstr(env, X86Instr_FpLdCW(zero_esp));
680   add_to_esp(env, 4);
681}
682
683
684/* Generate !src into a new vector register, and be sure that the code
685   is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
686   way to do this.
687*/
688static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
689{
690   HReg dst = newVRegV(env);
691   /* Set dst to zero.  If dst contains a NaN then all hell might
692      break loose after the comparison.  So, first zero it. */
693   addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
694   /* And now make it all 1s ... */
695   addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
696   /* Finally, xor 'src' into it. */
697   addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
698   /* Doesn't that just totally suck? */
699   return dst;
700}
701
702
703/* Round an x87 FPU value to 53-bit-mantissa precision, to be used
704   after most non-simple FPU operations (simple = +, -, *, / and
705   sqrt).
706
707   This could be done a lot more efficiently if needed, by loading
708   zero and adding it to the value to be rounded (fldz ; faddp?).
709*/
710static void roundToF64 ( ISelEnv* env, HReg reg )
711{
712   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
713   sub_from_esp(env, 8);
714   addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
715   addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
716   add_to_esp(env, 8);
717}
718
719
720/*---------------------------------------------------------*/
721/*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
722/*---------------------------------------------------------*/
723
724/* Select insns for an integer-typed expression, and add them to the
725   code list.  Return a reg holding the result.  This reg will be a
726   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
727   want to modify it, ask for a new vreg, copy it in there, and modify
728   the copy.  The register allocator will do its best to map both
729   vregs to the same real register, so the copies will often disappear
730   later in the game.
731
732   This should handle expressions of 32, 16 and 8-bit type.  All
733   results are returned in a 32-bit register.  For 16- and 8-bit
734   expressions, the upper 16/24 bits are arbitrary, so you should mask
735   or sign extend partial values if necessary.
736*/
737
738static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
739{
740   HReg r = iselIntExpr_R_wrk(env, e);
741   /* sanity checks ... */
742#  if 0
743   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
744#  endif
745   vassert(hregClass(r) == HRcInt32);
746   vassert(hregIsVirtual(r));
747   return r;
748}
749
750/* DO NOT CALL THIS DIRECTLY ! */
751static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
752{
753   MatchInfo mi;
754
755   IRType ty = typeOfIRExpr(env->type_env,e);
756   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
757
758   switch (e->tag) {
759
760   /* --------- TEMP --------- */
761   case Iex_RdTmp: {
762      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
763   }
764
765   /* --------- LOAD --------- */
766   case Iex_Load: {
767      HReg dst = newVRegI(env);
768      X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
769
770      /* We can't handle big-endian loads, nor load-linked. */
771      if (e->Iex.Load.end != Iend_LE)
772         goto irreducible;
773
774      if (ty == Ity_I32) {
775         addInstr(env, X86Instr_Alu32R(Xalu_MOV,
776                                       X86RMI_Mem(amode), dst) );
777         return dst;
778      }
779      if (ty == Ity_I16) {
780         addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
781         return dst;
782      }
783      if (ty == Ity_I8) {
784         addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
785         return dst;
786      }
787      break;
788   }
789
790   /* --------- TERNARY OP --------- */
791   case Iex_Triop: {
792      IRTriop *triop = e->Iex.Triop.details;
793      /* C3210 flags following FPU partial remainder (fprem), both
794         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
795      if (triop->op == Iop_PRemC3210F64
796          || triop->op == Iop_PRem1C3210F64) {
797         HReg junk = newVRegF(env);
798         HReg dst  = newVRegI(env);
799         HReg srcL = iselDblExpr(env, triop->arg2);
800         HReg srcR = iselDblExpr(env, triop->arg3);
801         /* XXXROUNDINGFIXME */
802         /* set roundingmode here */
803         addInstr(env, X86Instr_FpBinary(
804                           e->Iex.Binop.op==Iop_PRemC3210F64
805                              ? Xfp_PREM : Xfp_PREM1,
806                           srcL,srcR,junk
807                 ));
808         /* The previous pseudo-insn will have left the FPU's C3210
809            flags set correctly.  So bag them. */
810         addInstr(env, X86Instr_FpStSW_AX());
811         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
812         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
813         return dst;
814      }
815
816      break;
817   }
818
819   /* --------- BINARY OP --------- */
820   case Iex_Binop: {
821      X86AluOp   aluOp;
822      X86ShiftOp shOp;
823
824      /* Pattern: Sub32(0,x) */
825      if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
826         HReg dst = newVRegI(env);
827         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
828         addInstr(env, mk_iMOVsd_RR(reg,dst));
829         addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
830         return dst;
831      }
832
833      /* Is it an addition or logical style op? */
834      switch (e->Iex.Binop.op) {
835         case Iop_Add8: case Iop_Add16: case Iop_Add32:
836            aluOp = Xalu_ADD; break;
837         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
838            aluOp = Xalu_SUB; break;
839         case Iop_And8: case Iop_And16: case Iop_And32:
840            aluOp = Xalu_AND; break;
841         case Iop_Or8: case Iop_Or16: case Iop_Or32:
842            aluOp = Xalu_OR; break;
843         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
844            aluOp = Xalu_XOR; break;
845         case Iop_Mul16: case Iop_Mul32:
846            aluOp = Xalu_MUL; break;
847         default:
848            aluOp = Xalu_INVALID; break;
849      }
850      /* For commutative ops we assume any literal
851         values are on the second operand. */
852      if (aluOp != Xalu_INVALID) {
853         HReg dst    = newVRegI(env);
854         HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
855         X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
856         addInstr(env, mk_iMOVsd_RR(reg,dst));
857         addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
858         return dst;
859      }
860      /* Could do better here; forcing the first arg into a reg
861         isn't always clever.
862         -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
863                        LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
864                        t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
865            movl 0xFFFFFFA0(%vr41),%vr107
866            movl 0xFFFFFFA4(%vr41),%vr108
867            movl %vr107,%vr106
868            xorl %vr108,%vr106
869            movl 0xFFFFFFA8(%vr41),%vr109
870            movl %vr106,%vr105
871            andl %vr109,%vr105
872            movl 0xFFFFFFA0(%vr41),%vr110
873            movl %vr105,%vr104
874            xorl %vr110,%vr104
875            movl %vr104,%vr70
876      */
877
878      /* Perhaps a shift op? */
879      switch (e->Iex.Binop.op) {
880         case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
881            shOp = Xsh_SHL; break;
882         case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
883            shOp = Xsh_SHR; break;
884         case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
885            shOp = Xsh_SAR; break;
886         default:
887            shOp = Xsh_INVALID; break;
888      }
889      if (shOp != Xsh_INVALID) {
890         HReg dst = newVRegI(env);
891
892         /* regL = the value to be shifted */
893         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
894         addInstr(env, mk_iMOVsd_RR(regL,dst));
895
896         /* Do any necessary widening for 16/8 bit operands */
897         switch (e->Iex.Binop.op) {
898            case Iop_Shr8:
899               addInstr(env, X86Instr_Alu32R(
900                                Xalu_AND, X86RMI_Imm(0xFF), dst));
901               break;
902            case Iop_Shr16:
903               addInstr(env, X86Instr_Alu32R(
904                                Xalu_AND, X86RMI_Imm(0xFFFF), dst));
905               break;
906            case Iop_Sar8:
907               addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
908               addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
909               break;
910            case Iop_Sar16:
911               addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
912               addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
913               break;
914            default: break;
915         }
916
917         /* Now consider the shift amount.  If it's a literal, we
918            can do a much better job than the general case. */
919         if (e->Iex.Binop.arg2->tag == Iex_Const) {
920            /* assert that the IR is well-typed */
921            Int nshift;
922            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
923            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
924	    vassert(nshift >= 0);
925	    if (nshift > 0)
926               /* Can't allow nshift==0 since that means %cl */
927               addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
928         } else {
929            /* General case; we have to force the amount into %cl. */
930            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
931            addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
932            addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
933         }
934         return dst;
935      }
936
937      /* Handle misc other ops. */
938
939      if (e->Iex.Binop.op == Iop_Max32U) {
940         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
941         HReg dst  = newVRegI(env);
942         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
943         addInstr(env, mk_iMOVsd_RR(src1,dst));
944         addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
945         addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
946         return dst;
947      }
948
949      if (e->Iex.Binop.op == Iop_8HLto16) {
950         HReg hi8  = newVRegI(env);
951         HReg lo8  = newVRegI(env);
952         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
953         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
954         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
955         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
956         addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
957         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
958         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
959         return hi8;
960      }
961
962      if (e->Iex.Binop.op == Iop_16HLto32) {
963         HReg hi16  = newVRegI(env);
964         HReg lo16  = newVRegI(env);
965         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
966         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
967         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
968         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
969         addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
970         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
971         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
972         return hi16;
973      }
974
975      if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
976          || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
977         HReg a16   = newVRegI(env);
978         HReg b16   = newVRegI(env);
979         HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
980         HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
981         Int  shift = (e->Iex.Binop.op == Iop_MullS8
982                       || e->Iex.Binop.op == Iop_MullU8)
983                         ? 24 : 16;
984         X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
985                              || e->Iex.Binop.op == Iop_MullS16)
986                                ? Xsh_SAR : Xsh_SHR;
987
988         addInstr(env, mk_iMOVsd_RR(a16s, a16));
989         addInstr(env, mk_iMOVsd_RR(b16s, b16));
990         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
991         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
992         addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
993         addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
994         addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
995         return b16;
996      }
997
998      if (e->Iex.Binop.op == Iop_CmpF64) {
999         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1000         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1001         HReg dst = newVRegI(env);
1002         addInstr(env, X86Instr_FpCmp(fL,fR,dst));
1003         /* shift this right 8 bits so as to conform to CmpF64
1004            definition. */
1005         addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
1006         return dst;
1007      }
1008
1009      if (e->Iex.Binop.op == Iop_F64toI32S
1010          || e->Iex.Binop.op == Iop_F64toI16S) {
1011         Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
1012         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1013         HReg dst = newVRegI(env);
1014
1015         /* Used several times ... */
1016         X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1017
1018	 /* rf now holds the value to be converted, and rrm holds the
1019	    rounding mode value, encoded as per the IRRoundingMode
1020	    enum.  The first thing to do is set the FPU's rounding
1021	    mode accordingly. */
1022
1023         /* Create a space for the format conversion. */
1024         /* subl $4, %esp */
1025         sub_from_esp(env, 4);
1026
1027	 /* Set host rounding mode */
1028	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
1029
1030         /* gistw/l %rf, 0(%esp) */
1031         addInstr(env, X86Instr_FpLdStI(False/*store*/,
1032                                        toUChar(sz), rf, zero_esp));
1033
1034         if (sz == 2) {
1035            /* movzwl 0(%esp), %dst */
1036            addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
1037         } else {
1038            /* movl 0(%esp), %dst */
1039            vassert(sz == 4);
1040            addInstr(env, X86Instr_Alu32R(
1041                             Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1042         }
1043
1044	 /* Restore default FPU rounding. */
1045         set_FPU_rounding_default( env );
1046
1047         /* addl $4, %esp */
1048	 add_to_esp(env, 4);
1049         return dst;
1050      }
1051
1052      break;
1053   }
1054
1055   /* --------- UNARY OP --------- */
1056   case Iex_Unop: {
1057
1058      /* 1Uto8(32to1(expr32)) */
1059      if (e->Iex.Unop.op == Iop_1Uto8) {
1060         DECLARE_PATTERN(p_32to1_then_1Uto8);
1061         DEFINE_PATTERN(p_32to1_then_1Uto8,
1062                        unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
1063         if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
1064            IRExpr* expr32 = mi.bindee[0];
1065            HReg dst = newVRegI(env);
1066            HReg src = iselIntExpr_R(env, expr32);
1067            addInstr(env, mk_iMOVsd_RR(src,dst) );
1068            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1069                                          X86RMI_Imm(1), dst));
1070            return dst;
1071         }
1072      }
1073
1074      /* 8Uto32(LDle(expr32)) */
1075      if (e->Iex.Unop.op == Iop_8Uto32) {
1076         DECLARE_PATTERN(p_LDle8_then_8Uto32);
1077         DEFINE_PATTERN(p_LDle8_then_8Uto32,
1078                        unop(Iop_8Uto32,
1079                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1080         if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
1081            HReg dst = newVRegI(env);
1082            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1083            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1084            return dst;
1085         }
1086      }
1087
1088      /* 8Sto32(LDle(expr32)) */
1089      if (e->Iex.Unop.op == Iop_8Sto32) {
1090         DECLARE_PATTERN(p_LDle8_then_8Sto32);
1091         DEFINE_PATTERN(p_LDle8_then_8Sto32,
1092                        unop(Iop_8Sto32,
1093                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1094         if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
1095            HReg dst = newVRegI(env);
1096            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1097            addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
1098            return dst;
1099         }
1100      }
1101
1102      /* 16Uto32(LDle(expr32)) */
1103      if (e->Iex.Unop.op == Iop_16Uto32) {
1104         DECLARE_PATTERN(p_LDle16_then_16Uto32);
1105         DEFINE_PATTERN(p_LDle16_then_16Uto32,
1106                        unop(Iop_16Uto32,
1107                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1108         if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
1109            HReg dst = newVRegI(env);
1110            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1111            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1112            return dst;
1113         }
1114      }
1115
1116      /* 8Uto32(GET:I8) */
1117      if (e->Iex.Unop.op == Iop_8Uto32) {
1118         if (e->Iex.Unop.arg->tag == Iex_Get) {
1119            HReg      dst;
1120            X86AMode* amode;
1121            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
1122            dst = newVRegI(env);
1123            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1124                                hregX86_EBP());
1125            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1126            return dst;
1127         }
1128      }
1129
1130      /* 16to32(GET:I16) */
1131      if (e->Iex.Unop.op == Iop_16Uto32) {
1132         if (e->Iex.Unop.arg->tag == Iex_Get) {
1133            HReg      dst;
1134            X86AMode* amode;
1135            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
1136            dst = newVRegI(env);
1137            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1138                                hregX86_EBP());
1139            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1140            return dst;
1141         }
1142      }
1143
1144      switch (e->Iex.Unop.op) {
1145         case Iop_8Uto16:
1146         case Iop_8Uto32:
1147         case Iop_16Uto32: {
1148            HReg dst = newVRegI(env);
1149            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1150            UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
1151            addInstr(env, mk_iMOVsd_RR(src,dst) );
1152            addInstr(env, X86Instr_Alu32R(Xalu_AND,
1153                                          X86RMI_Imm(mask), dst));
1154            return dst;
1155         }
1156         case Iop_8Sto16:
1157         case Iop_8Sto32:
1158         case Iop_16Sto32: {
1159            HReg dst = newVRegI(env);
1160            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1161            UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
1162            addInstr(env, mk_iMOVsd_RR(src,dst) );
1163            addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
1164            addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
1165            return dst;
1166         }
1167	 case Iop_Not8:
1168	 case Iop_Not16:
1169         case Iop_Not32: {
1170            HReg dst = newVRegI(env);
1171            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1172            addInstr(env, mk_iMOVsd_RR(src,dst) );
1173            addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
1174            return dst;
1175         }
1176         case Iop_64HIto32: {
1177            HReg rHi, rLo;
1178            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1179            return rHi; /* and abandon rLo .. poor wee thing :-) */
1180         }
1181         case Iop_64to32: {
1182            HReg rHi, rLo;
1183            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1184            return rLo; /* similar stupid comment to the above ... */
1185         }
1186         case Iop_16HIto8:
1187         case Iop_32HIto16: {
1188            HReg dst  = newVRegI(env);
1189            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1190            Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
1191            addInstr(env, mk_iMOVsd_RR(src,dst) );
1192            addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
1193            return dst;
1194         }
1195         case Iop_1Uto32:
1196         case Iop_1Uto8: {
1197            HReg dst         = newVRegI(env);
1198            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1199            addInstr(env, X86Instr_Set32(cond,dst));
1200            return dst;
1201         }
1202         case Iop_1Sto8:
1203         case Iop_1Sto16:
1204         case Iop_1Sto32: {
1205            /* could do better than this, but for now ... */
1206            HReg dst         = newVRegI(env);
1207            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1208            addInstr(env, X86Instr_Set32(cond,dst));
1209            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
1210            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1211            return dst;
1212         }
1213         case Iop_Ctz32: {
1214            /* Count trailing zeroes, implemented by x86 'bsfl' */
1215            HReg dst = newVRegI(env);
1216            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1217            addInstr(env, X86Instr_Bsfr32(True,src,dst));
1218            return dst;
1219         }
1220         case Iop_Clz32: {
1221            /* Count leading zeroes.  Do 'bsrl' to establish the index
1222               of the highest set bit, and subtract that value from
1223               31. */
1224            HReg tmp = newVRegI(env);
1225            HReg dst = newVRegI(env);
1226            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1227            addInstr(env, X86Instr_Bsfr32(False,src,tmp));
1228            addInstr(env, X86Instr_Alu32R(Xalu_MOV,
1229                                          X86RMI_Imm(31), dst));
1230            addInstr(env, X86Instr_Alu32R(Xalu_SUB,
1231                                          X86RMI_Reg(tmp), dst));
1232            return dst;
1233         }
1234
1235         case Iop_CmpwNEZ32: {
1236            HReg dst = newVRegI(env);
1237            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1238            addInstr(env, mk_iMOVsd_RR(src,dst));
1239            addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
1240            addInstr(env, X86Instr_Alu32R(Xalu_OR,
1241                                          X86RMI_Reg(src), dst));
1242            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1243            return dst;
1244         }
1245         case Iop_Left8:
1246         case Iop_Left16:
1247         case Iop_Left32: {
1248            HReg dst = newVRegI(env);
1249            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1250            addInstr(env, mk_iMOVsd_RR(src, dst));
1251            addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
1252            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
1253            return dst;
1254         }
1255
1256         case Iop_V128to32: {
1257            HReg      dst  = newVRegI(env);
1258            HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
1259            X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
1260            sub_from_esp(env, 16);
1261            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
1262            addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
1263            add_to_esp(env, 16);
1264            return dst;
1265         }
1266
1267         /* ReinterpF32asI32(e) */
1268         /* Given an IEEE754 single, produce an I32 with the same bit
1269            pattern.  Keep stack 8-aligned even though only using 4
1270            bytes. */
1271         case Iop_ReinterpF32asI32: {
1272            HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
1273            HReg dst  = newVRegI(env);
1274            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1275            /* paranoia */
1276            set_FPU_rounding_default(env);
1277            /* subl $8, %esp */
1278            sub_from_esp(env, 8);
1279            /* gstF %rf, 0(%esp) */
1280            addInstr(env,
1281                     X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
1282            /* movl 0(%esp), %dst */
1283            addInstr(env,
1284                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1285            /* addl $8, %esp */
1286            add_to_esp(env, 8);
1287            return dst;
1288         }
1289
1290         case Iop_16to8:
1291         case Iop_32to8:
1292         case Iop_32to16:
1293            /* These are no-ops. */
1294            return iselIntExpr_R(env, e->Iex.Unop.arg);
1295
1296         default:
1297            break;
1298      }
1299      break;
1300   }
1301
1302   /* --------- GET --------- */
1303   case Iex_Get: {
1304      if (ty == Ity_I32) {
1305         HReg dst = newVRegI(env);
1306         addInstr(env, X86Instr_Alu32R(
1307                          Xalu_MOV,
1308                          X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1309                                                 hregX86_EBP())),
1310                          dst));
1311         return dst;
1312      }
1313      if (ty == Ity_I8 || ty == Ity_I16) {
1314         HReg dst = newVRegI(env);
1315         addInstr(env, X86Instr_LoadEX(
1316                          toUChar(ty==Ity_I8 ? 1 : 2),
1317                          False,
1318                          X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
1319                          dst));
1320         return dst;
1321      }
1322      break;
1323   }
1324
1325   case Iex_GetI: {
1326      X86AMode* am
1327         = genGuestArrayOffset(
1328              env, e->Iex.GetI.descr,
1329                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1330      HReg dst = newVRegI(env);
1331      if (ty == Ity_I8) {
1332         addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
1333         return dst;
1334      }
1335      if (ty == Ity_I32) {
1336         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
1337         return dst;
1338      }
1339      break;
1340   }
1341
1342   /* --------- CCALL --------- */
1343   case Iex_CCall: {
1344      HReg    dst = newVRegI(env);
1345      vassert(ty == e->Iex.CCall.retty);
1346
1347      /* be very restrictive for now.  Only 32/64-bit ints allowed
1348         for args, and 32 bits for return type. */
1349      if (e->Iex.CCall.retty != Ity_I32)
1350         goto irreducible;
1351
1352      /* Marshal args, do the call, clear stack. */
1353      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
1354
1355      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1356      return dst;
1357   }
1358
1359   /* --------- LITERAL --------- */
1360   /* 32/16/8-bit literals */
1361   case Iex_Const: {
1362      X86RMI* rmi = iselIntExpr_RMI ( env, e );
1363      HReg    r   = newVRegI(env);
1364      addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
1365      return r;
1366   }
1367
1368   /* --------- MULTIPLEX --------- */
1369   case Iex_Mux0X: {
1370     if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1371         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
1372        X86RM* r8;
1373        HReg   rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
1374        X86RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
1375        HReg   dst = newVRegI(env);
1376        addInstr(env, mk_iMOVsd_RR(rX,dst));
1377        r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
1378        addInstr(env, X86Instr_Test32(0xFF, r8));
1379        addInstr(env, X86Instr_CMov32(Xcc_Z,r0,dst));
1380        return dst;
1381      }
1382      break;
1383   }
1384
1385   default:
1386   break;
1387   } /* switch (e->tag) */
1388
1389   /* We get here if no pattern matched. */
1390  irreducible:
1391   ppIRExpr(e);
1392   vpanic("iselIntExpr_R: cannot reduce tree");
1393}
1394
1395
1396/*---------------------------------------------------------*/
1397/*--- ISEL: Integer expression auxiliaries              ---*/
1398/*---------------------------------------------------------*/
1399
1400/* --------------------- AMODEs --------------------- */
1401
1402/* Return an AMode which computes the value of the specified
1403   expression, possibly also adding insns to the code list as a
1404   result.  The expression may only be a 32-bit one.
1405*/
1406
1407static Bool sane_AMode ( X86AMode* am )
1408{
1409   switch (am->tag) {
1410      case Xam_IR:
1411         return
1412            toBool( hregClass(am->Xam.IR.reg) == HRcInt32
1413                    && (hregIsVirtual(am->Xam.IR.reg)
1414                        || am->Xam.IR.reg == hregX86_EBP()) );
1415      case Xam_IRRS:
1416         return
1417            toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
1418                    && hregIsVirtual(am->Xam.IRRS.base)
1419                    && hregClass(am->Xam.IRRS.index) == HRcInt32
1420                    && hregIsVirtual(am->Xam.IRRS.index) );
1421      default:
1422        vpanic("sane_AMode: unknown x86 amode tag");
1423   }
1424}
1425
1426static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1427{
1428   X86AMode* am = iselIntExpr_AMode_wrk(env, e);
1429   vassert(sane_AMode(am));
1430   return am;
1431}
1432
1433/* DO NOT CALL THIS DIRECTLY ! */
1434static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1435{
1436   IRType ty = typeOfIRExpr(env->type_env,e);
1437   vassert(ty == Ity_I32);
1438
1439   /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
1440   if (e->tag == Iex_Binop
1441       && e->Iex.Binop.op == Iop_Add32
1442       && e->Iex.Binop.arg2->tag == Iex_Const
1443       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
1444       && e->Iex.Binop.arg1->tag == Iex_Binop
1445       && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
1446       && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
1447       && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1448       && e->Iex.Binop.arg1
1449           ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1450       && e->Iex.Binop.arg1
1451           ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1452      UInt shift = e->Iex.Binop.arg1
1453                    ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1454      UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1455      if (shift == 1 || shift == 2 || shift == 3) {
1456         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
1457         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
1458                                       ->Iex.Binop.arg2->Iex.Binop.arg1 );
1459         return X86AMode_IRRS(imm32, r1, r2, shift);
1460      }
1461   }
1462
1463   /* Add32(expr1, Shl32(expr2, imm)) */
1464   if (e->tag == Iex_Binop
1465       && e->Iex.Binop.op == Iop_Add32
1466       && e->Iex.Binop.arg2->tag == Iex_Binop
1467       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1468       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1469       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1470      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1471      if (shift == 1 || shift == 2 || shift == 3) {
1472         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1473         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1474         return X86AMode_IRRS(0, r1, r2, shift);
1475      }
1476   }
1477
1478   /* Add32(expr,i) */
1479   if (e->tag == Iex_Binop
1480       && e->Iex.Binop.op == Iop_Add32
1481       && e->Iex.Binop.arg2->tag == Iex_Const
1482       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
1483      HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
1484      return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
1485   }
1486
1487   /* Doesn't match anything in particular.  Generate it into
1488      a register and use that. */
1489   {
1490      HReg r1 = iselIntExpr_R(env, e);
1491      return X86AMode_IR(0, r1);
1492   }
1493}
1494
1495
1496/* --------------------- RMIs --------------------- */
1497
1498/* Similarly, calculate an expression into an X86RMI operand.  As with
1499   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1500
1501static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
1502{
1503   X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1504   /* sanity checks ... */
1505   switch (rmi->tag) {
1506      case Xrmi_Imm:
1507         return rmi;
1508      case Xrmi_Reg:
1509         vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
1510         vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
1511         return rmi;
1512      case Xrmi_Mem:
1513         vassert(sane_AMode(rmi->Xrmi.Mem.am));
1514         return rmi;
1515      default:
1516         vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
1517   }
1518}
1519
1520/* DO NOT CALL THIS DIRECTLY ! */
1521static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
1522{
1523   IRType ty = typeOfIRExpr(env->type_env,e);
1524   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1525
1526   /* special case: immediate */
1527   if (e->tag == Iex_Const) {
1528      UInt u;
1529      switch (e->Iex.Const.con->tag) {
1530         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1531         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1532         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1533         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1534      }
1535      return X86RMI_Imm(u);
1536   }
1537
1538   /* special case: 32-bit GET */
1539   if (e->tag == Iex_Get && ty == Ity_I32) {
1540      return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1541                                    hregX86_EBP()));
1542   }
1543
1544   /* special case: 32-bit load from memory */
1545   if (e->tag == Iex_Load && ty == Ity_I32
1546       && e->Iex.Load.end == Iend_LE) {
1547      X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
1548      return X86RMI_Mem(am);
1549   }
1550
1551   /* default case: calculate into a register and return that */
1552   {
1553      HReg r = iselIntExpr_R ( env, e );
1554      return X86RMI_Reg(r);
1555   }
1556}
1557
1558
1559/* --------------------- RIs --------------------- */
1560
1561/* Calculate an expression into an X86RI operand.  As with
1562   iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
1563
1564static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
1565{
1566   X86RI* ri = iselIntExpr_RI_wrk(env, e);
1567   /* sanity checks ... */
1568   switch (ri->tag) {
1569      case Xri_Imm:
1570         return ri;
1571      case Xri_Reg:
1572         vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
1573         vassert(hregIsVirtual(ri->Xri.Reg.reg));
1574         return ri;
1575      default:
1576         vpanic("iselIntExpr_RI: unknown x86 RI tag");
1577   }
1578}
1579
1580/* DO NOT CALL THIS DIRECTLY ! */
1581static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
1582{
1583   IRType ty = typeOfIRExpr(env->type_env,e);
1584   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1585
1586   /* special case: immediate */
1587   if (e->tag == Iex_Const) {
1588      UInt u;
1589      switch (e->Iex.Const.con->tag) {
1590         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1591         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1592         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1593         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1594      }
1595      return X86RI_Imm(u);
1596   }
1597
1598   /* default case: calculate into a register and return that */
1599   {
1600      HReg r = iselIntExpr_R ( env, e );
1601      return X86RI_Reg(r);
1602   }
1603}
1604
1605
1606/* --------------------- RMs --------------------- */
1607
1608/* Similarly, calculate an expression into an X86RM operand.  As with
1609   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1610
1611static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
1612{
1613   X86RM* rm = iselIntExpr_RM_wrk(env, e);
1614   /* sanity checks ... */
1615   switch (rm->tag) {
1616      case Xrm_Reg:
1617         vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
1618         vassert(hregIsVirtual(rm->Xrm.Reg.reg));
1619         return rm;
1620      case Xrm_Mem:
1621         vassert(sane_AMode(rm->Xrm.Mem.am));
1622         return rm;
1623      default:
1624         vpanic("iselIntExpr_RM: unknown x86 RM tag");
1625   }
1626}
1627
1628/* DO NOT CALL THIS DIRECTLY ! */
1629static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
1630{
1631   IRType ty = typeOfIRExpr(env->type_env,e);
1632   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1633
1634   /* special case: 32-bit GET */
1635   if (e->tag == Iex_Get && ty == Ity_I32) {
1636      return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
1637                                   hregX86_EBP()));
1638   }
1639
1640   /* special case: load from memory */
1641
1642   /* default case: calculate into a register and return that */
1643   {
1644      HReg r = iselIntExpr_R ( env, e );
1645      return X86RM_Reg(r);
1646   }
1647}
1648
1649
1650/* --------------------- CONDCODE --------------------- */
1651
1652/* Generate code to evaluated a bit-typed expression, returning the
1653   condition code which would correspond when the expression would
1654   notionally have returned 1. */
1655
1656static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
1657{
1658   /* Uh, there's nothing we can sanity check here, unfortunately. */
1659   return iselCondCode_wrk(env,e);
1660}
1661
1662/* DO NOT CALL THIS DIRECTLY ! */
1663static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
1664{
1665   MatchInfo mi;
1666
1667   vassert(e);
1668   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
1669
1670   /* var */
1671   if (e->tag == Iex_RdTmp) {
1672      HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
1673      /* Test32 doesn't modify r32; so this is OK. */
1674      addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
1675      return Xcc_NZ;
1676   }
1677
1678   /* Constant 1:Bit */
1679   if (e->tag == Iex_Const) {
1680      HReg r;
1681      vassert(e->Iex.Const.con->tag == Ico_U1);
1682      vassert(e->Iex.Const.con->Ico.U1 == True
1683              || e->Iex.Const.con->Ico.U1 == False);
1684      r = newVRegI(env);
1685      addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
1686      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
1687      return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
1688   }
1689
1690   /* Not1(e) */
1691   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
1692      /* Generate code for the arg, and negate the test condition */
1693      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
1694   }
1695
1696   /* --- patterns rooted at: 32to1 --- */
1697
1698   if (e->tag == Iex_Unop
1699       && e->Iex.Unop.op == Iop_32to1) {
1700      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1701      addInstr(env, X86Instr_Test32(1,rm));
1702      return Xcc_NZ;
1703   }
1704
1705   /* --- patterns rooted at: CmpNEZ8 --- */
1706
1707   /* CmpNEZ8(x) */
1708   if (e->tag == Iex_Unop
1709       && e->Iex.Unop.op == Iop_CmpNEZ8) {
1710      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1711      addInstr(env, X86Instr_Test32(0xFF,rm));
1712      return Xcc_NZ;
1713   }
1714
1715   /* --- patterns rooted at: CmpNEZ16 --- */
1716
1717   /* CmpNEZ16(x) */
1718   if (e->tag == Iex_Unop
1719       && e->Iex.Unop.op == Iop_CmpNEZ16) {
1720      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1721      addInstr(env, X86Instr_Test32(0xFFFF,rm));
1722      return Xcc_NZ;
1723   }
1724
1725   /* --- patterns rooted at: CmpNEZ32 --- */
1726
1727   /* CmpNEZ32(And32(x,y)) */
1728   {
1729      DECLARE_PATTERN(p_CmpNEZ32_And32);
1730      DEFINE_PATTERN(p_CmpNEZ32_And32,
1731                     unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
1732      if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
1733         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1734         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1735         HReg    tmp  = newVRegI(env);
1736         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1737         addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
1738         return Xcc_NZ;
1739      }
1740   }
1741
1742   /* CmpNEZ32(Or32(x,y)) */
1743   {
1744      DECLARE_PATTERN(p_CmpNEZ32_Or32);
1745      DEFINE_PATTERN(p_CmpNEZ32_Or32,
1746                     unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
1747      if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
1748         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1749         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1750         HReg    tmp  = newVRegI(env);
1751         addInstr(env, mk_iMOVsd_RR(r0, tmp));
1752         addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
1753         return Xcc_NZ;
1754      }
1755   }
1756
1757   /* CmpNEZ32(GET(..):I32) */
1758   if (e->tag == Iex_Unop
1759       && e->Iex.Unop.op == Iop_CmpNEZ32
1760       && e->Iex.Unop.arg->tag == Iex_Get) {
1761      X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1762                                 hregX86_EBP());
1763      addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
1764      return Xcc_NZ;
1765   }
1766
1767   /* CmpNEZ32(x) */
1768   if (e->tag == Iex_Unop
1769       && e->Iex.Unop.op == Iop_CmpNEZ32) {
1770      HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1771      X86RMI* rmi2 = X86RMI_Imm(0);
1772      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1773      return Xcc_NZ;
1774   }
1775
1776   /* --- patterns rooted at: CmpNEZ64 --- */
1777
1778   /* CmpNEZ64(Or64(x,y)) */
1779   {
1780      DECLARE_PATTERN(p_CmpNEZ64_Or64);
1781      DEFINE_PATTERN(p_CmpNEZ64_Or64,
1782                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
1783      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
1784         HReg    hi1, lo1, hi2, lo2;
1785         HReg    tmp  = newVRegI(env);
1786         iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
1787         addInstr(env, mk_iMOVsd_RR(hi1, tmp));
1788         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
1789         iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
1790         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
1791         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
1792         return Xcc_NZ;
1793      }
1794   }
1795
1796   /* CmpNEZ64(x) */
1797   if (e->tag == Iex_Unop
1798       && e->Iex.Unop.op == Iop_CmpNEZ64) {
1799      HReg hi, lo;
1800      HReg tmp = newVRegI(env);
1801      iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
1802      addInstr(env, mk_iMOVsd_RR(hi, tmp));
1803      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
1804      return Xcc_NZ;
1805   }
1806
1807   /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
1808
1809   /* CmpEQ8 / CmpNE8 */
1810   if (e->tag == Iex_Binop
1811       && (e->Iex.Binop.op == Iop_CmpEQ8
1812           || e->Iex.Binop.op == Iop_CmpNE8
1813           || e->Iex.Binop.op == Iop_CasCmpEQ8
1814           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
1815      if (isZeroU8(e->Iex.Binop.arg2)) {
1816         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1817         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
1818         switch (e->Iex.Binop.op) {
1819            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1820            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1821            default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
1822         }
1823      } else {
1824         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1825         X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1826         HReg    r    = newVRegI(env);
1827         addInstr(env, mk_iMOVsd_RR(r1,r));
1828         addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1829         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
1830         switch (e->Iex.Binop.op) {
1831            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1832            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1833            default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
1834         }
1835      }
1836   }
1837
1838   /* CmpEQ16 / CmpNE16 */
1839   if (e->tag == Iex_Binop
1840       && (e->Iex.Binop.op == Iop_CmpEQ16
1841           || e->Iex.Binop.op == Iop_CmpNE16
1842           || e->Iex.Binop.op == Iop_CasCmpEQ16
1843           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
1844      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1845      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1846      HReg    r    = newVRegI(env);
1847      addInstr(env, mk_iMOVsd_RR(r1,r));
1848      addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1849      addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
1850      switch (e->Iex.Binop.op) {
1851         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
1852         case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
1853         default: vpanic("iselCondCode(x86): CmpXX16");
1854      }
1855   }
1856
1857   /* CmpNE32(ccall, 32-bit constant) (--smc-check=all optimisation).
1858      Saves a "movl %eax, %tmp" compared to the default route. */
1859   if (e->tag == Iex_Binop
1860       && e->Iex.Binop.op == Iop_CmpNE32
1861       && e->Iex.Binop.arg1->tag == Iex_CCall
1862       && e->Iex.Binop.arg2->tag == Iex_Const) {
1863      IRExpr* cal = e->Iex.Binop.arg1;
1864      IRExpr* con = e->Iex.Binop.arg2;
1865      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
1866      vassert(cal->Iex.CCall.retty == Ity_I32); /* else ill-typed IR */
1867      vassert(con->Iex.Const.con->tag == Ico_U32);
1868      /* Marshal args, do the call. */
1869      doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
1870      addInstr(env, X86Instr_Alu32R(Xalu_CMP,
1871                                    X86RMI_Imm(con->Iex.Const.con->Ico.U32),
1872                                    hregX86_EAX()));
1873      return Xcc_NZ;
1874   }
1875
1876   /* Cmp*32*(x,y) */
1877   if (e->tag == Iex_Binop
1878       && (e->Iex.Binop.op == Iop_CmpEQ32
1879           || e->Iex.Binop.op == Iop_CmpNE32
1880           || e->Iex.Binop.op == Iop_CmpLT32S
1881           || e->Iex.Binop.op == Iop_CmpLT32U
1882           || e->Iex.Binop.op == Iop_CmpLE32S
1883           || e->Iex.Binop.op == Iop_CmpLE32U
1884           || e->Iex.Binop.op == Iop_CasCmpEQ32
1885           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
1886      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1887      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1888      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1889      switch (e->Iex.Binop.op) {
1890         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
1891         case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
1892         case Iop_CmpLT32S: return Xcc_L;
1893         case Iop_CmpLT32U: return Xcc_B;
1894         case Iop_CmpLE32S: return Xcc_LE;
1895         case Iop_CmpLE32U: return Xcc_BE;
1896         default: vpanic("iselCondCode(x86): CmpXX32");
1897      }
1898   }
1899
1900   /* CmpNE64 */
1901   if (e->tag == Iex_Binop
1902       && (e->Iex.Binop.op == Iop_CmpNE64
1903           || e->Iex.Binop.op == Iop_CmpEQ64)) {
1904      HReg hi1, hi2, lo1, lo2;
1905      HReg tHi = newVRegI(env);
1906      HReg tLo = newVRegI(env);
1907      iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
1908      iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
1909      addInstr(env, mk_iMOVsd_RR(hi1, tHi));
1910      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
1911      addInstr(env, mk_iMOVsd_RR(lo1, tLo));
1912      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
1913      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
1914      switch (e->Iex.Binop.op) {
1915         case Iop_CmpNE64: return Xcc_NZ;
1916         case Iop_CmpEQ64: return Xcc_Z;
1917         default: vpanic("iselCondCode(x86): CmpXX64");
1918      }
1919   }
1920
1921   ppIRExpr(e);
1922   vpanic("iselCondCode");
1923}
1924
1925
1926/*---------------------------------------------------------*/
1927/*--- ISEL: Integer expressions (64 bit)                ---*/
1928/*---------------------------------------------------------*/
1929
1930/* Compute a 64-bit value into a register pair, which is returned as
1931   the first two parameters.  As with iselIntExpr_R, these may be
1932   either real or virtual regs; in any case they must not be changed
1933   by subsequent code emitted by the caller.  */
1934
1935static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
1936{
1937   iselInt64Expr_wrk(rHi, rLo, env, e);
1938#  if 0
1939   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
1940#  endif
1941   vassert(hregClass(*rHi) == HRcInt32);
1942   vassert(hregIsVirtual(*rHi));
1943   vassert(hregClass(*rLo) == HRcInt32);
1944   vassert(hregIsVirtual(*rLo));
1945}
1946
1947/* DO NOT CALL THIS DIRECTLY ! */
1948static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
1949{
1950   MatchInfo mi;
1951   HWord fn = 0; /* helper fn for most SIMD64 stuff */
1952   vassert(e);
1953   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
1954
1955   /* 64-bit literal */
1956   if (e->tag == Iex_Const) {
1957      ULong w64 = e->Iex.Const.con->Ico.U64;
1958      UInt  wHi = toUInt(w64 >> 32);
1959      UInt  wLo = toUInt(w64);
1960      HReg  tLo = newVRegI(env);
1961      HReg  tHi = newVRegI(env);
1962      vassert(e->Iex.Const.con->tag == Ico_U64);
1963      if (wLo == wHi) {
1964         /* Save a precious Int register in this special case. */
1965         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
1966         *rHi = tLo;
1967         *rLo = tLo;
1968      } else {
1969         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
1970         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
1971         *rHi = tHi;
1972         *rLo = tLo;
1973      }
1974      return;
1975   }
1976
1977   /* read 64-bit IRTemp */
1978   if (e->tag == Iex_RdTmp) {
1979      lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
1980      return;
1981   }
1982
1983   /* 64-bit load */
1984   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
1985      HReg     tLo, tHi;
1986      X86AMode *am0, *am4;
1987      vassert(e->Iex.Load.ty == Ity_I64);
1988      tLo = newVRegI(env);
1989      tHi = newVRegI(env);
1990      am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
1991      am4 = advance4(am0);
1992      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
1993      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
1994      *rHi = tHi;
1995      *rLo = tLo;
1996      return;
1997   }
1998
1999   /* 64-bit GET */
2000   if (e->tag == Iex_Get) {
2001      X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
2002      X86AMode* am4 = advance4(am);
2003      HReg tLo = newVRegI(env);
2004      HReg tHi = newVRegI(env);
2005      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2006      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2007      *rHi = tHi;
2008      *rLo = tLo;
2009      return;
2010   }
2011
2012   /* 64-bit GETI */
2013   if (e->tag == Iex_GetI) {
2014      X86AMode* am
2015         = genGuestArrayOffset( env, e->Iex.GetI.descr,
2016                                     e->Iex.GetI.ix, e->Iex.GetI.bias );
2017      X86AMode* am4 = advance4(am);
2018      HReg tLo = newVRegI(env);
2019      HReg tHi = newVRegI(env);
2020      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2021      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2022      *rHi = tHi;
2023      *rLo = tLo;
2024      return;
2025   }
2026
2027   /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
2028   if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
2029      X86RM* r8;
2030      HReg e0Lo, e0Hi;
2031      HReg tLo = newVRegI(env);
2032      HReg tHi = newVRegI(env);
2033      X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2034      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
2035      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
2036      addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
2037      addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
2038      addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
2039      addInstr(env, X86Instr_Test32(0xFF, r8));
2040      addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
2041      addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
2042      add_to_esp(env, 4);
2043      *rHi = tHi;
2044      *rLo = tLo;
2045      return;
2046   }
2047   /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
2048   if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
2049      X86RM* r8;
2050      HReg e0Lo, e0Hi;
2051      HReg tLo = newVRegI(env);
2052      HReg tHi = newVRegI(env);
2053      X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2054      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
2055      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
2056      addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
2057      addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
2058      addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
2059      addInstr(env, X86Instr_Test32(0xFF, r8));
2060      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
2061      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
2062      add_to_esp(env, 4);
2063      *rHi = tHi;
2064      *rLo = tLo;
2065      return;
2066   }
2067
2068   /* 64-bit Mux0X: Mux0X(g, expr, expr) */
2069   if (e->tag == Iex_Mux0X) {
2070      X86RM* r8;
2071      HReg e0Lo, e0Hi, eXLo, eXHi;
2072      HReg tLo = newVRegI(env);
2073      HReg tHi = newVRegI(env);
2074      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
2075      iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
2076      addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
2077      addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
2078      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
2079      addInstr(env, X86Instr_Test32(0xFF, r8));
2080      /* This assumes the first cmov32 doesn't trash the condition
2081         codes, so they are still available for the second cmov32 */
2082      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
2083      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
2084      *rHi = tHi;
2085      *rLo = tLo;
2086      return;
2087   }
2088
2089   /* --------- BINARY ops --------- */
2090   if (e->tag == Iex_Binop) {
2091      switch (e->Iex.Binop.op) {
2092         /* 32 x 32 -> 64 multiply */
2093         case Iop_MullU32:
2094         case Iop_MullS32: {
2095            /* get one operand into %eax, and the other into a R/M.
2096               Need to make an educated guess about which is better in
2097               which. */
2098            HReg   tLo    = newVRegI(env);
2099            HReg   tHi    = newVRegI(env);
2100            Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
2101            X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2102            HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2103            addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
2104            addInstr(env, X86Instr_MulL(syned, rmLeft));
2105            /* Result is now in EDX:EAX.  Tell the caller. */
2106            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2107            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2108            *rHi = tHi;
2109            *rLo = tLo;
2110            return;
2111         }
2112
2113         /* 64 x 32 -> (32(rem),32(div)) division */
2114         case Iop_DivModU64to32:
2115         case Iop_DivModS64to32: {
2116            /* Get the 64-bit operand into edx:eax, and the other into
2117               any old R/M. */
2118            HReg sHi, sLo;
2119            HReg   tLo     = newVRegI(env);
2120            HReg   tHi     = newVRegI(env);
2121            Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
2122            X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2123            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2124            addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
2125            addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
2126            addInstr(env, X86Instr_Div(syned, rmRight));
2127            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2128            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2129            *rHi = tHi;
2130            *rLo = tLo;
2131            return;
2132         }
2133
2134         /* Or64/And64/Xor64 */
2135         case Iop_Or64:
2136         case Iop_And64:
2137         case Iop_Xor64: {
2138            HReg xLo, xHi, yLo, yHi;
2139            HReg tLo = newVRegI(env);
2140            HReg tHi = newVRegI(env);
2141            X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
2142                          : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
2143                          : Xalu_XOR;
2144            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2145            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2146            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2147            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
2148            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2149            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
2150            *rHi = tHi;
2151            *rLo = tLo;
2152            return;
2153         }
2154
2155         /* Add64/Sub64 */
2156         case Iop_Add64:
2157            if (e->Iex.Binop.arg2->tag == Iex_Const) {
2158               /* special case Add64(e, const) */
2159               ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
2160               UInt  wHi = toUInt(w64 >> 32);
2161               UInt  wLo = toUInt(w64);
2162               HReg  tLo = newVRegI(env);
2163               HReg  tHi = newVRegI(env);
2164               HReg  xLo, xHi;
2165               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
2166               iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2167               addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2168               addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2169               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
2170               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
2171               *rHi = tHi;
2172               *rLo = tLo;
2173               return;
2174            }
2175            /* else fall through to the generic case */
2176         case Iop_Sub64: {
2177            HReg xLo, xHi, yLo, yHi;
2178            HReg tLo = newVRegI(env);
2179            HReg tHi = newVRegI(env);
2180            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2181            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2182            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2183            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2184            if (e->Iex.Binop.op==Iop_Add64) {
2185               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
2186               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
2187            } else {
2188               addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2189               addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2190            }
2191            *rHi = tHi;
2192            *rLo = tLo;
2193            return;
2194         }
2195
2196         /* 32HLto64(e1,e2) */
2197         case Iop_32HLto64:
2198            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2199            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2200            return;
2201
2202         /* 64-bit shifts */
2203         case Iop_Shl64: {
2204            /* We use the same ingenious scheme as gcc.  Put the value
2205               to be shifted into %hi:%lo, and the shift amount into
2206               %cl.  Then (dsts on right, a la ATT syntax):
2207
2208               shldl %cl, %lo, %hi   -- make %hi be right for the
2209                                     -- shift amt %cl % 32
2210               shll  %cl, %lo        -- make %lo be right for the
2211                                     -- shift amt %cl % 32
2212
2213               Now, if (shift amount % 64) is in the range 32 .. 63,
2214               we have to do a fixup, which puts the result low half
2215               into the result high half, and zeroes the low half:
2216
2217               testl $32, %ecx
2218
2219               cmovnz %lo, %hi
2220               movl $0, %tmp         -- sigh; need yet another reg
2221               cmovnz %tmp, %lo
2222            */
2223            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2224            tLo = newVRegI(env);
2225            tHi = newVRegI(env);
2226            tTemp = newVRegI(env);
2227            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2228            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2229            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2230            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2231            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2232            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2233               and those regs are legitimately modifiable. */
2234            addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
2235            addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
2236            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2237            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
2238            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2239            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
2240            *rHi = tHi;
2241            *rLo = tLo;
2242            return;
2243         }
2244
2245         case Iop_Shr64: {
2246            /* We use the same ingenious scheme as gcc.  Put the value
2247               to be shifted into %hi:%lo, and the shift amount into
2248               %cl.  Then:
2249
2250               shrdl %cl, %hi, %lo   -- make %lo be right for the
2251                                     -- shift amt %cl % 32
2252               shrl  %cl, %hi        -- make %hi be right for the
2253                                     -- shift amt %cl % 32
2254
2255               Now, if (shift amount % 64) is in the range 32 .. 63,
2256               we have to do a fixup, which puts the result high half
2257               into the result low half, and zeroes the high half:
2258
2259               testl $32, %ecx
2260
2261               cmovnz %hi, %lo
2262               movl $0, %tmp         -- sigh; need yet another reg
2263               cmovnz %tmp, %hi
2264            */
2265            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2266            tLo = newVRegI(env);
2267            tHi = newVRegI(env);
2268            tTemp = newVRegI(env);
2269            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2270            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2271            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2272            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2273            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2274            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2275               and those regs are legitimately modifiable. */
2276            addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
2277            addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
2278            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2279            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
2280            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2281            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
2282            *rHi = tHi;
2283            *rLo = tLo;
2284            return;
2285         }
2286
2287         /* F64 -> I64 */
2288         /* Sigh, this is an almost exact copy of the F64 -> I32/I16
2289            case.  Unfortunately I see no easy way to avoid the
2290            duplication. */
2291         case Iop_F64toI64S: {
2292            HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2293            HReg tLo = newVRegI(env);
2294            HReg tHi = newVRegI(env);
2295
2296            /* Used several times ... */
2297            /* Careful ... this sharing is only safe because
2298	       zero_esp/four_esp do not hold any registers which the
2299	       register allocator could attempt to swizzle later. */
2300            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2301            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2302
2303            /* rf now holds the value to be converted, and rrm holds
2304               the rounding mode value, encoded as per the
2305               IRRoundingMode enum.  The first thing to do is set the
2306               FPU's rounding mode accordingly. */
2307
2308            /* Create a space for the format conversion. */
2309            /* subl $8, %esp */
2310            sub_from_esp(env, 8);
2311
2312            /* Set host rounding mode */
2313            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2314
2315            /* gistll %rf, 0(%esp) */
2316            addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
2317
2318            /* movl 0(%esp), %dstLo */
2319            /* movl 4(%esp), %dstHi */
2320            addInstr(env, X86Instr_Alu32R(
2321                             Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2322            addInstr(env, X86Instr_Alu32R(
2323                             Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2324
2325            /* Restore default FPU rounding. */
2326            set_FPU_rounding_default( env );
2327
2328            /* addl $8, %esp */
2329            add_to_esp(env, 8);
2330
2331            *rHi = tHi;
2332            *rLo = tLo;
2333            return;
2334         }
2335
2336         case Iop_Add8x8:
2337            fn = (HWord)h_generic_calc_Add8x8; goto binnish;
2338         case Iop_Add16x4:
2339            fn = (HWord)h_generic_calc_Add16x4; goto binnish;
2340         case Iop_Add32x2:
2341            fn = (HWord)h_generic_calc_Add32x2; goto binnish;
2342
2343         case Iop_Avg8Ux8:
2344            fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
2345         case Iop_Avg16Ux4:
2346            fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
2347
2348         case Iop_CmpEQ8x8:
2349            fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
2350         case Iop_CmpEQ16x4:
2351            fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
2352         case Iop_CmpEQ32x2:
2353            fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
2354
2355         case Iop_CmpGT8Sx8:
2356            fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
2357         case Iop_CmpGT16Sx4:
2358            fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
2359         case Iop_CmpGT32Sx2:
2360            fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
2361
2362         case Iop_InterleaveHI8x8:
2363            fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
2364         case Iop_InterleaveLO8x8:
2365            fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
2366         case Iop_InterleaveHI16x4:
2367            fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
2368         case Iop_InterleaveLO16x4:
2369            fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
2370         case Iop_InterleaveHI32x2:
2371            fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
2372         case Iop_InterleaveLO32x2:
2373            fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
2374         case Iop_CatOddLanes16x4:
2375            fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
2376         case Iop_CatEvenLanes16x4:
2377            fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
2378         case Iop_Perm8x8:
2379            fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
2380
2381         case Iop_Max8Ux8:
2382            fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
2383         case Iop_Max16Sx4:
2384            fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
2385         case Iop_Min8Ux8:
2386            fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
2387         case Iop_Min16Sx4:
2388            fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
2389
2390         case Iop_Mul16x4:
2391            fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
2392         case Iop_Mul32x2:
2393            fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
2394         case Iop_MulHi16Sx4:
2395            fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
2396         case Iop_MulHi16Ux4:
2397            fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
2398
2399         case Iop_QAdd8Sx8:
2400            fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
2401         case Iop_QAdd16Sx4:
2402            fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
2403         case Iop_QAdd8Ux8:
2404            fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
2405         case Iop_QAdd16Ux4:
2406            fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
2407
2408         case Iop_QNarrowBin32Sto16Sx4:
2409            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; goto binnish;
2410         case Iop_QNarrowBin16Sto8Sx8:
2411            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
2412         case Iop_QNarrowBin16Sto8Ux8:
2413            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
2414         case Iop_NarrowBin16to8x8:
2415            fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
2416         case Iop_NarrowBin32to16x4:
2417            fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
2418
2419         case Iop_QSub8Sx8:
2420            fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
2421         case Iop_QSub16Sx4:
2422            fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
2423         case Iop_QSub8Ux8:
2424            fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
2425         case Iop_QSub16Ux4:
2426            fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
2427
2428         case Iop_Sub8x8:
2429            fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
2430         case Iop_Sub16x4:
2431            fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
2432         case Iop_Sub32x2:
2433            fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
2434
2435         binnish: {
2436            /* Note: the following assumes all helpers are of
2437               signature
2438                  ULong fn ( ULong, ULong ), and they are
2439               not marked as regparm functions.
2440            */
2441            HReg xLo, xHi, yLo, yHi;
2442            HReg tLo = newVRegI(env);
2443            HReg tHi = newVRegI(env);
2444            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2445            addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
2446            addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
2447            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2448            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2449            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2450            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
2451            add_to_esp(env, 4*4);
2452            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2453            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2454            *rHi = tHi;
2455            *rLo = tLo;
2456            return;
2457         }
2458
2459         case Iop_ShlN32x2:
2460            fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
2461         case Iop_ShlN16x4:
2462            fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
2463         case Iop_ShlN8x8:
2464            fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
2465         case Iop_ShrN32x2:
2466            fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
2467         case Iop_ShrN16x4:
2468            fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
2469         case Iop_SarN32x2:
2470            fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
2471         case Iop_SarN16x4:
2472            fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
2473         case Iop_SarN8x8:
2474            fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
2475         shifty: {
2476            /* Note: the following assumes all helpers are of
2477               signature
2478                  ULong fn ( ULong, UInt ), and they are
2479               not marked as regparm functions.
2480            */
2481            HReg xLo, xHi;
2482            HReg tLo = newVRegI(env);
2483            HReg tHi = newVRegI(env);
2484            X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2485            addInstr(env, X86Instr_Push(y));
2486            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2487            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2488            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2489            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
2490            add_to_esp(env, 3*4);
2491            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2492            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2493            *rHi = tHi;
2494            *rLo = tLo;
2495            return;
2496         }
2497
2498         default:
2499            break;
2500      }
2501   } /* if (e->tag == Iex_Binop) */
2502
2503
2504   /* --------- UNARY ops --------- */
2505   if (e->tag == Iex_Unop) {
2506      switch (e->Iex.Unop.op) {
2507
2508         /* 32Sto64(e) */
2509         case Iop_32Sto64: {
2510            HReg tLo = newVRegI(env);
2511            HReg tHi = newVRegI(env);
2512            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2513            addInstr(env, mk_iMOVsd_RR(src,tHi));
2514            addInstr(env, mk_iMOVsd_RR(src,tLo));
2515            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
2516            *rHi = tHi;
2517            *rLo = tLo;
2518            return;
2519         }
2520
2521         /* 32Uto64(e) */
2522         case Iop_32Uto64: {
2523            HReg tLo = newVRegI(env);
2524            HReg tHi = newVRegI(env);
2525            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2526            addInstr(env, mk_iMOVsd_RR(src,tLo));
2527            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2528            *rHi = tHi;
2529            *rLo = tLo;
2530            return;
2531         }
2532
2533         /* 16Uto64(e) */
2534         case Iop_16Uto64: {
2535            HReg tLo = newVRegI(env);
2536            HReg tHi = newVRegI(env);
2537            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2538            addInstr(env, mk_iMOVsd_RR(src,tLo));
2539            addInstr(env, X86Instr_Alu32R(Xalu_AND,
2540                                          X86RMI_Imm(0xFFFF), tLo));
2541            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2542            *rHi = tHi;
2543            *rLo = tLo;
2544            return;
2545         }
2546
2547         /* V128{HI}to64 */
2548         case Iop_V128HIto64:
2549         case Iop_V128to64: {
2550            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
2551            HReg tLo = newVRegI(env);
2552            HReg tHi = newVRegI(env);
2553            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
2554            X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
2555            X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
2556            X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
2557            sub_from_esp(env, 16);
2558            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
2559            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2560                                           X86RMI_Mem(espLO), tLo ));
2561            addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2562                                           X86RMI_Mem(espHI), tHi ));
2563            add_to_esp(env, 16);
2564            *rHi = tHi;
2565            *rLo = tLo;
2566            return;
2567         }
2568
2569         /* could do better than this, but for now ... */
2570         case Iop_1Sto64: {
2571            HReg tLo = newVRegI(env);
2572            HReg tHi = newVRegI(env);
2573            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
2574            addInstr(env, X86Instr_Set32(cond,tLo));
2575            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
2576            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
2577            addInstr(env, mk_iMOVsd_RR(tLo, tHi));
2578            *rHi = tHi;
2579            *rLo = tLo;
2580            return;
2581         }
2582
2583         /* Not64(e) */
2584         case Iop_Not64: {
2585            HReg tLo = newVRegI(env);
2586            HReg tHi = newVRegI(env);
2587            HReg sHi, sLo;
2588            iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
2589            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2590            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2591            addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
2592            addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
2593            *rHi = tHi;
2594            *rLo = tLo;
2595            return;
2596         }
2597
2598         /* Left64(e) */
2599         case Iop_Left64: {
2600            HReg yLo, yHi;
2601            HReg tLo = newVRegI(env);
2602            HReg tHi = newVRegI(env);
2603            /* yHi:yLo = arg */
2604            iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
2605            /* tLo = 0 - yLo, and set carry */
2606            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
2607            addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2608            /* tHi = 0 - yHi - carry */
2609            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2610            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2611            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
2612               back in, so as to give the final result
2613               tHi:tLo = arg | -arg. */
2614            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
2615            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
2616            *rHi = tHi;
2617            *rLo = tLo;
2618            return;
2619         }
2620
2621         /* --- patterns rooted at: CmpwNEZ64 --- */
2622
2623         /* CmpwNEZ64(e) */
2624         case Iop_CmpwNEZ64: {
2625
2626         DECLARE_PATTERN(p_CmpwNEZ64_Or64);
2627         DEFINE_PATTERN(p_CmpwNEZ64_Or64,
2628                        unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
2629         if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
2630            /* CmpwNEZ64(Or64(x,y)) */
2631            HReg xHi,xLo,yHi,yLo;
2632            HReg xBoth = newVRegI(env);
2633            HReg merged = newVRegI(env);
2634            HReg tmp2 = newVRegI(env);
2635
2636            iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
2637            addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
2638            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2639                                          X86RMI_Reg(xLo),xBoth));
2640
2641            iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
2642            addInstr(env, mk_iMOVsd_RR(yHi,merged));
2643            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2644                                          X86RMI_Reg(yLo),merged));
2645            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2646                                             X86RMI_Reg(xBoth),merged));
2647
2648            /* tmp2 = (merged | -merged) >>s 31 */
2649            addInstr(env, mk_iMOVsd_RR(merged,tmp2));
2650            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2651            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2652                                          X86RMI_Reg(merged), tmp2));
2653            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2654            *rHi = tmp2;
2655            *rLo = tmp2;
2656            return;
2657         } else {
2658            /* CmpwNEZ64(e) */
2659            HReg srcLo, srcHi;
2660            HReg tmp1  = newVRegI(env);
2661            HReg tmp2  = newVRegI(env);
2662            /* srcHi:srcLo = arg */
2663            iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
2664            /* tmp1 = srcHi | srcLo */
2665            addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
2666            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2667                                          X86RMI_Reg(srcLo), tmp1));
2668            /* tmp2 = (tmp1 | -tmp1) >>s 31 */
2669            addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
2670            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2671            addInstr(env, X86Instr_Alu32R(Xalu_OR,
2672                                          X86RMI_Reg(tmp1), tmp2));
2673            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2674            *rHi = tmp2;
2675            *rLo = tmp2;
2676            return;
2677         }
2678         }
2679
2680         /* ReinterpF64asI64(e) */
2681         /* Given an IEEE754 double, produce an I64 with the same bit
2682            pattern. */
2683         case Iop_ReinterpF64asI64: {
2684            HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
2685            HReg tLo  = newVRegI(env);
2686            HReg tHi  = newVRegI(env);
2687            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2688            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2689            /* paranoia */
2690            set_FPU_rounding_default(env);
2691            /* subl $8, %esp */
2692            sub_from_esp(env, 8);
2693            /* gstD %rf, 0(%esp) */
2694            addInstr(env,
2695                     X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
2696            /* movl 0(%esp), %tLo */
2697            addInstr(env,
2698                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2699            /* movl 4(%esp), %tHi */
2700            addInstr(env,
2701                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2702            /* addl $8, %esp */
2703            add_to_esp(env, 8);
2704            *rHi = tHi;
2705            *rLo = tLo;
2706            return;
2707         }
2708
2709         case Iop_CmpNEZ32x2:
2710            fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
2711         case Iop_CmpNEZ16x4:
2712            fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
2713         case Iop_CmpNEZ8x8:
2714            fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
2715         unish: {
2716            /* Note: the following assumes all helpers are of
2717               signature
2718                  ULong fn ( ULong ), and they are
2719               not marked as regparm functions.
2720            */
2721            HReg xLo, xHi;
2722            HReg tLo = newVRegI(env);
2723            HReg tHi = newVRegI(env);
2724            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
2725            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2726            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2727            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
2728            add_to_esp(env, 2*4);
2729            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2730            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2731            *rHi = tHi;
2732            *rLo = tLo;
2733            return;
2734         }
2735
2736         default:
2737            break;
2738      }
2739   } /* if (e->tag == Iex_Unop) */
2740
2741
2742   /* --------- CCALL --------- */
2743   if (e->tag == Iex_CCall) {
2744      HReg tLo = newVRegI(env);
2745      HReg tHi = newVRegI(env);
2746
2747      /* Marshal args, do the call, clear stack. */
2748      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
2749
2750      addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2751      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2752      *rHi = tHi;
2753      *rLo = tLo;
2754      return;
2755   }
2756
2757   ppIRExpr(e);
2758   vpanic("iselInt64Expr");
2759}
2760
2761
2762/*---------------------------------------------------------*/
2763/*--- ISEL: Floating point expressions (32 bit)         ---*/
2764/*---------------------------------------------------------*/
2765
2766/* Nothing interesting here; really just wrappers for
2767   64-bit stuff. */
2768
2769static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2770{
2771   HReg r = iselFltExpr_wrk( env, e );
2772#  if 0
2773   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2774#  endif
2775   vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
2776   vassert(hregIsVirtual(r));
2777   return r;
2778}
2779
2780/* DO NOT CALL THIS DIRECTLY */
2781static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2782{
2783   IRType ty = typeOfIRExpr(env->type_env,e);
2784   vassert(ty == Ity_F32);
2785
2786   if (e->tag == Iex_RdTmp) {
2787      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2788   }
2789
2790   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2791      X86AMode* am;
2792      HReg res = newVRegF(env);
2793      vassert(e->Iex.Load.ty == Ity_F32);
2794      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2795      addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
2796      return res;
2797   }
2798
2799   if (e->tag == Iex_Binop
2800       && e->Iex.Binop.op == Iop_F64toF32) {
2801      /* Although the result is still held in a standard FPU register,
2802         we need to round it to reflect the loss of accuracy/range
2803         entailed in casting it to a 32-bit float. */
2804      HReg dst = newVRegF(env);
2805      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2806      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2807      addInstr(env, X86Instr_Fp64to32(src,dst));
2808      set_FPU_rounding_default( env );
2809      return dst;
2810   }
2811
2812   if (e->tag == Iex_Get) {
2813      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2814                                  hregX86_EBP() );
2815      HReg res = newVRegF(env);
2816      addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
2817      return res;
2818   }
2819
2820   if (e->tag == Iex_Unop
2821       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2822       /* Given an I32, produce an IEEE754 float with the same bit
2823          pattern. */
2824      HReg    dst = newVRegF(env);
2825      X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
2826      /* paranoia */
2827      addInstr(env, X86Instr_Push(rmi));
2828      addInstr(env, X86Instr_FpLdSt(
2829                       True/*load*/, 4, dst,
2830                       X86AMode_IR(0, hregX86_ESP())));
2831      add_to_esp(env, 4);
2832      return dst;
2833   }
2834
2835   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2836      HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
2837      HReg dst = newVRegF(env);
2838
2839      /* rf now holds the value to be rounded.  The first thing to do
2840         is set the FPU's rounding mode accordingly. */
2841
2842      /* Set host rounding mode */
2843      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2844
2845      /* grndint %rf, %dst */
2846      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
2847
2848      /* Restore default FPU rounding. */
2849      set_FPU_rounding_default( env );
2850
2851      return dst;
2852   }
2853
2854   ppIRExpr(e);
2855   vpanic("iselFltExpr_wrk");
2856}
2857
2858
2859/*---------------------------------------------------------*/
2860/*--- ISEL: Floating point expressions (64 bit)         ---*/
2861/*---------------------------------------------------------*/
2862
2863/* Compute a 64-bit floating point value into a register, the identity
2864   of which is returned.  As with iselIntExpr_R, the reg may be either
2865   real or virtual; in any case it must not be changed by subsequent
2866   code emitted by the caller.  */
2867
2868/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2869
2870    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2871    ----                  ---------   -----------   -----------
2872    signalling NaN        u           2047 (max)    .0uuuuu---u
2873                                                    (with at least
2874                                                     one 1 bit)
2875    quiet NaN             u           2047 (max)    .1uuuuu---u
2876
2877    negative infinity     1           2047 (max)    .000000---0
2878
2879    positive infinity     0           2047 (max)    .000000---0
2880
2881    negative zero         1           0             .000000---0
2882
2883    positive zero         0           0             .000000---0
2884*/
2885
2886static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2887{
2888   HReg r = iselDblExpr_wrk( env, e );
2889#  if 0
2890   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2891#  endif
2892   vassert(hregClass(r) == HRcFlt64);
2893   vassert(hregIsVirtual(r));
2894   return r;
2895}
2896
2897/* DO NOT CALL THIS DIRECTLY */
2898static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
2899{
2900   IRType ty = typeOfIRExpr(env->type_env,e);
2901   vassert(e);
2902   vassert(ty == Ity_F64);
2903
2904   if (e->tag == Iex_RdTmp) {
2905      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2906   }
2907
2908   if (e->tag == Iex_Const) {
2909      union { UInt u32x2[2]; ULong u64; Double f64; } u;
2910      HReg freg = newVRegF(env);
2911      vassert(sizeof(u) == 8);
2912      vassert(sizeof(u.u64) == 8);
2913      vassert(sizeof(u.f64) == 8);
2914      vassert(sizeof(u.u32x2) == 8);
2915
2916      if (e->Iex.Const.con->tag == Ico_F64) {
2917         u.f64 = e->Iex.Const.con->Ico.F64;
2918      }
2919      else if (e->Iex.Const.con->tag == Ico_F64i) {
2920         u.u64 = e->Iex.Const.con->Ico.F64i;
2921      }
2922      else
2923         vpanic("iselDblExpr(x86): const");
2924
2925      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
2926      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
2927      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
2928                                    X86AMode_IR(0, hregX86_ESP())));
2929      add_to_esp(env, 8);
2930      return freg;
2931   }
2932
2933   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2934      X86AMode* am;
2935      HReg res = newVRegF(env);
2936      vassert(e->Iex.Load.ty == Ity_F64);
2937      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2938      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
2939      return res;
2940   }
2941
2942   if (e->tag == Iex_Get) {
2943      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2944                                  hregX86_EBP() );
2945      HReg res = newVRegF(env);
2946      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
2947      return res;
2948   }
2949
2950   if (e->tag == Iex_GetI) {
2951      X86AMode* am
2952         = genGuestArrayOffset(
2953              env, e->Iex.GetI.descr,
2954                   e->Iex.GetI.ix, e->Iex.GetI.bias );
2955      HReg res = newVRegF(env);
2956      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
2957      return res;
2958   }
2959
2960   if (e->tag == Iex_Triop) {
2961      X86FpOp fpop = Xfp_INVALID;
2962      IRTriop *triop = e->Iex.Triop.details;
2963      switch (triop->op) {
2964         case Iop_AddF64:    fpop = Xfp_ADD; break;
2965         case Iop_SubF64:    fpop = Xfp_SUB; break;
2966         case Iop_MulF64:    fpop = Xfp_MUL; break;
2967         case Iop_DivF64:    fpop = Xfp_DIV; break;
2968         case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
2969         case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
2970         case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
2971         case Iop_AtanF64:   fpop = Xfp_ATAN; break;
2972         case Iop_PRemF64:   fpop = Xfp_PREM; break;
2973         case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
2974         default: break;
2975      }
2976      if (fpop != Xfp_INVALID) {
2977         HReg res  = newVRegF(env);
2978         HReg srcL = iselDblExpr(env, triop->arg2);
2979         HReg srcR = iselDblExpr(env, triop->arg3);
2980         /* XXXROUNDINGFIXME */
2981         /* set roundingmode here */
2982         addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
2983	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
2984	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
2985            roundToF64(env, res);
2986         return res;
2987      }
2988   }
2989
2990   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2991      HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2992      HReg dst = newVRegF(env);
2993
2994      /* rf now holds the value to be rounded.  The first thing to do
2995         is set the FPU's rounding mode accordingly. */
2996
2997      /* Set host rounding mode */
2998      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2999
3000      /* grndint %rf, %dst */
3001      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
3002
3003      /* Restore default FPU rounding. */
3004      set_FPU_rounding_default( env );
3005
3006      return dst;
3007   }
3008
3009   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3010      HReg dst = newVRegF(env);
3011      HReg rHi,rLo;
3012      iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
3013      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3014      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3015
3016      /* Set host rounding mode */
3017      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3018
3019      addInstr(env, X86Instr_FpLdStI(
3020                       True/*load*/, 8, dst,
3021                       X86AMode_IR(0, hregX86_ESP())));
3022
3023      /* Restore default FPU rounding. */
3024      set_FPU_rounding_default( env );
3025
3026      add_to_esp(env, 8);
3027      return dst;
3028   }
3029
3030   if (e->tag == Iex_Binop) {
3031      X86FpOp fpop = Xfp_INVALID;
3032      switch (e->Iex.Binop.op) {
3033         case Iop_SinF64:  fpop = Xfp_SIN; break;
3034         case Iop_CosF64:  fpop = Xfp_COS; break;
3035         case Iop_TanF64:  fpop = Xfp_TAN; break;
3036         case Iop_2xm1F64: fpop = Xfp_2XM1; break;
3037         case Iop_SqrtF64: fpop = Xfp_SQRT; break;
3038         default: break;
3039      }
3040      if (fpop != Xfp_INVALID) {
3041         HReg res = newVRegF(env);
3042         HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
3043         /* XXXROUNDINGFIXME */
3044         /* set roundingmode here */
3045         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3046	 if (fpop != Xfp_SQRT
3047             && fpop != Xfp_NEG && fpop != Xfp_ABS)
3048            roundToF64(env, res);
3049         return res;
3050      }
3051   }
3052
3053   if (e->tag == Iex_Unop) {
3054      X86FpOp fpop = Xfp_INVALID;
3055      switch (e->Iex.Unop.op) {
3056         case Iop_NegF64:  fpop = Xfp_NEG; break;
3057         case Iop_AbsF64:  fpop = Xfp_ABS; break;
3058         default: break;
3059      }
3060      if (fpop != Xfp_INVALID) {
3061         HReg res = newVRegF(env);
3062         HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3063         addInstr(env, X86Instr_FpUnary(fpop,src,res));
3064	 if (fpop != Xfp_NEG && fpop != Xfp_ABS)
3065            roundToF64(env, res);
3066         return res;
3067      }
3068   }
3069
3070   if (e->tag == Iex_Unop) {
3071      switch (e->Iex.Unop.op) {
3072         case Iop_I32StoF64: {
3073            HReg dst = newVRegF(env);
3074            HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3075            addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3076            set_FPU_rounding_default(env);
3077            addInstr(env, X86Instr_FpLdStI(
3078                             True/*load*/, 4, dst,
3079                             X86AMode_IR(0, hregX86_ESP())));
3080	    add_to_esp(env, 4);
3081            return dst;
3082         }
3083         case Iop_ReinterpI64asF64: {
3084            /* Given an I64, produce an IEEE754 double with the same
3085               bit pattern. */
3086            HReg dst = newVRegF(env);
3087            HReg rHi, rLo;
3088	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
3089            /* paranoia */
3090            set_FPU_rounding_default(env);
3091            addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3092            addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3093            addInstr(env, X86Instr_FpLdSt(
3094                             True/*load*/, 8, dst,
3095                             X86AMode_IR(0, hregX86_ESP())));
3096	    add_to_esp(env, 8);
3097            return dst;
3098	 }
3099         case Iop_F32toF64: {
3100            /* this is a no-op */
3101            HReg res = iselFltExpr(env, e->Iex.Unop.arg);
3102            return res;
3103	 }
3104         default:
3105            break;
3106      }
3107   }
3108
3109   /* --------- MULTIPLEX --------- */
3110   if (e->tag == Iex_Mux0X) {
3111     if (ty == Ity_F64
3112         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
3113        X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
3114        HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
3115        HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
3116        HReg dst = newVRegF(env);
3117        addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
3118        addInstr(env, X86Instr_Test32(0xFF, r8));
3119        addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
3120        return dst;
3121      }
3122   }
3123
3124   ppIRExpr(e);
3125   vpanic("iselDblExpr_wrk");
3126}
3127
3128
3129/*---------------------------------------------------------*/
3130/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3131/*---------------------------------------------------------*/
3132
3133static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
3134{
3135   HReg r = iselVecExpr_wrk( env, e );
3136#  if 0
3137   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3138#  endif
3139   vassert(hregClass(r) == HRcVec128);
3140   vassert(hregIsVirtual(r));
3141   return r;
3142}
3143
3144
3145/* DO NOT CALL THIS DIRECTLY */
3146static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
3147{
3148
3149#  define REQUIRE_SSE1                                    \
3150      do { if (env->hwcaps == 0/*baseline, no sse*/)      \
3151              goto vec_fail;                              \
3152      } while (0)
3153
3154#  define REQUIRE_SSE2                                    \
3155      do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
3156              goto vec_fail;                              \
3157      } while (0)
3158
3159#  define SSE2_OR_ABOVE                                   \
3160       (env->hwcaps & VEX_HWCAPS_X86_SSE2)
3161
3162   HWord     fn = 0; /* address of helper fn, if required */
3163   MatchInfo mi;
3164   Bool      arg1isEReg = False;
3165   X86SseOp  op = Xsse_INVALID;
3166   IRType    ty = typeOfIRExpr(env->type_env,e);
3167   vassert(e);
3168   vassert(ty == Ity_V128);
3169
3170   REQUIRE_SSE1;
3171
3172   if (e->tag == Iex_RdTmp) {
3173      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3174   }
3175
3176   if (e->tag == Iex_Get) {
3177      HReg dst = newVRegV(env);
3178      addInstr(env, X86Instr_SseLdSt(
3179                       True/*load*/,
3180                       dst,
3181                       X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
3182                    )
3183              );
3184      return dst;
3185   }
3186
3187   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3188      HReg      dst = newVRegV(env);
3189      X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3190      addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
3191      return dst;
3192   }
3193
3194   if (e->tag == Iex_Const) {
3195      HReg dst = newVRegV(env);
3196      vassert(e->Iex.Const.con->tag == Ico_V128);
3197      addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
3198      return dst;
3199   }
3200
3201   if (e->tag == Iex_Unop) {
3202
3203   if (SSE2_OR_ABOVE) {
3204      /* 64UtoV128(LDle:I64(addr)) */
3205      DECLARE_PATTERN(p_zwiden_load64);
3206      DEFINE_PATTERN(p_zwiden_load64,
3207                     unop(Iop_64UtoV128,
3208                          IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
3209      if (matchIRExpr(&mi, p_zwiden_load64, e)) {
3210         X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
3211         HReg dst = newVRegV(env);
3212         addInstr(env, X86Instr_SseLdzLO(8, dst, am));
3213         return dst;
3214      }
3215   }
3216
3217   switch (e->Iex.Unop.op) {
3218
3219      case Iop_NotV128: {
3220         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3221         return do_sse_Not128(env, arg);
3222      }
3223
3224      case Iop_CmpNEZ64x2: {
3225         /* We can use SSE2 instructions for this. */
3226         /* Ideally, we want to do a 64Ix2 comparison against zero of
3227            the operand.  Problem is no such insn exists.  Solution
3228            therefore is to do a 32Ix4 comparison instead, and bitwise-
3229            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3230            let the not'd result of this initial comparison be a:b:c:d.
3231            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3232            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3233            giving the required result.
3234
3235            The required selection sequence is 2,3,0,1, which
3236            according to Intel's documentation means the pshufd
3237            literal value is 0xB1, that is,
3238            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3239         */
3240         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3241         HReg tmp  = newVRegV(env);
3242         HReg dst  = newVRegV(env);
3243         REQUIRE_SSE2;
3244         addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
3245         addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
3246         tmp = do_sse_Not128(env, tmp);
3247         addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
3248         addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
3249         return dst;
3250      }
3251
3252      case Iop_CmpNEZ32x4: {
3253         /* Sigh, we have to generate lousy code since this has to
3254            work on SSE1 hosts */
3255         /* basically, the idea is: for each lane:
3256               movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
3257               sbbl %r, %r               (now %r = 1Sto32(CF))
3258               movl %r, lane
3259         */
3260         Int       i;
3261         X86AMode* am;
3262         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3263         HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
3264         HReg      dst  = newVRegV(env);
3265         HReg      r32  = newVRegI(env);
3266         sub_from_esp(env, 16);
3267         addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
3268         for (i = 0; i < 4; i++) {
3269            am = X86AMode_IR(i*4, hregX86_ESP());
3270            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
3271            addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
3272            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
3273            addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
3274         }
3275         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3276         add_to_esp(env, 16);
3277         return dst;
3278      }
3279
3280      case Iop_CmpNEZ8x16:
3281      case Iop_CmpNEZ16x8: {
3282         /* We can use SSE2 instructions for this. */
3283         HReg arg;
3284         HReg vec0 = newVRegV(env);
3285         HReg vec1 = newVRegV(env);
3286         HReg dst  = newVRegV(env);
3287         X86SseOp cmpOp
3288            = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
3289                                             : Xsse_CMPEQ8;
3290         REQUIRE_SSE2;
3291         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
3292         addInstr(env, mk_vMOVsd_RR(vec0, vec1));
3293         addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
3294         /* defer arg computation to here so as to give CMPEQF as long
3295            as possible to complete */
3296         arg = iselVecExpr(env, e->Iex.Unop.arg);
3297         /* vec0 is all 0s; vec1 is all 1s */
3298         addInstr(env, mk_vMOVsd_RR(arg, dst));
3299         /* 16x8 or 8x16 comparison == */
3300         addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
3301         /* invert result */
3302         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
3303         return dst;
3304      }
3305
3306      case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
3307      case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
3308      case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
3309      do_32Fx4_unary:
3310      {
3311         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3312         HReg dst = newVRegV(env);
3313         addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
3314         return dst;
3315      }
3316
3317      case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
3318      case Iop_RSqrt64Fx2: op = Xsse_RSQRTF; goto do_64Fx2_unary;
3319      case Iop_Sqrt64Fx2:  op = Xsse_SQRTF;  goto do_64Fx2_unary;
3320      do_64Fx2_unary:
3321      {
3322         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3323         HReg dst = newVRegV(env);
3324         REQUIRE_SSE2;
3325         addInstr(env, X86Instr_Sse64Fx2(op, arg, dst));
3326         return dst;
3327      }
3328
3329      case Iop_Recip32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
3330      case Iop_RSqrt32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
3331      case Iop_Sqrt32F0x4:  op = Xsse_SQRTF;  goto do_32F0x4_unary;
3332      do_32F0x4_unary:
3333      {
3334         /* A bit subtle.  We have to copy the arg to the result
3335            register first, because actually doing the SSE scalar insn
3336            leaves the upper 3/4 of the destination register
3337            unchanged.  Whereas the required semantics of these
3338            primops is that the upper 3/4 is simply copied in from the
3339            argument. */
3340         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3341         HReg dst = newVRegV(env);
3342         addInstr(env, mk_vMOVsd_RR(arg, dst));
3343         addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
3344         return dst;
3345      }
3346
3347      case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
3348      case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
3349      case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
3350      do_64F0x2_unary:
3351      {
3352         /* A bit subtle.  We have to copy the arg to the result
3353            register first, because actually doing the SSE scalar insn
3354            leaves the upper half of the destination register
3355            unchanged.  Whereas the required semantics of these
3356            primops is that the upper half is simply copied in from the
3357            argument. */
3358         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3359         HReg dst = newVRegV(env);
3360         REQUIRE_SSE2;
3361         addInstr(env, mk_vMOVsd_RR(arg, dst));
3362         addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
3363         return dst;
3364      }
3365
3366      case Iop_32UtoV128: {
3367         HReg      dst  = newVRegV(env);
3368         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3369         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3370         addInstr(env, X86Instr_Push(rmi));
3371	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
3372         add_to_esp(env, 4);
3373         return dst;
3374      }
3375
3376      case Iop_64UtoV128: {
3377         HReg      rHi, rLo;
3378         HReg      dst  = newVRegV(env);
3379         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3380         iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
3381         addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3382         addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3383	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
3384         add_to_esp(env, 8);
3385         return dst;
3386      }
3387
3388      default:
3389         break;
3390   } /* switch (e->Iex.Unop.op) */
3391   } /* if (e->tag == Iex_Unop) */
3392
3393   if (e->tag == Iex_Binop) {
3394   switch (e->Iex.Binop.op) {
3395
3396      case Iop_SetV128lo32: {
3397         HReg dst = newVRegV(env);
3398         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3399         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3400         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3401         sub_from_esp(env, 16);
3402         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3403         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
3404         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3405         add_to_esp(env, 16);
3406         return dst;
3407      }
3408
3409      case Iop_SetV128lo64: {
3410         HReg dst = newVRegV(env);
3411         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3412         HReg srcIhi, srcIlo;
3413         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3414         X86AMode* esp4 = advance4(esp0);
3415         iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
3416         sub_from_esp(env, 16);
3417         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3418         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
3419         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
3420         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3421         add_to_esp(env, 16);
3422         return dst;
3423      }
3424
3425      case Iop_64HLtoV128: {
3426         HReg r3, r2, r1, r0;
3427         X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
3428         X86AMode* esp4  = advance4(esp0);
3429         X86AMode* esp8  = advance4(esp4);
3430         X86AMode* esp12 = advance4(esp8);
3431         HReg dst = newVRegV(env);
3432	 /* do this via the stack (easy, convenient, etc) */
3433         sub_from_esp(env, 16);
3434         /* Do the less significant 64 bits */
3435         iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
3436         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
3437         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
3438         /* Do the more significant 64 bits */
3439         iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
3440         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
3441         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
3442	 /* Fetch result back from stack. */
3443         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3444         add_to_esp(env, 16);
3445         return dst;
3446      }
3447
3448      case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
3449      case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
3450      case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
3451      case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
3452      case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
3453      case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
3454      case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
3455      case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
3456      case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
3457      case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
3458      do_32Fx4:
3459      {
3460         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3461         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3462         HReg dst = newVRegV(env);
3463         addInstr(env, mk_vMOVsd_RR(argL, dst));
3464         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3465         return dst;
3466      }
3467
3468      case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
3469      case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
3470      case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
3471      case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
3472      case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
3473      case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
3474      case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
3475      case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
3476      case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
3477      case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
3478      do_64Fx2:
3479      {
3480         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3481         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3482         HReg dst = newVRegV(env);
3483         REQUIRE_SSE2;
3484         addInstr(env, mk_vMOVsd_RR(argL, dst));
3485         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3486         return dst;
3487      }
3488
3489      case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
3490      case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
3491      case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
3492      case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
3493      case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
3494      case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
3495      case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
3496      case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
3497      case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
3498      case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
3499      do_32F0x4: {
3500         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3501         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3502         HReg dst = newVRegV(env);
3503         addInstr(env, mk_vMOVsd_RR(argL, dst));
3504         addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
3505         return dst;
3506      }
3507
3508      case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
3509      case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
3510      case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
3511      case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
3512      case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
3513      case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
3514      case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
3515      case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
3516      case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
3517      case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
3518      do_64F0x2: {
3519         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3520         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3521         HReg dst = newVRegV(env);
3522         REQUIRE_SSE2;
3523         addInstr(env, mk_vMOVsd_RR(argL, dst));
3524         addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
3525         return dst;
3526      }
3527
3528      case Iop_QNarrowBin32Sto16Sx8:
3529         op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3530      case Iop_QNarrowBin16Sto8Sx16:
3531         op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3532      case Iop_QNarrowBin16Sto8Ux16:
3533         op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3534
3535      case Iop_InterleaveHI8x16:
3536         op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3537      case Iop_InterleaveHI16x8:
3538         op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3539      case Iop_InterleaveHI32x4:
3540         op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3541      case Iop_InterleaveHI64x2:
3542         op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3543
3544      case Iop_InterleaveLO8x16:
3545         op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3546      case Iop_InterleaveLO16x8:
3547         op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3548      case Iop_InterleaveLO32x4:
3549         op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3550      case Iop_InterleaveLO64x2:
3551         op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3552
3553      case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
3554      case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
3555      case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
3556      case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
3557      case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
3558      case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
3559      case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
3560      case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
3561      case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
3562      case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
3563      case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
3564      case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
3565      case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
3566      case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
3567      case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
3568      case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
3569      case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
3570      case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
3571      case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
3572      case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
3573      case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
3574      case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
3575      case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
3576      case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
3577      case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
3578      case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
3579      case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
3580      case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
3581      case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
3582      case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
3583      case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
3584      case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
3585      case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
3586      case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
3587      do_SseReRg: {
3588         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3589         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3590         HReg dst = newVRegV(env);
3591         if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
3592            REQUIRE_SSE2;
3593         if (arg1isEReg) {
3594            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3595            addInstr(env, X86Instr_SseReRg(op, arg1, dst));
3596         } else {
3597            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3598            addInstr(env, X86Instr_SseReRg(op, arg2, dst));
3599         }
3600         return dst;
3601      }
3602
3603      case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
3604      case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
3605      case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
3606      case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
3607      case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
3608      case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
3609      case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
3610      case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
3611      do_SseShift: {
3612         HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
3613         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3614         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3615         HReg      ereg = newVRegV(env);
3616         HReg      dst  = newVRegV(env);
3617         REQUIRE_SSE2;
3618         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3619         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3620         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3621         addInstr(env, X86Instr_Push(rmi));
3622         addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
3623	 addInstr(env, mk_vMOVsd_RR(greg, dst));
3624         addInstr(env, X86Instr_SseReRg(op, ereg, dst));
3625         add_to_esp(env, 16);
3626         return dst;
3627      }
3628
3629      case Iop_NarrowBin32to16x8:
3630         fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3631         goto do_SseAssistedBinary;
3632      case Iop_NarrowBin16to8x16:
3633         fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3634         goto do_SseAssistedBinary;
3635      do_SseAssistedBinary: {
3636         /* As with the amd64 case (where this is copied from) we
3637            generate pretty bad code. */
3638         vassert(fn != 0);
3639         HReg dst = newVRegV(env);
3640         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3641         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3642         HReg argp = newVRegI(env);
3643         /* subl $112, %esp         -- make a space */
3644         sub_from_esp(env, 112);
3645         /* leal 48(%esp), %r_argp  -- point into it */
3646         addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
3647                                      argp));
3648         /* andl $-16, %r_argp      -- 16-align the pointer */
3649         addInstr(env, X86Instr_Alu32R(Xalu_AND,
3650                                       X86RMI_Imm( ~(UInt)15 ),
3651                                       argp));
3652         /* Prepare 3 arg regs:
3653            leal  0(%r_argp), %eax
3654            leal 16(%r_argp), %edx
3655            leal 32(%r_argp), %ecx
3656         */
3657         addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
3658                                      hregX86_EAX()));
3659         addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
3660                                      hregX86_EDX()));
3661         addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
3662                                      hregX86_ECX()));
3663         /* Store the two args, at (%edx) and (%ecx):
3664            movupd  %argL, 0(%edx)
3665            movupd  %argR, 0(%ecx)
3666         */
3667         addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
3668                                        X86AMode_IR(0, hregX86_EDX())));
3669         addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
3670                                        X86AMode_IR(0, hregX86_ECX())));
3671         /* call the helper */
3672         addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn, 3 ));
3673         /* fetch the result from memory, using %r_argp, which the
3674            register allocator will keep alive across the call. */
3675         addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
3676                                        X86AMode_IR(0, argp)));
3677         /* and finally, clear the space */
3678         add_to_esp(env, 112);
3679         return dst;
3680      }
3681
3682      default:
3683         break;
3684   } /* switch (e->Iex.Binop.op) */
3685   } /* if (e->tag == Iex_Binop) */
3686
3687   if (e->tag == Iex_Mux0X) {
3688      X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
3689      HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
3690      HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
3691      HReg dst = newVRegV(env);
3692      addInstr(env, mk_vMOVsd_RR(rX,dst));
3693      addInstr(env, X86Instr_Test32(0xFF, r8));
3694      addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
3695      return dst;
3696   }
3697
3698   vec_fail:
3699   vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
3700              LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
3701   ppIRExpr(e);
3702   vpanic("iselVecExpr_wrk");
3703
3704#  undef REQUIRE_SSE1
3705#  undef REQUIRE_SSE2
3706#  undef SSE2_OR_ABOVE
3707}
3708
3709
3710/*---------------------------------------------------------*/
3711/*--- ISEL: Statements                                  ---*/
3712/*---------------------------------------------------------*/
3713
3714static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3715{
3716   if (vex_traceflags & VEX_TRACE_VCODE) {
3717      vex_printf("\n-- ");
3718      ppIRStmt(stmt);
3719      vex_printf("\n");
3720   }
3721
3722   switch (stmt->tag) {
3723
3724   /* --------- STORE --------- */
3725   case Ist_Store: {
3726      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3727      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3728      IREndness end   = stmt->Ist.Store.end;
3729
3730      if (tya != Ity_I32 || end != Iend_LE)
3731         goto stmt_fail;
3732
3733      if (tyd == Ity_I32) {
3734         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3735         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
3736         addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
3737         return;
3738      }
3739      if (tyd == Ity_I8 || tyd == Ity_I16) {
3740         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3741         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
3742         addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
3743                                       r,am ));
3744         return;
3745      }
3746      if (tyd == Ity_F64) {
3747         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3748         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
3749         addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
3750         return;
3751      }
3752      if (tyd == Ity_F32) {
3753         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3754         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
3755         addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
3756         return;
3757      }
3758      if (tyd == Ity_I64) {
3759         HReg vHi, vLo, rA;
3760         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
3761         rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
3762         addInstr(env, X86Instr_Alu32M(
3763                          Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
3764         addInstr(env, X86Instr_Alu32M(
3765                          Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
3766         return;
3767      }
3768      if (tyd == Ity_V128) {
3769         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3770         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
3771         addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
3772         return;
3773      }
3774      break;
3775   }
3776
3777   /* --------- PUT --------- */
3778   case Ist_Put: {
3779      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3780      if (ty == Ity_I32) {
3781         /* We're going to write to memory, so compute the RHS into an
3782            X86RI. */
3783         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
3784         addInstr(env,
3785                  X86Instr_Alu32M(
3786                     Xalu_MOV,
3787                     ri,
3788                     X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
3789                 ));
3790         return;
3791      }
3792      if (ty == Ity_I8 || ty == Ity_I16) {
3793         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
3794         addInstr(env, X86Instr_Store(
3795                          toUChar(ty==Ity_I8 ? 1 : 2),
3796                          r,
3797                          X86AMode_IR(stmt->Ist.Put.offset,
3798                                      hregX86_EBP())));
3799         return;
3800      }
3801      if (ty == Ity_I64) {
3802         HReg vHi, vLo;
3803         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3804         X86AMode* am4 = advance4(am);
3805         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
3806         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
3807         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
3808         return;
3809      }
3810      if (ty == Ity_V128) {
3811         HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
3812         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3813         addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
3814         return;
3815      }
3816      if (ty == Ity_F32) {
3817         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
3818         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3819         set_FPU_rounding_default(env); /* paranoia */
3820         addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
3821         return;
3822      }
3823      if (ty == Ity_F64) {
3824         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
3825         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
3826         set_FPU_rounding_default(env); /* paranoia */
3827         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
3828         return;
3829      }
3830      break;
3831   }
3832
3833   /* --------- Indexed PUT --------- */
3834   case Ist_PutI: {
3835      IRPutI *puti = stmt->Ist.PutI.details;
3836
3837      X86AMode* am
3838         = genGuestArrayOffset(
3839              env, puti->descr,
3840                   puti->ix, puti->bias );
3841
3842      IRType ty = typeOfIRExpr(env->type_env, puti->data);
3843      if (ty == Ity_F64) {
3844         HReg val = iselDblExpr(env, puti->data);
3845         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
3846         return;
3847      }
3848      if (ty == Ity_I8) {
3849         HReg r = iselIntExpr_R(env, puti->data);
3850         addInstr(env, X86Instr_Store( 1, r, am ));
3851         return;
3852      }
3853      if (ty == Ity_I32) {
3854         HReg r = iselIntExpr_R(env, puti->data);
3855         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
3856         return;
3857      }
3858      if (ty == Ity_I64) {
3859         HReg rHi, rLo;
3860         X86AMode* am4 = advance4(am);
3861         iselInt64Expr(&rHi, &rLo, env, puti->data);
3862         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
3863         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
3864         return;
3865      }
3866      break;
3867   }
3868
3869   /* --------- TMP --------- */
3870   case Ist_WrTmp: {
3871      IRTemp tmp = stmt->Ist.WrTmp.tmp;
3872      IRType ty = typeOfIRTemp(env->type_env, tmp);
3873
3874      /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
3875         compute it into an AMode and then use LEA.  This usually
3876         produces fewer instructions, often because (for memcheck
3877         created IR) we get t = address-expression, (t is later used
3878         twice) and so doing this naturally turns address-expression
3879         back into an X86 amode. */
3880      if (ty == Ity_I32
3881          && stmt->Ist.WrTmp.data->tag == Iex_Binop
3882          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
3883         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
3884         HReg dst = lookupIRTemp(env, tmp);
3885         if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
3886            /* Hmm, iselIntExpr_AMode wimped out and just computed the
3887               value into a register.  Just emit a normal reg-reg move
3888               so reg-alloc can coalesce it away in the usual way. */
3889            HReg src = am->Xam.IR.reg;
3890            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
3891         } else {
3892            addInstr(env, X86Instr_Lea32(am,dst));
3893         }
3894         return;
3895      }
3896
3897      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
3898         X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
3899         HReg dst = lookupIRTemp(env, tmp);
3900         addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
3901         return;
3902      }
3903      if (ty == Ity_I64) {
3904         HReg rHi, rLo, dstHi, dstLo;
3905         iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
3906         lookupIRTemp64( &dstHi, &dstLo, env, tmp);
3907         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
3908         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
3909         return;
3910      }
3911      if (ty == Ity_I1) {
3912         X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
3913         HReg dst = lookupIRTemp(env, tmp);
3914         addInstr(env, X86Instr_Set32(cond, dst));
3915         return;
3916      }
3917      if (ty == Ity_F64) {
3918         HReg dst = lookupIRTemp(env, tmp);
3919         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
3920         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
3921         return;
3922      }
3923      if (ty == Ity_F32) {
3924         HReg dst = lookupIRTemp(env, tmp);
3925         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
3926         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
3927         return;
3928      }
3929      if (ty == Ity_V128) {
3930         HReg dst = lookupIRTemp(env, tmp);
3931         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
3932         addInstr(env, mk_vMOVsd_RR(src,dst));
3933         return;
3934      }
3935      break;
3936   }
3937
3938   /* --------- Call to DIRTY helper --------- */
3939   case Ist_Dirty: {
3940      IRType   retty;
3941      IRDirty* d = stmt->Ist.Dirty.details;
3942      Bool     passBBP = False;
3943
3944      if (d->nFxState == 0)
3945         vassert(!d->needsBBP);
3946
3947      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
3948
3949      /* Marshal args, do the call, clear stack. */
3950      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
3951
3952      /* Now figure out what to do with the returned value, if any. */
3953      if (d->tmp == IRTemp_INVALID)
3954         /* No return value.  Nothing to do. */
3955         return;
3956
3957      retty = typeOfIRTemp(env->type_env, d->tmp);
3958      if (retty == Ity_I64) {
3959         HReg dstHi, dstLo;
3960         /* The returned value is in %edx:%eax.  Park it in the
3961            register-pair associated with tmp. */
3962         lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
3963         addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
3964         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
3965         return;
3966      }
3967      if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
3968         /* The returned value is in %eax.  Park it in the register
3969            associated with tmp. */
3970         HReg dst = lookupIRTemp(env, d->tmp);
3971         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
3972         return;
3973      }
3974      break;
3975   }
3976
3977   /* --------- MEM FENCE --------- */
3978   case Ist_MBE:
3979      switch (stmt->Ist.MBE.event) {
3980         case Imbe_Fence:
3981            addInstr(env, X86Instr_MFence(env->hwcaps));
3982            return;
3983         default:
3984            break;
3985      }
3986      break;
3987
3988   /* --------- ACAS --------- */
3989   case Ist_CAS:
3990      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
3991         /* "normal" singleton CAS */
3992         UChar  sz;
3993         IRCAS* cas = stmt->Ist.CAS.details;
3994         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
3995         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
3996         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
3997         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
3998         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
3999         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4000         vassert(cas->expdHi == NULL);
4001         vassert(cas->dataHi == NULL);
4002         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4003         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4004         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4005         switch (ty) {
4006            case Ity_I32: sz = 4; break;
4007            case Ity_I16: sz = 2; break;
4008            case Ity_I8:  sz = 1; break;
4009            default: goto unhandled_cas;
4010         }
4011         addInstr(env, X86Instr_ACAS(am, sz));
4012         addInstr(env,
4013                  X86Instr_CMov32(Xcc_NZ,
4014                                  X86RM_Reg(hregX86_EAX()), rOldLo));
4015         return;
4016      } else {
4017         /* double CAS */
4018         IRCAS* cas = stmt->Ist.CAS.details;
4019         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4020         /* only 32-bit allowed in this case */
4021         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4022         /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
4023         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4024         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4025         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4026         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4027         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4028         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4029         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4030         if (ty != Ity_I32)
4031            goto unhandled_cas;
4032         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4033         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4034         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
4035         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4036         addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
4037         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4038         addInstr(env, X86Instr_DACAS(am));
4039         addInstr(env,
4040                  X86Instr_CMov32(Xcc_NZ,
4041                                  X86RM_Reg(hregX86_EDX()), rOldHi));
4042         addInstr(env,
4043                  X86Instr_CMov32(Xcc_NZ,
4044                                  X86RM_Reg(hregX86_EAX()), rOldLo));
4045         return;
4046      }
4047      unhandled_cas:
4048      break;
4049
4050   /* --------- INSTR MARK --------- */
4051   /* Doesn't generate any executable code ... */
4052   case Ist_IMark:
4053       return;
4054
4055   /* --------- NO-OP --------- */
4056   /* Fairly self-explanatory, wouldn't you say? */
4057   case Ist_NoOp:
4058       return;
4059
4060   /* --------- EXIT --------- */
4061   case Ist_Exit: {
4062      if (stmt->Ist.Exit.dst->tag != Ico_U32)
4063         vpanic("iselStmt(x86): Ist_Exit: dst is not a 32-bit value");
4064
4065      X86CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4066      X86AMode*   amEIP = X86AMode_IR(stmt->Ist.Exit.offsIP,
4067                                      hregX86_EBP());
4068
4069      /* Case: boring transfer to known address */
4070      if (stmt->Ist.Exit.jk == Ijk_Boring) {
4071         if (env->chainingAllowed) {
4072            /* .. almost always true .. */
4073            /* Skip the event check at the dst if this is a forwards
4074               edge. */
4075            Bool toFastEP
4076               = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
4077            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4078            addInstr(env, X86Instr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
4079                                           amEIP, cc, toFastEP));
4080         } else {
4081            /* .. very occasionally .. */
4082            /* We can't use chaining, so ask for an assisted transfer,
4083               as that's the only alternative that is allowable. */
4084            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4085            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, Ijk_Boring));
4086         }
4087         return;
4088      }
4089
4090      /* Case: assisted transfer to arbitrary address */
4091      switch (stmt->Ist.Exit.jk) {
4092         /* Keep this list in sync with that in iselNext below */
4093         case Ijk_ClientReq:
4094         case Ijk_EmWarn:
4095         case Ijk_MapFail:
4096         case Ijk_NoDecode:
4097         case Ijk_NoRedir:
4098         case Ijk_SigSEGV:
4099         case Ijk_SigTRAP:
4100         case Ijk_Sys_int128:
4101         case Ijk_Sys_int129:
4102         case Ijk_Sys_int130:
4103         case Ijk_Sys_sysenter:
4104         case Ijk_TInval:
4105         case Ijk_Yield:
4106         {
4107            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4108            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, stmt->Ist.Exit.jk));
4109            return;
4110         }
4111         default:
4112            break;
4113      }
4114
4115      /* Do we ever expect to see any other kind? */
4116      goto stmt_fail;
4117   }
4118
4119   default: break;
4120   }
4121  stmt_fail:
4122   ppIRStmt(stmt);
4123   vpanic("iselStmt");
4124}
4125
4126
4127/*---------------------------------------------------------*/
4128/*--- ISEL: Basic block terminators (Nexts)             ---*/
4129/*---------------------------------------------------------*/
4130
4131static void iselNext ( ISelEnv* env,
4132                       IRExpr* next, IRJumpKind jk, Int offsIP )
4133{
4134   if (vex_traceflags & VEX_TRACE_VCODE) {
4135      vex_printf( "\n-- PUT(%d) = ", offsIP);
4136      ppIRExpr( next );
4137      vex_printf( "; exit-");
4138      ppIRJumpKind(jk);
4139      vex_printf( "\n");
4140   }
4141
4142   /* Case: boring transfer to known address */
4143   if (next->tag == Iex_Const) {
4144      IRConst* cdst = next->Iex.Const.con;
4145      vassert(cdst->tag == Ico_U32);
4146      if (jk == Ijk_Boring || jk == Ijk_Call) {
4147         /* Boring transfer to known address */
4148         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4149         if (env->chainingAllowed) {
4150            /* .. almost always true .. */
4151            /* Skip the event check at the dst if this is a forwards
4152               edge. */
4153            Bool toFastEP
4154               = ((Addr64)cdst->Ico.U32) > env->max_ga;
4155            if (0) vex_printf("%s", toFastEP ? "X" : ".");
4156            addInstr(env, X86Instr_XDirect(cdst->Ico.U32,
4157                                           amEIP, Xcc_ALWAYS,
4158                                           toFastEP));
4159         } else {
4160            /* .. very occasionally .. */
4161            /* We can't use chaining, so ask for an assisted transfer,
4162               as that's the only alternative that is allowable. */
4163            HReg r = iselIntExpr_R(env, next);
4164            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4165                                             Ijk_Boring));
4166         }
4167         return;
4168      }
4169   }
4170
4171   /* Case: call/return (==boring) transfer to any address */
4172   switch (jk) {
4173      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4174         HReg      r     = iselIntExpr_R(env, next);
4175         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4176         if (env->chainingAllowed) {
4177            addInstr(env, X86Instr_XIndir(r, amEIP, Xcc_ALWAYS));
4178         } else {
4179            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4180                                               Ijk_Boring));
4181         }
4182         return;
4183      }
4184      default:
4185         break;
4186   }
4187
4188   /* Case: assisted transfer to arbitrary address */
4189   switch (jk) {
4190      /* Keep this list in sync with that for Ist_Exit above */
4191      case Ijk_ClientReq:
4192      case Ijk_EmWarn:
4193      case Ijk_MapFail:
4194      case Ijk_NoDecode:
4195      case Ijk_NoRedir:
4196      case Ijk_SigSEGV:
4197      case Ijk_SigTRAP:
4198      case Ijk_Sys_int128:
4199      case Ijk_Sys_int129:
4200      case Ijk_Sys_int130:
4201      case Ijk_Sys_sysenter:
4202      case Ijk_TInval:
4203      case Ijk_Yield:
4204      {
4205         HReg      r     = iselIntExpr_R(env, next);
4206         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4207         addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS, jk));
4208         return;
4209      }
4210      default:
4211         break;
4212   }
4213
4214   vex_printf( "\n-- PUT(%d) = ", offsIP);
4215   ppIRExpr( next );
4216   vex_printf( "; exit-");
4217   ppIRJumpKind(jk);
4218   vex_printf( "\n");
4219   vassert(0); // are we expecting any other kind?
4220}
4221
4222
4223/*---------------------------------------------------------*/
4224/*--- Insn selector top-level                           ---*/
4225/*---------------------------------------------------------*/
4226
4227/* Translate an entire SB to x86 code. */
4228
4229HInstrArray* iselSB_X86 ( IRSB* bb,
4230                          VexArch      arch_host,
4231                          VexArchInfo* archinfo_host,
4232                          VexAbiInfo*  vbi/*UNUSED*/,
4233                          Int offs_Host_EvC_Counter,
4234                          Int offs_Host_EvC_FailAddr,
4235                          Bool chainingAllowed,
4236                          Bool addProfInc,
4237                          Addr64 max_ga )
4238{
4239   Int      i, j;
4240   HReg     hreg, hregHI;
4241   ISelEnv* env;
4242   UInt     hwcaps_host = archinfo_host->hwcaps;
4243   X86AMode *amCounter, *amFailAddr;
4244
4245   /* sanity ... */
4246   vassert(arch_host == VexArchX86);
4247   vassert(0 == (hwcaps_host
4248                 & ~(VEX_HWCAPS_X86_SSE1
4249                     | VEX_HWCAPS_X86_SSE2
4250                     | VEX_HWCAPS_X86_SSE3
4251                     | VEX_HWCAPS_X86_LZCNT)));
4252   vassert(sizeof(max_ga) == 8);
4253   vassert((max_ga >> 32) == 0);
4254
4255   /* Make up an initial environment to use. */
4256   env = LibVEX_Alloc(sizeof(ISelEnv));
4257   env->vreg_ctr = 0;
4258
4259   /* Set up output code array. */
4260   env->code = newHInstrArray();
4261
4262   /* Copy BB's type env. */
4263   env->type_env = bb->tyenv;
4264
4265   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4266      change as we go along. */
4267   env->n_vregmap = bb->tyenv->types_used;
4268   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4269   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4270
4271   /* and finally ... */
4272   env->chainingAllowed = chainingAllowed;
4273   env->hwcaps          = hwcaps_host;
4274   env->max_ga          = max_ga;
4275
4276   /* For each IR temporary, allocate a suitably-kinded virtual
4277      register. */
4278   j = 0;
4279   for (i = 0; i < env->n_vregmap; i++) {
4280      hregHI = hreg = INVALID_HREG;
4281      switch (bb->tyenv->types[i]) {
4282         case Ity_I1:
4283         case Ity_I8:
4284         case Ity_I16:
4285         case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
4286         case Ity_I64:  hreg   = mkHReg(j++, HRcInt32, True);
4287                        hregHI = mkHReg(j++, HRcInt32, True); break;
4288         case Ity_F32:
4289         case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
4290         case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
4291         default: ppIRType(bb->tyenv->types[i]);
4292                  vpanic("iselBB: IRTemp type");
4293      }
4294      env->vregmap[i]   = hreg;
4295      env->vregmapHI[i] = hregHI;
4296   }
4297   env->vreg_ctr = j;
4298
4299   /* The very first instruction must be an event check. */
4300   amCounter  = X86AMode_IR(offs_Host_EvC_Counter,  hregX86_EBP());
4301   amFailAddr = X86AMode_IR(offs_Host_EvC_FailAddr, hregX86_EBP());
4302   addInstr(env, X86Instr_EvCheck(amCounter, amFailAddr));
4303
4304   /* Possibly a block counter increment (for profiling).  At this
4305      point we don't know the address of the counter, so just pretend
4306      it is zero.  It will have to be patched later, but before this
4307      translation is used, by a call to LibVEX_patchProfCtr. */
4308   if (addProfInc) {
4309      addInstr(env, X86Instr_ProfInc());
4310   }
4311
4312   /* Ok, finally we can iterate over the statements. */
4313   for (i = 0; i < bb->stmts_used; i++)
4314      iselStmt(env, bb->stmts[i]);
4315
4316   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4317
4318   /* record the number of vregs we used. */
4319   env->code->n_vregs = env->vreg_ctr;
4320   return env->code;
4321}
4322
4323
4324/*---------------------------------------------------------------*/
4325/*--- end                                     host_x86_isel.c ---*/
4326/*---------------------------------------------------------------*/
4327