1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2011 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_amd64_defs.h"
47
48
49/*---------------------------------------------------------*/
50/*--- x87/SSE control word stuff                        ---*/
51/*---------------------------------------------------------*/
52
53/* Vex-generated code expects to run with the FPU set as follows: all
54   exceptions masked, round-to-nearest, precision = 53 bits.  This
55   corresponds to a FPU control word value of 0x027F.
56
57   Similarly the SSE control word (%mxcsr) should be 0x1F80.
58
59   %fpucw and %mxcsr should have these values on entry to
60   Vex-generated code, and should those values should be
61   unchanged at exit.
62*/
63
64#define DEFAULT_FPUCW 0x027F
65
66#define DEFAULT_MXCSR 0x1F80
67
68/* debugging only, do not use */
69/* define DEFAULT_FPUCW 0x037F */
70
71
72/*---------------------------------------------------------*/
73/*--- misc helpers                                      ---*/
74/*---------------------------------------------------------*/
75
76/* These are duplicated in guest-amd64/toIR.c */
77static IRExpr* unop ( IROp op, IRExpr* a )
78{
79   return IRExpr_Unop(op, a);
80}
81
82static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
83{
84   return IRExpr_Binop(op, a1, a2);
85}
86
87static IRExpr* bind ( Int binder )
88{
89   return IRExpr_Binder(binder);
90}
91
92
93/*---------------------------------------------------------*/
94/*--- ISelEnv                                           ---*/
95/*---------------------------------------------------------*/
96
97/* This carries around:
98
99   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
100     might encounter.  This is computed before insn selection starts,
101     and does not change.
102
103   - A mapping from IRTemp to HReg.  This tells the insn selector
104     which virtual register is associated with each IRTemp
105     temporary.  This is computed before insn selection starts, and
106     does not change.  We expect this mapping to map precisely the
107     same set of IRTemps as the type mapping does.
108
109        - vregmap   holds the primary register for the IRTemp.
110        - vregmapHI is only used for 128-bit integer-typed
111             IRTemps.  It holds the identity of a second
112             64-bit virtual HReg, which holds the high half
113             of the value.
114
115   - The code array, that is, the insns selected so far.
116
117   - A counter, for generating new virtual registers.
118
119   - The host subarchitecture we are selecting insns for.
120     This is set at the start and does not change.
121
122   Note, this is all host-independent.  (JRS 20050201: well, kinda
123   ... not completely.  Compare with ISelEnv for X86.)
124*/
125
126typedef
127   struct {
128      IRTypeEnv*   type_env;
129
130      HReg*        vregmap;
131      HReg*        vregmapHI;
132      Int          n_vregmap;
133
134      HInstrArray* code;
135
136      Int          vreg_ctr;
137
138      UInt         hwcaps;
139   }
140   ISelEnv;
141
142
143static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
144{
145   vassert(tmp >= 0);
146   vassert(tmp < env->n_vregmap);
147   return env->vregmap[tmp];
148}
149
150static void lookupIRTemp128 ( HReg* vrHI, HReg* vrLO,
151                              ISelEnv* env, IRTemp tmp )
152{
153   vassert(tmp >= 0);
154   vassert(tmp < env->n_vregmap);
155   vassert(env->vregmapHI[tmp] != INVALID_HREG);
156   *vrLO = env->vregmap[tmp];
157   *vrHI = env->vregmapHI[tmp];
158}
159
160static void addInstr ( ISelEnv* env, AMD64Instr* instr )
161{
162   addHInstr(env->code, instr);
163   if (vex_traceflags & VEX_TRACE_VCODE) {
164      ppAMD64Instr(instr, True);
165      vex_printf("\n");
166   }
167}
168
169static HReg newVRegI ( ISelEnv* env )
170{
171   HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
172   env->vreg_ctr++;
173   return reg;
174}
175
176//.. static HReg newVRegF ( ISelEnv* env )
177//.. {
178//..    HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
179//..    env->vreg_ctr++;
180//..    return reg;
181//.. }
182
183static HReg newVRegV ( ISelEnv* env )
184{
185   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
186   env->vreg_ctr++;
187   return reg;
188}
189
190
191/*---------------------------------------------------------*/
192/*--- ISEL: Forward declarations                        ---*/
193/*---------------------------------------------------------*/
194
195/* These are organised as iselXXX and iselXXX_wrk pairs.  The
196   iselXXX_wrk do the real work, but are not to be called directly.
197   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
198   checks that all returned registers are virtual.  You should not
199   call the _wrk version directly.
200*/
201static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
202static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
203
204static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
205static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
206
207static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
208static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
209
210static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
211static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
212
213static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
214static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
215
216static void          iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
217                                          ISelEnv* env, IRExpr* e );
218static void          iselInt128Expr     ( HReg* rHi, HReg* rLo,
219                                          ISelEnv* env, IRExpr* e );
220
221static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
222static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
223
224static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
225static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
226
227static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
228static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
229
230static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
231static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
232
233
234/*---------------------------------------------------------*/
235/*--- ISEL: Misc helpers                                ---*/
236/*---------------------------------------------------------*/
237
238static Bool sane_AMode ( AMD64AMode* am )
239{
240   switch (am->tag) {
241      case Aam_IR:
242         return
243            toBool( hregClass(am->Aam.IR.reg) == HRcInt64
244                    && (hregIsVirtual(am->Aam.IR.reg)
245                        || am->Aam.IR.reg == hregAMD64_RBP()) );
246      case Aam_IRRS:
247         return
248            toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
249                    && hregIsVirtual(am->Aam.IRRS.base)
250                    && hregClass(am->Aam.IRRS.index) == HRcInt64
251                    && hregIsVirtual(am->Aam.IRRS.index) );
252      default:
253        vpanic("sane_AMode: unknown amd64 amode tag");
254   }
255}
256
257
258/* Can the lower 32 bits be signedly widened to produce the whole
259   64-bit value?  In other words, are the top 33 bits either all 0 or
260   all 1 ? */
261static Bool fitsIn32Bits ( ULong x )
262{
263   Long y0 = (Long)x;
264   Long y1 = y0;
265   y1 <<= 32;
266   y1 >>=/*s*/ 32;
267   return toBool(x == y1);
268}
269
270/* Is this a 64-bit zero expression? */
271
272static Bool isZeroU64 ( IRExpr* e )
273{
274   return e->tag == Iex_Const
275          && e->Iex.Const.con->tag == Ico_U64
276          && e->Iex.Const.con->Ico.U64 == 0ULL;
277}
278
279static Bool isZeroU32 ( IRExpr* e )
280{
281   return e->tag == Iex_Const
282          && e->Iex.Const.con->tag == Ico_U32
283          && e->Iex.Const.con->Ico.U32 == 0;
284}
285
286/* Make a int reg-reg move. */
287
288static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
289{
290   vassert(hregClass(src) == HRcInt64);
291   vassert(hregClass(dst) == HRcInt64);
292   return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
293}
294
295/* Make a vector reg-reg move. */
296
297static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
298{
299   vassert(hregClass(src) == HRcVec128);
300   vassert(hregClass(dst) == HRcVec128);
301   return AMD64Instr_SseReRg(Asse_MOV, src, dst);
302}
303
304/* Advance/retreat %rsp by n. */
305
306static void add_to_rsp ( ISelEnv* env, Int n )
307{
308   vassert(n > 0 && n < 256 && (n%8) == 0);
309   addInstr(env,
310            AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
311                                        hregAMD64_RSP()));
312}
313
314static void sub_from_rsp ( ISelEnv* env, Int n )
315{
316   vassert(n > 0 && n < 256 && (n%8) == 0);
317   addInstr(env,
318            AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
319                                        hregAMD64_RSP()));
320}
321
322/* Push 64-bit constants on the stack. */
323static void push_uimm64( ISelEnv* env, ULong uimm64 )
324{
325   /* If uimm64 can be expressed as the sign extension of its
326      lower 32 bits, we can do it the easy way. */
327   Long simm64 = (Long)uimm64;
328   if ( simm64 == ((simm64 << 32) >> 32) ) {
329      addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
330   } else {
331      HReg tmp = newVRegI(env);
332      addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
333      addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
334   }
335}
336
337//.. /* Given an amode, return one which references 4 bytes further
338//..    along. */
339//..
340//.. static X86AMode* advance4 ( X86AMode* am )
341//.. {
342//..    X86AMode* am4 = dopyX86AMode(am);
343//..    switch (am4->tag) {
344//..       case Xam_IRRS:
345//..          am4->Xam.IRRS.imm += 4; break;
346//..       case Xam_IR:
347//..          am4->Xam.IR.imm += 4; break;
348//..       default:
349//..          vpanic("advance4(x86,host)");
350//..    }
351//..    return am4;
352//.. }
353//..
354//..
355//.. /* Push an arg onto the host stack, in preparation for a call to a
356//..    helper function of some kind.  Returns the number of 32-bit words
357//..    pushed. */
358//..
359//.. static Int pushArg ( ISelEnv* env, IRExpr* arg )
360//.. {
361//..    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
362//..    if (arg_ty == Ity_I32) {
363//..       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
364//..       return 1;
365//..    } else
366//..    if (arg_ty == Ity_I64) {
367//..       HReg rHi, rLo;
368//..       iselInt64Expr(&rHi, &rLo, env, arg);
369//..       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
370//..       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
371//..       return 2;
372//..    }
373//..    ppIRExpr(arg);
374//..    vpanic("pushArg(x86): can't handle arg of this type");
375//.. }
376
377
378/* Used only in doHelperCall.  If possible, produce a single
379   instruction which computes 'e' into 'dst'.  If not possible, return
380   NULL. */
381
382static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
383                                                    HReg     dst,
384                                                    IRExpr*  e )
385{
386   vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
387
388   if (e->tag == Iex_Const) {
389      vassert(e->Iex.Const.con->tag == Ico_U64);
390      if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
391         return AMD64Instr_Alu64R(
392                   Aalu_MOV,
393                   AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
394                   dst
395                );
396      } else {
397         return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
398      }
399   }
400
401   if (e->tag == Iex_RdTmp) {
402      HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
403      return mk_iMOVsd_RR(src, dst);
404   }
405
406   if (e->tag == Iex_Get) {
407      vassert(e->Iex.Get.ty == Ity_I64);
408      return AMD64Instr_Alu64R(
409                Aalu_MOV,
410                AMD64RMI_Mem(
411                   AMD64AMode_IR(e->Iex.Get.offset,
412                                 hregAMD64_RBP())),
413                dst);
414   }
415
416   if (e->tag == Iex_Unop
417       && e->Iex.Unop.op == Iop_32Uto64
418       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
419      HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
420      return AMD64Instr_MovxLQ(False, src, dst);
421   }
422
423   if (0) { ppIRExpr(e); vex_printf("\n"); }
424
425   return NULL;
426}
427
428
429/* Do a complete function call.  guard is a Ity_Bit expression
430   indicating whether or not the call happens.  If guard==NULL, the
431   call is unconditional. */
432
433static
434void doHelperCall ( ISelEnv* env,
435                    Bool passBBP,
436                    IRExpr* guard, IRCallee* cee, IRExpr** args )
437{
438   AMD64CondCode cc;
439   HReg          argregs[6];
440   HReg          tmpregs[6];
441   AMD64Instr*   fastinstrs[6];
442   Int           n_args, i, argreg;
443
444   /* Marshal args for a call and do the call.
445
446      If passBBP is True, %rbp (the baseblock pointer) is to be passed
447      as the first arg.
448
449      This function only deals with a tiny set of possibilities, which
450      cover all helpers in practice.  The restrictions are that only
451      arguments in registers are supported, hence only 6x64 integer
452      bits in total can be passed.  In fact the only supported arg
453      type is I64.
454
455      Generating code which is both efficient and correct when
456      parameters are to be passed in registers is difficult, for the
457      reasons elaborated in detail in comments attached to
458      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
459      of the method described in those comments.
460
461      The problem is split into two cases: the fast scheme and the
462      slow scheme.  In the fast scheme, arguments are computed
463      directly into the target (real) registers.  This is only safe
464      when we can be sure that computation of each argument will not
465      trash any real registers set by computation of any other
466      argument.
467
468      In the slow scheme, all args are first computed into vregs, and
469      once they are all done, they are moved to the relevant real
470      regs.  This always gives correct code, but it also gives a bunch
471      of vreg-to-rreg moves which are usually redundant but are hard
472      for the register allocator to get rid of.
473
474      To decide which scheme to use, all argument expressions are
475      first examined.  If they are all so simple that it is clear they
476      will be evaluated without use of any fixed registers, use the
477      fast scheme, else use the slow scheme.  Note also that only
478      unconditional calls may use the fast scheme, since having to
479      compute a condition expression could itself trash real
480      registers.
481
482      Note this requires being able to examine an expression and
483      determine whether or not evaluation of it might use a fixed
484      register.  That requires knowledge of how the rest of this insn
485      selector works.  Currently just the following 3 are regarded as
486      safe -- hopefully they cover the majority of arguments in
487      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
488   */
489
490   /* Note that the cee->regparms field is meaningless on AMD64 host
491      (since there is only one calling convention) and so we always
492      ignore it. */
493
494   n_args = 0;
495   for (i = 0; args[i]; i++)
496      n_args++;
497
498   if (6 < n_args + (passBBP ? 1 : 0))
499      vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
500
501   argregs[0] = hregAMD64_RDI();
502   argregs[1] = hregAMD64_RSI();
503   argregs[2] = hregAMD64_RDX();
504   argregs[3] = hregAMD64_RCX();
505   argregs[4] = hregAMD64_R8();
506   argregs[5] = hregAMD64_R9();
507
508   tmpregs[0] = tmpregs[1] = tmpregs[2] =
509   tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
510
511   fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
512   fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
513
514   /* First decide which scheme (slow or fast) is to be used.  First
515      assume the fast scheme, and select slow if any contraindications
516      (wow) appear. */
517
518   if (guard) {
519      if (guard->tag == Iex_Const
520          && guard->Iex.Const.con->tag == Ico_U1
521          && guard->Iex.Const.con->Ico.U1 == True) {
522         /* unconditional */
523      } else {
524         /* Not manifestly unconditional -- be conservative. */
525         goto slowscheme;
526      }
527   }
528
529   /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
530      use the slow scheme.  Because this is tentative, we can't call
531      addInstr (that is, commit to) any instructions until we're
532      handled all the arguments.  So park the resulting instructions
533      in a buffer and emit that if we're successful. */
534
535   /* FAST SCHEME */
536   argreg = 0;
537   if (passBBP) {
538      fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
539      argreg++;
540   }
541
542   for (i = 0; i < n_args; i++) {
543      vassert(argreg < 6);
544      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
545      fastinstrs[argreg]
546         = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
547      if (fastinstrs[argreg] == NULL)
548         goto slowscheme;
549      argreg++;
550   }
551
552   /* Looks like we're in luck.  Emit the accumulated instructions and
553      move on to doing the call itself. */
554   vassert(argreg <= 6);
555   for (i = 0; i < argreg; i++)
556      addInstr(env, fastinstrs[i]);
557
558   /* Fast scheme only applies for unconditional calls.  Hence: */
559   cc = Acc_ALWAYS;
560
561   goto handle_call;
562
563
564   /* SLOW SCHEME; move via temporaries */
565  slowscheme:
566#if 0
567if (n_args > 0) {for (i = 0; args[i]; i++) {
568ppIRExpr(args[i]); vex_printf(" "); }
569vex_printf("\n");}
570#endif
571   argreg = 0;
572
573   if (passBBP) {
574      /* This is pretty stupid; better to move directly to rdi
575         after the rest of the args are done. */
576      tmpregs[argreg] = newVRegI(env);
577      addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
578      argreg++;
579   }
580
581   for (i = 0; i < n_args; i++) {
582      vassert(argreg < 6);
583      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
584      tmpregs[argreg] = iselIntExpr_R(env, args[i]);
585      argreg++;
586   }
587
588   /* Now we can compute the condition.  We can't do it earlier
589      because the argument computations could trash the condition
590      codes.  Be a bit clever to handle the common case where the
591      guard is 1:Bit. */
592   cc = Acc_ALWAYS;
593   if (guard) {
594      if (guard->tag == Iex_Const
595          && guard->Iex.Const.con->tag == Ico_U1
596          && guard->Iex.Const.con->Ico.U1 == True) {
597         /* unconditional -- do nothing */
598      } else {
599         cc = iselCondCode( env, guard );
600      }
601   }
602
603   /* Move the args to their final destinations. */
604   for (i = 0; i < argreg; i++) {
605      /* None of these insns, including any spill code that might
606         be generated, may alter the condition codes. */
607      addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
608   }
609
610
611   /* Finally, the call itself. */
612  handle_call:
613   addInstr(env, AMD64Instr_Call(
614                    cc,
615                    Ptr_to_ULong(cee->addr),
616                    n_args + (passBBP ? 1 : 0)
617                 )
618   );
619}
620
621
622/* Given a guest-state array descriptor, an index expression and a
623   bias, generate an AMD64AMode holding the relevant guest state
624   offset. */
625
626static
627AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
628                                  IRExpr* off, Int bias )
629{
630   HReg tmp, roff;
631   Int  elemSz = sizeofIRType(descr->elemTy);
632   Int  nElems = descr->nElems;
633
634   /* Throw out any cases not generated by an amd64 front end.  In
635      theory there might be a day where we need to handle them -- if
636      we ever run non-amd64-guest on amd64 host. */
637
638   if (nElems != 8 || (elemSz != 1 && elemSz != 8))
639      vpanic("genGuestArrayOffset(amd64 host)");
640
641   /* Compute off into a reg, %off.  Then return:
642
643         movq %off, %tmp
644         addq $bias, %tmp  (if bias != 0)
645         andq %tmp, 7
646         ... base(%rbp, %tmp, shift) ...
647   */
648   tmp  = newVRegI(env);
649   roff = iselIntExpr_R(env, off);
650   addInstr(env, mk_iMOVsd_RR(roff, tmp));
651   if (bias != 0) {
652      /* Make sure the bias is sane, in the sense that there are
653         no significant bits above bit 30 in it. */
654      vassert(-10000 < bias && bias < 10000);
655      addInstr(env,
656               AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
657   }
658   addInstr(env,
659            AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
660   vassert(elemSz == 1 || elemSz == 8);
661   return
662      AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
663                                    elemSz==8 ? 3 : 0);
664}
665
666
667/* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
668static
669void set_SSE_rounding_default ( ISelEnv* env )
670{
671   /* pushq $DEFAULT_MXCSR
672      ldmxcsr 0(%rsp)
673      addq $8, %rsp
674   */
675   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
676   addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
677   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
678   add_to_rsp(env, 8);
679}
680
681/* Mess with the FPU's rounding mode: set to the default rounding mode
682   (DEFAULT_FPUCW). */
683static
684void set_FPU_rounding_default ( ISelEnv* env )
685{
686   /* movq $DEFAULT_FPUCW, -8(%rsp)
687      fldcw -8(%esp)
688   */
689   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
690   addInstr(env, AMD64Instr_Alu64M(
691                    Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
692   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
693}
694
695
696/* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
697   expression denoting a value in the range 0 .. 3, indicating a round
698   mode encoded as per type IRRoundingMode.  Set the SSE machinery to
699   have the same rounding.
700*/
701static
702void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
703{
704   /* Note: this sequence only makes sense because DEFAULT_MXCSR has
705      both rounding bits == 0.  If that wasn't the case, we couldn't
706      create a new rounding field simply by ORing the new value into
707      place. */
708
709   /* movq $3, %reg
710      andq [[mode]], %reg  -- shouldn't be needed; paranoia
711      shlq $13, %reg
712      orq $DEFAULT_MXCSR, %reg
713      pushq %reg
714      ldmxcsr 0(%esp)
715      addq $8, %rsp
716   */
717   HReg        reg      = newVRegI(env);
718   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
719   addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
720   addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
721                                   iselIntExpr_RMI(env, mode), reg));
722   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
723   addInstr(env, AMD64Instr_Alu64R(
724                    Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
725   addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
726   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
727   add_to_rsp(env, 8);
728}
729
730
731/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
732   expression denoting a value in the range 0 .. 3, indicating a round
733   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
734   the same rounding.
735*/
736static
737void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
738{
739   HReg rrm  = iselIntExpr_R(env, mode);
740   HReg rrm2 = newVRegI(env);
741   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
742
743   /* movq  %rrm, %rrm2
744      andq  $3, %rrm2   -- shouldn't be needed; paranoia
745      shlq  $10, %rrm2
746      orq   $DEFAULT_FPUCW, %rrm2
747      movq  %rrm2, -8(%rsp)
748      fldcw -8(%esp)
749   */
750   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
751   addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
752   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
753   addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
754                                   AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
755   addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
756                                   AMD64RI_Reg(rrm2), m8_rsp));
757   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
758}
759
760
761/* Generate all-zeroes into a new vector register.
762*/
763static HReg generate_zeroes_V128 ( ISelEnv* env )
764{
765   HReg dst = newVRegV(env);
766   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
767   return dst;
768}
769
770/* Generate all-ones into a new vector register.
771*/
772static HReg generate_ones_V128 ( ISelEnv* env )
773{
774   HReg dst = newVRegV(env);
775   addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
776   return dst;
777}
778
779
780/* Generate !src into a new vector register.  Amazing that there isn't
781   a less crappy way to do this.
782*/
783static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
784{
785   HReg dst = generate_ones_V128(env);
786   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
787   return dst;
788}
789
790
791/* Expand the given byte into a 64-bit word, by cloning each bit
792   8 times. */
793static ULong bitmask8_to_bytemask64 ( UShort w8 )
794{
795   vassert(w8 == (w8 & 0xFF));
796   ULong w64 = 0;
797   Int i;
798   for (i = 0; i < 8; i++) {
799      if (w8 & (1<<i))
800         w64 |= (0xFFULL << (8 * i));
801   }
802   return w64;
803}
804
805
806//.. /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
807//..    after most non-simple FPU operations (simple = +, -, *, / and
808//..    sqrt).
809//..
810//..    This could be done a lot more efficiently if needed, by loading
811//..    zero and adding it to the value to be rounded (fldz ; faddp?).
812//.. */
813//.. static void roundToF64 ( ISelEnv* env, HReg reg )
814//.. {
815//..    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
816//..    sub_from_esp(env, 8);
817//..    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
818//..    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
819//..    add_to_esp(env, 8);
820//.. }
821
822
823/*---------------------------------------------------------*/
824/*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
825/*---------------------------------------------------------*/
826
827/* Select insns for an integer-typed expression, and add them to the
828   code list.  Return a reg holding the result.  This reg will be a
829   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
830   want to modify it, ask for a new vreg, copy it in there, and modify
831   the copy.  The register allocator will do its best to map both
832   vregs to the same real register, so the copies will often disappear
833   later in the game.
834
835   This should handle expressions of 64, 32, 16 and 8-bit type.  All
836   results are returned in a 64-bit register.  For 32-, 16- and 8-bit
837   expressions, the upper 32/16/24 bits are arbitrary, so you should
838   mask or sign extend partial values if necessary.
839*/
840
841static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
842{
843   HReg r = iselIntExpr_R_wrk(env, e);
844   /* sanity checks ... */
845#  if 0
846   vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
847#  endif
848   vassert(hregClass(r) == HRcInt64);
849   vassert(hregIsVirtual(r));
850   return r;
851}
852
853/* DO NOT CALL THIS DIRECTLY ! */
854static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
855{
856   /* Used for unary/binary SIMD64 ops. */
857   HWord fn = 0;
858   Bool second_is_UInt;
859
860   MatchInfo mi;
861   DECLARE_PATTERN(p_1Uto8_64to1);
862   DECLARE_PATTERN(p_LDle8_then_8Uto64);
863   DECLARE_PATTERN(p_LDle16_then_16Uto64);
864
865   IRType ty = typeOfIRExpr(env->type_env,e);
866   switch (ty) {
867      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
868      default: vassert(0);
869   }
870
871   switch (e->tag) {
872
873   /* --------- TEMP --------- */
874   case Iex_RdTmp: {
875      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
876   }
877
878   /* --------- LOAD --------- */
879   case Iex_Load: {
880      HReg dst = newVRegI(env);
881      AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
882
883      /* We can't handle big-endian loads, nor load-linked. */
884      if (e->Iex.Load.end != Iend_LE)
885         goto irreducible;
886
887      if (ty == Ity_I64) {
888         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
889                                         AMD64RMI_Mem(amode), dst) );
890         return dst;
891      }
892      if (ty == Ity_I32) {
893         addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
894         return dst;
895      }
896      if (ty == Ity_I16) {
897         addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
898         return dst;
899      }
900      if (ty == Ity_I8) {
901         addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
902         return dst;
903      }
904      break;
905   }
906
907   /* --------- BINARY OP --------- */
908   case Iex_Binop: {
909      AMD64AluOp   aluOp;
910      AMD64ShiftOp shOp;
911
912      /* Pattern: Sub64(0,x) */
913      /*     and: Sub32(0,x) */
914      if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
915          || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
916         HReg dst = newVRegI(env);
917         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
918         addInstr(env, mk_iMOVsd_RR(reg,dst));
919         addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
920         return dst;
921      }
922
923      /* Is it an addition or logical style op? */
924      switch (e->Iex.Binop.op) {
925         case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
926            aluOp = Aalu_ADD; break;
927         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
928            aluOp = Aalu_SUB; break;
929         case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
930            aluOp = Aalu_AND; break;
931         case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
932            aluOp = Aalu_OR; break;
933         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
934            aluOp = Aalu_XOR; break;
935         case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
936            aluOp = Aalu_MUL; break;
937         default:
938            aluOp = Aalu_INVALID; break;
939      }
940      /* For commutative ops we assume any literal
941         values are on the second operand. */
942      if (aluOp != Aalu_INVALID) {
943         HReg dst      = newVRegI(env);
944         HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
945         AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
946         addInstr(env, mk_iMOVsd_RR(reg,dst));
947         addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
948         return dst;
949      }
950
951      /* Perhaps a shift op? */
952      switch (e->Iex.Binop.op) {
953         case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
954            shOp = Ash_SHL; break;
955         case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
956            shOp = Ash_SHR; break;
957         case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
958            shOp = Ash_SAR; break;
959         default:
960            shOp = Ash_INVALID; break;
961      }
962      if (shOp != Ash_INVALID) {
963         HReg dst = newVRegI(env);
964
965         /* regL = the value to be shifted */
966         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
967         addInstr(env, mk_iMOVsd_RR(regL,dst));
968
969         /* Do any necessary widening for 32/16/8 bit operands */
970         switch (e->Iex.Binop.op) {
971            case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
972               break;
973            case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
974               break;
975            case Iop_Shr8:
976               addInstr(env, AMD64Instr_Alu64R(
977                                Aalu_AND, AMD64RMI_Imm(0xFF), dst));
978               break;
979            case Iop_Shr16:
980               addInstr(env, AMD64Instr_Alu64R(
981                                Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
982               break;
983            case Iop_Shr32:
984               addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
985               break;
986            case Iop_Sar8:
987               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
988               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
989               break;
990            case Iop_Sar16:
991               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
992               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
993               break;
994            case Iop_Sar32:
995               addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
996               break;
997            default:
998               ppIROp(e->Iex.Binop.op);
999               vassert(0);
1000         }
1001
1002         /* Now consider the shift amount.  If it's a literal, we
1003            can do a much better job than the general case. */
1004         if (e->Iex.Binop.arg2->tag == Iex_Const) {
1005            /* assert that the IR is well-typed */
1006            Int nshift;
1007            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1008            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1009            vassert(nshift >= 0);
1010            if (nshift > 0)
1011               /* Can't allow nshift==0 since that means %cl */
1012               addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1013         } else {
1014            /* General case; we have to force the amount into %cl. */
1015            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1016            addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1017            addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1018         }
1019         return dst;
1020      }
1021
1022      /* Deal with 64-bit SIMD binary ops */
1023      second_is_UInt = False;
1024      switch (e->Iex.Binop.op) {
1025         case Iop_Add8x8:
1026            fn = (HWord)h_generic_calc_Add8x8; break;
1027         case Iop_Add16x4:
1028            fn = (HWord)h_generic_calc_Add16x4; break;
1029         case Iop_Add32x2:
1030            fn = (HWord)h_generic_calc_Add32x2; break;
1031
1032         case Iop_Avg8Ux8:
1033            fn = (HWord)h_generic_calc_Avg8Ux8; break;
1034         case Iop_Avg16Ux4:
1035            fn = (HWord)h_generic_calc_Avg16Ux4; break;
1036
1037         case Iop_CmpEQ8x8:
1038            fn = (HWord)h_generic_calc_CmpEQ8x8; break;
1039         case Iop_CmpEQ16x4:
1040            fn = (HWord)h_generic_calc_CmpEQ16x4; break;
1041         case Iop_CmpEQ32x2:
1042            fn = (HWord)h_generic_calc_CmpEQ32x2; break;
1043
1044         case Iop_CmpGT8Sx8:
1045            fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
1046         case Iop_CmpGT16Sx4:
1047            fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
1048         case Iop_CmpGT32Sx2:
1049            fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
1050
1051         case Iop_InterleaveHI8x8:
1052            fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
1053         case Iop_InterleaveLO8x8:
1054            fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
1055         case Iop_InterleaveHI16x4:
1056            fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
1057         case Iop_InterleaveLO16x4:
1058            fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
1059         case Iop_InterleaveHI32x2:
1060            fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
1061         case Iop_InterleaveLO32x2:
1062            fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
1063         case Iop_CatOddLanes16x4:
1064            fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1065         case Iop_CatEvenLanes16x4:
1066            fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1067         case Iop_Perm8x8:
1068            fn = (HWord)h_generic_calc_Perm8x8; break;
1069
1070         case Iop_Max8Ux8:
1071            fn = (HWord)h_generic_calc_Max8Ux8; break;
1072         case Iop_Max16Sx4:
1073            fn = (HWord)h_generic_calc_Max16Sx4; break;
1074         case Iop_Min8Ux8:
1075            fn = (HWord)h_generic_calc_Min8Ux8; break;
1076         case Iop_Min16Sx4:
1077            fn = (HWord)h_generic_calc_Min16Sx4; break;
1078
1079         case Iop_Mul16x4:
1080            fn = (HWord)h_generic_calc_Mul16x4; break;
1081         case Iop_Mul32x2:
1082            fn = (HWord)h_generic_calc_Mul32x2; break;
1083         case Iop_MulHi16Sx4:
1084            fn = (HWord)h_generic_calc_MulHi16Sx4; break;
1085         case Iop_MulHi16Ux4:
1086            fn = (HWord)h_generic_calc_MulHi16Ux4; break;
1087
1088         case Iop_QAdd8Sx8:
1089            fn = (HWord)h_generic_calc_QAdd8Sx8; break;
1090         case Iop_QAdd16Sx4:
1091            fn = (HWord)h_generic_calc_QAdd16Sx4; break;
1092         case Iop_QAdd8Ux8:
1093            fn = (HWord)h_generic_calc_QAdd8Ux8; break;
1094         case Iop_QAdd16Ux4:
1095            fn = (HWord)h_generic_calc_QAdd16Ux4; break;
1096
1097         case Iop_QNarrowBin32Sto16Sx4:
1098            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1099         case Iop_QNarrowBin16Sto8Sx8:
1100            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1101         case Iop_QNarrowBin16Sto8Ux8:
1102            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1103         case Iop_NarrowBin16to8x8:
1104            fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1105         case Iop_NarrowBin32to16x4:
1106            fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1107
1108         case Iop_QSub8Sx8:
1109            fn = (HWord)h_generic_calc_QSub8Sx8; break;
1110         case Iop_QSub16Sx4:
1111            fn = (HWord)h_generic_calc_QSub16Sx4; break;
1112         case Iop_QSub8Ux8:
1113            fn = (HWord)h_generic_calc_QSub8Ux8; break;
1114         case Iop_QSub16Ux4:
1115            fn = (HWord)h_generic_calc_QSub16Ux4; break;
1116
1117         case Iop_Sub8x8:
1118            fn = (HWord)h_generic_calc_Sub8x8; break;
1119         case Iop_Sub16x4:
1120            fn = (HWord)h_generic_calc_Sub16x4; break;
1121         case Iop_Sub32x2:
1122            fn = (HWord)h_generic_calc_Sub32x2; break;
1123
1124         case Iop_ShlN32x2:
1125            fn = (HWord)h_generic_calc_ShlN32x2;
1126            second_is_UInt = True;
1127            break;
1128         case Iop_ShlN16x4:
1129            fn = (HWord)h_generic_calc_ShlN16x4;
1130            second_is_UInt = True;
1131            break;
1132         case Iop_ShlN8x8:
1133            fn = (HWord)h_generic_calc_ShlN8x8;
1134            second_is_UInt = True;
1135            break;
1136         case Iop_ShrN32x2:
1137            fn = (HWord)h_generic_calc_ShrN32x2;
1138            second_is_UInt = True;
1139            break;
1140         case Iop_ShrN16x4:
1141            fn = (HWord)h_generic_calc_ShrN16x4;
1142            second_is_UInt = True;
1143            break;
1144         case Iop_SarN32x2:
1145            fn = (HWord)h_generic_calc_SarN32x2;
1146            second_is_UInt = True;
1147            break;
1148         case Iop_SarN16x4:
1149            fn = (HWord)h_generic_calc_SarN16x4;
1150            second_is_UInt = True;
1151            break;
1152         case Iop_SarN8x8:
1153            fn = (HWord)h_generic_calc_SarN8x8;
1154            second_is_UInt = True;
1155            break;
1156
1157         default:
1158            fn = (HWord)0; break;
1159      }
1160      if (fn != (HWord)0) {
1161         /* Note: the following assumes all helpers are of signature
1162               ULong fn ( ULong, ULong ), and they are
1163            not marked as regparm functions.
1164         */
1165         HReg dst  = newVRegI(env);
1166         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1167         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1168         if (second_is_UInt)
1169            addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1170         addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1171         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1172         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
1173         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1174         return dst;
1175      }
1176
1177      /* Handle misc other ops. */
1178
1179      if (e->Iex.Binop.op == Iop_Max32U) {
1180         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1181         HReg dst  = newVRegI(env);
1182         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1183         addInstr(env, mk_iMOVsd_RR(src1, dst));
1184         addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1185         addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
1186         return dst;
1187      }
1188
1189      if (e->Iex.Binop.op == Iop_DivModS64to32
1190          || e->Iex.Binop.op == Iop_DivModU64to32) {
1191         /* 64 x 32 -> (32(rem),32(div)) division */
1192         /* Get the 64-bit operand into edx:eax, and the other into
1193            any old R/M. */
1194         HReg      rax     = hregAMD64_RAX();
1195         HReg      rdx     = hregAMD64_RDX();
1196         HReg      dst     = newVRegI(env);
1197         Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1198         AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1199         /* Compute the left operand into a reg, and then
1200            put the top half in edx and the bottom in eax. */
1201         HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1202         addInstr(env, mk_iMOVsd_RR(left64, rdx));
1203         addInstr(env, mk_iMOVsd_RR(left64, rax));
1204         addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1205         addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1206	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1207	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1208         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1209         addInstr(env, mk_iMOVsd_RR(rax, dst));
1210         addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1211         return dst;
1212      }
1213
1214      if (e->Iex.Binop.op == Iop_32HLto64) {
1215         HReg hi32  = newVRegI(env);
1216         HReg lo32  = newVRegI(env);
1217         HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1218         HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1219         addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1220         addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1221         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1222	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1223         addInstr(env, AMD64Instr_Alu64R(
1224                          Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1225         return hi32;
1226      }
1227
1228      if (e->Iex.Binop.op == Iop_16HLto32) {
1229         HReg hi16  = newVRegI(env);
1230         HReg lo16  = newVRegI(env);
1231         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1232         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1233         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1234         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1235         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1236         addInstr(env, AMD64Instr_Alu64R(
1237                          Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1238         addInstr(env, AMD64Instr_Alu64R(
1239                          Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1240         return hi16;
1241      }
1242
1243      if (e->Iex.Binop.op == Iop_8HLto16) {
1244         HReg hi8  = newVRegI(env);
1245         HReg lo8  = newVRegI(env);
1246         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1247         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1248         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1249         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1250         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1251         addInstr(env, AMD64Instr_Alu64R(
1252                          Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1253         addInstr(env, AMD64Instr_Alu64R(
1254                          Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1255         return hi8;
1256      }
1257
1258      if (e->Iex.Binop.op == Iop_MullS32
1259          || e->Iex.Binop.op == Iop_MullS16
1260          || e->Iex.Binop.op == Iop_MullS8
1261          || e->Iex.Binop.op == Iop_MullU32
1262          || e->Iex.Binop.op == Iop_MullU16
1263          || e->Iex.Binop.op == Iop_MullU8) {
1264         HReg a32   = newVRegI(env);
1265         HReg b32   = newVRegI(env);
1266         HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1267         HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1268         Int          shift  = 0;
1269         AMD64ShiftOp shr_op = Ash_SHR;
1270         switch (e->Iex.Binop.op) {
1271            case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1272            case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1273            case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1274            case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1275            case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1276            case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1277            default: vassert(0);
1278         }
1279
1280         addInstr(env, mk_iMOVsd_RR(a32s, a32));
1281         addInstr(env, mk_iMOVsd_RR(b32s, b32));
1282         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1283         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1284         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1285         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1286         addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1287         return b32;
1288      }
1289
1290      if (e->Iex.Binop.op == Iop_CmpF64) {
1291         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1292         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1293         HReg dst = newVRegI(env);
1294         addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1295         /* Mask out irrelevant parts of the result so as to conform
1296            to the CmpF64 definition. */
1297         addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1298         return dst;
1299      }
1300
1301      if (e->Iex.Binop.op == Iop_F64toI32S
1302          || e->Iex.Binop.op == Iop_F64toI64S) {
1303         Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1304         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1305         HReg dst = newVRegI(env);
1306         set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1307         addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1308         set_SSE_rounding_default(env);
1309         return dst;
1310      }
1311
1312//..       if (e->Iex.Binop.op == Iop_F64toI32 || e->Iex.Binop.op == Iop_F64toI16) {
1313//..          Int  sz  = e->Iex.Binop.op == Iop_F64toI16 ? 2 : 4;
1314//..          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1315//..          HReg dst = newVRegI(env);
1316//..
1317//..          /* Used several times ... */
1318//..          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1319//..
1320//..          /* rf now holds the value to be converted, and rrm holds the
1321//.. 	    rounding mode value, encoded as per the IRRoundingMode
1322//.. 	    enum.  The first thing to do is set the FPU's rounding
1323//.. 	    mode accordingly. */
1324//..
1325//..          /* Create a space for the format conversion. */
1326//..          /* subl $4, %esp */
1327//..          sub_from_esp(env, 4);
1328//..
1329//.. 	 /* Set host rounding mode */
1330//.. 	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
1331//..
1332//..          /* gistw/l %rf, 0(%esp) */
1333//..          addInstr(env, X86Instr_FpLdStI(False/*store*/, sz, rf, zero_esp));
1334//..
1335//..          if (sz == 2) {
1336//..             /* movzwl 0(%esp), %dst */
1337//..             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
1338//..          } else {
1339//..             /* movl 0(%esp), %dst */
1340//..             vassert(sz == 4);
1341//..             addInstr(env, X86Instr_Alu32R(
1342//..                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1343//..          }
1344//..
1345//.. 	 /* Restore default FPU rounding. */
1346//..          set_FPU_rounding_default( env );
1347//..
1348//..          /* addl $4, %esp */
1349//.. 	 add_to_esp(env, 4);
1350//..          return dst;
1351//..       }
1352//..
1353//..       /* C3210 flags following FPU partial remainder (fprem), both
1354//..          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1355//..       if (e->Iex.Binop.op == Iop_PRemC3210F64
1356//..           || e->Iex.Binop.op == Iop_PRem1C3210F64) {
1357//..          HReg junk = newVRegF(env);
1358//..          HReg dst  = newVRegI(env);
1359//..          HReg srcL = iselDblExpr(env, e->Iex.Binop.arg1);
1360//..          HReg srcR = iselDblExpr(env, e->Iex.Binop.arg2);
1361//..          addInstr(env, X86Instr_FpBinary(
1362//..                            e->Iex.Binop.op==Iop_PRemC3210F64
1363//..                               ? Xfp_PREM : Xfp_PREM1,
1364//..                            srcL,srcR,junk
1365//..                  ));
1366//..          /* The previous pseudo-insn will have left the FPU's C3210
1367//..             flags set correctly.  So bag them. */
1368//..          addInstr(env, X86Instr_FpStSW_AX());
1369//..          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1370//.. 	 addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
1371//..          return dst;
1372//..       }
1373
1374      break;
1375   }
1376
1377   /* --------- UNARY OP --------- */
1378   case Iex_Unop: {
1379
1380      /* 1Uto8(64to1(expr64)) */
1381      {
1382         DEFINE_PATTERN( p_1Uto8_64to1,
1383                         unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1384         if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1385            IRExpr* expr64 = mi.bindee[0];
1386            HReg    dst    = newVRegI(env);
1387            HReg    src    = iselIntExpr_R(env, expr64);
1388            addInstr(env, mk_iMOVsd_RR(src,dst) );
1389            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1390                                            AMD64RMI_Imm(1), dst));
1391            return dst;
1392         }
1393      }
1394
1395      /* 8Uto64(LDle(expr64)) */
1396      {
1397         DEFINE_PATTERN(p_LDle8_then_8Uto64,
1398                        unop(Iop_8Uto64,
1399                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1400         if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1401            HReg dst = newVRegI(env);
1402            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1403            addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1404            return dst;
1405         }
1406      }
1407
1408      /* 16Uto64(LDle(expr64)) */
1409      {
1410         DEFINE_PATTERN(p_LDle16_then_16Uto64,
1411                        unop(Iop_16Uto64,
1412                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1413         if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1414            HReg dst = newVRegI(env);
1415            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1416            addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1417            return dst;
1418         }
1419      }
1420
1421      /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1422         Use 32 bit arithmetic and let the default zero-extend rule
1423         do the 32Uto64 for free. */
1424      if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1425         IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1426         IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1427         IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1428         AMD64AluOp aluOp = Aalu_INVALID;
1429         switch (opi) {
1430            case Iop_Add32: aluOp = Aalu_ADD; break;
1431            case Iop_Sub32: aluOp = Aalu_SUB; break;
1432            case Iop_And32: aluOp = Aalu_AND; break;
1433            case Iop_Or32:  aluOp = Aalu_OR;  break;
1434            case Iop_Xor32: aluOp = Aalu_XOR; break;
1435            default: break;
1436         }
1437         if (aluOp != Aalu_INVALID) {
1438            /* For commutative ops we assume any literal values are on
1439               the second operand. */
1440            HReg dst      = newVRegI(env);
1441            HReg reg      = iselIntExpr_R(env, argL);
1442            AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1443            addInstr(env, mk_iMOVsd_RR(reg,dst));
1444            addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1445            return dst;
1446         }
1447         /* just fall through to normal handling for Iop_32Uto64 */
1448      }
1449
1450      /* Fallback cases */
1451      switch (e->Iex.Unop.op) {
1452         case Iop_32Uto64:
1453         case Iop_32Sto64: {
1454            HReg dst = newVRegI(env);
1455            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1456            addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1457                                            src, dst) );
1458            return dst;
1459         }
1460         case Iop_128HIto64: {
1461            HReg rHi, rLo;
1462            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1463            return rHi; /* and abandon rLo */
1464         }
1465         case Iop_128to64: {
1466            HReg rHi, rLo;
1467            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1468            return rLo; /* and abandon rHi */
1469         }
1470         case Iop_8Uto16:
1471         case Iop_8Uto32:
1472         case Iop_8Uto64:
1473         case Iop_16Uto64:
1474         case Iop_16Uto32: {
1475            HReg dst     = newVRegI(env);
1476            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1477            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1478                                   || e->Iex.Unop.op==Iop_16Uto64 );
1479            UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1480            addInstr(env, mk_iMOVsd_RR(src,dst) );
1481            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1482                                            AMD64RMI_Imm(mask), dst));
1483            return dst;
1484         }
1485         case Iop_8Sto16:
1486         case Iop_8Sto64:
1487         case Iop_8Sto32:
1488         case Iop_16Sto32:
1489         case Iop_16Sto64: {
1490            HReg dst     = newVRegI(env);
1491            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1492            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1493                                   || e->Iex.Unop.op==Iop_16Sto64 );
1494            UInt amt     = srcIs16 ? 48 : 56;
1495            addInstr(env, mk_iMOVsd_RR(src,dst) );
1496            addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1497            addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1498            return dst;
1499         }
1500 	 case Iop_Not8:
1501 	 case Iop_Not16:
1502         case Iop_Not32:
1503         case Iop_Not64: {
1504            HReg dst = newVRegI(env);
1505            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1506            addInstr(env, mk_iMOVsd_RR(src,dst) );
1507            addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1508            return dst;
1509         }
1510//..          case Iop_64HIto32: {
1511//..             HReg rHi, rLo;
1512//..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1513//..             return rHi; /* and abandon rLo .. poor wee thing :-) */
1514//..          }
1515//..          case Iop_64to32: {
1516//..             HReg rHi, rLo;
1517//..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1518//..             return rLo; /* similar stupid comment to the above ... */
1519//..          }
1520         case Iop_16HIto8:
1521         case Iop_32HIto16:
1522         case Iop_64HIto32: {
1523            HReg dst  = newVRegI(env);
1524            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1525            Int shift = 0;
1526            switch (e->Iex.Unop.op) {
1527               case Iop_16HIto8:  shift = 8;  break;
1528               case Iop_32HIto16: shift = 16; break;
1529               case Iop_64HIto32: shift = 32; break;
1530               default: vassert(0);
1531            }
1532            addInstr(env, mk_iMOVsd_RR(src,dst) );
1533            addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1534            return dst;
1535         }
1536         case Iop_1Uto64:
1537         case Iop_1Uto32:
1538         case Iop_1Uto8: {
1539            HReg dst           = newVRegI(env);
1540            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1541            addInstr(env, AMD64Instr_Set64(cond,dst));
1542            return dst;
1543         }
1544         case Iop_1Sto8:
1545         case Iop_1Sto16:
1546         case Iop_1Sto32:
1547         case Iop_1Sto64: {
1548            /* could do better than this, but for now ... */
1549            HReg dst           = newVRegI(env);
1550            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1551            addInstr(env, AMD64Instr_Set64(cond,dst));
1552            addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1553            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1554            return dst;
1555         }
1556         case Iop_Ctz64: {
1557            /* Count trailing zeroes, implemented by amd64 'bsfq' */
1558            HReg dst = newVRegI(env);
1559            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1560            addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1561            return dst;
1562         }
1563         case Iop_Clz64: {
1564            /* Count leading zeroes.  Do 'bsrq' to establish the index
1565               of the highest set bit, and subtract that value from
1566               63. */
1567            HReg tmp = newVRegI(env);
1568            HReg dst = newVRegI(env);
1569            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1570            addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1571            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1572                                            AMD64RMI_Imm(63), dst));
1573            addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1574                                            AMD64RMI_Reg(tmp), dst));
1575            return dst;
1576         }
1577
1578         case Iop_CmpwNEZ64: {
1579            HReg dst = newVRegI(env);
1580            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1581            addInstr(env, mk_iMOVsd_RR(src,dst));
1582            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1583            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1584                                            AMD64RMI_Reg(src), dst));
1585            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1586            return dst;
1587         }
1588
1589         case Iop_CmpwNEZ32: {
1590            HReg src = newVRegI(env);
1591            HReg dst = newVRegI(env);
1592            HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1593            addInstr(env, mk_iMOVsd_RR(pre,src));
1594            addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1595            addInstr(env, mk_iMOVsd_RR(src,dst));
1596            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1597            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1598                                            AMD64RMI_Reg(src), dst));
1599            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1600            return dst;
1601         }
1602
1603         case Iop_Left8:
1604         case Iop_Left16:
1605         case Iop_Left32:
1606         case Iop_Left64: {
1607            HReg dst = newVRegI(env);
1608            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1609            addInstr(env, mk_iMOVsd_RR(src, dst));
1610            addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1611            addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1612            return dst;
1613         }
1614
1615         case Iop_V128to32: {
1616            HReg        dst     = newVRegI(env);
1617            HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1618            AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1619            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1620            addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1621            return dst;
1622         }
1623
1624         /* V128{HI}to64 */
1625         case Iop_V128HIto64:
1626         case Iop_V128to64: {
1627            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
1628            HReg dst = newVRegI(env);
1629            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1630            AMD64AMode* rsp0 = AMD64AMode_IR(0,   hregAMD64_RSP());
1631            AMD64AMode* rspN = AMD64AMode_IR(off, hregAMD64_RSP());
1632            sub_from_rsp(env, 16);
1633            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp0));
1634            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1635                                             AMD64RMI_Mem(rspN), dst ));
1636            add_to_rsp(env, 16);
1637            return dst;
1638         }
1639
1640         /* ReinterpF64asI64(e) */
1641         /* Given an IEEE754 double, produce an I64 with the same bit
1642            pattern. */
1643         case Iop_ReinterpF64asI64: {
1644            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1645            HReg        dst    = newVRegI(env);
1646            HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1647            /* paranoia */
1648            set_SSE_rounding_default(env);
1649            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1650            addInstr(env, AMD64Instr_Alu64R(
1651                             Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1652            return dst;
1653         }
1654
1655         /* ReinterpF32asI32(e) */
1656         /* Given an IEEE754 single, produce an I64 with the same bit
1657            pattern in the lower half. */
1658         case Iop_ReinterpF32asI32: {
1659            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1660            HReg        dst    = newVRegI(env);
1661            HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1662            /* paranoia */
1663            set_SSE_rounding_default(env);
1664            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1665            addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1666            return dst;
1667         }
1668
1669         case Iop_16to8:
1670         case Iop_32to8:
1671         case Iop_64to8:
1672         case Iop_32to16:
1673         case Iop_64to16:
1674         case Iop_64to32:
1675            /* These are no-ops. */
1676            return iselIntExpr_R(env, e->Iex.Unop.arg);
1677
1678         default:
1679            break;
1680      }
1681
1682      /* Deal with unary 64-bit SIMD ops. */
1683      switch (e->Iex.Unop.op) {
1684         case Iop_CmpNEZ32x2:
1685            fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1686         case Iop_CmpNEZ16x4:
1687            fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1688         case Iop_CmpNEZ8x8:
1689            fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1690         default:
1691            fn = (HWord)0; break;
1692      }
1693      if (fn != (HWord)0) {
1694         /* Note: the following assumes all helpers are of
1695            signature
1696               ULong fn ( ULong ), and they are
1697            not marked as regparm functions.
1698         */
1699         HReg dst = newVRegI(env);
1700         HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1701         addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1702         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
1703         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1704         return dst;
1705      }
1706
1707      break;
1708   }
1709
1710   /* --------- GET --------- */
1711   case Iex_Get: {
1712      if (ty == Ity_I64) {
1713         HReg dst = newVRegI(env);
1714         addInstr(env, AMD64Instr_Alu64R(
1715                          Aalu_MOV,
1716                          AMD64RMI_Mem(
1717                             AMD64AMode_IR(e->Iex.Get.offset,
1718                                           hregAMD64_RBP())),
1719                          dst));
1720         return dst;
1721      }
1722      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1723         HReg dst = newVRegI(env);
1724         addInstr(env, AMD64Instr_LoadEX(
1725                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1726                          False,
1727                          AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1728                          dst));
1729         return dst;
1730      }
1731      break;
1732   }
1733
1734   case Iex_GetI: {
1735      AMD64AMode* am
1736         = genGuestArrayOffset(
1737              env, e->Iex.GetI.descr,
1738                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1739      HReg dst = newVRegI(env);
1740      if (ty == Ity_I8) {
1741         addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1742         return dst;
1743      }
1744      if (ty == Ity_I64) {
1745         addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1746         return dst;
1747      }
1748      break;
1749   }
1750
1751   /* --------- CCALL --------- */
1752   case Iex_CCall: {
1753      HReg    dst = newVRegI(env);
1754      vassert(ty == e->Iex.CCall.retty);
1755
1756      /* be very restrictive for now.  Only 64-bit ints allowed
1757         for args, and 64 or 32 bits for return type. */
1758      if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1759         goto irreducible;
1760
1761      /* Marshal args, do the call. */
1762      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
1763
1764      /* Move to dst, and zero out the top 32 bits if the result type is
1765         Ity_I32.  Probably overkill, but still .. */
1766      if (e->Iex.CCall.retty == Ity_I64)
1767         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1768      else
1769         addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1770
1771      return dst;
1772   }
1773
1774   /* --------- LITERAL --------- */
1775   /* 64/32/16/8-bit literals */
1776   case Iex_Const:
1777      if (ty == Ity_I64) {
1778         HReg r = newVRegI(env);
1779         addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1780         return r;
1781      } else {
1782         AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1783         HReg      r   = newVRegI(env);
1784         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1785         return r;
1786      }
1787
1788   /* --------- MULTIPLEX --------- */
1789   case Iex_Mux0X: {
1790     if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1791         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
1792        HReg     r8;
1793        HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
1794        AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
1795        HReg dst = newVRegI(env);
1796        addInstr(env, mk_iMOVsd_RR(rX,dst));
1797        r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
1798        addInstr(env, AMD64Instr_Test64(0xFF, r8));
1799        addInstr(env, AMD64Instr_CMov64(Acc_Z,r0,dst));
1800        return dst;
1801      }
1802      break;
1803   }
1804
1805   /* --------- TERNARY OP --------- */
1806   case Iex_Triop: {
1807      /* C3210 flags following FPU partial remainder (fprem), both
1808         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1809      if (e->Iex.Triop.op == Iop_PRemC3210F64
1810          || e->Iex.Triop.op == Iop_PRem1C3210F64) {
1811         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1812         HReg        arg1   = iselDblExpr(env, e->Iex.Triop.arg2);
1813         HReg        arg2   = iselDblExpr(env, e->Iex.Triop.arg3);
1814         HReg        dst    = newVRegI(env);
1815         addInstr(env, AMD64Instr_A87Free(2));
1816
1817         /* one arg -> top of x87 stack */
1818         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1819         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1820
1821         /* other arg -> top of x87 stack */
1822         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1823         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1824
1825         switch (e->Iex.Triop.op) {
1826            case Iop_PRemC3210F64:
1827               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1828               break;
1829            case Iop_PRem1C3210F64:
1830               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1831               break;
1832            default:
1833               vassert(0);
1834         }
1835         /* Ignore the result, and instead make off with the FPU's
1836	    C3210 flags (in the status word). */
1837         addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1838         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1839         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1840         return dst;
1841      }
1842      break;
1843   }
1844
1845   default:
1846   break;
1847   } /* switch (e->tag) */
1848
1849   /* We get here if no pattern matched. */
1850  irreducible:
1851   ppIRExpr(e);
1852   vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1853}
1854
1855
1856/*---------------------------------------------------------*/
1857/*--- ISEL: Integer expression auxiliaries              ---*/
1858/*---------------------------------------------------------*/
1859
1860/* --------------------- AMODEs --------------------- */
1861
1862/* Return an AMode which computes the value of the specified
1863   expression, possibly also adding insns to the code list as a
1864   result.  The expression may only be a 32-bit one.
1865*/
1866
1867static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1868{
1869   AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
1870   vassert(sane_AMode(am));
1871   return am;
1872}
1873
1874/* DO NOT CALL THIS DIRECTLY ! */
1875static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1876{
1877   MatchInfo mi;
1878   DECLARE_PATTERN(p_complex);
1879   IRType ty = typeOfIRExpr(env->type_env,e);
1880   vassert(ty == Ity_I64);
1881
1882   /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
1883   /*              bind0        bind1  bind2   bind3   */
1884   DEFINE_PATTERN(p_complex,
1885      binop( Iop_Add64,
1886             binop( Iop_Add64,
1887                    bind(0),
1888                    binop(Iop_Shl64, bind(1), bind(2))
1889                  ),
1890             bind(3)
1891           )
1892   );
1893   if (matchIRExpr(&mi, p_complex, e)) {
1894      IRExpr* expr1  = mi.bindee[0];
1895      IRExpr* expr2  = mi.bindee[1];
1896      IRExpr* imm8   = mi.bindee[2];
1897      IRExpr* simm32 = mi.bindee[3];
1898      if (imm8->tag == Iex_Const
1899          && imm8->Iex.Const.con->tag == Ico_U8
1900          && imm8->Iex.Const.con->Ico.U8 < 4
1901          /* imm8 is OK, now check simm32 */
1902          && simm32->tag == Iex_Const
1903          && simm32->Iex.Const.con->tag == Ico_U64
1904          && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
1905         UInt shift = imm8->Iex.Const.con->Ico.U8;
1906         UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
1907         HReg r1 = iselIntExpr_R(env, expr1);
1908         HReg r2 = iselIntExpr_R(env, expr2);
1909         vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
1910         return AMD64AMode_IRRS(offset, r1, r2, shift);
1911      }
1912   }
1913
1914   /* Add64(expr1, Shl64(expr2, imm)) */
1915   if (e->tag == Iex_Binop
1916       && e->Iex.Binop.op == Iop_Add64
1917       && e->Iex.Binop.arg2->tag == Iex_Binop
1918       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
1919       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1920       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1921      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1922      if (shift == 1 || shift == 2 || shift == 3) {
1923         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1924         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1925         return AMD64AMode_IRRS(0, r1, r2, shift);
1926      }
1927   }
1928
1929   /* Add64(expr,i) */
1930   if (e->tag == Iex_Binop
1931       && e->Iex.Binop.op == Iop_Add64
1932       && e->Iex.Binop.arg2->tag == Iex_Const
1933       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
1934       && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
1935      HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1936      return AMD64AMode_IR(
1937                toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
1938                r1
1939             );
1940   }
1941
1942   /* Doesn't match anything in particular.  Generate it into
1943      a register and use that. */
1944   {
1945      HReg r1 = iselIntExpr_R(env, e);
1946      return AMD64AMode_IR(0, r1);
1947   }
1948}
1949
1950
1951/* --------------------- RMIs --------------------- */
1952
1953/* Similarly, calculate an expression into an X86RMI operand.  As with
1954   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1955
1956static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
1957{
1958   AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1959   /* sanity checks ... */
1960   switch (rmi->tag) {
1961      case Armi_Imm:
1962         return rmi;
1963      case Armi_Reg:
1964         vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
1965         vassert(hregIsVirtual(rmi->Armi.Reg.reg));
1966         return rmi;
1967      case Armi_Mem:
1968         vassert(sane_AMode(rmi->Armi.Mem.am));
1969         return rmi;
1970      default:
1971         vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
1972   }
1973}
1974
1975/* DO NOT CALL THIS DIRECTLY ! */
1976static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
1977{
1978   IRType ty = typeOfIRExpr(env->type_env,e);
1979   vassert(ty == Ity_I64 || ty == Ity_I32
1980           || ty == Ity_I16 || ty == Ity_I8);
1981
1982   /* special case: immediate 64/32/16/8 */
1983   if (e->tag == Iex_Const) {
1984      switch (e->Iex.Const.con->tag) {
1985        case Ico_U64:
1986           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
1987              return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
1988           }
1989           break;
1990         case Ico_U32:
1991            return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
1992         case Ico_U16:
1993            return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
1994         case Ico_U8:
1995            return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
1996         default:
1997            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
1998      }
1999   }
2000
2001   /* special case: 64-bit GET */
2002   if (e->tag == Iex_Get && ty == Ity_I64) {
2003      return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2004                                        hregAMD64_RBP()));
2005   }
2006
2007   /* special case: 64-bit load from memory */
2008   if (e->tag == Iex_Load && ty == Ity_I64
2009       && e->Iex.Load.end == Iend_LE) {
2010      AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2011      return AMD64RMI_Mem(am);
2012   }
2013
2014   /* default case: calculate into a register and return that */
2015   {
2016      HReg r = iselIntExpr_R ( env, e );
2017      return AMD64RMI_Reg(r);
2018   }
2019}
2020
2021
2022/* --------------------- RIs --------------------- */
2023
2024/* Calculate an expression into an AMD64RI operand.  As with
2025   iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2026   bits. */
2027
2028static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
2029{
2030   AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2031   /* sanity checks ... */
2032   switch (ri->tag) {
2033      case Ari_Imm:
2034         return ri;
2035      case Ari_Reg:
2036         vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2037         vassert(hregIsVirtual(ri->Ari.Reg.reg));
2038         return ri;
2039      default:
2040         vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2041   }
2042}
2043
2044/* DO NOT CALL THIS DIRECTLY ! */
2045static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
2046{
2047   IRType ty = typeOfIRExpr(env->type_env,e);
2048   vassert(ty == Ity_I64 || ty == Ity_I32
2049           || ty == Ity_I16 || ty == Ity_I8);
2050
2051   /* special case: immediate */
2052   if (e->tag == Iex_Const) {
2053      switch (e->Iex.Const.con->tag) {
2054        case Ico_U64:
2055           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2056              return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2057           }
2058           break;
2059         case Ico_U32:
2060            return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2061         case Ico_U16:
2062            return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2063         case Ico_U8:
2064            return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2065         default:
2066            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2067      }
2068   }
2069
2070   /* default case: calculate into a register and return that */
2071   {
2072      HReg r = iselIntExpr_R ( env, e );
2073      return AMD64RI_Reg(r);
2074   }
2075}
2076
2077
2078/* --------------------- RMs --------------------- */
2079
2080/* Similarly, calculate an expression into an AMD64RM operand.  As
2081   with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2082   bits.  */
2083
2084static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
2085{
2086   AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2087   /* sanity checks ... */
2088   switch (rm->tag) {
2089      case Arm_Reg:
2090         vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2091         vassert(hregIsVirtual(rm->Arm.Reg.reg));
2092         return rm;
2093      case Arm_Mem:
2094         vassert(sane_AMode(rm->Arm.Mem.am));
2095         return rm;
2096      default:
2097         vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2098   }
2099}
2100
2101/* DO NOT CALL THIS DIRECTLY ! */
2102static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
2103{
2104   IRType ty = typeOfIRExpr(env->type_env,e);
2105   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2106
2107   /* special case: 64-bit GET */
2108   if (e->tag == Iex_Get && ty == Ity_I64) {
2109      return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2110                                       hregAMD64_RBP()));
2111   }
2112
2113   /* special case: load from memory */
2114
2115   /* default case: calculate into a register and return that */
2116   {
2117      HReg r = iselIntExpr_R ( env, e );
2118      return AMD64RM_Reg(r);
2119   }
2120}
2121
2122
2123/* --------------------- CONDCODE --------------------- */
2124
2125/* Generate code to evaluated a bit-typed expression, returning the
2126   condition code which would correspond when the expression would
2127   notionally have returned 1. */
2128
2129static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
2130{
2131   /* Uh, there's nothing we can sanity check here, unfortunately. */
2132   return iselCondCode_wrk(env,e);
2133}
2134
2135/* DO NOT CALL THIS DIRECTLY ! */
2136static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
2137{
2138   MatchInfo mi;
2139
2140   vassert(e);
2141   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2142
2143   /* var */
2144   if (e->tag == Iex_RdTmp) {
2145      HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2146      HReg dst = newVRegI(env);
2147      addInstr(env, mk_iMOVsd_RR(r64,dst));
2148      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2149      return Acc_NZ;
2150   }
2151
2152   /* Constant 1:Bit */
2153   if (e->tag == Iex_Const) {
2154      HReg r;
2155      vassert(e->Iex.Const.con->tag == Ico_U1);
2156      vassert(e->Iex.Const.con->Ico.U1 == True
2157              || e->Iex.Const.con->Ico.U1 == False);
2158      r = newVRegI(env);
2159      addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2160      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2161      return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2162   }
2163
2164   /* Not1(...) */
2165   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2166      /* Generate code for the arg, and negate the test condition */
2167      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2168   }
2169
2170   /* --- patterns rooted at: 64to1 --- */
2171
2172   /* 64to1 */
2173   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2174      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2175      addInstr(env, AMD64Instr_Test64(1,reg));
2176      return Acc_NZ;
2177   }
2178
2179   /* --- patterns rooted at: CmpNEZ8 --- */
2180
2181   /* CmpNEZ8(x) */
2182   if (e->tag == Iex_Unop
2183       && e->Iex.Unop.op == Iop_CmpNEZ8) {
2184      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2185      addInstr(env, AMD64Instr_Test64(0xFF,r));
2186      return Acc_NZ;
2187   }
2188
2189   /* --- patterns rooted at: CmpNEZ16 --- */
2190
2191   /* CmpNEZ16(x) */
2192   if (e->tag == Iex_Unop
2193       && e->Iex.Unop.op == Iop_CmpNEZ16) {
2194      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2195      addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2196      return Acc_NZ;
2197   }
2198
2199   /* --- patterns rooted at: CmpNEZ32 --- */
2200
2201   /* CmpNEZ32(x) */
2202   if (e->tag == Iex_Unop
2203       && e->Iex.Unop.op == Iop_CmpNEZ32) {
2204      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2205      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2206      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2207      return Acc_NZ;
2208   }
2209
2210   /* --- patterns rooted at: CmpNEZ64 --- */
2211
2212   /* CmpNEZ64(Or64(x,y)) */
2213   {
2214      DECLARE_PATTERN(p_CmpNEZ64_Or64);
2215      DEFINE_PATTERN(p_CmpNEZ64_Or64,
2216                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
2217      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
2218         HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
2219         AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
2220         HReg      tmp  = newVRegI(env);
2221         addInstr(env, mk_iMOVsd_RR(r0, tmp));
2222         addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
2223         return Acc_NZ;
2224      }
2225   }
2226
2227   /* CmpNEZ64(x) */
2228   if (e->tag == Iex_Unop
2229       && e->Iex.Unop.op == Iop_CmpNEZ64) {
2230      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2231      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2232      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2233      return Acc_NZ;
2234   }
2235
2236   /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2237
2238   /* CmpEQ8 / CmpNE8 */
2239   if (e->tag == Iex_Binop
2240       && (e->Iex.Binop.op == Iop_CmpEQ8
2241           || e->Iex.Binop.op == Iop_CmpNE8
2242           || e->Iex.Binop.op == Iop_CasCmpEQ8
2243           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2244      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2245      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2246      HReg      r    = newVRegI(env);
2247      addInstr(env, mk_iMOVsd_RR(r1,r));
2248      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2249      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2250      switch (e->Iex.Binop.op) {
2251         case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2252         case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2253         default: vpanic("iselCondCode(amd64): CmpXX8");
2254      }
2255   }
2256
2257   /* CmpEQ16 / CmpNE16 */
2258   if (e->tag == Iex_Binop
2259       && (e->Iex.Binop.op == Iop_CmpEQ16
2260           || e->Iex.Binop.op == Iop_CmpNE16
2261           || e->Iex.Binop.op == Iop_CasCmpEQ16
2262           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2263      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2264      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2265      HReg      r    = newVRegI(env);
2266      addInstr(env, mk_iMOVsd_RR(r1,r));
2267      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2268      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2269      switch (e->Iex.Binop.op) {
2270         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2271         case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2272         default: vpanic("iselCondCode(amd64): CmpXX16");
2273      }
2274   }
2275
2276   /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2277      Saves a "movq %rax, %tmp" compared to the default route. */
2278   if (e->tag == Iex_Binop
2279       && e->Iex.Binop.op == Iop_CmpNE64
2280       && e->Iex.Binop.arg1->tag == Iex_CCall
2281       && e->Iex.Binop.arg2->tag == Iex_Const) {
2282      IRExpr* cal = e->Iex.Binop.arg1;
2283      IRExpr* con = e->Iex.Binop.arg2;
2284      HReg    tmp = newVRegI(env);
2285      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2286      vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2287      vassert(con->Iex.Const.con->tag == Ico_U64);
2288      /* Marshal args, do the call. */
2289      doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
2290      addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2291      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2292                                      AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2293      return Acc_NZ;
2294   }
2295
2296   /* Cmp*64*(x,y) */
2297   if (e->tag == Iex_Binop
2298       && (e->Iex.Binop.op == Iop_CmpEQ64
2299           || e->Iex.Binop.op == Iop_CmpNE64
2300           || e->Iex.Binop.op == Iop_CmpLT64S
2301           || e->Iex.Binop.op == Iop_CmpLT64U
2302           || e->Iex.Binop.op == Iop_CmpLE64S
2303           || e->Iex.Binop.op == Iop_CmpLE64U
2304           || e->Iex.Binop.op == Iop_CasCmpEQ64
2305           || e->Iex.Binop.op == Iop_CasCmpNE64)) {
2306      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2307      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2308      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2309      switch (e->Iex.Binop.op) {
2310         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2311         case Iop_CmpNE64: case Iop_CasCmpNE64: return Acc_NZ;
2312	 case Iop_CmpLT64S: return Acc_L;
2313	 case Iop_CmpLT64U: return Acc_B;
2314	 case Iop_CmpLE64S: return Acc_LE;
2315         case Iop_CmpLE64U: return Acc_BE;
2316         default: vpanic("iselCondCode(amd64): CmpXX64");
2317      }
2318   }
2319
2320   /* Cmp*32*(x,y) */
2321   if (e->tag == Iex_Binop
2322       && (e->Iex.Binop.op == Iop_CmpEQ32
2323           || e->Iex.Binop.op == Iop_CmpNE32
2324           || e->Iex.Binop.op == Iop_CmpLT32S
2325           || e->Iex.Binop.op == Iop_CmpLT32U
2326           || e->Iex.Binop.op == Iop_CmpLE32S
2327           || e->Iex.Binop.op == Iop_CmpLE32U
2328           || e->Iex.Binop.op == Iop_CasCmpEQ32
2329           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
2330      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2331      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2332      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2333      switch (e->Iex.Binop.op) {
2334         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2335         case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
2336	 case Iop_CmpLT32S: return Acc_L;
2337	 case Iop_CmpLT32U: return Acc_B;
2338	 case Iop_CmpLE32S: return Acc_LE;
2339         case Iop_CmpLE32U: return Acc_BE;
2340         default: vpanic("iselCondCode(amd64): CmpXX32");
2341      }
2342   }
2343
2344   ppIRExpr(e);
2345   vpanic("iselCondCode(amd64)");
2346}
2347
2348
2349/*---------------------------------------------------------*/
2350/*--- ISEL: Integer expressions (128 bit)               ---*/
2351/*---------------------------------------------------------*/
2352
2353/* Compute a 128-bit value into a register pair, which is returned as
2354   the first two parameters.  As with iselIntExpr_R, these may be
2355   either real or virtual regs; in any case they must not be changed
2356   by subsequent code emitted by the caller.  */
2357
2358static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2359                             ISelEnv* env, IRExpr* e )
2360{
2361   iselInt128Expr_wrk(rHi, rLo, env, e);
2362#  if 0
2363   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2364#  endif
2365   vassert(hregClass(*rHi) == HRcInt64);
2366   vassert(hregIsVirtual(*rHi));
2367   vassert(hregClass(*rLo) == HRcInt64);
2368   vassert(hregIsVirtual(*rLo));
2369}
2370
2371/* DO NOT CALL THIS DIRECTLY ! */
2372static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2373                                 ISelEnv* env, IRExpr* e )
2374{
2375//..    HWord fn = 0; /* helper fn for most SIMD64 stuff */
2376   vassert(e);
2377   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2378
2379//..    /* 64-bit literal */
2380//..    if (e->tag == Iex_Const) {
2381//..       ULong w64 = e->Iex.Const.con->Ico.U64;
2382//..       UInt  wHi = ((UInt)(w64 >> 32)) & 0xFFFFFFFF;
2383//..       UInt  wLo = ((UInt)w64) & 0xFFFFFFFF;
2384//..       HReg  tLo = newVRegI(env);
2385//..       HReg  tHi = newVRegI(env);
2386//..       vassert(e->Iex.Const.con->tag == Ico_U64);
2387//..       addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
2388//..       addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2389//..       *rHi = tHi;
2390//..       *rLo = tLo;
2391//..       return;
2392//..    }
2393
2394   /* read 128-bit IRTemp */
2395   if (e->tag == Iex_RdTmp) {
2396      lookupIRTemp128( rHi, rLo, env, e->Iex.RdTmp.tmp);
2397      return;
2398   }
2399
2400//..    /* 64-bit load */
2401//..    if (e->tag == Iex_LDle) {
2402//..       HReg     tLo, tHi;
2403//..       X86AMode *am0, *am4;
2404//..       vassert(e->Iex.LDle.ty == Ity_I64);
2405//..       tLo = newVRegI(env);
2406//..       tHi = newVRegI(env);
2407//..       am0 = iselIntExpr_AMode(env, e->Iex.LDle.addr);
2408//..       am4 = advance4(am0);
2409//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
2410//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2411//..       *rHi = tHi;
2412//..       *rLo = tLo;
2413//..       return;
2414//..    }
2415//..
2416//..    /* 64-bit GET */
2417//..    if (e->tag == Iex_Get) {
2418//..       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
2419//..       X86AMode* am4 = advance4(am);
2420//..       HReg tLo = newVRegI(env);
2421//..       HReg tHi = newVRegI(env);
2422//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2423//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2424//..       *rHi = tHi;
2425//..       *rLo = tLo;
2426//..       return;
2427//..    }
2428//..
2429//..    /* 64-bit GETI */
2430//..    if (e->tag == Iex_GetI) {
2431//..       X86AMode* am
2432//..          = genGuestArrayOffset( env, e->Iex.GetI.descr,
2433//..                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
2434//..       X86AMode* am4 = advance4(am);
2435//..       HReg tLo = newVRegI(env);
2436//..       HReg tHi = newVRegI(env);
2437//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2438//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2439//..       *rHi = tHi;
2440//..       *rLo = tLo;
2441//..       return;
2442//..    }
2443//..
2444//..    /* 64-bit Mux0X */
2445//..    if (e->tag == Iex_Mux0X) {
2446//..       HReg e0Lo, e0Hi, eXLo, eXHi, r8;
2447//..       HReg tLo = newVRegI(env);
2448//..       HReg tHi = newVRegI(env);
2449//..       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
2450//..       iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
2451//..       addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
2452//..       addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
2453//..       r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
2454//..       addInstr(env, X86Instr_Test32(X86RI_Imm(0xFF), X86RM_Reg(r8)));
2455//..       /* This assumes the first cmov32 doesn't trash the condition
2456//..          codes, so they are still available for the second cmov32 */
2457//..       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
2458//..       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
2459//..       *rHi = tHi;
2460//..       *rLo = tLo;
2461//..       return;
2462//..    }
2463
2464   /* --------- BINARY ops --------- */
2465   if (e->tag == Iex_Binop) {
2466      switch (e->Iex.Binop.op) {
2467         /* 64 x 64 -> 128 multiply */
2468         case Iop_MullU64:
2469         case Iop_MullS64: {
2470            /* get one operand into %rax, and the other into a R/M.
2471               Need to make an educated guess about which is better in
2472               which. */
2473            HReg     tLo    = newVRegI(env);
2474            HReg     tHi    = newVRegI(env);
2475            Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2476            AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2477            HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2478            addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2479            addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2480            /* Result is now in RDX:RAX.  Tell the caller. */
2481            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2482            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2483            *rHi = tHi;
2484            *rLo = tLo;
2485            return;
2486         }
2487
2488         /* 128 x 64 -> (64(rem),64(div)) division */
2489         case Iop_DivModU128to64:
2490         case Iop_DivModS128to64: {
2491            /* Get the 128-bit operand into rdx:rax, and the other into
2492               any old R/M. */
2493            HReg sHi, sLo;
2494            HReg     tLo     = newVRegI(env);
2495            HReg     tHi     = newVRegI(env);
2496            Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2497            AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2498            iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2499            addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2500            addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2501            addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2502            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2503            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2504            *rHi = tHi;
2505            *rLo = tLo;
2506            return;
2507         }
2508
2509         /* 64HLto128(e1,e2) */
2510         case Iop_64HLto128:
2511            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2512            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2513            return;
2514
2515//..          /* Or64/And64/Xor64 */
2516//..          case Iop_Or64:
2517//..          case Iop_And64:
2518//..          case Iop_Xor64: {
2519//..             HReg xLo, xHi, yLo, yHi;
2520//..             HReg tLo = newVRegI(env);
2521//..             HReg tHi = newVRegI(env);
2522//..             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
2523//..                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
2524//..                           : Xalu_XOR;
2525//..             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2526//..             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2527//..             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2528//..             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2529//..             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
2530//..             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
2531//..             *rHi = tHi;
2532//..             *rLo = tLo;
2533//..             return;
2534//..          }
2535//..
2536//..          /* Add64/Sub64 */
2537//..          case Iop_Add64:
2538//..          case Iop_Sub64: {
2539//..             HReg xLo, xHi, yLo, yHi;
2540//..             HReg tLo = newVRegI(env);
2541//..             HReg tHi = newVRegI(env);
2542//..             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2543//..             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2544//..             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2545//..             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2546//..             if (e->Iex.Binop.op==Iop_Add64) {
2547//..                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
2548//..                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
2549//..             } else {
2550//..                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2551//..                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2552//..             }
2553//..             *rHi = tHi;
2554//..             *rLo = tLo;
2555//..             return;
2556//..          }
2557//..
2558//..          /* 32HLto64(e1,e2) */
2559//..          case Iop_32HLto64:
2560//..             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2561//..             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2562//..             return;
2563//..
2564//..          /* 64-bit shifts */
2565//..          case Iop_Shl64: {
2566//..             /* We use the same ingenious scheme as gcc.  Put the value
2567//..                to be shifted into %hi:%lo, and the shift amount into
2568//..                %cl.  Then (dsts on right, a la ATT syntax):
2569//..
2570//..                shldl %cl, %lo, %hi   -- make %hi be right for the
2571//..                                      -- shift amt %cl % 32
2572//..                shll  %cl, %lo        -- make %lo be right for the
2573//..                                      -- shift amt %cl % 32
2574//..
2575//..                Now, if (shift amount % 64) is in the range 32 .. 63,
2576//..                we have to do a fixup, which puts the result low half
2577//..                into the result high half, and zeroes the low half:
2578//..
2579//..                testl $32, %ecx
2580//..
2581//..                cmovnz %lo, %hi
2582//..                movl $0, %tmp         -- sigh; need yet another reg
2583//..                cmovnz %tmp, %lo
2584//..             */
2585//..             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2586//..             tLo = newVRegI(env);
2587//..             tHi = newVRegI(env);
2588//..             tTemp = newVRegI(env);
2589//..             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2590//..             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2591//..             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2592//..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2593//..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2594//..             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2595//..                and those regs are legitimately modifiable. */
2596//..             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
2597//..             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, X86RM_Reg(tLo)));
2598//..             addInstr(env, X86Instr_Test32(X86RI_Imm(32),
2599//..                           X86RM_Reg(hregX86_ECX())));
2600//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
2601//..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2602//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
2603//..             *rHi = tHi;
2604//..             *rLo = tLo;
2605//..             return;
2606//..          }
2607//..
2608//..          case Iop_Shr64: {
2609//..             /* We use the same ingenious scheme as gcc.  Put the value
2610//..                to be shifted into %hi:%lo, and the shift amount into
2611//..                %cl.  Then:
2612//..
2613//..                shrdl %cl, %hi, %lo   -- make %lo be right for the
2614//..                                      -- shift amt %cl % 32
2615//..                shrl  %cl, %hi        -- make %hi be right for the
2616//..                                      -- shift amt %cl % 32
2617//..
2618//..                Now, if (shift amount % 64) is in the range 32 .. 63,
2619//..                we have to do a fixup, which puts the result high half
2620//..                into the result low half, and zeroes the high half:
2621//..
2622//..                testl $32, %ecx
2623//..
2624//..                cmovnz %hi, %lo
2625//..                movl $0, %tmp         -- sigh; need yet another reg
2626//..                cmovnz %tmp, %hi
2627//..             */
2628//..             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2629//..             tLo = newVRegI(env);
2630//..             tHi = newVRegI(env);
2631//..             tTemp = newVRegI(env);
2632//..             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2633//..             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2634//..             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2635//..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2636//..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2637//..             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2638//..                and those regs are legitimately modifiable. */
2639//..             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
2640//..             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, X86RM_Reg(tHi)));
2641//..             addInstr(env, X86Instr_Test32(X86RI_Imm(32),
2642//..                           X86RM_Reg(hregX86_ECX())));
2643//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
2644//..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2645//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
2646//..             *rHi = tHi;
2647//..             *rLo = tLo;
2648//..             return;
2649//..          }
2650//..
2651//..          /* F64 -> I64 */
2652//..          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
2653//..             case.  Unfortunately I see no easy way to avoid the
2654//..             duplication. */
2655//..          case Iop_F64toI64: {
2656//..             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2657//..             HReg tLo = newVRegI(env);
2658//..             HReg tHi = newVRegI(env);
2659//..
2660//..             /* Used several times ... */
2661//..             /* Careful ... this sharing is only safe because
2662//.. 	       zero_esp/four_esp do not hold any registers which the
2663//.. 	       register allocator could attempt to swizzle later. */
2664//..             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2665//..             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2666//..
2667//..             /* rf now holds the value to be converted, and rrm holds
2668//..                the rounding mode value, encoded as per the
2669//..                IRRoundingMode enum.  The first thing to do is set the
2670//..                FPU's rounding mode accordingly. */
2671//..
2672//..             /* Create a space for the format conversion. */
2673//..             /* subl $8, %esp */
2674//..             sub_from_esp(env, 8);
2675//..
2676//..             /* Set host rounding mode */
2677//..             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2678//..
2679//..             /* gistll %rf, 0(%esp) */
2680//..             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
2681//..
2682//..             /* movl 0(%esp), %dstLo */
2683//..             /* movl 4(%esp), %dstHi */
2684//..             addInstr(env, X86Instr_Alu32R(
2685//..                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2686//..             addInstr(env, X86Instr_Alu32R(
2687//..                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2688//..
2689//..             /* Restore default FPU rounding. */
2690//..             set_FPU_rounding_default( env );
2691//..
2692//..             /* addl $8, %esp */
2693//..             add_to_esp(env, 8);
2694//..
2695//..             *rHi = tHi;
2696//..             *rLo = tLo;
2697//..             return;
2698//..          }
2699//..
2700         default:
2701            break;
2702      }
2703   } /* if (e->tag == Iex_Binop) */
2704
2705
2706//..    /* --------- UNARY ops --------- */
2707//..    if (e->tag == Iex_Unop) {
2708//..       switch (e->Iex.Unop.op) {
2709//..
2710//..          /* 32Sto64(e) */
2711//..          case Iop_32Sto64: {
2712//..             HReg tLo = newVRegI(env);
2713//..             HReg tHi = newVRegI(env);
2714//..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2715//..             addInstr(env, mk_iMOVsd_RR(src,tHi));
2716//..             addInstr(env, mk_iMOVsd_RR(src,tLo));
2717//..             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tHi)));
2718//..             *rHi = tHi;
2719//..             *rLo = tLo;
2720//..             return;
2721//..          }
2722//..
2723//..          /* 32Uto64(e) */
2724//..          case Iop_32Uto64: {
2725//..             HReg tLo = newVRegI(env);
2726//..             HReg tHi = newVRegI(env);
2727//..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2728//..             addInstr(env, mk_iMOVsd_RR(src,tLo));
2729//..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2730//..             *rHi = tHi;
2731//..             *rLo = tLo;
2732//..             return;
2733//..          }
2734
2735//..          /* could do better than this, but for now ... */
2736//..          case Iop_1Sto64: {
2737//..             HReg tLo = newVRegI(env);
2738//..             HReg tHi = newVRegI(env);
2739//..             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
2740//..             addInstr(env, X86Instr_Set32(cond,tLo));
2741//..             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, X86RM_Reg(tLo)));
2742//..             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tLo)));
2743//..             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
2744//..             *rHi = tHi;
2745//..             *rLo = tLo;
2746//..             return;
2747//..          }
2748//..
2749//..          /* Not64(e) */
2750//..          case Iop_Not64: {
2751//..             HReg tLo = newVRegI(env);
2752//..             HReg tHi = newVRegI(env);
2753//..             HReg sHi, sLo;
2754//..             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
2755//..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2756//..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2757//..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tHi)));
2758//..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tLo)));
2759//..             *rHi = tHi;
2760//..             *rLo = tLo;
2761//..             return;
2762//..          }
2763//..
2764//..          default:
2765//..             break;
2766//..       }
2767//..    } /* if (e->tag == Iex_Unop) */
2768//..
2769//..
2770//..    /* --------- CCALL --------- */
2771//..    if (e->tag == Iex_CCall) {
2772//..       HReg tLo = newVRegI(env);
2773//..       HReg tHi = newVRegI(env);
2774//..
2775//..       /* Marshal args, do the call, clear stack. */
2776//..       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
2777//..
2778//..       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2779//..       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2780//..       *rHi = tHi;
2781//..       *rLo = tLo;
2782//..       return;
2783//..    }
2784
2785   ppIRExpr(e);
2786   vpanic("iselInt128Expr");
2787}
2788
2789
2790/*---------------------------------------------------------*/
2791/*--- ISEL: Floating point expressions (32 bit)         ---*/
2792/*---------------------------------------------------------*/
2793
2794/* Nothing interesting here; really just wrappers for
2795   64-bit stuff. */
2796
2797static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2798{
2799   HReg r = iselFltExpr_wrk( env, e );
2800#  if 0
2801   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2802#  endif
2803   vassert(hregClass(r) == HRcVec128);
2804   vassert(hregIsVirtual(r));
2805   return r;
2806}
2807
2808/* DO NOT CALL THIS DIRECTLY */
2809static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2810{
2811   IRType ty = typeOfIRExpr(env->type_env,e);
2812   vassert(ty == Ity_F32);
2813
2814   if (e->tag == Iex_RdTmp) {
2815      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2816   }
2817
2818   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2819      AMD64AMode* am;
2820      HReg res = newVRegV(env);
2821      vassert(e->Iex.Load.ty == Ity_F32);
2822      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2823      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2824      return res;
2825   }
2826
2827   if (e->tag == Iex_Binop
2828       && e->Iex.Binop.op == Iop_F64toF32) {
2829      /* Although the result is still held in a standard SSE register,
2830         we need to round it to reflect the loss of accuracy/range
2831         entailed in casting it to a 32-bit float. */
2832      HReg dst = newVRegV(env);
2833      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2834      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2835      addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2836      set_SSE_rounding_default( env );
2837      return dst;
2838   }
2839
2840   if (e->tag == Iex_Get) {
2841      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2842                                       hregAMD64_RBP() );
2843      HReg res = newVRegV(env);
2844      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2845      return res;
2846   }
2847
2848   if (e->tag == Iex_Unop
2849       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2850       /* Given an I32, produce an IEEE754 float with the same bit
2851          pattern. */
2852       HReg        dst    = newVRegV(env);
2853       HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2854       AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2855       addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2856       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2857       return dst;
2858   }
2859
2860   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2861      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2862      HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2863      HReg        dst    = newVRegV(env);
2864
2865      /* rf now holds the value to be rounded.  The first thing to do
2866         is set the FPU's rounding mode accordingly. */
2867
2868      /* Set host x87 rounding mode */
2869      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2870
2871      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2872      addInstr(env, AMD64Instr_A87Free(1));
2873      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2874      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2875      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2876      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2877
2878      /* Restore default x87 rounding. */
2879      set_FPU_rounding_default( env );
2880
2881      return dst;
2882   }
2883
2884   ppIRExpr(e);
2885   vpanic("iselFltExpr_wrk");
2886}
2887
2888
2889/*---------------------------------------------------------*/
2890/*--- ISEL: Floating point expressions (64 bit)         ---*/
2891/*---------------------------------------------------------*/
2892
2893/* Compute a 64-bit floating point value into the lower half of an xmm
2894   register, the identity of which is returned.  As with
2895   iselIntExpr_R, the returned reg will be virtual, and it must not be
2896   changed by subsequent code emitted by the caller.
2897*/
2898
2899/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2900
2901    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2902    ----                  ---------   -----------   -----------
2903    signalling NaN        u           2047 (max)    .0uuuuu---u
2904                                                    (with at least
2905                                                     one 1 bit)
2906    quiet NaN             u           2047 (max)    .1uuuuu---u
2907
2908    negative infinity     1           2047 (max)    .000000---0
2909
2910    positive infinity     0           2047 (max)    .000000---0
2911
2912    negative zero         1           0             .000000---0
2913
2914    positive zero         0           0             .000000---0
2915*/
2916
2917static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2918{
2919   HReg r = iselDblExpr_wrk( env, e );
2920#  if 0
2921   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2922#  endif
2923   vassert(hregClass(r) == HRcVec128);
2924   vassert(hregIsVirtual(r));
2925   return r;
2926}
2927
2928/* DO NOT CALL THIS DIRECTLY */
2929static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
2930{
2931   IRType ty = typeOfIRExpr(env->type_env,e);
2932   vassert(e);
2933   vassert(ty == Ity_F64);
2934
2935   if (e->tag == Iex_RdTmp) {
2936      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2937   }
2938
2939   if (e->tag == Iex_Const) {
2940      union { ULong u64; Double f64; } u;
2941      HReg res = newVRegV(env);
2942      HReg tmp = newVRegI(env);
2943      vassert(sizeof(u) == 8);
2944      vassert(sizeof(u.u64) == 8);
2945      vassert(sizeof(u.f64) == 8);
2946
2947      if (e->Iex.Const.con->tag == Ico_F64) {
2948         u.f64 = e->Iex.Const.con->Ico.F64;
2949      }
2950      else if (e->Iex.Const.con->tag == Ico_F64i) {
2951         u.u64 = e->Iex.Const.con->Ico.F64i;
2952      }
2953      else
2954         vpanic("iselDblExpr(amd64): const");
2955
2956      addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2957      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2958      addInstr(env, AMD64Instr_SseLdSt(
2959                       True/*load*/, 8, res,
2960                       AMD64AMode_IR(0, hregAMD64_RSP())
2961              ));
2962      add_to_rsp(env, 8);
2963      return res;
2964   }
2965
2966   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2967      AMD64AMode* am;
2968      HReg res = newVRegV(env);
2969      vassert(e->Iex.Load.ty == Ity_F64);
2970      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2971      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2972      return res;
2973   }
2974
2975   if (e->tag == Iex_Get) {
2976      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2977                                      hregAMD64_RBP() );
2978      HReg res = newVRegV(env);
2979      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2980      return res;
2981   }
2982
2983   if (e->tag == Iex_GetI) {
2984      AMD64AMode* am
2985         = genGuestArrayOffset(
2986              env, e->Iex.GetI.descr,
2987                   e->Iex.GetI.ix, e->Iex.GetI.bias );
2988      HReg res = newVRegV(env);
2989      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2990      return res;
2991   }
2992
2993   if (e->tag == Iex_Triop) {
2994      AMD64SseOp op = Asse_INVALID;
2995      switch (e->Iex.Triop.op) {
2996         case Iop_AddF64: op = Asse_ADDF; break;
2997         case Iop_SubF64: op = Asse_SUBF; break;
2998         case Iop_MulF64: op = Asse_MULF; break;
2999         case Iop_DivF64: op = Asse_DIVF; break;
3000         default: break;
3001      }
3002      if (op != Asse_INVALID) {
3003         HReg dst  = newVRegV(env);
3004         HReg argL = iselDblExpr(env, e->Iex.Triop.arg2);
3005         HReg argR = iselDblExpr(env, e->Iex.Triop.arg3);
3006         addInstr(env, mk_vMOVsd_RR(argL, dst));
3007         /* XXXROUNDINGFIXME */
3008         /* set roundingmode here */
3009         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3010         return dst;
3011      }
3012   }
3013
3014   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
3015      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3016      HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3017      HReg        dst    = newVRegV(env);
3018
3019      /* rf now holds the value to be rounded.  The first thing to do
3020         is set the FPU's rounding mode accordingly. */
3021
3022      /* Set host x87 rounding mode */
3023      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3024
3025      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3026      addInstr(env, AMD64Instr_A87Free(1));
3027      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3028      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
3029      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3030      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3031
3032      /* Restore default x87 rounding. */
3033      set_FPU_rounding_default( env );
3034
3035      return dst;
3036   }
3037
3038   if (e->tag == Iex_Triop
3039       && (e->Iex.Triop.op == Iop_ScaleF64
3040           || e->Iex.Triop.op == Iop_AtanF64
3041           || e->Iex.Triop.op == Iop_Yl2xF64
3042           || e->Iex.Triop.op == Iop_Yl2xp1F64
3043           || e->Iex.Triop.op == Iop_PRemF64
3044           || e->Iex.Triop.op == Iop_PRem1F64)
3045      ) {
3046      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3047      HReg        arg1   = iselDblExpr(env, e->Iex.Triop.arg2);
3048      HReg        arg2   = iselDblExpr(env, e->Iex.Triop.arg3);
3049      HReg        dst    = newVRegV(env);
3050      Bool     arg2first = toBool(e->Iex.Triop.op == Iop_ScaleF64
3051                                  || e->Iex.Triop.op == Iop_PRemF64
3052                                  || e->Iex.Triop.op == Iop_PRem1F64);
3053      addInstr(env, AMD64Instr_A87Free(2));
3054
3055      /* one arg -> top of x87 stack */
3056      addInstr(env, AMD64Instr_SseLdSt(
3057                       False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3058      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3059
3060      /* other arg -> top of x87 stack */
3061      addInstr(env, AMD64Instr_SseLdSt(
3062                       False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3063      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3064
3065      /* do it */
3066      /* XXXROUNDINGFIXME */
3067      /* set roundingmode here */
3068      switch (e->Iex.Triop.op) {
3069         case Iop_ScaleF64:
3070            addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3071            break;
3072         case Iop_AtanF64:
3073            addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3074            break;
3075         case Iop_Yl2xF64:
3076            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3077            break;
3078         case Iop_Yl2xp1F64:
3079            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3080            break;
3081         case Iop_PRemF64:
3082            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3083            break;
3084         case Iop_PRem1F64:
3085            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3086            break;
3087         default:
3088            vassert(0);
3089      }
3090
3091      /* save result */
3092      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3093      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3094      return dst;
3095   }
3096
3097   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3098      HReg dst = newVRegV(env);
3099      HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3100      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3101      addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3102      set_SSE_rounding_default( env );
3103      return dst;
3104   }
3105
3106   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3107      HReg dst = newVRegV(env);
3108      HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3109      set_SSE_rounding_default( env );
3110      addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3111      return dst;
3112   }
3113
3114   if (e->tag == Iex_Unop
3115       && (e->Iex.Unop.op == Iop_NegF64
3116           || e->Iex.Unop.op == Iop_AbsF64)) {
3117      /* Sigh ... very rough code.  Could do much better. */
3118      /* Get the 128-bit literal 00---0 10---0 into a register
3119         and xor/nand it with the value to be negated. */
3120      HReg r1  = newVRegI(env);
3121      HReg dst = newVRegV(env);
3122      HReg tmp = newVRegV(env);
3123      HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3124      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3125      addInstr(env, mk_vMOVsd_RR(src,tmp));
3126      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3127      addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3128      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3129      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3130
3131      if (e->Iex.Unop.op == Iop_NegF64)
3132         addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3133      else
3134         addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3135
3136      add_to_rsp(env, 16);
3137      return dst;
3138   }
3139
3140   if (e->tag == Iex_Binop) {
3141      A87FpOp fpop = Afp_INVALID;
3142      switch (e->Iex.Binop.op) {
3143         case Iop_SqrtF64: fpop = Afp_SQRT; break;
3144         case Iop_SinF64:  fpop = Afp_SIN;  break;
3145         case Iop_CosF64:  fpop = Afp_COS;  break;
3146         case Iop_TanF64:  fpop = Afp_TAN;  break;
3147         case Iop_2xm1F64: fpop = Afp_2XM1; break;
3148         default: break;
3149      }
3150      if (fpop != Afp_INVALID) {
3151         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3152         HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3153         HReg        dst    = newVRegV(env);
3154         Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3155         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3156         addInstr(env, AMD64Instr_A87Free(nNeeded));
3157         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3158         /* XXXROUNDINGFIXME */
3159         /* set roundingmode here */
3160         addInstr(env, AMD64Instr_A87FpOp(fpop));
3161         if (e->Iex.Binop.op==Iop_TanF64) {
3162            /* get rid of the extra 1.0 that fptan pushes */
3163            addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3164         }
3165         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3166         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3167         return dst;
3168      }
3169   }
3170
3171   if (e->tag == Iex_Unop) {
3172      switch (e->Iex.Unop.op) {
3173//..          case Iop_I32toF64: {
3174//..             HReg dst = newVRegF(env);
3175//..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3176//..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3177//..             set_FPU_rounding_default(env);
3178//..             addInstr(env, X86Instr_FpLdStI(
3179//..                              True/*load*/, 4, dst,
3180//..                              X86AMode_IR(0, hregX86_ESP())));
3181//..             add_to_esp(env, 4);
3182//..             return dst;
3183//..          }
3184         case Iop_ReinterpI64asF64: {
3185            /* Given an I64, produce an IEEE754 double with the same
3186               bit pattern. */
3187            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3188            HReg        dst    = newVRegV(env);
3189            AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3190            /* paranoia */
3191            set_SSE_rounding_default(env);
3192            addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3193            addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3194            return dst;
3195         }
3196         case Iop_F32toF64: {
3197            HReg f32;
3198            HReg f64 = newVRegV(env);
3199            /* this shouldn't be necessary, but be paranoid ... */
3200            set_SSE_rounding_default(env);
3201            f32 = iselFltExpr(env, e->Iex.Unop.arg);
3202            addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3203            return f64;
3204         }
3205         default:
3206            break;
3207      }
3208   }
3209
3210   /* --------- MULTIPLEX --------- */
3211   if (e->tag == Iex_Mux0X) {
3212      HReg r8, rX, r0, dst;
3213      vassert(ty == Ity_F64);
3214      vassert(typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8);
3215      r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
3216      rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
3217      r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
3218      dst = newVRegV(env);
3219      addInstr(env, mk_vMOVsd_RR(rX,dst));
3220      addInstr(env, AMD64Instr_Test64(0xFF, r8));
3221      addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
3222      return dst;
3223   }
3224
3225   ppIRExpr(e);
3226   vpanic("iselDblExpr_wrk");
3227}
3228
3229
3230/*---------------------------------------------------------*/
3231/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3232/*---------------------------------------------------------*/
3233
3234static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
3235{
3236   HReg r = iselVecExpr_wrk( env, e );
3237#  if 0
3238   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3239#  endif
3240   vassert(hregClass(r) == HRcVec128);
3241   vassert(hregIsVirtual(r));
3242   return r;
3243}
3244
3245
3246/* DO NOT CALL THIS DIRECTLY */
3247static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
3248{
3249   HWord      fn = 0; /* address of helper fn, if required */
3250   Bool       arg1isEReg = False;
3251   AMD64SseOp op = Asse_INVALID;
3252   IRType     ty = typeOfIRExpr(env->type_env,e);
3253   vassert(e);
3254   vassert(ty == Ity_V128);
3255
3256   if (e->tag == Iex_RdTmp) {
3257      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3258   }
3259
3260   if (e->tag == Iex_Get) {
3261      HReg dst = newVRegV(env);
3262      addInstr(env, AMD64Instr_SseLdSt(
3263                       True/*load*/,
3264                       16,
3265                       dst,
3266                       AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3267                    )
3268              );
3269      return dst;
3270   }
3271
3272   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3273      HReg        dst = newVRegV(env);
3274      AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3275      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3276      return dst;
3277   }
3278
3279   if (e->tag == Iex_Const) {
3280      HReg dst = newVRegV(env);
3281      vassert(e->Iex.Const.con->tag == Ico_V128);
3282      switch (e->Iex.Const.con->Ico.V128) {
3283         case 0x0000:
3284            dst = generate_zeroes_V128(env);
3285            break;
3286         case 0xFFFF:
3287            dst = generate_ones_V128(env);
3288            break;
3289         default: {
3290            AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3291            /* do push_uimm64 twice, first time for the high-order half. */
3292            push_uimm64(env, bitmask8_to_bytemask64(
3293                                (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3294                       ));
3295            push_uimm64(env, bitmask8_to_bytemask64(
3296                                (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3297                       ));
3298            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3299            add_to_rsp(env, 16);
3300            break;
3301         }
3302      }
3303      return dst;
3304   }
3305
3306   if (e->tag == Iex_Unop) {
3307   switch (e->Iex.Unop.op) {
3308
3309      case Iop_NotV128: {
3310         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3311         return do_sse_NotV128(env, arg);
3312      }
3313
3314      case Iop_CmpNEZ64x2: {
3315         /* We can use SSE2 instructions for this. */
3316         /* Ideally, we want to do a 64Ix2 comparison against zero of
3317            the operand.  Problem is no such insn exists.  Solution
3318            therefore is to do a 32Ix4 comparison instead, and bitwise-
3319            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3320            let the not'd result of this initial comparison be a:b:c:d.
3321            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3322            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3323            giving the required result.
3324
3325            The required selection sequence is 2,3,0,1, which
3326            according to Intel's documentation means the pshufd
3327            literal value is 0xB1, that is,
3328            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3329         */
3330         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3331         HReg tmp  = generate_zeroes_V128(env);
3332         HReg dst  = newVRegV(env);
3333         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3334         tmp = do_sse_NotV128(env, tmp);
3335         addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3336         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3337         return dst;
3338      }
3339
3340      case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3341      case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3342      case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3343      do_CmpNEZ_vector:
3344      {
3345         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3346         HReg tmp  = newVRegV(env);
3347         HReg zero = generate_zeroes_V128(env);
3348         HReg dst;
3349         addInstr(env, mk_vMOVsd_RR(arg, tmp));
3350         addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3351         dst = do_sse_NotV128(env, tmp);
3352         return dst;
3353      }
3354
3355      case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3356      case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3357      case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
3358      do_32Fx4_unary:
3359      {
3360         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3361         HReg dst = newVRegV(env);
3362         addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3363         return dst;
3364      }
3365
3366//..       case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
3367//..       case Iop_RSqrt64Fx2: op = Asse_RSQRTF; goto do_64Fx2_unary;
3368      case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
3369      do_64Fx2_unary:
3370      {
3371         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3372         HReg dst = newVRegV(env);
3373         addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
3374         return dst;
3375      }
3376
3377      case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3378      case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3379      case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
3380      do_32F0x4_unary:
3381      {
3382         /* A bit subtle.  We have to copy the arg to the result
3383            register first, because actually doing the SSE scalar insn
3384            leaves the upper 3/4 of the destination register
3385            unchanged.  Whereas the required semantics of these
3386            primops is that the upper 3/4 is simply copied in from the
3387            argument. */
3388         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3389         HReg dst = newVRegV(env);
3390         addInstr(env, mk_vMOVsd_RR(arg, dst));
3391         addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3392         return dst;
3393      }
3394
3395//..       case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
3396//..       case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
3397      case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3398      do_64F0x2_unary:
3399      {
3400         /* A bit subtle.  We have to copy the arg to the result
3401            register first, because actually doing the SSE scalar insn
3402            leaves the upper half of the destination register
3403            unchanged.  Whereas the required semantics of these
3404            primops is that the upper half is simply copied in from the
3405            argument. */
3406         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3407         HReg dst = newVRegV(env);
3408         addInstr(env, mk_vMOVsd_RR(arg, dst));
3409         addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3410         return dst;
3411      }
3412
3413      case Iop_32UtoV128: {
3414         HReg        dst     = newVRegV(env);
3415         AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3416         AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3417         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3418         addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3419         return dst;
3420      }
3421
3422      case Iop_64UtoV128: {
3423         HReg        dst  = newVRegV(env);
3424         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3425         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3426         addInstr(env, AMD64Instr_Push(rmi));
3427         addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3428         add_to_rsp(env, 8);
3429         return dst;
3430      }
3431
3432      default:
3433         break;
3434   } /* switch (e->Iex.Unop.op) */
3435   } /* if (e->tag == Iex_Unop) */
3436
3437   if (e->tag == Iex_Binop) {
3438   switch (e->Iex.Binop.op) {
3439
3440      case Iop_SetV128lo64: {
3441         HReg dst  = newVRegV(env);
3442         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3443         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3444         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3445         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3446         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3447         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3448         return dst;
3449      }
3450
3451      case Iop_SetV128lo32: {
3452         HReg dst  = newVRegV(env);
3453         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3454         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3455         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3456         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3457         addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3458         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3459         return dst;
3460      }
3461
3462      case Iop_64HLtoV128: {
3463         AMD64AMode* rsp = AMD64AMode_IR(0, hregAMD64_RSP());
3464         HReg        dst = newVRegV(env);
3465         /* do this via the stack (easy, convenient, etc) */
3466         addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg1)));
3467         addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg2)));
3468         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp));
3469         add_to_rsp(env, 16);
3470         return dst;
3471      }
3472
3473      case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3474      case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3475      case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3476      case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3477      case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
3478      case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
3479      case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3480      case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3481      case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
3482      case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
3483      do_32Fx4:
3484      {
3485         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3486         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3487         HReg dst = newVRegV(env);
3488         addInstr(env, mk_vMOVsd_RR(argL, dst));
3489         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3490         return dst;
3491      }
3492
3493      case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3494      case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3495      case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3496      case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3497      case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
3498      case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
3499      case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3500      case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3501      case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
3502      case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
3503      do_64Fx2:
3504      {
3505         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3506         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3507         HReg dst = newVRegV(env);
3508         addInstr(env, mk_vMOVsd_RR(argL, dst));
3509         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3510         return dst;
3511      }
3512
3513      case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3514      case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3515      case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3516      case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3517      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3518      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3519      case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3520      case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3521      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3522      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3523      do_32F0x4: {
3524         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3525         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3526         HReg dst = newVRegV(env);
3527         addInstr(env, mk_vMOVsd_RR(argL, dst));
3528         addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3529         return dst;
3530      }
3531
3532      case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3533      case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3534      case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3535      case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3536      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3537      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3538      case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3539      case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3540      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3541      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3542      do_64F0x2: {
3543         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3544         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3545         HReg dst = newVRegV(env);
3546         addInstr(env, mk_vMOVsd_RR(argL, dst));
3547         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3548         return dst;
3549      }
3550
3551      case Iop_QNarrowBin32Sto16Sx8:
3552         op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3553      case Iop_QNarrowBin16Sto8Sx16:
3554         op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3555      case Iop_QNarrowBin16Sto8Ux16:
3556         op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3557
3558      case Iop_InterleaveHI8x16:
3559         op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3560      case Iop_InterleaveHI16x8:
3561         op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3562      case Iop_InterleaveHI32x4:
3563         op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3564      case Iop_InterleaveHI64x2:
3565         op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3566
3567      case Iop_InterleaveLO8x16:
3568         op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3569      case Iop_InterleaveLO16x8:
3570         op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3571      case Iop_InterleaveLO32x4:
3572         op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3573      case Iop_InterleaveLO64x2:
3574         op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3575
3576      case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3577      case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3578      case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3579      case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3580      case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3581      case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3582      case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3583      case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3584      case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3585      case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3586      case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3587      case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3588      case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3589      case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3590      case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3591      case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3592      case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3593      case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3594      case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3595      case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3596      case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3597      case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3598      case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3599      case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3600      case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3601      case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3602      case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3603      case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3604      case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3605      case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3606      case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3607      case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3608      case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3609      case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3610      do_SseReRg: {
3611         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3612         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3613         HReg dst = newVRegV(env);
3614         if (arg1isEReg) {
3615            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3616            addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3617         } else {
3618            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3619            addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3620         }
3621         return dst;
3622      }
3623
3624      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
3625      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
3626      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
3627      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
3628      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
3629      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
3630      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
3631      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
3632      do_SseShift: {
3633         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
3634         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3635         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3636         HReg        ereg = newVRegV(env);
3637         HReg        dst  = newVRegV(env);
3638         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3639         addInstr(env, AMD64Instr_Push(rmi));
3640         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3641         addInstr(env, mk_vMOVsd_RR(greg, dst));
3642         addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3643         add_to_rsp(env, 16);
3644         return dst;
3645      }
3646
3647      case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3648                           goto do_SseAssistedBinary;
3649      case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3650                           goto do_SseAssistedBinary;
3651      case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3652                           goto do_SseAssistedBinary;
3653      case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3654                           goto do_SseAssistedBinary;
3655      case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3656                           goto do_SseAssistedBinary;
3657      case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3658                           goto do_SseAssistedBinary;
3659      case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3660                           goto do_SseAssistedBinary;
3661      case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3662                           goto do_SseAssistedBinary;
3663      case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3664                           goto do_SseAssistedBinary;
3665      case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3666                           goto do_SseAssistedBinary;
3667      case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3668                           goto do_SseAssistedBinary;
3669      case Iop_QNarrowBin32Sto16Ux8:
3670                           fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3671                           goto do_SseAssistedBinary;
3672      case Iop_NarrowBin16to8x16:
3673                           fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3674                           goto do_SseAssistedBinary;
3675      case Iop_NarrowBin32to16x8:
3676                           fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3677                           goto do_SseAssistedBinary;
3678      do_SseAssistedBinary: {
3679         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3680            well. */
3681         vassert(fn != 0);
3682         HReg dst = newVRegV(env);
3683         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3684         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3685         HReg argp = newVRegI(env);
3686         /* subq $112, %rsp         -- make a space*/
3687         sub_from_rsp(env, 112);
3688         /* leaq 48(%rsp), %r_argp  -- point into it */
3689         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3690                                        argp));
3691         /* andq $-16, %r_argp      -- 16-align the pointer */
3692         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3693                                         AMD64RMI_Imm( ~(UInt)15 ),
3694                                         argp));
3695         /* Prepare 3 arg regs:
3696            leaq 0(%r_argp), %rdi
3697            leaq 16(%r_argp), %rsi
3698            leaq 32(%r_argp), %rdx
3699         */
3700         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3701                                        hregAMD64_RDI()));
3702         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3703                                        hregAMD64_RSI()));
3704         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3705                                        hregAMD64_RDX()));
3706         /* Store the two args, at (%rsi) and (%rdx):
3707            movupd  %argL, 0(%rsi)
3708            movupd  %argR, 0(%rdx)
3709         */
3710         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3711                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3712         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3713                                          AMD64AMode_IR(0, hregAMD64_RDX())));
3714         /* call the helper */
3715         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
3716         /* fetch the result from memory, using %r_argp, which the
3717            register allocator will keep alive across the call. */
3718         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3719                                          AMD64AMode_IR(0, argp)));
3720         /* and finally, clear the space */
3721         add_to_rsp(env, 112);
3722         return dst;
3723      }
3724
3725      case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3726                         goto do_SseAssistedVectorAndScalar;
3727      case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3728                         goto do_SseAssistedVectorAndScalar;
3729      do_SseAssistedVectorAndScalar: {
3730         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3731            well. */
3732         vassert(fn != 0);
3733         HReg dst = newVRegV(env);
3734         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3735         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3736         HReg argp = newVRegI(env);
3737         /* subq $112, %rsp         -- make a space*/
3738         sub_from_rsp(env, 112);
3739         /* leaq 48(%rsp), %r_argp  -- point into it */
3740         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3741                                        argp));
3742         /* andq $-16, %r_argp      -- 16-align the pointer */
3743         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3744                                         AMD64RMI_Imm( ~(UInt)15 ),
3745                                         argp));
3746         /* Prepare 2 vector arg regs:
3747            leaq 0(%r_argp), %rdi
3748            leaq 16(%r_argp), %rsi
3749         */
3750         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3751                                        hregAMD64_RDI()));
3752         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3753                                        hregAMD64_RSI()));
3754         /* Store the vector arg, at (%rsi):
3755            movupd  %argL, 0(%rsi)
3756         */
3757         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3758                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3759         /* And get the scalar value into rdx */
3760         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3761
3762         /* call the helper */
3763         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
3764         /* fetch the result from memory, using %r_argp, which the
3765            register allocator will keep alive across the call. */
3766         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3767                                          AMD64AMode_IR(0, argp)));
3768         /* and finally, clear the space */
3769         add_to_rsp(env, 112);
3770         return dst;
3771      }
3772
3773      default:
3774         break;
3775   } /* switch (e->Iex.Binop.op) */
3776   } /* if (e->tag == Iex_Binop) */
3777
3778   if (e->tag == Iex_Mux0X) {
3779      HReg r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
3780      HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
3781      HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
3782      HReg dst = newVRegV(env);
3783      addInstr(env, mk_vMOVsd_RR(rX,dst));
3784      addInstr(env, AMD64Instr_Test64(0xFF, r8));
3785      addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
3786      return dst;
3787   }
3788
3789   //vec_fail:
3790   vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3791              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3792   ppIRExpr(e);
3793   vpanic("iselVecExpr_wrk");
3794}
3795
3796
3797/*---------------------------------------------------------*/
3798/*--- ISEL: Statements                                  ---*/
3799/*---------------------------------------------------------*/
3800
3801static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3802{
3803   if (vex_traceflags & VEX_TRACE_VCODE) {
3804      vex_printf("\n-- ");
3805      ppIRStmt(stmt);
3806      vex_printf("\n");
3807   }
3808
3809   switch (stmt->tag) {
3810
3811   /* --------- STORE --------- */
3812   case Ist_Store: {
3813      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3814      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3815      IREndness end   = stmt->Ist.Store.end;
3816
3817      if (tya != Ity_I64 || end != Iend_LE)
3818         goto stmt_fail;
3819
3820      if (tyd == Ity_I64) {
3821         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3822         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
3823         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
3824         return;
3825      }
3826      if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
3827         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3828         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
3829         addInstr(env, AMD64Instr_Store(
3830                          toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
3831                          r,am));
3832         return;
3833      }
3834      if (tyd == Ity_F64) {
3835         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3836         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
3837         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
3838         return;
3839      }
3840      if (tyd == Ity_F32) {
3841         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3842         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
3843         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
3844         return;
3845      }
3846      if (tyd == Ity_V128) {
3847         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3848         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
3849         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
3850         return;
3851      }
3852      break;
3853   }
3854
3855   /* --------- PUT --------- */
3856   case Ist_Put: {
3857      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3858      if (ty == Ity_I64) {
3859         /* We're going to write to memory, so compute the RHS into an
3860            AMD64RI. */
3861         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
3862         addInstr(env,
3863                  AMD64Instr_Alu64M(
3864                     Aalu_MOV,
3865                     ri,
3866                     AMD64AMode_IR(stmt->Ist.Put.offset,
3867                                   hregAMD64_RBP())
3868                 ));
3869         return;
3870      }
3871      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
3872         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
3873         addInstr(env, AMD64Instr_Store(
3874                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
3875                          r,
3876                          AMD64AMode_IR(stmt->Ist.Put.offset,
3877                                        hregAMD64_RBP())));
3878         return;
3879      }
3880      if (ty == Ity_V128) {
3881         HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
3882         AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
3883                                         hregAMD64_RBP());
3884         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
3885         return;
3886      }
3887      if (ty == Ity_F32) {
3888         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
3889         AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
3890         set_SSE_rounding_default(env); /* paranoia */
3891         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
3892         return;
3893      }
3894      if (ty == Ity_F64) {
3895         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
3896         AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
3897                                         hregAMD64_RBP() );
3898         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
3899         return;
3900      }
3901      break;
3902   }
3903
3904   /* --------- Indexed PUT --------- */
3905   case Ist_PutI: {
3906      AMD64AMode* am
3907         = genGuestArrayOffset(
3908              env, stmt->Ist.PutI.descr,
3909                   stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
3910
3911      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
3912      if (ty == Ity_F64) {
3913         HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
3914         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
3915         return;
3916      }
3917      if (ty == Ity_I8) {
3918         HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
3919         addInstr(env, AMD64Instr_Store( 1, r, am ));
3920         return;
3921      }
3922      if (ty == Ity_I64) {
3923         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.PutI.data);
3924         addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
3925         return;
3926      }
3927      break;
3928   }
3929
3930   /* --------- TMP --------- */
3931   case Ist_WrTmp: {
3932      IRTemp tmp = stmt->Ist.WrTmp.tmp;
3933      IRType ty = typeOfIRTemp(env->type_env, tmp);
3934
3935      /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
3936         compute it into an AMode and then use LEA.  This usually
3937         produces fewer instructions, often because (for memcheck
3938         created IR) we get t = address-expression, (t is later used
3939         twice) and so doing this naturally turns address-expression
3940         back into an AMD64 amode. */
3941      if (ty == Ity_I64
3942          && stmt->Ist.WrTmp.data->tag == Iex_Binop
3943          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
3944         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
3945         HReg dst = lookupIRTemp(env, tmp);
3946         if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
3947            /* Hmm, iselIntExpr_AMode wimped out and just computed the
3948               value into a register.  Just emit a normal reg-reg move
3949               so reg-alloc can coalesce it away in the usual way. */
3950            HReg src = am->Aam.IR.reg;
3951            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
3952         } else {
3953            addInstr(env, AMD64Instr_Lea64(am,dst));
3954         }
3955         return;
3956      }
3957
3958      if (ty == Ity_I64 || ty == Ity_I32
3959          || ty == Ity_I16 || ty == Ity_I8) {
3960         AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
3961         HReg dst = lookupIRTemp(env, tmp);
3962         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
3963         return;
3964      }
3965      if (ty == Ity_I128) {
3966         HReg rHi, rLo, dstHi, dstLo;
3967         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
3968         lookupIRTemp128( &dstHi, &dstLo, env, tmp);
3969         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
3970         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
3971         return;
3972      }
3973      if (ty == Ity_I1) {
3974         AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
3975         HReg dst = lookupIRTemp(env, tmp);
3976         addInstr(env, AMD64Instr_Set64(cond, dst));
3977         return;
3978      }
3979      if (ty == Ity_F64) {
3980         HReg dst = lookupIRTemp(env, tmp);
3981         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
3982         addInstr(env, mk_vMOVsd_RR(src, dst));
3983         return;
3984      }
3985      if (ty == Ity_F32) {
3986         HReg dst = lookupIRTemp(env, tmp);
3987         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
3988         addInstr(env, mk_vMOVsd_RR(src, dst));
3989         return;
3990      }
3991      if (ty == Ity_V128) {
3992         HReg dst = lookupIRTemp(env, tmp);
3993         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
3994         addInstr(env, mk_vMOVsd_RR(src, dst));
3995         return;
3996      }
3997      break;
3998   }
3999
4000   /* --------- Call to DIRTY helper --------- */
4001   case Ist_Dirty: {
4002      IRType   retty;
4003      IRDirty* d = stmt->Ist.Dirty.details;
4004      Bool     passBBP = False;
4005
4006      if (d->nFxState == 0)
4007         vassert(!d->needsBBP);
4008
4009      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
4010
4011      /* Marshal args, do the call, clear stack. */
4012      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
4013
4014      /* Now figure out what to do with the returned value, if any. */
4015      if (d->tmp == IRTemp_INVALID)
4016         /* No return value.  Nothing to do. */
4017         return;
4018
4019      retty = typeOfIRTemp(env->type_env, d->tmp);
4020      if (retty == Ity_I64 || retty == Ity_I32
4021          || retty == Ity_I16 || retty == Ity_I8) {
4022         /* The returned value is in %rax.  Park it in the register
4023            associated with tmp. */
4024         HReg dst = lookupIRTemp(env, d->tmp);
4025         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4026         return;
4027      }
4028      break;
4029   }
4030
4031   /* --------- MEM FENCE --------- */
4032   case Ist_MBE:
4033      switch (stmt->Ist.MBE.event) {
4034         case Imbe_Fence:
4035            addInstr(env, AMD64Instr_MFence());
4036            return;
4037         default:
4038            break;
4039      }
4040      break;
4041
4042   /* --------- ACAS --------- */
4043   case Ist_CAS:
4044      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4045         /* "normal" singleton CAS */
4046         UChar  sz;
4047         IRCAS* cas = stmt->Ist.CAS.details;
4048         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4049         /* get: cas->expd into %rax, and cas->data into %rbx */
4050         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4051         HReg rData = iselIntExpr_R(env, cas->dataLo);
4052         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4053         HReg rOld  = lookupIRTemp(env, cas->oldLo);
4054         vassert(cas->expdHi == NULL);
4055         vassert(cas->dataHi == NULL);
4056         addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4057         addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4058         addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4059         switch (ty) {
4060            case Ity_I64: sz = 8; break;
4061            case Ity_I32: sz = 4; break;
4062            case Ity_I16: sz = 2; break;
4063            case Ity_I8:  sz = 1; break;
4064            default: goto unhandled_cas;
4065         }
4066         addInstr(env, AMD64Instr_ACAS(am, sz));
4067         addInstr(env, AMD64Instr_CMov64(
4068                          Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
4069         return;
4070      } else {
4071         /* double CAS */
4072         UChar  sz;
4073         IRCAS* cas = stmt->Ist.CAS.details;
4074         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4075         /* only 32-bit and 64-bit allowed in this case */
4076         /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4077         /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4078         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4079         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4080         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4081         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4082         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4083         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4084         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4085         switch (ty) {
4086            case Ity_I64:
4087               if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4088                  goto unhandled_cas; /* we'd have to generate
4089                                         cmpxchg16b, but the host
4090                                         doesn't support that */
4091               sz = 8;
4092               break;
4093            case Ity_I32:
4094               sz = 4;
4095               break;
4096            default:
4097               goto unhandled_cas;
4098         }
4099         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4100         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4101         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4102         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4103         addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4104         addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4105         addInstr(env, AMD64Instr_DACAS(am, sz));
4106         addInstr(env,
4107                  AMD64Instr_CMov64(
4108                     Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
4109         addInstr(env,
4110                  AMD64Instr_CMov64(
4111                     Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
4112         return;
4113      }
4114      unhandled_cas:
4115      break;
4116
4117   /* --------- INSTR MARK --------- */
4118   /* Doesn't generate any executable code ... */
4119   case Ist_IMark:
4120       return;
4121
4122   /* --------- ABI HINT --------- */
4123   /* These have no meaning (denotation in the IR) and so we ignore
4124      them ... if any actually made it this far. */
4125   case Ist_AbiHint:
4126       return;
4127
4128   /* --------- NO-OP --------- */
4129   case Ist_NoOp:
4130       return;
4131
4132   /* --------- EXIT --------- */
4133   case Ist_Exit: {
4134      AMD64RI*      dst;
4135      AMD64CondCode cc;
4136      if (stmt->Ist.Exit.dst->tag != Ico_U64)
4137         vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4138      dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
4139      cc  = iselCondCode(env,stmt->Ist.Exit.guard);
4140      addInstr(env, AMD64Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
4141      return;
4142   }
4143
4144   default: break;
4145   }
4146  stmt_fail:
4147   ppIRStmt(stmt);
4148   vpanic("iselStmt(amd64)");
4149}
4150
4151
4152/*---------------------------------------------------------*/
4153/*--- ISEL: Basic block terminators (Nexts)             ---*/
4154/*---------------------------------------------------------*/
4155
4156static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
4157{
4158   AMD64RI* ri;
4159   if (vex_traceflags & VEX_TRACE_VCODE) {
4160      vex_printf("\n-- goto {");
4161      ppIRJumpKind(jk);
4162      vex_printf("} ");
4163      ppIRExpr(next);
4164      vex_printf("\n");
4165   }
4166   ri = iselIntExpr_RI(env, next);
4167   addInstr(env, AMD64Instr_Goto(jk, Acc_ALWAYS,ri));
4168}
4169
4170
4171/*---------------------------------------------------------*/
4172/*--- Insn selector top-level                           ---*/
4173/*---------------------------------------------------------*/
4174
4175/* Translate an entire SB to amd64 code. */
4176
4177HInstrArray* iselSB_AMD64 ( IRSB* bb, VexArch      arch_host,
4178                                      VexArchInfo* archinfo_host,
4179                                      VexAbiInfo*  vbi/*UNUSED*/ )
4180{
4181   Int      i, j;
4182   HReg     hreg, hregHI;
4183   ISelEnv* env;
4184   UInt     hwcaps_host = archinfo_host->hwcaps;
4185
4186   /* sanity ... */
4187   vassert(arch_host == VexArchAMD64);
4188   vassert(0 == (hwcaps_host
4189                 & ~(VEX_HWCAPS_AMD64_SSE3
4190                     | VEX_HWCAPS_AMD64_CX16
4191                     | VEX_HWCAPS_AMD64_LZCNT)));
4192
4193   /* Make up an initial environment to use. */
4194   env = LibVEX_Alloc(sizeof(ISelEnv));
4195   env->vreg_ctr = 0;
4196
4197   /* Set up output code array. */
4198   env->code = newHInstrArray();
4199
4200   /* Copy BB's type env. */
4201   env->type_env = bb->tyenv;
4202
4203   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4204      change as we go along. */
4205   env->n_vregmap = bb->tyenv->types_used;
4206   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4207   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4208
4209   /* and finally ... */
4210   env->hwcaps = hwcaps_host;
4211
4212   /* For each IR temporary, allocate a suitably-kinded virtual
4213      register. */
4214   j = 0;
4215   for (i = 0; i < env->n_vregmap; i++) {
4216      hregHI = hreg = INVALID_HREG;
4217      switch (bb->tyenv->types[i]) {
4218         case Ity_I1:
4219         case Ity_I8:
4220         case Ity_I16:
4221         case Ity_I32:
4222         case Ity_I64:  hreg   = mkHReg(j++, HRcInt64, True); break;
4223         case Ity_I128: hreg   = mkHReg(j++, HRcInt64, True);
4224                        hregHI = mkHReg(j++, HRcInt64, True); break;
4225         case Ity_F32:
4226         case Ity_F64:
4227         case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
4228         default: ppIRType(bb->tyenv->types[i]);
4229                  vpanic("iselBB(amd64): IRTemp type");
4230      }
4231      env->vregmap[i]   = hreg;
4232      env->vregmapHI[i] = hregHI;
4233   }
4234   env->vreg_ctr = j;
4235
4236   /* Ok, finally we can iterate over the statements. */
4237   for (i = 0; i < bb->stmts_used; i++)
4238      if (bb->stmts[i])
4239         iselStmt(env,bb->stmts[i]);
4240
4241   iselNext(env,bb->next,bb->jumpkind);
4242
4243   /* record the number of vregs we used. */
4244   env->code->n_vregs = env->vreg_ctr;
4245   return env->code;
4246}
4247
4248
4249/*---------------------------------------------------------------*/
4250/*--- end                                   host_amd64_isel.c ---*/
4251/*---------------------------------------------------------------*/
4252