1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_amd64_defs.h"
47
48
49/*---------------------------------------------------------*/
50/*--- x87/SSE control word stuff                        ---*/
51/*---------------------------------------------------------*/
52
53/* Vex-generated code expects to run with the FPU set as follows: all
54   exceptions masked, round-to-nearest, precision = 53 bits.  This
55   corresponds to a FPU control word value of 0x027F.
56
57   Similarly the SSE control word (%mxcsr) should be 0x1F80.
58
59   %fpucw and %mxcsr should have these values on entry to
60   Vex-generated code, and should those values should be
61   unchanged at exit.
62*/
63
64#define DEFAULT_FPUCW 0x027F
65
66#define DEFAULT_MXCSR 0x1F80
67
68/* debugging only, do not use */
69/* define DEFAULT_FPUCW 0x037F */
70
71
72/*---------------------------------------------------------*/
73/*--- misc helpers                                      ---*/
74/*---------------------------------------------------------*/
75
76/* These are duplicated in guest-amd64/toIR.c */
77static IRExpr* unop ( IROp op, IRExpr* a )
78{
79   return IRExpr_Unop(op, a);
80}
81
82static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
83{
84   return IRExpr_Binop(op, a1, a2);
85}
86
87static IRExpr* bind ( Int binder )
88{
89   return IRExpr_Binder(binder);
90}
91
92
93/*---------------------------------------------------------*/
94/*--- ISelEnv                                           ---*/
95/*---------------------------------------------------------*/
96
97/* This carries around:
98
99   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
100     might encounter.  This is computed before insn selection starts,
101     and does not change.
102
103   - A mapping from IRTemp to HReg.  This tells the insn selector
104     which virtual register is associated with each IRTemp
105     temporary.  This is computed before insn selection starts, and
106     does not change.  We expect this mapping to map precisely the
107     same set of IRTemps as the type mapping does.
108
109        - vregmap   holds the primary register for the IRTemp.
110        - vregmapHI is only used for 128-bit integer-typed
111             IRTemps.  It holds the identity of a second
112             64-bit virtual HReg, which holds the high half
113             of the value.
114
115   - The host subarchitecture we are selecting insns for.
116     This is set at the start and does not change.
117
118   - The code array, that is, the insns selected so far.
119
120   - A counter, for generating new virtual registers.
121
122   - A Bool for indicating whether we may generate chain-me
123     instructions for control flow transfers, or whether we must use
124     XAssisted.
125
126   - The maximum guest address of any guest insn in this block.
127     Actually, the address of the highest-addressed byte from any insn
128     in this block.  Is set at the start and does not change.  This is
129     used for detecting jumps which are definitely forward-edges from
130     this block, and therefore can be made (chained) to the fast entry
131     point of the destination, thereby avoiding the destination's
132     event check.
133
134   Note, this is all host-independent.  (JRS 20050201: well, kinda
135   ... not completely.  Compare with ISelEnv for X86.)
136*/
137
138typedef
139   struct {
140      /* Constant -- are set at the start and do not change. */
141      IRTypeEnv*   type_env;
142
143      HReg*        vregmap;
144      HReg*        vregmapHI;
145      Int          n_vregmap;
146
147      UInt         hwcaps;
148
149      Bool         chainingAllowed;
150      Addr64       max_ga;
151
152      /* These are modified as we go along. */
153      HInstrArray* code;
154      Int          vreg_ctr;
155   }
156   ISelEnv;
157
158
159static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
160{
161   vassert(tmp >= 0);
162   vassert(tmp < env->n_vregmap);
163   return env->vregmap[tmp];
164}
165
166static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
167                               ISelEnv* env, IRTemp tmp )
168{
169   vassert(tmp >= 0);
170   vassert(tmp < env->n_vregmap);
171   vassert(env->vregmapHI[tmp] != INVALID_HREG);
172   *vrLO = env->vregmap[tmp];
173   *vrHI = env->vregmapHI[tmp];
174}
175
176static void addInstr ( ISelEnv* env, AMD64Instr* instr )
177{
178   addHInstr(env->code, instr);
179   if (vex_traceflags & VEX_TRACE_VCODE) {
180      ppAMD64Instr(instr, True);
181      vex_printf("\n");
182   }
183}
184
185static HReg newVRegI ( ISelEnv* env )
186{
187   HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
188   env->vreg_ctr++;
189   return reg;
190}
191
192static HReg newVRegV ( ISelEnv* env )
193{
194   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
195   env->vreg_ctr++;
196   return reg;
197}
198
199
200/*---------------------------------------------------------*/
201/*--- ISEL: Forward declarations                        ---*/
202/*---------------------------------------------------------*/
203
204/* These are organised as iselXXX and iselXXX_wrk pairs.  The
205   iselXXX_wrk do the real work, but are not to be called directly.
206   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
207   checks that all returned registers are virtual.  You should not
208   call the _wrk version directly.
209*/
210static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
211static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
212
213static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
214static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
215
216static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
217static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
218
219static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
220static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
221
222static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
223static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
224
225static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
226                                          ISelEnv* env, IRExpr* e );
227static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
228                                          ISelEnv* env, IRExpr* e );
229
230static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
231static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
232
233static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
234static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
235
236static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
237static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
238
239static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
240static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
241
242static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
243                                        ISelEnv* env, IRExpr* e );
244static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
245                                        ISelEnv* env, IRExpr* e );
246
247
248/*---------------------------------------------------------*/
249/*--- ISEL: Misc helpers                                ---*/
250/*---------------------------------------------------------*/
251
252static Bool sane_AMode ( AMD64AMode* am )
253{
254   switch (am->tag) {
255      case Aam_IR:
256         return
257            toBool( hregClass(am->Aam.IR.reg) == HRcInt64
258                    && (hregIsVirtual(am->Aam.IR.reg)
259                        || am->Aam.IR.reg == hregAMD64_RBP()) );
260      case Aam_IRRS:
261         return
262            toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
263                    && hregIsVirtual(am->Aam.IRRS.base)
264                    && hregClass(am->Aam.IRRS.index) == HRcInt64
265                    && hregIsVirtual(am->Aam.IRRS.index) );
266      default:
267        vpanic("sane_AMode: unknown amd64 amode tag");
268   }
269}
270
271
272/* Can the lower 32 bits be signedly widened to produce the whole
273   64-bit value?  In other words, are the top 33 bits either all 0 or
274   all 1 ? */
275static Bool fitsIn32Bits ( ULong x )
276{
277   Long y0 = (Long)x;
278   Long y1 = y0;
279   y1 <<= 32;
280   y1 >>=/*s*/ 32;
281   return toBool(x == y1);
282}
283
284/* Is this a 64-bit zero expression? */
285
286static Bool isZeroU64 ( IRExpr* e )
287{
288   return e->tag == Iex_Const
289          && e->Iex.Const.con->tag == Ico_U64
290          && e->Iex.Const.con->Ico.U64 == 0ULL;
291}
292
293static Bool isZeroU32 ( IRExpr* e )
294{
295   return e->tag == Iex_Const
296          && e->Iex.Const.con->tag == Ico_U32
297          && e->Iex.Const.con->Ico.U32 == 0;
298}
299
300/* Make a int reg-reg move. */
301
302static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
303{
304   vassert(hregClass(src) == HRcInt64);
305   vassert(hregClass(dst) == HRcInt64);
306   return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
307}
308
309/* Make a vector (128 bit) reg-reg move. */
310
311static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
312{
313   vassert(hregClass(src) == HRcVec128);
314   vassert(hregClass(dst) == HRcVec128);
315   return AMD64Instr_SseReRg(Asse_MOV, src, dst);
316}
317
318/* Advance/retreat %rsp by n. */
319
320static void add_to_rsp ( ISelEnv* env, Int n )
321{
322   vassert(n > 0 && n < 256 && (n%8) == 0);
323   addInstr(env,
324            AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
325                                        hregAMD64_RSP()));
326}
327
328static void sub_from_rsp ( ISelEnv* env, Int n )
329{
330   vassert(n > 0 && n < 256 && (n%8) == 0);
331   addInstr(env,
332            AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
333                                        hregAMD64_RSP()));
334}
335
336/* Push 64-bit constants on the stack. */
337static void push_uimm64( ISelEnv* env, ULong uimm64 )
338{
339   /* If uimm64 can be expressed as the sign extension of its
340      lower 32 bits, we can do it the easy way. */
341   Long simm64 = (Long)uimm64;
342   if ( simm64 == ((simm64 << 32) >> 32) ) {
343      addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
344   } else {
345      HReg tmp = newVRegI(env);
346      addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
347      addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
348   }
349}
350
351
352/* Used only in doHelperCall.  If possible, produce a single
353   instruction which computes 'e' into 'dst'.  If not possible, return
354   NULL. */
355
356static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
357                                                    HReg     dst,
358                                                    IRExpr*  e )
359{
360   vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
361
362   if (e->tag == Iex_Const) {
363      vassert(e->Iex.Const.con->tag == Ico_U64);
364      if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
365         return AMD64Instr_Alu64R(
366                   Aalu_MOV,
367                   AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
368                   dst
369                );
370      } else {
371         return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
372      }
373   }
374
375   if (e->tag == Iex_RdTmp) {
376      HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
377      return mk_iMOVsd_RR(src, dst);
378   }
379
380   if (e->tag == Iex_Get) {
381      vassert(e->Iex.Get.ty == Ity_I64);
382      return AMD64Instr_Alu64R(
383                Aalu_MOV,
384                AMD64RMI_Mem(
385                   AMD64AMode_IR(e->Iex.Get.offset,
386                                 hregAMD64_RBP())),
387                dst);
388   }
389
390   if (e->tag == Iex_Unop
391       && e->Iex.Unop.op == Iop_32Uto64
392       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
393      HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
394      return AMD64Instr_MovxLQ(False, src, dst);
395   }
396
397   if (0) { ppIRExpr(e); vex_printf("\n"); }
398
399   return NULL;
400}
401
402
403/* Do a complete function call.  guard is a Ity_Bit expression
404   indicating whether or not the call happens.  If guard==NULL, the
405   call is unconditional. */
406
407static
408void doHelperCall ( ISelEnv* env,
409                    Bool passBBP,
410                    IRExpr* guard, IRCallee* cee, IRExpr** args )
411{
412   AMD64CondCode cc;
413   HReg          argregs[6];
414   HReg          tmpregs[6];
415   AMD64Instr*   fastinstrs[6];
416   Int           n_args, i, argreg;
417
418   /* Marshal args for a call and do the call.
419
420      If passBBP is True, %rbp (the baseblock pointer) is to be passed
421      as the first arg.
422
423      This function only deals with a tiny set of possibilities, which
424      cover all helpers in practice.  The restrictions are that only
425      arguments in registers are supported, hence only 6x64 integer
426      bits in total can be passed.  In fact the only supported arg
427      type is I64.
428
429      Generating code which is both efficient and correct when
430      parameters are to be passed in registers is difficult, for the
431      reasons elaborated in detail in comments attached to
432      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
433      of the method described in those comments.
434
435      The problem is split into two cases: the fast scheme and the
436      slow scheme.  In the fast scheme, arguments are computed
437      directly into the target (real) registers.  This is only safe
438      when we can be sure that computation of each argument will not
439      trash any real registers set by computation of any other
440      argument.
441
442      In the slow scheme, all args are first computed into vregs, and
443      once they are all done, they are moved to the relevant real
444      regs.  This always gives correct code, but it also gives a bunch
445      of vreg-to-rreg moves which are usually redundant but are hard
446      for the register allocator to get rid of.
447
448      To decide which scheme to use, all argument expressions are
449      first examined.  If they are all so simple that it is clear they
450      will be evaluated without use of any fixed registers, use the
451      fast scheme, else use the slow scheme.  Note also that only
452      unconditional calls may use the fast scheme, since having to
453      compute a condition expression could itself trash real
454      registers.
455
456      Note this requires being able to examine an expression and
457      determine whether or not evaluation of it might use a fixed
458      register.  That requires knowledge of how the rest of this insn
459      selector works.  Currently just the following 3 are regarded as
460      safe -- hopefully they cover the majority of arguments in
461      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
462   */
463
464   /* Note that the cee->regparms field is meaningless on AMD64 host
465      (since there is only one calling convention) and so we always
466      ignore it. */
467
468   n_args = 0;
469   for (i = 0; args[i]; i++)
470      n_args++;
471
472   if (6 < n_args + (passBBP ? 1 : 0))
473      vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
474
475   argregs[0] = hregAMD64_RDI();
476   argregs[1] = hregAMD64_RSI();
477   argregs[2] = hregAMD64_RDX();
478   argregs[3] = hregAMD64_RCX();
479   argregs[4] = hregAMD64_R8();
480   argregs[5] = hregAMD64_R9();
481
482   tmpregs[0] = tmpregs[1] = tmpregs[2] =
483   tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
484
485   fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
486   fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
487
488   /* First decide which scheme (slow or fast) is to be used.  First
489      assume the fast scheme, and select slow if any contraindications
490      (wow) appear. */
491
492   if (guard) {
493      if (guard->tag == Iex_Const
494          && guard->Iex.Const.con->tag == Ico_U1
495          && guard->Iex.Const.con->Ico.U1 == True) {
496         /* unconditional */
497      } else {
498         /* Not manifestly unconditional -- be conservative. */
499         goto slowscheme;
500      }
501   }
502
503   /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
504      use the slow scheme.  Because this is tentative, we can't call
505      addInstr (that is, commit to) any instructions until we're
506      handled all the arguments.  So park the resulting instructions
507      in a buffer and emit that if we're successful. */
508
509   /* FAST SCHEME */
510   argreg = 0;
511   if (passBBP) {
512      fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
513      argreg++;
514   }
515
516   for (i = 0; i < n_args; i++) {
517      vassert(argreg < 6);
518      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
519      fastinstrs[argreg]
520         = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
521      if (fastinstrs[argreg] == NULL)
522         goto slowscheme;
523      argreg++;
524   }
525
526   /* Looks like we're in luck.  Emit the accumulated instructions and
527      move on to doing the call itself. */
528   vassert(argreg <= 6);
529   for (i = 0; i < argreg; i++)
530      addInstr(env, fastinstrs[i]);
531
532   /* Fast scheme only applies for unconditional calls.  Hence: */
533   cc = Acc_ALWAYS;
534
535   goto handle_call;
536
537
538   /* SLOW SCHEME; move via temporaries */
539  slowscheme:
540#  if 0 /* debug only */
541   if (n_args > 0) {for (i = 0; args[i]; i++) {
542   ppIRExpr(args[i]); vex_printf(" "); }
543   vex_printf("\n");}
544#  endif
545   argreg = 0;
546
547   if (passBBP) {
548      /* This is pretty stupid; better to move directly to rdi
549         after the rest of the args are done. */
550      tmpregs[argreg] = newVRegI(env);
551      addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
552      argreg++;
553   }
554
555   for (i = 0; i < n_args; i++) {
556      vassert(argreg < 6);
557      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
558      tmpregs[argreg] = iselIntExpr_R(env, args[i]);
559      argreg++;
560   }
561
562   /* Now we can compute the condition.  We can't do it earlier
563      because the argument computations could trash the condition
564      codes.  Be a bit clever to handle the common case where the
565      guard is 1:Bit. */
566   cc = Acc_ALWAYS;
567   if (guard) {
568      if (guard->tag == Iex_Const
569          && guard->Iex.Const.con->tag == Ico_U1
570          && guard->Iex.Const.con->Ico.U1 == True) {
571         /* unconditional -- do nothing */
572      } else {
573         cc = iselCondCode( env, guard );
574      }
575   }
576
577   /* Move the args to their final destinations. */
578   for (i = 0; i < argreg; i++) {
579      /* None of these insns, including any spill code that might
580         be generated, may alter the condition codes. */
581      addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
582   }
583
584
585   /* Finally, the call itself. */
586  handle_call:
587   addInstr(env, AMD64Instr_Call(
588                    cc,
589                    Ptr_to_ULong(cee->addr),
590                    n_args + (passBBP ? 1 : 0)
591                 )
592   );
593}
594
595
596/* Given a guest-state array descriptor, an index expression and a
597   bias, generate an AMD64AMode holding the relevant guest state
598   offset. */
599
600static
601AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
602                                  IRExpr* off, Int bias )
603{
604   HReg tmp, roff;
605   Int  elemSz = sizeofIRType(descr->elemTy);
606   Int  nElems = descr->nElems;
607
608   /* Throw out any cases not generated by an amd64 front end.  In
609      theory there might be a day where we need to handle them -- if
610      we ever run non-amd64-guest on amd64 host. */
611
612   if (nElems != 8 || (elemSz != 1 && elemSz != 8))
613      vpanic("genGuestArrayOffset(amd64 host)");
614
615   /* Compute off into a reg, %off.  Then return:
616
617         movq %off, %tmp
618         addq $bias, %tmp  (if bias != 0)
619         andq %tmp, 7
620         ... base(%rbp, %tmp, shift) ...
621   */
622   tmp  = newVRegI(env);
623   roff = iselIntExpr_R(env, off);
624   addInstr(env, mk_iMOVsd_RR(roff, tmp));
625   if (bias != 0) {
626      /* Make sure the bias is sane, in the sense that there are
627         no significant bits above bit 30 in it. */
628      vassert(-10000 < bias && bias < 10000);
629      addInstr(env,
630               AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
631   }
632   addInstr(env,
633            AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
634   vassert(elemSz == 1 || elemSz == 8);
635   return
636      AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
637                                    elemSz==8 ? 3 : 0);
638}
639
640
641/* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
642static
643void set_SSE_rounding_default ( ISelEnv* env )
644{
645   /* pushq $DEFAULT_MXCSR
646      ldmxcsr 0(%rsp)
647      addq $8, %rsp
648   */
649   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
650   addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
651   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
652   add_to_rsp(env, 8);
653}
654
655/* Mess with the FPU's rounding mode: set to the default rounding mode
656   (DEFAULT_FPUCW). */
657static
658void set_FPU_rounding_default ( ISelEnv* env )
659{
660   /* movq $DEFAULT_FPUCW, -8(%rsp)
661      fldcw -8(%esp)
662   */
663   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
664   addInstr(env, AMD64Instr_Alu64M(
665                    Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
666   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
667}
668
669
670/* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
671   expression denoting a value in the range 0 .. 3, indicating a round
672   mode encoded as per type IRRoundingMode.  Set the SSE machinery to
673   have the same rounding.
674*/
675static
676void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
677{
678   /* Note: this sequence only makes sense because DEFAULT_MXCSR has
679      both rounding bits == 0.  If that wasn't the case, we couldn't
680      create a new rounding field simply by ORing the new value into
681      place. */
682
683   /* movq $3, %reg
684      andq [[mode]], %reg  -- shouldn't be needed; paranoia
685      shlq $13, %reg
686      orq $DEFAULT_MXCSR, %reg
687      pushq %reg
688      ldmxcsr 0(%esp)
689      addq $8, %rsp
690   */
691   HReg        reg      = newVRegI(env);
692   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
693   addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
694   addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
695                                   iselIntExpr_RMI(env, mode), reg));
696   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
697   addInstr(env, AMD64Instr_Alu64R(
698                    Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
699   addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
700   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
701   add_to_rsp(env, 8);
702}
703
704
705/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
706   expression denoting a value in the range 0 .. 3, indicating a round
707   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
708   the same rounding.
709*/
710static
711void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
712{
713   HReg rrm  = iselIntExpr_R(env, mode);
714   HReg rrm2 = newVRegI(env);
715   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
716
717   /* movq  %rrm, %rrm2
718      andq  $3, %rrm2   -- shouldn't be needed; paranoia
719      shlq  $10, %rrm2
720      orq   $DEFAULT_FPUCW, %rrm2
721      movq  %rrm2, -8(%rsp)
722      fldcw -8(%esp)
723   */
724   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
725   addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
726   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
727   addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
728                                   AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
729   addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
730                                   AMD64RI_Reg(rrm2), m8_rsp));
731   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
732}
733
734
735/* Generate all-zeroes into a new vector register.
736*/
737static HReg generate_zeroes_V128 ( ISelEnv* env )
738{
739   HReg dst = newVRegV(env);
740   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
741   return dst;
742}
743
744/* Generate all-ones into a new vector register.
745*/
746static HReg generate_ones_V128 ( ISelEnv* env )
747{
748   HReg dst = newVRegV(env);
749   addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
750   return dst;
751}
752
753
754/* Generate !src into a new vector register.  Amazing that there isn't
755   a less crappy way to do this.
756*/
757static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
758{
759   HReg dst = generate_ones_V128(env);
760   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
761   return dst;
762}
763
764
765/* Expand the given byte into a 64-bit word, by cloning each bit
766   8 times. */
767static ULong bitmask8_to_bytemask64 ( UShort w8 )
768{
769   vassert(w8 == (w8 & 0xFF));
770   ULong w64 = 0;
771   Int i;
772   for (i = 0; i < 8; i++) {
773      if (w8 & (1<<i))
774         w64 |= (0xFFULL << (8 * i));
775   }
776   return w64;
777}
778
779
780/*---------------------------------------------------------*/
781/*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
782/*---------------------------------------------------------*/
783
784/* Select insns for an integer-typed expression, and add them to the
785   code list.  Return a reg holding the result.  This reg will be a
786   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
787   want to modify it, ask for a new vreg, copy it in there, and modify
788   the copy.  The register allocator will do its best to map both
789   vregs to the same real register, so the copies will often disappear
790   later in the game.
791
792   This should handle expressions of 64, 32, 16 and 8-bit type.  All
793   results are returned in a 64-bit register.  For 32-, 16- and 8-bit
794   expressions, the upper 32/16/24 bits are arbitrary, so you should
795   mask or sign extend partial values if necessary.
796*/
797
798static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
799{
800   HReg r = iselIntExpr_R_wrk(env, e);
801   /* sanity checks ... */
802#  if 0
803   vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
804#  endif
805   vassert(hregClass(r) == HRcInt64);
806   vassert(hregIsVirtual(r));
807   return r;
808}
809
810/* DO NOT CALL THIS DIRECTLY ! */
811static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
812{
813   /* Used for unary/binary SIMD64 ops. */
814   HWord fn = 0;
815   Bool second_is_UInt;
816
817   MatchInfo mi;
818   DECLARE_PATTERN(p_1Uto8_64to1);
819   DECLARE_PATTERN(p_LDle8_then_8Uto64);
820   DECLARE_PATTERN(p_LDle16_then_16Uto64);
821
822   IRType ty = typeOfIRExpr(env->type_env,e);
823   switch (ty) {
824      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
825      default: vassert(0);
826   }
827
828   switch (e->tag) {
829
830   /* --------- TEMP --------- */
831   case Iex_RdTmp: {
832      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
833   }
834
835   /* --------- LOAD --------- */
836   case Iex_Load: {
837      HReg dst = newVRegI(env);
838      AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
839
840      /* We can't handle big-endian loads, nor load-linked. */
841      if (e->Iex.Load.end != Iend_LE)
842         goto irreducible;
843
844      if (ty == Ity_I64) {
845         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
846                                         AMD64RMI_Mem(amode), dst) );
847         return dst;
848      }
849      if (ty == Ity_I32) {
850         addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
851         return dst;
852      }
853      if (ty == Ity_I16) {
854         addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
855         return dst;
856      }
857      if (ty == Ity_I8) {
858         addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
859         return dst;
860      }
861      break;
862   }
863
864   /* --------- BINARY OP --------- */
865   case Iex_Binop: {
866      AMD64AluOp   aluOp;
867      AMD64ShiftOp shOp;
868
869      /* Pattern: Sub64(0,x) */
870      /*     and: Sub32(0,x) */
871      if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
872          || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
873         HReg dst = newVRegI(env);
874         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
875         addInstr(env, mk_iMOVsd_RR(reg,dst));
876         addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
877         return dst;
878      }
879
880      /* Is it an addition or logical style op? */
881      switch (e->Iex.Binop.op) {
882         case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
883            aluOp = Aalu_ADD; break;
884         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
885            aluOp = Aalu_SUB; break;
886         case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
887            aluOp = Aalu_AND; break;
888         case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
889            aluOp = Aalu_OR; break;
890         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
891            aluOp = Aalu_XOR; break;
892         case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
893            aluOp = Aalu_MUL; break;
894         default:
895            aluOp = Aalu_INVALID; break;
896      }
897      /* For commutative ops we assume any literal
898         values are on the second operand. */
899      if (aluOp != Aalu_INVALID) {
900         HReg dst      = newVRegI(env);
901         HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
902         AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
903         addInstr(env, mk_iMOVsd_RR(reg,dst));
904         addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
905         return dst;
906      }
907
908      /* Perhaps a shift op? */
909      switch (e->Iex.Binop.op) {
910         case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
911            shOp = Ash_SHL; break;
912         case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
913            shOp = Ash_SHR; break;
914         case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
915            shOp = Ash_SAR; break;
916         default:
917            shOp = Ash_INVALID; break;
918      }
919      if (shOp != Ash_INVALID) {
920         HReg dst = newVRegI(env);
921
922         /* regL = the value to be shifted */
923         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
924         addInstr(env, mk_iMOVsd_RR(regL,dst));
925
926         /* Do any necessary widening for 32/16/8 bit operands */
927         switch (e->Iex.Binop.op) {
928            case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
929               break;
930            case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
931               break;
932            case Iop_Shr8:
933               addInstr(env, AMD64Instr_Alu64R(
934                                Aalu_AND, AMD64RMI_Imm(0xFF), dst));
935               break;
936            case Iop_Shr16:
937               addInstr(env, AMD64Instr_Alu64R(
938                                Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
939               break;
940            case Iop_Shr32:
941               addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
942               break;
943            case Iop_Sar8:
944               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
945               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
946               break;
947            case Iop_Sar16:
948               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
949               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
950               break;
951            case Iop_Sar32:
952               addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
953               break;
954            default:
955               ppIROp(e->Iex.Binop.op);
956               vassert(0);
957         }
958
959         /* Now consider the shift amount.  If it's a literal, we
960            can do a much better job than the general case. */
961         if (e->Iex.Binop.arg2->tag == Iex_Const) {
962            /* assert that the IR is well-typed */
963            Int nshift;
964            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
965            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
966            vassert(nshift >= 0);
967            if (nshift > 0)
968               /* Can't allow nshift==0 since that means %cl */
969               addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
970         } else {
971            /* General case; we have to force the amount into %cl. */
972            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
973            addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
974            addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
975         }
976         return dst;
977      }
978
979      /* Deal with 64-bit SIMD binary ops */
980      second_is_UInt = False;
981      switch (e->Iex.Binop.op) {
982         case Iop_Add8x8:
983            fn = (HWord)h_generic_calc_Add8x8; break;
984         case Iop_Add16x4:
985            fn = (HWord)h_generic_calc_Add16x4; break;
986         case Iop_Add32x2:
987            fn = (HWord)h_generic_calc_Add32x2; break;
988
989         case Iop_Avg8Ux8:
990            fn = (HWord)h_generic_calc_Avg8Ux8; break;
991         case Iop_Avg16Ux4:
992            fn = (HWord)h_generic_calc_Avg16Ux4; break;
993
994         case Iop_CmpEQ8x8:
995            fn = (HWord)h_generic_calc_CmpEQ8x8; break;
996         case Iop_CmpEQ16x4:
997            fn = (HWord)h_generic_calc_CmpEQ16x4; break;
998         case Iop_CmpEQ32x2:
999            fn = (HWord)h_generic_calc_CmpEQ32x2; break;
1000
1001         case Iop_CmpGT8Sx8:
1002            fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
1003         case Iop_CmpGT16Sx4:
1004            fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
1005         case Iop_CmpGT32Sx2:
1006            fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
1007
1008         case Iop_InterleaveHI8x8:
1009            fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
1010         case Iop_InterleaveLO8x8:
1011            fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
1012         case Iop_InterleaveHI16x4:
1013            fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
1014         case Iop_InterleaveLO16x4:
1015            fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
1016         case Iop_InterleaveHI32x2:
1017            fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
1018         case Iop_InterleaveLO32x2:
1019            fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
1020         case Iop_CatOddLanes16x4:
1021            fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1022         case Iop_CatEvenLanes16x4:
1023            fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1024         case Iop_Perm8x8:
1025            fn = (HWord)h_generic_calc_Perm8x8; break;
1026
1027         case Iop_Max8Ux8:
1028            fn = (HWord)h_generic_calc_Max8Ux8; break;
1029         case Iop_Max16Sx4:
1030            fn = (HWord)h_generic_calc_Max16Sx4; break;
1031         case Iop_Min8Ux8:
1032            fn = (HWord)h_generic_calc_Min8Ux8; break;
1033         case Iop_Min16Sx4:
1034            fn = (HWord)h_generic_calc_Min16Sx4; break;
1035
1036         case Iop_Mul16x4:
1037            fn = (HWord)h_generic_calc_Mul16x4; break;
1038         case Iop_Mul32x2:
1039            fn = (HWord)h_generic_calc_Mul32x2; break;
1040         case Iop_MulHi16Sx4:
1041            fn = (HWord)h_generic_calc_MulHi16Sx4; break;
1042         case Iop_MulHi16Ux4:
1043            fn = (HWord)h_generic_calc_MulHi16Ux4; break;
1044
1045         case Iop_QAdd8Sx8:
1046            fn = (HWord)h_generic_calc_QAdd8Sx8; break;
1047         case Iop_QAdd16Sx4:
1048            fn = (HWord)h_generic_calc_QAdd16Sx4; break;
1049         case Iop_QAdd8Ux8:
1050            fn = (HWord)h_generic_calc_QAdd8Ux8; break;
1051         case Iop_QAdd16Ux4:
1052            fn = (HWord)h_generic_calc_QAdd16Ux4; break;
1053
1054         case Iop_QNarrowBin32Sto16Sx4:
1055            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1056         case Iop_QNarrowBin16Sto8Sx8:
1057            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1058         case Iop_QNarrowBin16Sto8Ux8:
1059            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1060         case Iop_NarrowBin16to8x8:
1061            fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1062         case Iop_NarrowBin32to16x4:
1063            fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1064
1065         case Iop_QSub8Sx8:
1066            fn = (HWord)h_generic_calc_QSub8Sx8; break;
1067         case Iop_QSub16Sx4:
1068            fn = (HWord)h_generic_calc_QSub16Sx4; break;
1069         case Iop_QSub8Ux8:
1070            fn = (HWord)h_generic_calc_QSub8Ux8; break;
1071         case Iop_QSub16Ux4:
1072            fn = (HWord)h_generic_calc_QSub16Ux4; break;
1073
1074         case Iop_Sub8x8:
1075            fn = (HWord)h_generic_calc_Sub8x8; break;
1076         case Iop_Sub16x4:
1077            fn = (HWord)h_generic_calc_Sub16x4; break;
1078         case Iop_Sub32x2:
1079            fn = (HWord)h_generic_calc_Sub32x2; break;
1080
1081         case Iop_ShlN32x2:
1082            fn = (HWord)h_generic_calc_ShlN32x2;
1083            second_is_UInt = True;
1084            break;
1085         case Iop_ShlN16x4:
1086            fn = (HWord)h_generic_calc_ShlN16x4;
1087            second_is_UInt = True;
1088            break;
1089         case Iop_ShlN8x8:
1090            fn = (HWord)h_generic_calc_ShlN8x8;
1091            second_is_UInt = True;
1092            break;
1093         case Iop_ShrN32x2:
1094            fn = (HWord)h_generic_calc_ShrN32x2;
1095            second_is_UInt = True;
1096            break;
1097         case Iop_ShrN16x4:
1098            fn = (HWord)h_generic_calc_ShrN16x4;
1099            second_is_UInt = True;
1100            break;
1101         case Iop_SarN32x2:
1102            fn = (HWord)h_generic_calc_SarN32x2;
1103            second_is_UInt = True;
1104            break;
1105         case Iop_SarN16x4:
1106            fn = (HWord)h_generic_calc_SarN16x4;
1107            second_is_UInt = True;
1108            break;
1109         case Iop_SarN8x8:
1110            fn = (HWord)h_generic_calc_SarN8x8;
1111            second_is_UInt = True;
1112            break;
1113
1114         default:
1115            fn = (HWord)0; break;
1116      }
1117      if (fn != (HWord)0) {
1118         /* Note: the following assumes all helpers are of signature
1119               ULong fn ( ULong, ULong ), and they are
1120            not marked as regparm functions.
1121         */
1122         HReg dst  = newVRegI(env);
1123         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1124         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1125         if (second_is_UInt)
1126            addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1127         addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1128         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1129         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
1130         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1131         return dst;
1132      }
1133
1134      /* Handle misc other ops. */
1135
1136      if (e->Iex.Binop.op == Iop_Max32U) {
1137         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1138         HReg dst  = newVRegI(env);
1139         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1140         addInstr(env, mk_iMOVsd_RR(src1, dst));
1141         addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1142         addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
1143         return dst;
1144      }
1145
1146      if (e->Iex.Binop.op == Iop_DivModS64to32
1147          || e->Iex.Binop.op == Iop_DivModU64to32) {
1148         /* 64 x 32 -> (32(rem),32(div)) division */
1149         /* Get the 64-bit operand into edx:eax, and the other into
1150            any old R/M. */
1151         HReg      rax     = hregAMD64_RAX();
1152         HReg      rdx     = hregAMD64_RDX();
1153         HReg      dst     = newVRegI(env);
1154         Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1155         AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1156         /* Compute the left operand into a reg, and then
1157            put the top half in edx and the bottom in eax. */
1158         HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1159         addInstr(env, mk_iMOVsd_RR(left64, rdx));
1160         addInstr(env, mk_iMOVsd_RR(left64, rax));
1161         addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1162         addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1163	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1164	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1165         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1166         addInstr(env, mk_iMOVsd_RR(rax, dst));
1167         addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1168         return dst;
1169      }
1170
1171      if (e->Iex.Binop.op == Iop_32HLto64) {
1172         HReg hi32  = newVRegI(env);
1173         HReg lo32  = newVRegI(env);
1174         HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1175         HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1176         addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1177         addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1178         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1179	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1180         addInstr(env, AMD64Instr_Alu64R(
1181                          Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1182         return hi32;
1183      }
1184
1185      if (e->Iex.Binop.op == Iop_16HLto32) {
1186         HReg hi16  = newVRegI(env);
1187         HReg lo16  = newVRegI(env);
1188         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1189         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1190         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1191         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1192         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1193         addInstr(env, AMD64Instr_Alu64R(
1194                          Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1195         addInstr(env, AMD64Instr_Alu64R(
1196                          Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1197         return hi16;
1198      }
1199
1200      if (e->Iex.Binop.op == Iop_8HLto16) {
1201         HReg hi8  = newVRegI(env);
1202         HReg lo8  = newVRegI(env);
1203         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1204         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1205         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1206         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1207         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1208         addInstr(env, AMD64Instr_Alu64R(
1209                          Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1210         addInstr(env, AMD64Instr_Alu64R(
1211                          Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1212         return hi8;
1213      }
1214
1215      if (e->Iex.Binop.op == Iop_MullS32
1216          || e->Iex.Binop.op == Iop_MullS16
1217          || e->Iex.Binop.op == Iop_MullS8
1218          || e->Iex.Binop.op == Iop_MullU32
1219          || e->Iex.Binop.op == Iop_MullU16
1220          || e->Iex.Binop.op == Iop_MullU8) {
1221         HReg a32   = newVRegI(env);
1222         HReg b32   = newVRegI(env);
1223         HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1224         HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1225         Int          shift  = 0;
1226         AMD64ShiftOp shr_op = Ash_SHR;
1227         switch (e->Iex.Binop.op) {
1228            case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1229            case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1230            case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1231            case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1232            case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1233            case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1234            default: vassert(0);
1235         }
1236
1237         addInstr(env, mk_iMOVsd_RR(a32s, a32));
1238         addInstr(env, mk_iMOVsd_RR(b32s, b32));
1239         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1240         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1241         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1242         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1243         addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1244         return b32;
1245      }
1246
1247      if (e->Iex.Binop.op == Iop_CmpF64) {
1248         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1249         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1250         HReg dst = newVRegI(env);
1251         addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1252         /* Mask out irrelevant parts of the result so as to conform
1253            to the CmpF64 definition. */
1254         addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1255         return dst;
1256      }
1257
1258      if (e->Iex.Binop.op == Iop_F64toI32S
1259          || e->Iex.Binop.op == Iop_F64toI64S) {
1260         Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1261         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1262         HReg dst = newVRegI(env);
1263         set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1264         addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1265         set_SSE_rounding_default(env);
1266         return dst;
1267      }
1268
1269      break;
1270   }
1271
1272   /* --------- UNARY OP --------- */
1273   case Iex_Unop: {
1274
1275      /* 1Uto8(64to1(expr64)) */
1276      {
1277         DEFINE_PATTERN( p_1Uto8_64to1,
1278                         unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1279         if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1280            IRExpr* expr64 = mi.bindee[0];
1281            HReg    dst    = newVRegI(env);
1282            HReg    src    = iselIntExpr_R(env, expr64);
1283            addInstr(env, mk_iMOVsd_RR(src,dst) );
1284            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1285                                            AMD64RMI_Imm(1), dst));
1286            return dst;
1287         }
1288      }
1289
1290      /* 8Uto64(LDle(expr64)) */
1291      {
1292         DEFINE_PATTERN(p_LDle8_then_8Uto64,
1293                        unop(Iop_8Uto64,
1294                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1295         if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1296            HReg dst = newVRegI(env);
1297            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1298            addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1299            return dst;
1300         }
1301      }
1302
1303      /* 16Uto64(LDle(expr64)) */
1304      {
1305         DEFINE_PATTERN(p_LDle16_then_16Uto64,
1306                        unop(Iop_16Uto64,
1307                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1308         if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1309            HReg dst = newVRegI(env);
1310            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1311            addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1312            return dst;
1313         }
1314      }
1315
1316      /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1317         Use 32 bit arithmetic and let the default zero-extend rule
1318         do the 32Uto64 for free. */
1319      if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1320         IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1321         IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1322         IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1323         AMD64AluOp aluOp = Aalu_INVALID;
1324         switch (opi) {
1325            case Iop_Add32: aluOp = Aalu_ADD; break;
1326            case Iop_Sub32: aluOp = Aalu_SUB; break;
1327            case Iop_And32: aluOp = Aalu_AND; break;
1328            case Iop_Or32:  aluOp = Aalu_OR;  break;
1329            case Iop_Xor32: aluOp = Aalu_XOR; break;
1330            default: break;
1331         }
1332         if (aluOp != Aalu_INVALID) {
1333            /* For commutative ops we assume any literal values are on
1334               the second operand. */
1335            HReg dst      = newVRegI(env);
1336            HReg reg      = iselIntExpr_R(env, argL);
1337            AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1338            addInstr(env, mk_iMOVsd_RR(reg,dst));
1339            addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1340            return dst;
1341         }
1342         /* just fall through to normal handling for Iop_32Uto64 */
1343      }
1344
1345      /* Fallback cases */
1346      switch (e->Iex.Unop.op) {
1347         case Iop_32Uto64:
1348         case Iop_32Sto64: {
1349            HReg dst = newVRegI(env);
1350            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1351            addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1352                                            src, dst) );
1353            return dst;
1354         }
1355         case Iop_128HIto64: {
1356            HReg rHi, rLo;
1357            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1358            return rHi; /* and abandon rLo */
1359         }
1360         case Iop_128to64: {
1361            HReg rHi, rLo;
1362            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1363            return rLo; /* and abandon rHi */
1364         }
1365         case Iop_8Uto16:
1366         case Iop_8Uto32:
1367         case Iop_8Uto64:
1368         case Iop_16Uto64:
1369         case Iop_16Uto32: {
1370            HReg dst     = newVRegI(env);
1371            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1372            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1373                                   || e->Iex.Unop.op==Iop_16Uto64 );
1374            UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1375            addInstr(env, mk_iMOVsd_RR(src,dst) );
1376            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1377                                            AMD64RMI_Imm(mask), dst));
1378            return dst;
1379         }
1380         case Iop_8Sto16:
1381         case Iop_8Sto64:
1382         case Iop_8Sto32:
1383         case Iop_16Sto32:
1384         case Iop_16Sto64: {
1385            HReg dst     = newVRegI(env);
1386            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1387            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1388                                   || e->Iex.Unop.op==Iop_16Sto64 );
1389            UInt amt     = srcIs16 ? 48 : 56;
1390            addInstr(env, mk_iMOVsd_RR(src,dst) );
1391            addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1392            addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1393            return dst;
1394         }
1395 	 case Iop_Not8:
1396 	 case Iop_Not16:
1397         case Iop_Not32:
1398         case Iop_Not64: {
1399            HReg dst = newVRegI(env);
1400            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1401            addInstr(env, mk_iMOVsd_RR(src,dst) );
1402            addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1403            return dst;
1404         }
1405         case Iop_16HIto8:
1406         case Iop_32HIto16:
1407         case Iop_64HIto32: {
1408            HReg dst  = newVRegI(env);
1409            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1410            Int shift = 0;
1411            switch (e->Iex.Unop.op) {
1412               case Iop_16HIto8:  shift = 8;  break;
1413               case Iop_32HIto16: shift = 16; break;
1414               case Iop_64HIto32: shift = 32; break;
1415               default: vassert(0);
1416            }
1417            addInstr(env, mk_iMOVsd_RR(src,dst) );
1418            addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1419            return dst;
1420         }
1421         case Iop_1Uto64:
1422         case Iop_1Uto32:
1423         case Iop_1Uto8: {
1424            HReg dst           = newVRegI(env);
1425            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1426            addInstr(env, AMD64Instr_Set64(cond,dst));
1427            return dst;
1428         }
1429         case Iop_1Sto8:
1430         case Iop_1Sto16:
1431         case Iop_1Sto32:
1432         case Iop_1Sto64: {
1433            /* could do better than this, but for now ... */
1434            HReg dst           = newVRegI(env);
1435            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1436            addInstr(env, AMD64Instr_Set64(cond,dst));
1437            addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1438            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1439            return dst;
1440         }
1441         case Iop_Ctz64: {
1442            /* Count trailing zeroes, implemented by amd64 'bsfq' */
1443            HReg dst = newVRegI(env);
1444            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1445            addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1446            return dst;
1447         }
1448         case Iop_Clz64: {
1449            /* Count leading zeroes.  Do 'bsrq' to establish the index
1450               of the highest set bit, and subtract that value from
1451               63. */
1452            HReg tmp = newVRegI(env);
1453            HReg dst = newVRegI(env);
1454            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1455            addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1456            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1457                                            AMD64RMI_Imm(63), dst));
1458            addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1459                                            AMD64RMI_Reg(tmp), dst));
1460            return dst;
1461         }
1462
1463         case Iop_CmpwNEZ64: {
1464            HReg dst = newVRegI(env);
1465            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1466            addInstr(env, mk_iMOVsd_RR(src,dst));
1467            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1468            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1469                                            AMD64RMI_Reg(src), dst));
1470            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1471            return dst;
1472         }
1473
1474         case Iop_CmpwNEZ32: {
1475            HReg src = newVRegI(env);
1476            HReg dst = newVRegI(env);
1477            HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1478            addInstr(env, mk_iMOVsd_RR(pre,src));
1479            addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1480            addInstr(env, mk_iMOVsd_RR(src,dst));
1481            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1482            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1483                                            AMD64RMI_Reg(src), dst));
1484            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1485            return dst;
1486         }
1487
1488         case Iop_Left8:
1489         case Iop_Left16:
1490         case Iop_Left32:
1491         case Iop_Left64: {
1492            HReg dst = newVRegI(env);
1493            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1494            addInstr(env, mk_iMOVsd_RR(src, dst));
1495            addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1496            addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1497            return dst;
1498         }
1499
1500         case Iop_V128to32: {
1501            HReg        dst     = newVRegI(env);
1502            HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1503            AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1504            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1505            addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1506            return dst;
1507         }
1508
1509         /* V128{HI}to64 */
1510         case Iop_V128HIto64:
1511         case Iop_V128to64: {
1512            HReg dst = newVRegI(env);
1513            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
1514            HReg rsp = hregAMD64_RSP();
1515            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1516            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1517            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1518            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1519                                             16, vec, m16_rsp));
1520            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1521                                             AMD64RMI_Mem(off_rsp), dst ));
1522            return dst;
1523         }
1524
1525         case Iop_V256to64_0: case Iop_V256to64_1:
1526         case Iop_V256to64_2: case Iop_V256to64_3: {
1527            HReg vHi, vLo, vec;
1528            iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1529            /* Do the first part of the selection by deciding which of
1530               the 128 bit registers do look at, and second part using
1531               the same scheme as for V128{HI}to64 above. */
1532            Int off = 0;
1533            switch (e->Iex.Unop.op) {
1534               case Iop_V256to64_0: vec = vLo; off = -16; break;
1535               case Iop_V256to64_1: vec = vLo; off =  -8; break;
1536               case Iop_V256to64_2: vec = vHi; off = -16; break;
1537               case Iop_V256to64_3: vec = vHi; off =  -8; break;
1538               default: vassert(0);
1539            }
1540            HReg        dst     = newVRegI(env);
1541            HReg        rsp     = hregAMD64_RSP();
1542            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1543            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1544            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1545                                             16, vec, m16_rsp));
1546            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1547                                             AMD64RMI_Mem(off_rsp), dst ));
1548            return dst;
1549         }
1550
1551         /* ReinterpF64asI64(e) */
1552         /* Given an IEEE754 double, produce an I64 with the same bit
1553            pattern. */
1554         case Iop_ReinterpF64asI64: {
1555            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1556            HReg        dst    = newVRegI(env);
1557            HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1558            /* paranoia */
1559            set_SSE_rounding_default(env);
1560            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1561            addInstr(env, AMD64Instr_Alu64R(
1562                             Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1563            return dst;
1564         }
1565
1566         /* ReinterpF32asI32(e) */
1567         /* Given an IEEE754 single, produce an I64 with the same bit
1568            pattern in the lower half. */
1569         case Iop_ReinterpF32asI32: {
1570            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1571            HReg        dst    = newVRegI(env);
1572            HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1573            /* paranoia */
1574            set_SSE_rounding_default(env);
1575            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1576            addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1577            return dst;
1578         }
1579
1580         case Iop_16to8:
1581         case Iop_32to8:
1582         case Iop_64to8:
1583         case Iop_32to16:
1584         case Iop_64to16:
1585         case Iop_64to32:
1586            /* These are no-ops. */
1587            return iselIntExpr_R(env, e->Iex.Unop.arg);
1588
1589         default:
1590            break;
1591      }
1592
1593      /* Deal with unary 64-bit SIMD ops. */
1594      switch (e->Iex.Unop.op) {
1595         case Iop_CmpNEZ32x2:
1596            fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1597         case Iop_CmpNEZ16x4:
1598            fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1599         case Iop_CmpNEZ8x8:
1600            fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1601         default:
1602            fn = (HWord)0; break;
1603      }
1604      if (fn != (HWord)0) {
1605         /* Note: the following assumes all helpers are of
1606            signature
1607               ULong fn ( ULong ), and they are
1608            not marked as regparm functions.
1609         */
1610         HReg dst = newVRegI(env);
1611         HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1612         addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1613         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
1614         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1615         return dst;
1616      }
1617
1618      break;
1619   }
1620
1621   /* --------- GET --------- */
1622   case Iex_Get: {
1623      if (ty == Ity_I64) {
1624         HReg dst = newVRegI(env);
1625         addInstr(env, AMD64Instr_Alu64R(
1626                          Aalu_MOV,
1627                          AMD64RMI_Mem(
1628                             AMD64AMode_IR(e->Iex.Get.offset,
1629                                           hregAMD64_RBP())),
1630                          dst));
1631         return dst;
1632      }
1633      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1634         HReg dst = newVRegI(env);
1635         addInstr(env, AMD64Instr_LoadEX(
1636                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1637                          False,
1638                          AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1639                          dst));
1640         return dst;
1641      }
1642      break;
1643   }
1644
1645   case Iex_GetI: {
1646      AMD64AMode* am
1647         = genGuestArrayOffset(
1648              env, e->Iex.GetI.descr,
1649                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1650      HReg dst = newVRegI(env);
1651      if (ty == Ity_I8) {
1652         addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1653         return dst;
1654      }
1655      if (ty == Ity_I64) {
1656         addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1657         return dst;
1658      }
1659      break;
1660   }
1661
1662   /* --------- CCALL --------- */
1663   case Iex_CCall: {
1664      HReg    dst = newVRegI(env);
1665      vassert(ty == e->Iex.CCall.retty);
1666
1667      /* be very restrictive for now.  Only 64-bit ints allowed
1668         for args, and 64 or 32 bits for return type. */
1669      if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1670         goto irreducible;
1671
1672      /* Marshal args, do the call. */
1673      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
1674
1675      /* Move to dst, and zero out the top 32 bits if the result type is
1676         Ity_I32.  Probably overkill, but still .. */
1677      if (e->Iex.CCall.retty == Ity_I64)
1678         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1679      else
1680         addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1681
1682      return dst;
1683   }
1684
1685   /* --------- LITERAL --------- */
1686   /* 64/32/16/8-bit literals */
1687   case Iex_Const:
1688      if (ty == Ity_I64) {
1689         HReg r = newVRegI(env);
1690         addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1691         return r;
1692      } else {
1693         AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1694         HReg      r   = newVRegI(env);
1695         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1696         return r;
1697      }
1698
1699   /* --------- MULTIPLEX --------- */
1700   case Iex_Mux0X: {
1701     if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1702         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
1703        HReg     r8;
1704        HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
1705        AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
1706        HReg dst = newVRegI(env);
1707        addInstr(env, mk_iMOVsd_RR(rX,dst));
1708        r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
1709        addInstr(env, AMD64Instr_Test64(0xFF, r8));
1710        addInstr(env, AMD64Instr_CMov64(Acc_Z,r0,dst));
1711        return dst;
1712      }
1713      break;
1714   }
1715
1716   /* --------- TERNARY OP --------- */
1717   case Iex_Triop: {
1718      IRTriop *triop = e->Iex.Triop.details;
1719      /* C3210 flags following FPU partial remainder (fprem), both
1720         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1721      if (triop->op == Iop_PRemC3210F64
1722          || triop->op == Iop_PRem1C3210F64) {
1723         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1724         HReg        arg1   = iselDblExpr(env, triop->arg2);
1725         HReg        arg2   = iselDblExpr(env, triop->arg3);
1726         HReg        dst    = newVRegI(env);
1727         addInstr(env, AMD64Instr_A87Free(2));
1728
1729         /* one arg -> top of x87 stack */
1730         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1731         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1732
1733         /* other arg -> top of x87 stack */
1734         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1735         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1736
1737         switch (triop->op) {
1738            case Iop_PRemC3210F64:
1739               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1740               break;
1741            case Iop_PRem1C3210F64:
1742               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1743               break;
1744            default:
1745               vassert(0);
1746         }
1747         /* Ignore the result, and instead make off with the FPU's
1748	    C3210 flags (in the status word). */
1749         addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1750         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1751         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1752         return dst;
1753      }
1754      break;
1755   }
1756
1757   default:
1758   break;
1759   } /* switch (e->tag) */
1760
1761   /* We get here if no pattern matched. */
1762  irreducible:
1763   ppIRExpr(e);
1764   vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1765}
1766
1767
1768/*---------------------------------------------------------*/
1769/*--- ISEL: Integer expression auxiliaries              ---*/
1770/*---------------------------------------------------------*/
1771
1772/* --------------------- AMODEs --------------------- */
1773
1774/* Return an AMode which computes the value of the specified
1775   expression, possibly also adding insns to the code list as a
1776   result.  The expression may only be a 32-bit one.
1777*/
1778
1779static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1780{
1781   AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
1782   vassert(sane_AMode(am));
1783   return am;
1784}
1785
1786/* DO NOT CALL THIS DIRECTLY ! */
1787static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1788{
1789   MatchInfo mi;
1790   DECLARE_PATTERN(p_complex);
1791   IRType ty = typeOfIRExpr(env->type_env,e);
1792   vassert(ty == Ity_I64);
1793
1794   /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
1795   /*              bind0        bind1  bind2   bind3   */
1796   DEFINE_PATTERN(p_complex,
1797      binop( Iop_Add64,
1798             binop( Iop_Add64,
1799                    bind(0),
1800                    binop(Iop_Shl64, bind(1), bind(2))
1801                  ),
1802             bind(3)
1803           )
1804   );
1805   if (matchIRExpr(&mi, p_complex, e)) {
1806      IRExpr* expr1  = mi.bindee[0];
1807      IRExpr* expr2  = mi.bindee[1];
1808      IRExpr* imm8   = mi.bindee[2];
1809      IRExpr* simm32 = mi.bindee[3];
1810      if (imm8->tag == Iex_Const
1811          && imm8->Iex.Const.con->tag == Ico_U8
1812          && imm8->Iex.Const.con->Ico.U8 < 4
1813          /* imm8 is OK, now check simm32 */
1814          && simm32->tag == Iex_Const
1815          && simm32->Iex.Const.con->tag == Ico_U64
1816          && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
1817         UInt shift = imm8->Iex.Const.con->Ico.U8;
1818         UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
1819         HReg r1 = iselIntExpr_R(env, expr1);
1820         HReg r2 = iselIntExpr_R(env, expr2);
1821         vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
1822         return AMD64AMode_IRRS(offset, r1, r2, shift);
1823      }
1824   }
1825
1826   /* Add64(expr1, Shl64(expr2, imm)) */
1827   if (e->tag == Iex_Binop
1828       && e->Iex.Binop.op == Iop_Add64
1829       && e->Iex.Binop.arg2->tag == Iex_Binop
1830       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
1831       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1832       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1833      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1834      if (shift == 1 || shift == 2 || shift == 3) {
1835         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1836         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1837         return AMD64AMode_IRRS(0, r1, r2, shift);
1838      }
1839   }
1840
1841   /* Add64(expr,i) */
1842   if (e->tag == Iex_Binop
1843       && e->Iex.Binop.op == Iop_Add64
1844       && e->Iex.Binop.arg2->tag == Iex_Const
1845       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
1846       && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
1847      HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1848      return AMD64AMode_IR(
1849                toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
1850                r1
1851             );
1852   }
1853
1854   /* Doesn't match anything in particular.  Generate it into
1855      a register and use that. */
1856   {
1857      HReg r1 = iselIntExpr_R(env, e);
1858      return AMD64AMode_IR(0, r1);
1859   }
1860}
1861
1862
1863/* --------------------- RMIs --------------------- */
1864
1865/* Similarly, calculate an expression into an X86RMI operand.  As with
1866   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1867
1868static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
1869{
1870   AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1871   /* sanity checks ... */
1872   switch (rmi->tag) {
1873      case Armi_Imm:
1874         return rmi;
1875      case Armi_Reg:
1876         vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
1877         vassert(hregIsVirtual(rmi->Armi.Reg.reg));
1878         return rmi;
1879      case Armi_Mem:
1880         vassert(sane_AMode(rmi->Armi.Mem.am));
1881         return rmi;
1882      default:
1883         vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
1884   }
1885}
1886
1887/* DO NOT CALL THIS DIRECTLY ! */
1888static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
1889{
1890   IRType ty = typeOfIRExpr(env->type_env,e);
1891   vassert(ty == Ity_I64 || ty == Ity_I32
1892           || ty == Ity_I16 || ty == Ity_I8);
1893
1894   /* special case: immediate 64/32/16/8 */
1895   if (e->tag == Iex_Const) {
1896      switch (e->Iex.Const.con->tag) {
1897        case Ico_U64:
1898           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
1899              return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
1900           }
1901           break;
1902         case Ico_U32:
1903            return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
1904         case Ico_U16:
1905            return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
1906         case Ico_U8:
1907            return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
1908         default:
1909            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
1910      }
1911   }
1912
1913   /* special case: 64-bit GET */
1914   if (e->tag == Iex_Get && ty == Ity_I64) {
1915      return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
1916                                        hregAMD64_RBP()));
1917   }
1918
1919   /* special case: 64-bit load from memory */
1920   if (e->tag == Iex_Load && ty == Ity_I64
1921       && e->Iex.Load.end == Iend_LE) {
1922      AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
1923      return AMD64RMI_Mem(am);
1924   }
1925
1926   /* default case: calculate into a register and return that */
1927   {
1928      HReg r = iselIntExpr_R ( env, e );
1929      return AMD64RMI_Reg(r);
1930   }
1931}
1932
1933
1934/* --------------------- RIs --------------------- */
1935
1936/* Calculate an expression into an AMD64RI operand.  As with
1937   iselIntExpr_R, the expression can have type 64, 32, 16 or 8
1938   bits. */
1939
1940static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
1941{
1942   AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
1943   /* sanity checks ... */
1944   switch (ri->tag) {
1945      case Ari_Imm:
1946         return ri;
1947      case Ari_Reg:
1948         vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
1949         vassert(hregIsVirtual(ri->Ari.Reg.reg));
1950         return ri;
1951      default:
1952         vpanic("iselIntExpr_RI: unknown amd64 RI tag");
1953   }
1954}
1955
1956/* DO NOT CALL THIS DIRECTLY ! */
1957static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
1958{
1959   IRType ty = typeOfIRExpr(env->type_env,e);
1960   vassert(ty == Ity_I64 || ty == Ity_I32
1961           || ty == Ity_I16 || ty == Ity_I8);
1962
1963   /* special case: immediate */
1964   if (e->tag == Iex_Const) {
1965      switch (e->Iex.Const.con->tag) {
1966        case Ico_U64:
1967           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
1968              return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
1969           }
1970           break;
1971         case Ico_U32:
1972            return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
1973         case Ico_U16:
1974            return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
1975         case Ico_U8:
1976            return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
1977         default:
1978            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
1979      }
1980   }
1981
1982   /* default case: calculate into a register and return that */
1983   {
1984      HReg r = iselIntExpr_R ( env, e );
1985      return AMD64RI_Reg(r);
1986   }
1987}
1988
1989
1990/* --------------------- RMs --------------------- */
1991
1992/* Similarly, calculate an expression into an AMD64RM operand.  As
1993   with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
1994   bits.  */
1995
1996static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
1997{
1998   AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
1999   /* sanity checks ... */
2000   switch (rm->tag) {
2001      case Arm_Reg:
2002         vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2003         vassert(hregIsVirtual(rm->Arm.Reg.reg));
2004         return rm;
2005      case Arm_Mem:
2006         vassert(sane_AMode(rm->Arm.Mem.am));
2007         return rm;
2008      default:
2009         vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2010   }
2011}
2012
2013/* DO NOT CALL THIS DIRECTLY ! */
2014static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
2015{
2016   IRType ty = typeOfIRExpr(env->type_env,e);
2017   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2018
2019   /* special case: 64-bit GET */
2020   if (e->tag == Iex_Get && ty == Ity_I64) {
2021      return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2022                                       hregAMD64_RBP()));
2023   }
2024
2025   /* special case: load from memory */
2026
2027   /* default case: calculate into a register and return that */
2028   {
2029      HReg r = iselIntExpr_R ( env, e );
2030      return AMD64RM_Reg(r);
2031   }
2032}
2033
2034
2035/* --------------------- CONDCODE --------------------- */
2036
2037/* Generate code to evaluated a bit-typed expression, returning the
2038   condition code which would correspond when the expression would
2039   notionally have returned 1. */
2040
2041static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
2042{
2043   /* Uh, there's nothing we can sanity check here, unfortunately. */
2044   return iselCondCode_wrk(env,e);
2045}
2046
2047/* DO NOT CALL THIS DIRECTLY ! */
2048static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
2049{
2050   MatchInfo mi;
2051
2052   vassert(e);
2053   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2054
2055   /* var */
2056   if (e->tag == Iex_RdTmp) {
2057      HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2058      HReg dst = newVRegI(env);
2059      addInstr(env, mk_iMOVsd_RR(r64,dst));
2060      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2061      return Acc_NZ;
2062   }
2063
2064   /* Constant 1:Bit */
2065   if (e->tag == Iex_Const) {
2066      HReg r;
2067      vassert(e->Iex.Const.con->tag == Ico_U1);
2068      vassert(e->Iex.Const.con->Ico.U1 == True
2069              || e->Iex.Const.con->Ico.U1 == False);
2070      r = newVRegI(env);
2071      addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2072      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2073      return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2074   }
2075
2076   /* Not1(...) */
2077   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2078      /* Generate code for the arg, and negate the test condition */
2079      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2080   }
2081
2082   /* --- patterns rooted at: 64to1 --- */
2083
2084   /* 64to1 */
2085   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2086      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2087      addInstr(env, AMD64Instr_Test64(1,reg));
2088      return Acc_NZ;
2089   }
2090
2091   /* --- patterns rooted at: 32to1 --- */
2092
2093   /* 32to1 */
2094   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2095      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2096      addInstr(env, AMD64Instr_Test64(1,reg));
2097      return Acc_NZ;
2098   }
2099
2100   /* --- patterns rooted at: CmpNEZ8 --- */
2101
2102   /* CmpNEZ8(x) */
2103   if (e->tag == Iex_Unop
2104       && e->Iex.Unop.op == Iop_CmpNEZ8) {
2105      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2106      addInstr(env, AMD64Instr_Test64(0xFF,r));
2107      return Acc_NZ;
2108   }
2109
2110   /* --- patterns rooted at: CmpNEZ16 --- */
2111
2112   /* CmpNEZ16(x) */
2113   if (e->tag == Iex_Unop
2114       && e->Iex.Unop.op == Iop_CmpNEZ16) {
2115      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2116      addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2117      return Acc_NZ;
2118   }
2119
2120   /* --- patterns rooted at: CmpNEZ32 --- */
2121
2122   /* CmpNEZ32(x) */
2123   if (e->tag == Iex_Unop
2124       && e->Iex.Unop.op == Iop_CmpNEZ32) {
2125      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2126      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2127      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2128      return Acc_NZ;
2129   }
2130
2131   /* --- patterns rooted at: CmpNEZ64 --- */
2132
2133   /* CmpNEZ64(Or64(x,y)) */
2134   {
2135      DECLARE_PATTERN(p_CmpNEZ64_Or64);
2136      DEFINE_PATTERN(p_CmpNEZ64_Or64,
2137                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
2138      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
2139         HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
2140         AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
2141         HReg      tmp  = newVRegI(env);
2142         addInstr(env, mk_iMOVsd_RR(r0, tmp));
2143         addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
2144         return Acc_NZ;
2145      }
2146   }
2147
2148   /* CmpNEZ64(x) */
2149   if (e->tag == Iex_Unop
2150       && e->Iex.Unop.op == Iop_CmpNEZ64) {
2151      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2152      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2153      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2154      return Acc_NZ;
2155   }
2156
2157   /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2158
2159   /* CmpEQ8 / CmpNE8 */
2160   if (e->tag == Iex_Binop
2161       && (e->Iex.Binop.op == Iop_CmpEQ8
2162           || e->Iex.Binop.op == Iop_CmpNE8
2163           || e->Iex.Binop.op == Iop_CasCmpEQ8
2164           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2165      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2166      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2167      HReg      r    = newVRegI(env);
2168      addInstr(env, mk_iMOVsd_RR(r1,r));
2169      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2170      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2171      switch (e->Iex.Binop.op) {
2172         case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2173         case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2174         default: vpanic("iselCondCode(amd64): CmpXX8");
2175      }
2176   }
2177
2178   /* CmpEQ16 / CmpNE16 */
2179   if (e->tag == Iex_Binop
2180       && (e->Iex.Binop.op == Iop_CmpEQ16
2181           || e->Iex.Binop.op == Iop_CmpNE16
2182           || e->Iex.Binop.op == Iop_CasCmpEQ16
2183           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2184      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2185      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2186      HReg      r    = newVRegI(env);
2187      addInstr(env, mk_iMOVsd_RR(r1,r));
2188      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2189      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2190      switch (e->Iex.Binop.op) {
2191         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2192         case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2193         default: vpanic("iselCondCode(amd64): CmpXX16");
2194      }
2195   }
2196
2197   /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2198      Saves a "movq %rax, %tmp" compared to the default route. */
2199   if (e->tag == Iex_Binop
2200       && e->Iex.Binop.op == Iop_CmpNE64
2201       && e->Iex.Binop.arg1->tag == Iex_CCall
2202       && e->Iex.Binop.arg2->tag == Iex_Const) {
2203      IRExpr* cal = e->Iex.Binop.arg1;
2204      IRExpr* con = e->Iex.Binop.arg2;
2205      HReg    tmp = newVRegI(env);
2206      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2207      vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2208      vassert(con->Iex.Const.con->tag == Ico_U64);
2209      /* Marshal args, do the call. */
2210      doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
2211      addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2212      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2213                                      AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2214      return Acc_NZ;
2215   }
2216
2217   /* Cmp*64*(x,y) */
2218   if (e->tag == Iex_Binop
2219       && (e->Iex.Binop.op == Iop_CmpEQ64
2220           || e->Iex.Binop.op == Iop_CmpNE64
2221           || e->Iex.Binop.op == Iop_CmpLT64S
2222           || e->Iex.Binop.op == Iop_CmpLT64U
2223           || e->Iex.Binop.op == Iop_CmpLE64S
2224           || e->Iex.Binop.op == Iop_CmpLE64U
2225           || e->Iex.Binop.op == Iop_CasCmpEQ64
2226           || e->Iex.Binop.op == Iop_CasCmpNE64)) {
2227      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2228      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2229      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2230      switch (e->Iex.Binop.op) {
2231         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2232         case Iop_CmpNE64: case Iop_CasCmpNE64: return Acc_NZ;
2233	 case Iop_CmpLT64S: return Acc_L;
2234	 case Iop_CmpLT64U: return Acc_B;
2235	 case Iop_CmpLE64S: return Acc_LE;
2236         case Iop_CmpLE64U: return Acc_BE;
2237         default: vpanic("iselCondCode(amd64): CmpXX64");
2238      }
2239   }
2240
2241   /* Cmp*32*(x,y) */
2242   if (e->tag == Iex_Binop
2243       && (e->Iex.Binop.op == Iop_CmpEQ32
2244           || e->Iex.Binop.op == Iop_CmpNE32
2245           || e->Iex.Binop.op == Iop_CmpLT32S
2246           || e->Iex.Binop.op == Iop_CmpLT32U
2247           || e->Iex.Binop.op == Iop_CmpLE32S
2248           || e->Iex.Binop.op == Iop_CmpLE32U
2249           || e->Iex.Binop.op == Iop_CasCmpEQ32
2250           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
2251      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2252      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2253      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2254      switch (e->Iex.Binop.op) {
2255         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2256         case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
2257	 case Iop_CmpLT32S: return Acc_L;
2258	 case Iop_CmpLT32U: return Acc_B;
2259	 case Iop_CmpLE32S: return Acc_LE;
2260         case Iop_CmpLE32U: return Acc_BE;
2261         default: vpanic("iselCondCode(amd64): CmpXX32");
2262      }
2263   }
2264
2265   ppIRExpr(e);
2266   vpanic("iselCondCode(amd64)");
2267}
2268
2269
2270/*---------------------------------------------------------*/
2271/*--- ISEL: Integer expressions (128 bit)               ---*/
2272/*---------------------------------------------------------*/
2273
2274/* Compute a 128-bit value into a register pair, which is returned as
2275   the first two parameters.  As with iselIntExpr_R, these may be
2276   either real or virtual regs; in any case they must not be changed
2277   by subsequent code emitted by the caller.  */
2278
2279static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2280                             ISelEnv* env, IRExpr* e )
2281{
2282   iselInt128Expr_wrk(rHi, rLo, env, e);
2283#  if 0
2284   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2285#  endif
2286   vassert(hregClass(*rHi) == HRcInt64);
2287   vassert(hregIsVirtual(*rHi));
2288   vassert(hregClass(*rLo) == HRcInt64);
2289   vassert(hregIsVirtual(*rLo));
2290}
2291
2292/* DO NOT CALL THIS DIRECTLY ! */
2293static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2294                                 ISelEnv* env, IRExpr* e )
2295{
2296   vassert(e);
2297   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2298
2299   /* read 128-bit IRTemp */
2300   if (e->tag == Iex_RdTmp) {
2301      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2302      return;
2303   }
2304
2305   /* --------- BINARY ops --------- */
2306   if (e->tag == Iex_Binop) {
2307      switch (e->Iex.Binop.op) {
2308         /* 64 x 64 -> 128 multiply */
2309         case Iop_MullU64:
2310         case Iop_MullS64: {
2311            /* get one operand into %rax, and the other into a R/M.
2312               Need to make an educated guess about which is better in
2313               which. */
2314            HReg     tLo    = newVRegI(env);
2315            HReg     tHi    = newVRegI(env);
2316            Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2317            AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2318            HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2319            addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2320            addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2321            /* Result is now in RDX:RAX.  Tell the caller. */
2322            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2323            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2324            *rHi = tHi;
2325            *rLo = tLo;
2326            return;
2327         }
2328
2329         /* 128 x 64 -> (64(rem),64(div)) division */
2330         case Iop_DivModU128to64:
2331         case Iop_DivModS128to64: {
2332            /* Get the 128-bit operand into rdx:rax, and the other into
2333               any old R/M. */
2334            HReg sHi, sLo;
2335            HReg     tLo     = newVRegI(env);
2336            HReg     tHi     = newVRegI(env);
2337            Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2338            AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2339            iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2340            addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2341            addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2342            addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2343            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2344            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2345            *rHi = tHi;
2346            *rLo = tLo;
2347            return;
2348         }
2349
2350         /* 64HLto128(e1,e2) */
2351         case Iop_64HLto128:
2352            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2353            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2354            return;
2355
2356         default:
2357            break;
2358      }
2359   } /* if (e->tag == Iex_Binop) */
2360
2361   ppIRExpr(e);
2362   vpanic("iselInt128Expr");
2363}
2364
2365
2366/*---------------------------------------------------------*/
2367/*--- ISEL: Floating point expressions (32 bit)         ---*/
2368/*---------------------------------------------------------*/
2369
2370/* Nothing interesting here; really just wrappers for
2371   64-bit stuff. */
2372
2373static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2374{
2375   HReg r = iselFltExpr_wrk( env, e );
2376#  if 0
2377   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2378#  endif
2379   vassert(hregClass(r) == HRcVec128);
2380   vassert(hregIsVirtual(r));
2381   return r;
2382}
2383
2384/* DO NOT CALL THIS DIRECTLY */
2385static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2386{
2387   IRType ty = typeOfIRExpr(env->type_env,e);
2388   vassert(ty == Ity_F32);
2389
2390   if (e->tag == Iex_RdTmp) {
2391      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2392   }
2393
2394   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2395      AMD64AMode* am;
2396      HReg res = newVRegV(env);
2397      vassert(e->Iex.Load.ty == Ity_F32);
2398      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2399      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2400      return res;
2401   }
2402
2403   if (e->tag == Iex_Binop
2404       && e->Iex.Binop.op == Iop_F64toF32) {
2405      /* Although the result is still held in a standard SSE register,
2406         we need to round it to reflect the loss of accuracy/range
2407         entailed in casting it to a 32-bit float. */
2408      HReg dst = newVRegV(env);
2409      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2410      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2411      addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2412      set_SSE_rounding_default( env );
2413      return dst;
2414   }
2415
2416   if (e->tag == Iex_Get) {
2417      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2418                                       hregAMD64_RBP() );
2419      HReg res = newVRegV(env);
2420      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2421      return res;
2422   }
2423
2424   if (e->tag == Iex_Unop
2425       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2426       /* Given an I32, produce an IEEE754 float with the same bit
2427          pattern. */
2428       HReg        dst    = newVRegV(env);
2429       HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2430       AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2431       addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2432       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2433       return dst;
2434   }
2435
2436   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2437      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2438      HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2439      HReg        dst    = newVRegV(env);
2440
2441      /* rf now holds the value to be rounded.  The first thing to do
2442         is set the FPU's rounding mode accordingly. */
2443
2444      /* Set host x87 rounding mode */
2445      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2446
2447      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2448      addInstr(env, AMD64Instr_A87Free(1));
2449      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2450      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2451      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2452      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2453
2454      /* Restore default x87 rounding. */
2455      set_FPU_rounding_default( env );
2456
2457      return dst;
2458   }
2459
2460   ppIRExpr(e);
2461   vpanic("iselFltExpr_wrk");
2462}
2463
2464
2465/*---------------------------------------------------------*/
2466/*--- ISEL: Floating point expressions (64 bit)         ---*/
2467/*---------------------------------------------------------*/
2468
2469/* Compute a 64-bit floating point value into the lower half of an xmm
2470   register, the identity of which is returned.  As with
2471   iselIntExpr_R, the returned reg will be virtual, and it must not be
2472   changed by subsequent code emitted by the caller.
2473*/
2474
2475/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2476
2477    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2478    ----                  ---------   -----------   -----------
2479    signalling NaN        u           2047 (max)    .0uuuuu---u
2480                                                    (with at least
2481                                                     one 1 bit)
2482    quiet NaN             u           2047 (max)    .1uuuuu---u
2483
2484    negative infinity     1           2047 (max)    .000000---0
2485
2486    positive infinity     0           2047 (max)    .000000---0
2487
2488    negative zero         1           0             .000000---0
2489
2490    positive zero         0           0             .000000---0
2491*/
2492
2493static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2494{
2495   HReg r = iselDblExpr_wrk( env, e );
2496#  if 0
2497   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2498#  endif
2499   vassert(hregClass(r) == HRcVec128);
2500   vassert(hregIsVirtual(r));
2501   return r;
2502}
2503
2504/* DO NOT CALL THIS DIRECTLY */
2505static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
2506{
2507   IRType ty = typeOfIRExpr(env->type_env,e);
2508   vassert(e);
2509   vassert(ty == Ity_F64);
2510
2511   if (e->tag == Iex_RdTmp) {
2512      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2513   }
2514
2515   if (e->tag == Iex_Const) {
2516      union { ULong u64; Double f64; } u;
2517      HReg res = newVRegV(env);
2518      HReg tmp = newVRegI(env);
2519      vassert(sizeof(u) == 8);
2520      vassert(sizeof(u.u64) == 8);
2521      vassert(sizeof(u.f64) == 8);
2522
2523      if (e->Iex.Const.con->tag == Ico_F64) {
2524         u.f64 = e->Iex.Const.con->Ico.F64;
2525      }
2526      else if (e->Iex.Const.con->tag == Ico_F64i) {
2527         u.u64 = e->Iex.Const.con->Ico.F64i;
2528      }
2529      else
2530         vpanic("iselDblExpr(amd64): const");
2531
2532      addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2533      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2534      addInstr(env, AMD64Instr_SseLdSt(
2535                       True/*load*/, 8, res,
2536                       AMD64AMode_IR(0, hregAMD64_RSP())
2537              ));
2538      add_to_rsp(env, 8);
2539      return res;
2540   }
2541
2542   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2543      AMD64AMode* am;
2544      HReg res = newVRegV(env);
2545      vassert(e->Iex.Load.ty == Ity_F64);
2546      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2547      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2548      return res;
2549   }
2550
2551   if (e->tag == Iex_Get) {
2552      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2553                                      hregAMD64_RBP() );
2554      HReg res = newVRegV(env);
2555      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2556      return res;
2557   }
2558
2559   if (e->tag == Iex_GetI) {
2560      AMD64AMode* am
2561         = genGuestArrayOffset(
2562              env, e->Iex.GetI.descr,
2563                   e->Iex.GetI.ix, e->Iex.GetI.bias );
2564      HReg res = newVRegV(env);
2565      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2566      return res;
2567   }
2568
2569   if (e->tag == Iex_Triop) {
2570      IRTriop *triop = e->Iex.Triop.details;
2571      AMD64SseOp op = Asse_INVALID;
2572      switch (triop->op) {
2573         case Iop_AddF64: op = Asse_ADDF; break;
2574         case Iop_SubF64: op = Asse_SUBF; break;
2575         case Iop_MulF64: op = Asse_MULF; break;
2576         case Iop_DivF64: op = Asse_DIVF; break;
2577         default: break;
2578      }
2579      if (op != Asse_INVALID) {
2580         HReg dst  = newVRegV(env);
2581         HReg argL = iselDblExpr(env, triop->arg2);
2582         HReg argR = iselDblExpr(env, triop->arg3);
2583         addInstr(env, mk_vMOVsd_RR(argL, dst));
2584         /* XXXROUNDINGFIXME */
2585         /* set roundingmode here */
2586         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2587         return dst;
2588      }
2589   }
2590
2591   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2592      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2593      HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2594      HReg        dst    = newVRegV(env);
2595
2596      /* rf now holds the value to be rounded.  The first thing to do
2597         is set the FPU's rounding mode accordingly. */
2598
2599      /* Set host x87 rounding mode */
2600      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2601
2602      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2603      addInstr(env, AMD64Instr_A87Free(1));
2604      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2605      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2606      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2607      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2608
2609      /* Restore default x87 rounding. */
2610      set_FPU_rounding_default( env );
2611
2612      return dst;
2613   }
2614
2615   IRTriop *triop = e->Iex.Triop.details;
2616   if (e->tag == Iex_Triop
2617       && (triop->op == Iop_ScaleF64
2618           || triop->op == Iop_AtanF64
2619           || triop->op == Iop_Yl2xF64
2620           || triop->op == Iop_Yl2xp1F64
2621           || triop->op == Iop_PRemF64
2622           || triop->op == Iop_PRem1F64)
2623      ) {
2624      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2625      HReg        arg1   = iselDblExpr(env, triop->arg2);
2626      HReg        arg2   = iselDblExpr(env, triop->arg3);
2627      HReg        dst    = newVRegV(env);
2628      Bool     arg2first = toBool(triop->op == Iop_ScaleF64
2629                                  || triop->op == Iop_PRemF64
2630                                  || triop->op == Iop_PRem1F64);
2631      addInstr(env, AMD64Instr_A87Free(2));
2632
2633      /* one arg -> top of x87 stack */
2634      addInstr(env, AMD64Instr_SseLdSt(
2635                       False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
2636      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2637
2638      /* other arg -> top of x87 stack */
2639      addInstr(env, AMD64Instr_SseLdSt(
2640                       False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
2641      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2642
2643      /* do it */
2644      /* XXXROUNDINGFIXME */
2645      /* set roundingmode here */
2646      switch (triop->op) {
2647         case Iop_ScaleF64:
2648            addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
2649            break;
2650         case Iop_AtanF64:
2651            addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
2652            break;
2653         case Iop_Yl2xF64:
2654            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
2655            break;
2656         case Iop_Yl2xp1F64:
2657            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
2658            break;
2659         case Iop_PRemF64:
2660            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
2661            break;
2662         case Iop_PRem1F64:
2663            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
2664            break;
2665         default:
2666            vassert(0);
2667      }
2668
2669      /* save result */
2670      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2671      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2672      return dst;
2673   }
2674
2675   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
2676      HReg dst = newVRegV(env);
2677      HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
2678      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2679      addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
2680      set_SSE_rounding_default( env );
2681      return dst;
2682   }
2683
2684   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
2685      HReg dst = newVRegV(env);
2686      HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2687      set_SSE_rounding_default( env );
2688      addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
2689      return dst;
2690   }
2691
2692   if (e->tag == Iex_Unop
2693       && (e->Iex.Unop.op == Iop_NegF64
2694           || e->Iex.Unop.op == Iop_AbsF64)) {
2695      /* Sigh ... very rough code.  Could do much better. */
2696      /* Get the 128-bit literal 00---0 10---0 into a register
2697         and xor/nand it with the value to be negated. */
2698      HReg r1  = newVRegI(env);
2699      HReg dst = newVRegV(env);
2700      HReg tmp = newVRegV(env);
2701      HReg src = iselDblExpr(env, e->Iex.Unop.arg);
2702      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2703      addInstr(env, mk_vMOVsd_RR(src,tmp));
2704      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2705      addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
2706      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2707      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2708
2709      if (e->Iex.Unop.op == Iop_NegF64)
2710         addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2711      else
2712         addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
2713
2714      add_to_rsp(env, 16);
2715      return dst;
2716   }
2717
2718   if (e->tag == Iex_Binop) {
2719      A87FpOp fpop = Afp_INVALID;
2720      switch (e->Iex.Binop.op) {
2721         case Iop_SqrtF64: fpop = Afp_SQRT; break;
2722         case Iop_SinF64:  fpop = Afp_SIN;  break;
2723         case Iop_CosF64:  fpop = Afp_COS;  break;
2724         case Iop_TanF64:  fpop = Afp_TAN;  break;
2725         case Iop_2xm1F64: fpop = Afp_2XM1; break;
2726         default: break;
2727      }
2728      if (fpop != Afp_INVALID) {
2729         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2730         HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2731         HReg        dst    = newVRegV(env);
2732         Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
2733         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2734         addInstr(env, AMD64Instr_A87Free(nNeeded));
2735         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2736         /* XXXROUNDINGFIXME */
2737         /* set roundingmode here */
2738         addInstr(env, AMD64Instr_A87FpOp(fpop));
2739         if (e->Iex.Binop.op==Iop_TanF64) {
2740            /* get rid of the extra 1.0 that fptan pushes */
2741            addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2742         }
2743         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2744         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2745         return dst;
2746      }
2747   }
2748
2749   if (e->tag == Iex_Unop) {
2750      switch (e->Iex.Unop.op) {
2751//..          case Iop_I32toF64: {
2752//..             HReg dst = newVRegF(env);
2753//..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
2754//..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
2755//..             set_FPU_rounding_default(env);
2756//..             addInstr(env, X86Instr_FpLdStI(
2757//..                              True/*load*/, 4, dst,
2758//..                              X86AMode_IR(0, hregX86_ESP())));
2759//..             add_to_esp(env, 4);
2760//..             return dst;
2761//..          }
2762         case Iop_ReinterpI64asF64: {
2763            /* Given an I64, produce an IEEE754 double with the same
2764               bit pattern. */
2765            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2766            HReg        dst    = newVRegV(env);
2767            AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
2768            /* paranoia */
2769            set_SSE_rounding_default(env);
2770            addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
2771            addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2772            return dst;
2773         }
2774         case Iop_F32toF64: {
2775            HReg f32;
2776            HReg f64 = newVRegV(env);
2777            /* this shouldn't be necessary, but be paranoid ... */
2778            set_SSE_rounding_default(env);
2779            f32 = iselFltExpr(env, e->Iex.Unop.arg);
2780            addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
2781            return f64;
2782         }
2783         default:
2784            break;
2785      }
2786   }
2787
2788   /* --------- MULTIPLEX --------- */
2789   if (e->tag == Iex_Mux0X) {
2790      HReg r8, rX, r0, dst;
2791      vassert(ty == Ity_F64);
2792      vassert(typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8);
2793      r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
2794      rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
2795      r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
2796      dst = newVRegV(env);
2797      addInstr(env, mk_vMOVsd_RR(rX,dst));
2798      addInstr(env, AMD64Instr_Test64(0xFF, r8));
2799      addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
2800      return dst;
2801   }
2802
2803   ppIRExpr(e);
2804   vpanic("iselDblExpr_wrk");
2805}
2806
2807
2808/*---------------------------------------------------------*/
2809/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
2810/*---------------------------------------------------------*/
2811
2812static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
2813{
2814   HReg r = iselVecExpr_wrk( env, e );
2815#  if 0
2816   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2817#  endif
2818   vassert(hregClass(r) == HRcVec128);
2819   vassert(hregIsVirtual(r));
2820   return r;
2821}
2822
2823
2824/* DO NOT CALL THIS DIRECTLY */
2825static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
2826{
2827   HWord      fn = 0; /* address of helper fn, if required */
2828   Bool       arg1isEReg = False;
2829   AMD64SseOp op = Asse_INVALID;
2830   IRType     ty = typeOfIRExpr(env->type_env,e);
2831   vassert(e);
2832   vassert(ty == Ity_V128);
2833
2834   if (e->tag == Iex_RdTmp) {
2835      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2836   }
2837
2838   if (e->tag == Iex_Get) {
2839      HReg dst = newVRegV(env);
2840      addInstr(env, AMD64Instr_SseLdSt(
2841                       True/*load*/,
2842                       16,
2843                       dst,
2844                       AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
2845                    )
2846              );
2847      return dst;
2848   }
2849
2850   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2851      HReg        dst = newVRegV(env);
2852      AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
2853      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
2854      return dst;
2855   }
2856
2857   if (e->tag == Iex_Const) {
2858      HReg dst = newVRegV(env);
2859      vassert(e->Iex.Const.con->tag == Ico_V128);
2860      switch (e->Iex.Const.con->Ico.V128) {
2861         case 0x0000:
2862            dst = generate_zeroes_V128(env);
2863            break;
2864         case 0xFFFF:
2865            dst = generate_ones_V128(env);
2866            break;
2867         default: {
2868            AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2869            /* do push_uimm64 twice, first time for the high-order half. */
2870            push_uimm64(env, bitmask8_to_bytemask64(
2871                                (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
2872                       ));
2873            push_uimm64(env, bitmask8_to_bytemask64(
2874                                (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
2875                       ));
2876            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
2877            add_to_rsp(env, 16);
2878            break;
2879         }
2880      }
2881      return dst;
2882   }
2883
2884   if (e->tag == Iex_Unop) {
2885   switch (e->Iex.Unop.op) {
2886
2887      case Iop_NotV128: {
2888         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
2889         return do_sse_NotV128(env, arg);
2890      }
2891
2892      case Iop_CmpNEZ64x2: {
2893         /* We can use SSE2 instructions for this. */
2894         /* Ideally, we want to do a 64Ix2 comparison against zero of
2895            the operand.  Problem is no such insn exists.  Solution
2896            therefore is to do a 32Ix4 comparison instead, and bitwise-
2897            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
2898            let the not'd result of this initial comparison be a:b:c:d.
2899            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
2900            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
2901            giving the required result.
2902
2903            The required selection sequence is 2,3,0,1, which
2904            according to Intel's documentation means the pshufd
2905            literal value is 0xB1, that is,
2906            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
2907         */
2908         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
2909         HReg tmp  = generate_zeroes_V128(env);
2910         HReg dst  = newVRegV(env);
2911         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
2912         tmp = do_sse_NotV128(env, tmp);
2913         addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
2914         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
2915         return dst;
2916      }
2917
2918      case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
2919      case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
2920      case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
2921      do_CmpNEZ_vector:
2922      {
2923         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
2924         HReg tmp  = newVRegV(env);
2925         HReg zero = generate_zeroes_V128(env);
2926         HReg dst;
2927         addInstr(env, mk_vMOVsd_RR(arg, tmp));
2928         addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
2929         dst = do_sse_NotV128(env, tmp);
2930         return dst;
2931      }
2932
2933      case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
2934      case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
2935      case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
2936      do_32Fx4_unary:
2937      {
2938         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
2939         HReg dst = newVRegV(env);
2940         addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
2941         return dst;
2942      }
2943
2944      case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
2945      do_64Fx2_unary:
2946      {
2947         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
2948         HReg dst = newVRegV(env);
2949         addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
2950         return dst;
2951      }
2952
2953      case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
2954      case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
2955      case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
2956      do_32F0x4_unary:
2957      {
2958         /* A bit subtle.  We have to copy the arg to the result
2959            register first, because actually doing the SSE scalar insn
2960            leaves the upper 3/4 of the destination register
2961            unchanged.  Whereas the required semantics of these
2962            primops is that the upper 3/4 is simply copied in from the
2963            argument. */
2964         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
2965         HReg dst = newVRegV(env);
2966         addInstr(env, mk_vMOVsd_RR(arg, dst));
2967         addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
2968         return dst;
2969      }
2970
2971      case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
2972      do_64F0x2_unary:
2973      {
2974         /* A bit subtle.  We have to copy the arg to the result
2975            register first, because actually doing the SSE scalar insn
2976            leaves the upper half of the destination register
2977            unchanged.  Whereas the required semantics of these
2978            primops is that the upper half is simply copied in from the
2979            argument. */
2980         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
2981         HReg dst = newVRegV(env);
2982         addInstr(env, mk_vMOVsd_RR(arg, dst));
2983         addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
2984         return dst;
2985      }
2986
2987      case Iop_32UtoV128: {
2988         HReg        dst     = newVRegV(env);
2989         AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
2990         AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
2991         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
2992         addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
2993         return dst;
2994      }
2995
2996      case Iop_64UtoV128: {
2997         HReg        dst  = newVRegV(env);
2998         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2999         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3000         addInstr(env, AMD64Instr_Push(rmi));
3001         addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3002         add_to_rsp(env, 8);
3003         return dst;
3004      }
3005
3006      case Iop_V256toV128_0:
3007      case Iop_V256toV128_1: {
3008         HReg vHi, vLo;
3009         iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3010         return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3011      }
3012
3013      default:
3014         break;
3015   } /* switch (e->Iex.Unop.op) */
3016   } /* if (e->tag == Iex_Unop) */
3017
3018   if (e->tag == Iex_Binop) {
3019   switch (e->Iex.Binop.op) {
3020
3021      /* FIXME: could we generate MOVQ here? */
3022      case Iop_SetV128lo64: {
3023         HReg dst  = newVRegV(env);
3024         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3025         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3026         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3027         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3028         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3029         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3030         return dst;
3031      }
3032
3033      /* FIXME: could we generate MOVD here? */
3034      case Iop_SetV128lo32: {
3035         HReg dst  = newVRegV(env);
3036         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3037         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3038         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3039         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3040         addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3041         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3042         return dst;
3043      }
3044
3045      case Iop_64HLtoV128: {
3046         HReg        rsp     = hregAMD64_RSP();
3047         AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
3048         AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
3049         AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
3050         AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
3051         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
3052         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
3053         HReg        dst = newVRegV(env);
3054         /* One store-forwarding stall coming up, oh well :-( */
3055         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
3056         return dst;
3057      }
3058
3059      case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3060      case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3061      case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3062      case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3063      case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
3064      case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
3065      case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3066      case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3067      case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
3068      case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
3069      do_32Fx4:
3070      {
3071         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3072         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3073         HReg dst = newVRegV(env);
3074         addInstr(env, mk_vMOVsd_RR(argL, dst));
3075         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3076         return dst;
3077      }
3078
3079      case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3080      case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3081      case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3082      case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3083      case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
3084      case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
3085      case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3086      case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3087      case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
3088      case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
3089      do_64Fx2:
3090      {
3091         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3092         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3093         HReg dst = newVRegV(env);
3094         addInstr(env, mk_vMOVsd_RR(argL, dst));
3095         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3096         return dst;
3097      }
3098
3099      case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3100      case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3101      case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3102      case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3103      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3104      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3105      case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3106      case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3107      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3108      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3109      do_32F0x4: {
3110         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3111         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3112         HReg dst = newVRegV(env);
3113         addInstr(env, mk_vMOVsd_RR(argL, dst));
3114         addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3115         return dst;
3116      }
3117
3118      case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3119      case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3120      case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3121      case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3122      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3123      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3124      case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3125      case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3126      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3127      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3128      do_64F0x2: {
3129         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3130         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3131         HReg dst = newVRegV(env);
3132         addInstr(env, mk_vMOVsd_RR(argL, dst));
3133         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3134         return dst;
3135      }
3136
3137      case Iop_QNarrowBin32Sto16Sx8:
3138         op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3139      case Iop_QNarrowBin16Sto8Sx16:
3140         op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3141      case Iop_QNarrowBin16Sto8Ux16:
3142         op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3143
3144      case Iop_InterleaveHI8x16:
3145         op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3146      case Iop_InterleaveHI16x8:
3147         op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3148      case Iop_InterleaveHI32x4:
3149         op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3150      case Iop_InterleaveHI64x2:
3151         op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3152
3153      case Iop_InterleaveLO8x16:
3154         op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3155      case Iop_InterleaveLO16x8:
3156         op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3157      case Iop_InterleaveLO32x4:
3158         op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3159      case Iop_InterleaveLO64x2:
3160         op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3161
3162      case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3163      case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3164      case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3165      case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3166      case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3167      case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3168      case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3169      case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3170      case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3171      case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3172      case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3173      case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3174      case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3175      case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3176      case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3177      case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3178      case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3179      case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3180      case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3181      case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3182      case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3183      case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3184      case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3185      case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3186      case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3187      case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3188      case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3189      case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3190      case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3191      case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3192      case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3193      case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3194      case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3195      case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3196      do_SseReRg: {
3197         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3198         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3199         HReg dst = newVRegV(env);
3200         if (arg1isEReg) {
3201            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3202            addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3203         } else {
3204            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3205            addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3206         }
3207         return dst;
3208      }
3209
3210      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
3211      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
3212      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
3213      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
3214      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
3215      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
3216      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
3217      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
3218      do_SseShift: {
3219         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
3220         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3221         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3222         HReg        ereg = newVRegV(env);
3223         HReg        dst  = newVRegV(env);
3224         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3225         addInstr(env, AMD64Instr_Push(rmi));
3226         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3227         addInstr(env, mk_vMOVsd_RR(greg, dst));
3228         addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3229         add_to_rsp(env, 16);
3230         return dst;
3231      }
3232
3233      case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3234                           goto do_SseAssistedBinary;
3235      case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3236                           goto do_SseAssistedBinary;
3237      case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3238                           goto do_SseAssistedBinary;
3239      case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3240                           goto do_SseAssistedBinary;
3241      case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3242                           goto do_SseAssistedBinary;
3243      case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3244                           goto do_SseAssistedBinary;
3245      case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3246                           goto do_SseAssistedBinary;
3247      case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3248                           goto do_SseAssistedBinary;
3249      case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3250                           goto do_SseAssistedBinary;
3251      case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3252                           goto do_SseAssistedBinary;
3253      case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3254                           goto do_SseAssistedBinary;
3255      case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3256                           goto do_SseAssistedBinary;
3257      case Iop_QNarrowBin32Sto16Ux8:
3258                           fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3259                           goto do_SseAssistedBinary;
3260      case Iop_NarrowBin16to8x16:
3261                           fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3262                           goto do_SseAssistedBinary;
3263      case Iop_NarrowBin32to16x8:
3264                           fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3265                           goto do_SseAssistedBinary;
3266      do_SseAssistedBinary: {
3267         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3268            well. */
3269         vassert(fn != 0);
3270         HReg dst = newVRegV(env);
3271         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3272         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3273         HReg argp = newVRegI(env);
3274         /* subq $112, %rsp         -- make a space*/
3275         sub_from_rsp(env, 112);
3276         /* leaq 48(%rsp), %r_argp  -- point into it */
3277         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3278                                        argp));
3279         /* andq $-16, %r_argp      -- 16-align the pointer */
3280         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3281                                         AMD64RMI_Imm( ~(UInt)15 ),
3282                                         argp));
3283         /* Prepare 3 arg regs:
3284            leaq 0(%r_argp), %rdi
3285            leaq 16(%r_argp), %rsi
3286            leaq 32(%r_argp), %rdx
3287         */
3288         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3289                                        hregAMD64_RDI()));
3290         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3291                                        hregAMD64_RSI()));
3292         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3293                                        hregAMD64_RDX()));
3294         /* Store the two args, at (%rsi) and (%rdx):
3295            movupd  %argL, 0(%rsi)
3296            movupd  %argR, 0(%rdx)
3297         */
3298         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3299                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3300         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3301                                          AMD64AMode_IR(0, hregAMD64_RDX())));
3302         /* call the helper */
3303         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
3304         /* fetch the result from memory, using %r_argp, which the
3305            register allocator will keep alive across the call. */
3306         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3307                                          AMD64AMode_IR(0, argp)));
3308         /* and finally, clear the space */
3309         add_to_rsp(env, 112);
3310         return dst;
3311      }
3312
3313      case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3314                         goto do_SseAssistedVectorAndScalar;
3315      case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3316                         goto do_SseAssistedVectorAndScalar;
3317      do_SseAssistedVectorAndScalar: {
3318         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3319            well. */
3320         vassert(fn != 0);
3321         HReg dst = newVRegV(env);
3322         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3323         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3324         HReg argp = newVRegI(env);
3325         /* subq $112, %rsp         -- make a space*/
3326         sub_from_rsp(env, 112);
3327         /* leaq 48(%rsp), %r_argp  -- point into it */
3328         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3329                                        argp));
3330         /* andq $-16, %r_argp      -- 16-align the pointer */
3331         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3332                                         AMD64RMI_Imm( ~(UInt)15 ),
3333                                         argp));
3334         /* Prepare 2 vector arg regs:
3335            leaq 0(%r_argp), %rdi
3336            leaq 16(%r_argp), %rsi
3337         */
3338         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3339                                        hregAMD64_RDI()));
3340         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3341                                        hregAMD64_RSI()));
3342         /* Store the vector arg, at (%rsi):
3343            movupd  %argL, 0(%rsi)
3344         */
3345         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3346                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3347         /* And get the scalar value into rdx */
3348         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3349
3350         /* call the helper */
3351         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
3352         /* fetch the result from memory, using %r_argp, which the
3353            register allocator will keep alive across the call. */
3354         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3355                                          AMD64AMode_IR(0, argp)));
3356         /* and finally, clear the space */
3357         add_to_rsp(env, 112);
3358         return dst;
3359      }
3360
3361      default:
3362         break;
3363   } /* switch (e->Iex.Binop.op) */
3364   } /* if (e->tag == Iex_Binop) */
3365
3366   if (e->tag == Iex_Mux0X) {
3367      HReg r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
3368      HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
3369      HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
3370      HReg dst = newVRegV(env);
3371      addInstr(env, mk_vMOVsd_RR(rX,dst));
3372      addInstr(env, AMD64Instr_Test64(0xFF, r8));
3373      addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
3374      return dst;
3375   }
3376
3377   //vec_fail:
3378   vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3379              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3380   ppIRExpr(e);
3381   vpanic("iselVecExpr_wrk");
3382}
3383
3384
3385/*---------------------------------------------------------*/
3386/*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3387/*---------------------------------------------------------*/
3388
3389static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3390                           ISelEnv* env, IRExpr* e )
3391{
3392   iselDVecExpr_wrk( rHi, rLo, env, e );
3393#  if 0
3394   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3395#  endif
3396   vassert(hregClass(*rHi) == HRcVec128);
3397   vassert(hregClass(*rLo) == HRcVec128);
3398   vassert(hregIsVirtual(*rHi));
3399   vassert(hregIsVirtual(*rLo));
3400}
3401
3402
3403/* DO NOT CALL THIS DIRECTLY */
3404static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3405                               ISelEnv* env, IRExpr* e )
3406{
3407   vassert(e);
3408   IRType ty = typeOfIRExpr(env->type_env,e);
3409   vassert(ty == Ity_V256);
3410
3411   AMD64SseOp op = Asse_INVALID;
3412
3413   /* read 256-bit IRTemp */
3414   if (e->tag == Iex_RdTmp) {
3415      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3416      return;
3417   }
3418
3419   if (e->tag == Iex_Get) {
3420      HReg        vHi  = newVRegV(env);
3421      HReg        vLo  = newVRegV(env);
3422      HReg        rbp  = hregAMD64_RBP();
3423      AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3424      AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3425      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3426      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3427      *rHi = vHi;
3428      *rLo = vLo;
3429      return;
3430   }
3431
3432   if (e->tag == Iex_Load) {
3433      HReg        vHi  = newVRegV(env);
3434      HReg        vLo  = newVRegV(env);
3435      HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3436      AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3437      AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3438      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3439      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3440      *rHi = vHi;
3441      *rLo = vLo;
3442      return;
3443   }
3444
3445   if (e->tag == Iex_Const) {
3446      vassert(e->Iex.Const.con->tag == Ico_V256);
3447      switch (e->Iex.Const.con->Ico.V256) {
3448         case 0x00000000: {
3449            HReg vHi = generate_zeroes_V128(env);
3450            HReg vLo = newVRegV(env);
3451            addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3452            *rHi = vHi;
3453            *rLo = vLo;
3454            return;
3455         }
3456         default:
3457            break; /* give up.   Until such time as is necessary. */
3458      }
3459   }
3460
3461   if (e->tag == Iex_Unop) {
3462   switch (e->Iex.Unop.op) {
3463
3464      case Iop_NotV256: {
3465         HReg argHi, argLo;
3466         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3467         *rHi = do_sse_NotV128(env, argHi);
3468         *rLo = do_sse_NotV128(env, argLo);
3469         return;
3470      }
3471
3472      case Iop_Recip32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
3473      case Iop_Sqrt32Fx8:  op = Asse_SQRTF;  goto do_32Fx8_unary;
3474      case Iop_RSqrt32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3475      do_32Fx8_unary:
3476      {
3477         HReg argHi, argLo;
3478         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3479         HReg dstHi = newVRegV(env);
3480         HReg dstLo = newVRegV(env);
3481         addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
3482         addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
3483         *rHi = dstHi;
3484         *rLo = dstLo;
3485         return;
3486      }
3487
3488      case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
3489      do_64Fx4_unary:
3490      {
3491         HReg argHi, argLo;
3492         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3493         HReg dstHi = newVRegV(env);
3494         HReg dstLo = newVRegV(env);
3495         addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
3496         addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
3497         *rHi = dstHi;
3498         *rLo = dstLo;
3499         return;
3500      }
3501
3502      case Iop_CmpNEZ64x4: {
3503         /* We can use SSE2 instructions for this. */
3504         /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
3505            (obviously).  See comment on Iop_CmpNEZ64x2 for
3506            explanation of what's going on here. */
3507         HReg argHi, argLo;
3508         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3509         HReg tmpHi  = generate_zeroes_V128(env);
3510         HReg tmpLo  = newVRegV(env);
3511         addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
3512         HReg dstHi  = newVRegV(env);
3513         HReg dstLo  = newVRegV(env);
3514         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
3515         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
3516         tmpHi = do_sse_NotV128(env, tmpHi);
3517         tmpLo = do_sse_NotV128(env, tmpLo);
3518         addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
3519         addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
3520         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
3521         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
3522         *rHi = dstHi;
3523         *rLo = dstLo;
3524         return;
3525      }
3526
3527      case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3528      do_CmpNEZ_vector:
3529      {
3530         HReg argHi, argLo;
3531         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3532         HReg tmpHi = newVRegV(env);
3533         HReg tmpLo = newVRegV(env);
3534         HReg zero  = generate_zeroes_V128(env);
3535         HReg dstHi, dstLo;
3536         addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
3537         addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
3538         addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
3539         addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
3540         dstHi = do_sse_NotV128(env, tmpHi);
3541         dstLo = do_sse_NotV128(env, tmpLo);
3542         *rHi = dstHi;
3543         *rLo = dstLo;
3544         return;
3545      }
3546
3547      default:
3548         break;
3549   } /* switch (e->Iex.Unop.op) */
3550   } /* if (e->tag == Iex_Unop) */
3551
3552   if (e->tag == Iex_Binop) {
3553   switch (e->Iex.Binop.op) {
3554
3555      case Iop_Add64Fx4:   op = Asse_ADDF;   goto do_64Fx4;
3556      case Iop_Sub64Fx4:   op = Asse_SUBF;   goto do_64Fx4;
3557      case Iop_Mul64Fx4:   op = Asse_MULF;   goto do_64Fx4;
3558      case Iop_Div64Fx4:   op = Asse_DIVF;   goto do_64Fx4;
3559      case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
3560      case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
3561      do_64Fx4:
3562      {
3563         HReg argLhi, argLlo, argRhi, argRlo;
3564         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3565         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3566         HReg dstHi = newVRegV(env);
3567         HReg dstLo = newVRegV(env);
3568         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3569         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3570         addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
3571         addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
3572         *rHi = dstHi;
3573         *rLo = dstLo;
3574         return;
3575      }
3576
3577      case Iop_Add32Fx8:   op = Asse_ADDF;   goto do_32Fx8;
3578      case Iop_Sub32Fx8:   op = Asse_SUBF;   goto do_32Fx8;
3579      case Iop_Mul32Fx8:   op = Asse_MULF;   goto do_32Fx8;
3580      case Iop_Div32Fx8:   op = Asse_DIVF;   goto do_32Fx8;
3581      case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
3582      case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
3583      do_32Fx8:
3584      {
3585         HReg argLhi, argLlo, argRhi, argRlo;
3586         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3587         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3588         HReg dstHi = newVRegV(env);
3589         HReg dstLo = newVRegV(env);
3590         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3591         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3592         addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
3593         addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
3594         *rHi = dstHi;
3595         *rLo = dstLo;
3596         return;
3597      }
3598
3599      case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
3600      case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
3601      case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
3602      do_SseReRg:
3603      {
3604         HReg argLhi, argLlo, argRhi, argRlo;
3605         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3606         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3607         HReg dstHi = newVRegV(env);
3608         HReg dstLo = newVRegV(env);
3609         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3610         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3611         addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
3612         addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
3613         *rHi = dstHi;
3614         *rLo = dstLo;
3615         return;
3616      }
3617
3618      case Iop_V128HLtoV256: {
3619         *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
3620         *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
3621         return;
3622      }
3623
3624      default:
3625         break;
3626   } /* switch (e->Iex.Binop.op) */
3627   } /* if (e->tag == Iex_Binop) */
3628
3629   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
3630      HReg        rsp     = hregAMD64_RSP();
3631      HReg        vHi     = newVRegV(env);
3632      HReg        vLo     = newVRegV(env);
3633      AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
3634      AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
3635      /* arg1 is the most significant (Q3), arg4 the least (Q0) */
3636      /* Get all the args into regs, before messing with the stack. */
3637      AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
3638      AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
3639      AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
3640      AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
3641      /* less significant lane (Q2) at the lower address (-16(rsp)) */
3642      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
3643      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
3644      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
3645      /* and then the lower half .. */
3646      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
3647      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
3648      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
3649      *rHi = vHi;
3650      *rLo = vLo;
3651      return;
3652   }
3653
3654   //avx_fail:
3655   vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
3656              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3657   ppIRExpr(e);
3658   vpanic("iselDVecExpr_wrk");
3659}
3660
3661
3662/*---------------------------------------------------------*/
3663/*--- ISEL: Statements                                  ---*/
3664/*---------------------------------------------------------*/
3665
3666static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3667{
3668   if (vex_traceflags & VEX_TRACE_VCODE) {
3669      vex_printf("\n-- ");
3670      ppIRStmt(stmt);
3671      vex_printf("\n");
3672   }
3673
3674   switch (stmt->tag) {
3675
3676   /* --------- STORE --------- */
3677   case Ist_Store: {
3678      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3679      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3680      IREndness end   = stmt->Ist.Store.end;
3681
3682      if (tya != Ity_I64 || end != Iend_LE)
3683         goto stmt_fail;
3684
3685      if (tyd == Ity_I64) {
3686         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3687         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
3688         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
3689         return;
3690      }
3691      if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
3692         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3693         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
3694         addInstr(env, AMD64Instr_Store(
3695                          toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
3696                          r,am));
3697         return;
3698      }
3699      if (tyd == Ity_F64) {
3700         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3701         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
3702         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
3703         return;
3704      }
3705      if (tyd == Ity_F32) {
3706         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3707         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
3708         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
3709         return;
3710      }
3711      if (tyd == Ity_V128) {
3712         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
3713         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
3714         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
3715         return;
3716      }
3717      if (tyd == Ity_V256) {
3718         HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
3719         AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3720         AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3721         HReg vHi, vLo;
3722         iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
3723         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
3724         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
3725         return;
3726      }
3727      break;
3728   }
3729
3730   /* --------- PUT --------- */
3731   case Ist_Put: {
3732      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3733      if (ty == Ity_I64) {
3734         /* We're going to write to memory, so compute the RHS into an
3735            AMD64RI. */
3736         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
3737         addInstr(env,
3738                  AMD64Instr_Alu64M(
3739                     Aalu_MOV,
3740                     ri,
3741                     AMD64AMode_IR(stmt->Ist.Put.offset,
3742                                   hregAMD64_RBP())
3743                 ));
3744         return;
3745      }
3746      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
3747         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
3748         addInstr(env, AMD64Instr_Store(
3749                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
3750                          r,
3751                          AMD64AMode_IR(stmt->Ist.Put.offset,
3752                                        hregAMD64_RBP())));
3753         return;
3754      }
3755      if (ty == Ity_F32) {
3756         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
3757         AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
3758         set_SSE_rounding_default(env); /* paranoia */
3759         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
3760         return;
3761      }
3762      if (ty == Ity_F64) {
3763         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
3764         AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
3765                                         hregAMD64_RBP() );
3766         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
3767         return;
3768      }
3769      if (ty == Ity_V128) {
3770         HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
3771         AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
3772                                         hregAMD64_RBP());
3773         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
3774         return;
3775      }
3776      if (ty == Ity_V256) {
3777         HReg vHi, vLo;
3778         iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
3779         HReg        rbp  = hregAMD64_RBP();
3780         AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
3781         AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
3782         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
3783         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
3784         return;
3785      }
3786      break;
3787   }
3788
3789   /* --------- Indexed PUT --------- */
3790   case Ist_PutI: {
3791      IRPutI *puti = stmt->Ist.PutI.details;
3792
3793      AMD64AMode* am
3794         = genGuestArrayOffset(
3795              env, puti->descr,
3796                   puti->ix, puti->bias );
3797
3798      IRType ty = typeOfIRExpr(env->type_env, puti->data);
3799      if (ty == Ity_F64) {
3800         HReg val = iselDblExpr(env, puti->data);
3801         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
3802         return;
3803      }
3804      if (ty == Ity_I8) {
3805         HReg r = iselIntExpr_R(env, puti->data);
3806         addInstr(env, AMD64Instr_Store( 1, r, am ));
3807         return;
3808      }
3809      if (ty == Ity_I64) {
3810         AMD64RI* ri = iselIntExpr_RI(env, puti->data);
3811         addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
3812         return;
3813      }
3814      break;
3815   }
3816
3817   /* --------- TMP --------- */
3818   case Ist_WrTmp: {
3819      IRTemp tmp = stmt->Ist.WrTmp.tmp;
3820      IRType ty = typeOfIRTemp(env->type_env, tmp);
3821
3822      /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
3823         compute it into an AMode and then use LEA.  This usually
3824         produces fewer instructions, often because (for memcheck
3825         created IR) we get t = address-expression, (t is later used
3826         twice) and so doing this naturally turns address-expression
3827         back into an AMD64 amode. */
3828      if (ty == Ity_I64
3829          && stmt->Ist.WrTmp.data->tag == Iex_Binop
3830          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
3831         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
3832         HReg dst = lookupIRTemp(env, tmp);
3833         if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
3834            /* Hmm, iselIntExpr_AMode wimped out and just computed the
3835               value into a register.  Just emit a normal reg-reg move
3836               so reg-alloc can coalesce it away in the usual way. */
3837            HReg src = am->Aam.IR.reg;
3838            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
3839         } else {
3840            addInstr(env, AMD64Instr_Lea64(am,dst));
3841         }
3842         return;
3843      }
3844
3845      if (ty == Ity_I64 || ty == Ity_I32
3846          || ty == Ity_I16 || ty == Ity_I8) {
3847         AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
3848         HReg dst = lookupIRTemp(env, tmp);
3849         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
3850         return;
3851      }
3852      if (ty == Ity_I128) {
3853         HReg rHi, rLo, dstHi, dstLo;
3854         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
3855         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
3856         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
3857         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
3858         return;
3859      }
3860      if (ty == Ity_I1) {
3861         AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
3862         HReg dst = lookupIRTemp(env, tmp);
3863         addInstr(env, AMD64Instr_Set64(cond, dst));
3864         return;
3865      }
3866      if (ty == Ity_F64) {
3867         HReg dst = lookupIRTemp(env, tmp);
3868         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
3869         addInstr(env, mk_vMOVsd_RR(src, dst));
3870         return;
3871      }
3872      if (ty == Ity_F32) {
3873         HReg dst = lookupIRTemp(env, tmp);
3874         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
3875         addInstr(env, mk_vMOVsd_RR(src, dst));
3876         return;
3877      }
3878      if (ty == Ity_V128) {
3879         HReg dst = lookupIRTemp(env, tmp);
3880         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
3881         addInstr(env, mk_vMOVsd_RR(src, dst));
3882         return;
3883      }
3884      if (ty == Ity_V256) {
3885         HReg rHi, rLo, dstHi, dstLo;
3886         iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
3887         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
3888         addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
3889         addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
3890         return;
3891      }
3892      break;
3893   }
3894
3895   /* --------- Call to DIRTY helper --------- */
3896   case Ist_Dirty: {
3897      IRType   retty;
3898      IRDirty* d = stmt->Ist.Dirty.details;
3899      Bool     passBBP = False;
3900
3901      if (d->nFxState == 0)
3902         vassert(!d->needsBBP);
3903
3904      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
3905
3906      /* Marshal args, do the call, clear stack. */
3907      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
3908
3909      /* Now figure out what to do with the returned value, if any. */
3910      if (d->tmp == IRTemp_INVALID)
3911         /* No return value.  Nothing to do. */
3912         return;
3913
3914      retty = typeOfIRTemp(env->type_env, d->tmp);
3915      if (retty == Ity_I64 || retty == Ity_I32
3916          || retty == Ity_I16 || retty == Ity_I8) {
3917         /* The returned value is in %rax.  Park it in the register
3918            associated with tmp. */
3919         HReg dst = lookupIRTemp(env, d->tmp);
3920         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
3921         return;
3922      }
3923      break;
3924   }
3925
3926   /* --------- MEM FENCE --------- */
3927   case Ist_MBE:
3928      switch (stmt->Ist.MBE.event) {
3929         case Imbe_Fence:
3930            addInstr(env, AMD64Instr_MFence());
3931            return;
3932         default:
3933            break;
3934      }
3935      break;
3936
3937   /* --------- ACAS --------- */
3938   case Ist_CAS:
3939      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
3940         /* "normal" singleton CAS */
3941         UChar  sz;
3942         IRCAS* cas = stmt->Ist.CAS.details;
3943         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
3944         /* get: cas->expd into %rax, and cas->data into %rbx */
3945         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
3946         HReg rData = iselIntExpr_R(env, cas->dataLo);
3947         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
3948         HReg rOld  = lookupIRTemp(env, cas->oldLo);
3949         vassert(cas->expdHi == NULL);
3950         vassert(cas->dataHi == NULL);
3951         addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
3952         addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
3953         addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
3954         switch (ty) {
3955            case Ity_I64: sz = 8; break;
3956            case Ity_I32: sz = 4; break;
3957            case Ity_I16: sz = 2; break;
3958            case Ity_I8:  sz = 1; break;
3959            default: goto unhandled_cas;
3960         }
3961         addInstr(env, AMD64Instr_ACAS(am, sz));
3962         addInstr(env, AMD64Instr_CMov64(
3963                          Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
3964         return;
3965      } else {
3966         /* double CAS */
3967         UChar  sz;
3968         IRCAS* cas = stmt->Ist.CAS.details;
3969         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
3970         /* only 32-bit and 64-bit allowed in this case */
3971         /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
3972         /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
3973         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
3974         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
3975         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
3976         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
3977         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
3978         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
3979         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
3980         switch (ty) {
3981            case Ity_I64:
3982               if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
3983                  goto unhandled_cas; /* we'd have to generate
3984                                         cmpxchg16b, but the host
3985                                         doesn't support that */
3986               sz = 8;
3987               break;
3988            case Ity_I32:
3989               sz = 4;
3990               break;
3991            default:
3992               goto unhandled_cas;
3993         }
3994         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
3995         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
3996         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
3997         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
3998         addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
3999         addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4000         addInstr(env, AMD64Instr_DACAS(am, sz));
4001         addInstr(env,
4002                  AMD64Instr_CMov64(
4003                     Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
4004         addInstr(env,
4005                  AMD64Instr_CMov64(
4006                     Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
4007         return;
4008      }
4009      unhandled_cas:
4010      break;
4011
4012   /* --------- INSTR MARK --------- */
4013   /* Doesn't generate any executable code ... */
4014   case Ist_IMark:
4015       return;
4016
4017   /* --------- ABI HINT --------- */
4018   /* These have no meaning (denotation in the IR) and so we ignore
4019      them ... if any actually made it this far. */
4020   case Ist_AbiHint:
4021       return;
4022
4023   /* --------- NO-OP --------- */
4024   case Ist_NoOp:
4025       return;
4026
4027   /* --------- EXIT --------- */
4028   case Ist_Exit: {
4029      if (stmt->Ist.Exit.dst->tag != Ico_U64)
4030         vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4031
4032      AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4033      AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
4034                                          hregAMD64_RBP());
4035
4036      /* Case: boring transfer to known address */
4037      if (stmt->Ist.Exit.jk == Ijk_Boring) {
4038         if (env->chainingAllowed) {
4039            /* .. almost always true .. */
4040            /* Skip the event check at the dst if this is a forwards
4041               edge. */
4042            Bool toFastEP
4043               = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
4044            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4045            addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
4046                                             amRIP, cc, toFastEP));
4047         } else {
4048            /* .. very occasionally .. */
4049            /* We can't use chaining, so ask for an assisted transfer,
4050               as that's the only alternative that is allowable. */
4051            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4052            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
4053         }
4054         return;
4055      }
4056
4057      /* Case: assisted transfer to arbitrary address */
4058      switch (stmt->Ist.Exit.jk) {
4059         /* Keep this list in sync with that in iselNext below */
4060         case Ijk_ClientReq:
4061         case Ijk_EmWarn:
4062         case Ijk_NoDecode:
4063         case Ijk_NoRedir:
4064         case Ijk_SigSEGV:
4065         case Ijk_SigTRAP:
4066         case Ijk_Sys_syscall:
4067         case Ijk_TInval:
4068         case Ijk_Yield:
4069         {
4070            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4071            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
4072            return;
4073         }
4074         default:
4075            break;
4076      }
4077
4078      /* Do we ever expect to see any other kind? */
4079      goto stmt_fail;
4080   }
4081
4082   default: break;
4083   }
4084  stmt_fail:
4085   ppIRStmt(stmt);
4086   vpanic("iselStmt(amd64)");
4087}
4088
4089
4090/*---------------------------------------------------------*/
4091/*--- ISEL: Basic block terminators (Nexts)             ---*/
4092/*---------------------------------------------------------*/
4093
4094static void iselNext ( ISelEnv* env,
4095                       IRExpr* next, IRJumpKind jk, Int offsIP )
4096{
4097   if (vex_traceflags & VEX_TRACE_VCODE) {
4098      vex_printf( "\n-- PUT(%d) = ", offsIP);
4099      ppIRExpr( next );
4100      vex_printf( "; exit-");
4101      ppIRJumpKind(jk);
4102      vex_printf( "\n");
4103   }
4104
4105   /* Case: boring transfer to known address */
4106   if (next->tag == Iex_Const) {
4107      IRConst* cdst = next->Iex.Const.con;
4108      vassert(cdst->tag == Ico_U64);
4109      if (jk == Ijk_Boring || jk == Ijk_Call) {
4110         /* Boring transfer to known address */
4111         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4112         if (env->chainingAllowed) {
4113            /* .. almost always true .. */
4114            /* Skip the event check at the dst if this is a forwards
4115               edge. */
4116            Bool toFastEP
4117               = ((Addr64)cdst->Ico.U64) > env->max_ga;
4118            if (0) vex_printf("%s", toFastEP ? "X" : ".");
4119            addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
4120                                             amRIP, Acc_ALWAYS,
4121                                             toFastEP));
4122         } else {
4123            /* .. very occasionally .. */
4124            /* We can't use chaining, so ask for an indirect transfer,
4125               as that's the cheapest alternative that is
4126               allowable. */
4127            HReg r = iselIntExpr_R(env, next);
4128            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4129                                               Ijk_Boring));
4130         }
4131         return;
4132      }
4133   }
4134
4135   /* Case: call/return (==boring) transfer to any address */
4136   switch (jk) {
4137      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4138         HReg        r     = iselIntExpr_R(env, next);
4139         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4140         if (env->chainingAllowed) {
4141            addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
4142         } else {
4143            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4144                                               Ijk_Boring));
4145         }
4146         return;
4147      }
4148      default:
4149         break;
4150   }
4151
4152   /* Case: assisted transfer to arbitrary address */
4153   switch (jk) {
4154      /* Keep this list in sync with that for Ist_Exit above */
4155      case Ijk_ClientReq:
4156      case Ijk_EmWarn:
4157      case Ijk_NoDecode:
4158      case Ijk_NoRedir:
4159      case Ijk_SigSEGV:
4160      case Ijk_SigTRAP:
4161      case Ijk_Sys_syscall:
4162      case Ijk_TInval:
4163      case Ijk_Yield: {
4164         HReg        r     = iselIntExpr_R(env, next);
4165         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4166         addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
4167         return;
4168      }
4169      default:
4170         break;
4171   }
4172
4173   vex_printf( "\n-- PUT(%d) = ", offsIP);
4174   ppIRExpr( next );
4175   vex_printf( "; exit-");
4176   ppIRJumpKind(jk);
4177   vex_printf( "\n");
4178   vassert(0); // are we expecting any other kind?
4179}
4180
4181
4182/*---------------------------------------------------------*/
4183/*--- Insn selector top-level                           ---*/
4184/*---------------------------------------------------------*/
4185
4186/* Translate an entire SB to amd64 code. */
4187
4188HInstrArray* iselSB_AMD64 ( IRSB* bb,
4189                            VexArch      arch_host,
4190                            VexArchInfo* archinfo_host,
4191                            VexAbiInfo*  vbi/*UNUSED*/,
4192                            Int offs_Host_EvC_Counter,
4193                            Int offs_Host_EvC_FailAddr,
4194                            Bool chainingAllowed,
4195                            Bool addProfInc,
4196                            Addr64 max_ga )
4197{
4198   Int        i, j;
4199   HReg       hreg, hregHI;
4200   ISelEnv*   env;
4201   UInt       hwcaps_host = archinfo_host->hwcaps;
4202   AMD64AMode *amCounter, *amFailAddr;
4203
4204   /* sanity ... */
4205   vassert(arch_host == VexArchAMD64);
4206   vassert(0 == (hwcaps_host
4207                 & ~(VEX_HWCAPS_AMD64_SSE3
4208                     | VEX_HWCAPS_AMD64_CX16
4209                     | VEX_HWCAPS_AMD64_LZCNT
4210                     | VEX_HWCAPS_AMD64_AVX)));
4211
4212   /* Make up an initial environment to use. */
4213   env = LibVEX_Alloc(sizeof(ISelEnv));
4214   env->vreg_ctr = 0;
4215
4216   /* Set up output code array. */
4217   env->code = newHInstrArray();
4218
4219   /* Copy BB's type env. */
4220   env->type_env = bb->tyenv;
4221
4222   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4223      change as we go along. */
4224   env->n_vregmap = bb->tyenv->types_used;
4225   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4226   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4227
4228   /* and finally ... */
4229   env->chainingAllowed = chainingAllowed;
4230   env->hwcaps          = hwcaps_host;
4231   env->max_ga          = max_ga;
4232
4233   /* For each IR temporary, allocate a suitably-kinded virtual
4234      register. */
4235   j = 0;
4236   for (i = 0; i < env->n_vregmap; i++) {
4237      hregHI = hreg = INVALID_HREG;
4238      switch (bb->tyenv->types[i]) {
4239         case Ity_I1:
4240         case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
4241            hreg = mkHReg(j++, HRcInt64, True);
4242            break;
4243         case Ity_I128:
4244            hreg   = mkHReg(j++, HRcInt64, True);
4245            hregHI = mkHReg(j++, HRcInt64, True);
4246            break;
4247         case Ity_F32:
4248         case Ity_F64:
4249         case Ity_V128:
4250            hreg = mkHReg(j++, HRcVec128, True);
4251            break;
4252         case Ity_V256:
4253            hreg   = mkHReg(j++, HRcVec128, True);
4254            hregHI = mkHReg(j++, HRcVec128, True);
4255            break;
4256         default:
4257            ppIRType(bb->tyenv->types[i]);
4258            vpanic("iselBB(amd64): IRTemp type");
4259      }
4260      env->vregmap[i]   = hreg;
4261      env->vregmapHI[i] = hregHI;
4262   }
4263   env->vreg_ctr = j;
4264
4265   /* The very first instruction must be an event check. */
4266   amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
4267   amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
4268   addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
4269
4270   /* Possibly a block counter increment (for profiling).  At this
4271      point we don't know the address of the counter, so just pretend
4272      it is zero.  It will have to be patched later, but before this
4273      translation is used, by a call to LibVEX_patchProfCtr. */
4274   if (addProfInc) {
4275      addInstr(env, AMD64Instr_ProfInc());
4276   }
4277
4278   /* Ok, finally we can iterate over the statements. */
4279   for (i = 0; i < bb->stmts_used; i++)
4280      if (bb->stmts[i])
4281         iselStmt(env, bb->stmts[i]);
4282
4283   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4284
4285   /* record the number of vregs we used. */
4286   env->code->n_vregs = env->vreg_ctr;
4287   return env->code;
4288}
4289
4290
4291/*---------------------------------------------------------------*/
4292/*--- end                                   host_amd64_isel.c ---*/
4293/*---------------------------------------------------------------*/
4294