1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_generic_simd256.h"
47#include "host_generic_maddf.h"
48#include "host_amd64_defs.h"
49
50
51/*---------------------------------------------------------*/
52/*--- x87/SSE control word stuff                        ---*/
53/*---------------------------------------------------------*/
54
55/* Vex-generated code expects to run with the FPU set as follows: all
56   exceptions masked, round-to-nearest, precision = 53 bits.  This
57   corresponds to a FPU control word value of 0x027F.
58
59   Similarly the SSE control word (%mxcsr) should be 0x1F80.
60
61   %fpucw and %mxcsr should have these values on entry to
62   Vex-generated code, and should those values should be
63   unchanged at exit.
64*/
65
66#define DEFAULT_FPUCW 0x027F
67
68#define DEFAULT_MXCSR 0x1F80
69
70/* debugging only, do not use */
71/* define DEFAULT_FPUCW 0x037F */
72
73
74/*---------------------------------------------------------*/
75/*--- misc helpers                                      ---*/
76/*---------------------------------------------------------*/
77
78/* These are duplicated in guest-amd64/toIR.c */
79static IRExpr* unop ( IROp op, IRExpr* a )
80{
81   return IRExpr_Unop(op, a);
82}
83
84static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
85{
86   return IRExpr_Binop(op, a1, a2);
87}
88
89static IRExpr* bind ( Int binder )
90{
91   return IRExpr_Binder(binder);
92}
93
94static Bool isZeroU8 ( IRExpr* e )
95{
96   return e->tag == Iex_Const
97          && e->Iex.Const.con->tag == Ico_U8
98          && e->Iex.Const.con->Ico.U8 == 0;
99}
100
101
102/*---------------------------------------------------------*/
103/*--- ISelEnv                                           ---*/
104/*---------------------------------------------------------*/
105
106/* This carries around:
107
108   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
109     might encounter.  This is computed before insn selection starts,
110     and does not change.
111
112   - A mapping from IRTemp to HReg.  This tells the insn selector
113     which virtual register is associated with each IRTemp
114     temporary.  This is computed before insn selection starts, and
115     does not change.  We expect this mapping to map precisely the
116     same set of IRTemps as the type mapping does.
117
118        - vregmap   holds the primary register for the IRTemp.
119        - vregmapHI is only used for 128-bit integer-typed
120             IRTemps.  It holds the identity of a second
121             64-bit virtual HReg, which holds the high half
122             of the value.
123
124   - The host subarchitecture we are selecting insns for.
125     This is set at the start and does not change.
126
127   - The code array, that is, the insns selected so far.
128
129   - A counter, for generating new virtual registers.
130
131   - A Bool for indicating whether we may generate chain-me
132     instructions for control flow transfers, or whether we must use
133     XAssisted.
134
135   - The maximum guest address of any guest insn in this block.
136     Actually, the address of the highest-addressed byte from any insn
137     in this block.  Is set at the start and does not change.  This is
138     used for detecting jumps which are definitely forward-edges from
139     this block, and therefore can be made (chained) to the fast entry
140     point of the destination, thereby avoiding the destination's
141     event check.
142
143   Note, this is all host-independent.  (JRS 20050201: well, kinda
144   ... not completely.  Compare with ISelEnv for X86.)
145*/
146
147typedef
148   struct {
149      /* Constant -- are set at the start and do not change. */
150      IRTypeEnv*   type_env;
151
152      HReg*        vregmap;
153      HReg*        vregmapHI;
154      Int          n_vregmap;
155
156      UInt         hwcaps;
157
158      Bool         chainingAllowed;
159      Addr64       max_ga;
160
161      /* These are modified as we go along. */
162      HInstrArray* code;
163      Int          vreg_ctr;
164   }
165   ISelEnv;
166
167
168static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
169{
170   vassert(tmp >= 0);
171   vassert(tmp < env->n_vregmap);
172   return env->vregmap[tmp];
173}
174
175static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
176                               ISelEnv* env, IRTemp tmp )
177{
178   vassert(tmp >= 0);
179   vassert(tmp < env->n_vregmap);
180   vassert(! hregIsInvalid(env->vregmapHI[tmp]));
181   *vrLO = env->vregmap[tmp];
182   *vrHI = env->vregmapHI[tmp];
183}
184
185static void addInstr ( ISelEnv* env, AMD64Instr* instr )
186{
187   addHInstr(env->code, instr);
188   if (vex_traceflags & VEX_TRACE_VCODE) {
189      ppAMD64Instr(instr, True);
190      vex_printf("\n");
191   }
192}
193
194static HReg newVRegI ( ISelEnv* env )
195{
196   HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
197   env->vreg_ctr++;
198   return reg;
199}
200
201static HReg newVRegV ( ISelEnv* env )
202{
203   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
204   env->vreg_ctr++;
205   return reg;
206}
207
208
209/*---------------------------------------------------------*/
210/*--- ISEL: Forward declarations                        ---*/
211/*---------------------------------------------------------*/
212
213/* These are organised as iselXXX and iselXXX_wrk pairs.  The
214   iselXXX_wrk do the real work, but are not to be called directly.
215   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
216   checks that all returned registers are virtual.  You should not
217   call the _wrk version directly.
218*/
219static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
220static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
221
222static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
223static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
224
225static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
226static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
227
228static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
229static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
230
231static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
232static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
233
234static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
235                                          ISelEnv* env, IRExpr* e );
236static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
237                                          ISelEnv* env, IRExpr* e );
238
239static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
240static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
241
242static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
243static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
244
245static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
246static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
247
248static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
249static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
250
251static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252                                        ISelEnv* env, IRExpr* e );
253static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
254                                        ISelEnv* env, IRExpr* e );
255
256
257/*---------------------------------------------------------*/
258/*--- ISEL: Misc helpers                                ---*/
259/*---------------------------------------------------------*/
260
261static Bool sane_AMode ( AMD64AMode* am )
262{
263   switch (am->tag) {
264      case Aam_IR:
265         return
266            toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267                    && (hregIsVirtual(am->Aam.IR.reg)
268                        || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269      case Aam_IRRS:
270         return
271            toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272                    && hregIsVirtual(am->Aam.IRRS.base)
273                    && hregClass(am->Aam.IRRS.index) == HRcInt64
274                    && hregIsVirtual(am->Aam.IRRS.index) );
275      default:
276        vpanic("sane_AMode: unknown amd64 amode tag");
277   }
278}
279
280
281/* Can the lower 32 bits be signedly widened to produce the whole
282   64-bit value?  In other words, are the top 33 bits either all 0 or
283   all 1 ? */
284static Bool fitsIn32Bits ( ULong x )
285{
286   Long y0 = (Long)x;
287   Long y1 = y0;
288   y1 <<= 32;
289   y1 >>=/*s*/ 32;
290   return toBool(x == y1);
291}
292
293/* Is this a 64-bit zero expression? */
294
295static Bool isZeroU64 ( IRExpr* e )
296{
297   return e->tag == Iex_Const
298          && e->Iex.Const.con->tag == Ico_U64
299          && e->Iex.Const.con->Ico.U64 == 0ULL;
300}
301
302static Bool isZeroU32 ( IRExpr* e )
303{
304   return e->tag == Iex_Const
305          && e->Iex.Const.con->tag == Ico_U32
306          && e->Iex.Const.con->Ico.U32 == 0;
307}
308
309/* Make a int reg-reg move. */
310
311static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
312{
313   vassert(hregClass(src) == HRcInt64);
314   vassert(hregClass(dst) == HRcInt64);
315   return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
316}
317
318/* Make a vector (128 bit) reg-reg move. */
319
320static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
321{
322   vassert(hregClass(src) == HRcVec128);
323   vassert(hregClass(dst) == HRcVec128);
324   return AMD64Instr_SseReRg(Asse_MOV, src, dst);
325}
326
327/* Advance/retreat %rsp by n. */
328
329static void add_to_rsp ( ISelEnv* env, Int n )
330{
331   vassert(n > 0 && n < 256 && (n%8) == 0);
332   addInstr(env,
333            AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
334                                        hregAMD64_RSP()));
335}
336
337static void sub_from_rsp ( ISelEnv* env, Int n )
338{
339   vassert(n > 0 && n < 256 && (n%8) == 0);
340   addInstr(env,
341            AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
342                                        hregAMD64_RSP()));
343}
344
345/* Push 64-bit constants on the stack. */
346static void push_uimm64( ISelEnv* env, ULong uimm64 )
347{
348   /* If uimm64 can be expressed as the sign extension of its
349      lower 32 bits, we can do it the easy way. */
350   Long simm64 = (Long)uimm64;
351   if ( simm64 == ((simm64 << 32) >> 32) ) {
352      addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
353   } else {
354      HReg tmp = newVRegI(env);
355      addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
356      addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
357   }
358}
359
360
361/* Used only in doHelperCall.  If possible, produce a single
362   instruction which computes 'e' into 'dst'.  If not possible, return
363   NULL. */
364
365static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
366                                                    HReg     dst,
367                                                    IRExpr*  e )
368{
369   /* Per comments in doHelperCall below, appearance of
370      Iex_VECRET implies ill-formed IR. */
371   vassert(e->tag != Iex_VECRET);
372
373   /* In this case we give out a copy of the BaseBlock pointer. */
374   if (UNLIKELY(e->tag == Iex_BBPTR)) {
375      return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
376   }
377
378   vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
379
380   if (e->tag == Iex_Const) {
381      vassert(e->Iex.Const.con->tag == Ico_U64);
382      if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
383         return AMD64Instr_Alu64R(
384                   Aalu_MOV,
385                   AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
386                   dst
387                );
388      } else {
389         return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
390      }
391   }
392
393   if (e->tag == Iex_RdTmp) {
394      HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
395      return mk_iMOVsd_RR(src, dst);
396   }
397
398   if (e->tag == Iex_Get) {
399      vassert(e->Iex.Get.ty == Ity_I64);
400      return AMD64Instr_Alu64R(
401                Aalu_MOV,
402                AMD64RMI_Mem(
403                   AMD64AMode_IR(e->Iex.Get.offset,
404                                 hregAMD64_RBP())),
405                dst);
406   }
407
408   if (e->tag == Iex_Unop
409       && e->Iex.Unop.op == Iop_32Uto64
410       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
411      HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
412      return AMD64Instr_MovxLQ(False, src, dst);
413   }
414
415   if (0) { ppIRExpr(e); vex_printf("\n"); }
416
417   return NULL;
418}
419
420
421/* Do a complete function call.  |guard| is a Ity_Bit expression
422   indicating whether or not the call happens.  If guard==NULL, the
423   call is unconditional.  |retloc| is set to indicate where the
424   return value is after the call.  The caller (of this fn) must
425   generate code to add |stackAdjustAfterCall| to the stack pointer
426   after the call is done. */
427
428static
429void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
430                    /*OUT*/RetLoc* retloc,
431                    ISelEnv* env,
432                    IRExpr* guard,
433                    IRCallee* cee, IRType retTy, IRExpr** args )
434{
435   AMD64CondCode cc;
436   HReg          argregs[6];
437   HReg          tmpregs[6];
438   AMD64Instr*   fastinstrs[6];
439   UInt          n_args, i;
440
441   /* Set default returns.  We'll update them later if needed. */
442   *stackAdjustAfterCall = 0;
443   *retloc               = mk_RetLoc_INVALID();
444
445   /* These are used for cross-checking that IR-level constraints on
446      the use of IRExpr_VECRET() and IRExpr_BBPTR() are observed. */
447   UInt nVECRETs = 0;
448   UInt nBBPTRs  = 0;
449
450   /* Marshal args for a call and do the call.
451
452      This function only deals with a tiny set of possibilities, which
453      cover all helpers in practice.  The restrictions are that only
454      arguments in registers are supported, hence only 6x64 integer
455      bits in total can be passed.  In fact the only supported arg
456      type is I64.
457
458      The return type can be I{64,32,16,8} or V{128,256}.  In the
459      latter two cases, it is expected that |args| will contain the
460      special node IRExpr_VECRET(), in which case this routine
461      generates code to allocate space on the stack for the vector
462      return value.  Since we are not passing any scalars on the
463      stack, it is enough to preallocate the return space before
464      marshalling any arguments, in this case.
465
466      |args| may also contain IRExpr_BBPTR(), in which case the
467      value in %rbp is passed as the corresponding argument.
468
469      Generating code which is both efficient and correct when
470      parameters are to be passed in registers is difficult, for the
471      reasons elaborated in detail in comments attached to
472      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
473      of the method described in those comments.
474
475      The problem is split into two cases: the fast scheme and the
476      slow scheme.  In the fast scheme, arguments are computed
477      directly into the target (real) registers.  This is only safe
478      when we can be sure that computation of each argument will not
479      trash any real registers set by computation of any other
480      argument.
481
482      In the slow scheme, all args are first computed into vregs, and
483      once they are all done, they are moved to the relevant real
484      regs.  This always gives correct code, but it also gives a bunch
485      of vreg-to-rreg moves which are usually redundant but are hard
486      for the register allocator to get rid of.
487
488      To decide which scheme to use, all argument expressions are
489      first examined.  If they are all so simple that it is clear they
490      will be evaluated without use of any fixed registers, use the
491      fast scheme, else use the slow scheme.  Note also that only
492      unconditional calls may use the fast scheme, since having to
493      compute a condition expression could itself trash real
494      registers.  Note that for simplicity, in the case where
495      IRExpr_VECRET() is present, we use the slow scheme.  This is
496      motivated by the desire to avoid any possible complexity
497      w.r.t. nested calls.
498
499      Note this requires being able to examine an expression and
500      determine whether or not evaluation of it might use a fixed
501      register.  That requires knowledge of how the rest of this insn
502      selector works.  Currently just the following 3 are regarded as
503      safe -- hopefully they cover the majority of arguments in
504      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
505   */
506
507   /* Note that the cee->regparms field is meaningless on AMD64 host
508      (since there is only one calling convention) and so we always
509      ignore it. */
510   n_args = 0;
511   for (i = 0; args[i]; i++)
512      n_args++;
513
514   if (n_args > 6)
515      vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
516
517   argregs[0] = hregAMD64_RDI();
518   argregs[1] = hregAMD64_RSI();
519   argregs[2] = hregAMD64_RDX();
520   argregs[3] = hregAMD64_RCX();
521   argregs[4] = hregAMD64_R8();
522   argregs[5] = hregAMD64_R9();
523
524   tmpregs[0] = tmpregs[1] = tmpregs[2] =
525   tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
526
527   fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
528   fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
529
530   /* First decide which scheme (slow or fast) is to be used.  First
531      assume the fast scheme, and select slow if any contraindications
532      (wow) appear. */
533
534   /* We'll need space on the stack for the return value.  Avoid
535      possible complications with nested calls by using the slow
536      scheme. */
537   if (retTy == Ity_V128 || retTy == Ity_V256)
538      goto slowscheme;
539
540   if (guard) {
541      if (guard->tag == Iex_Const
542          && guard->Iex.Const.con->tag == Ico_U1
543          && guard->Iex.Const.con->Ico.U1 == True) {
544         /* unconditional */
545      } else {
546         /* Not manifestly unconditional -- be conservative. */
547         goto slowscheme;
548      }
549   }
550
551   /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
552      use the slow scheme.  Because this is tentative, we can't call
553      addInstr (that is, commit to) any instructions until we're
554      handled all the arguments.  So park the resulting instructions
555      in a buffer and emit that if we're successful. */
556
557   /* FAST SCHEME */
558   /* In this loop, we process args that can be computed into the
559      destination (real) register with a single instruction, without
560      using any fixed regs.  That also includes IRExpr_BBPTR(), but
561      not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
562      never see IRExpr_VECRET() at this point, since the return-type
563      check above should ensure all those cases use the slow scheme
564      instead. */
565   vassert(n_args >= 0 && n_args <= 6);
566   for (i = 0; i < n_args; i++) {
567      IRExpr* arg = args[i];
568      if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg))) {
569         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
570      }
571      fastinstrs[i]
572         = iselIntExpr_single_instruction( env, argregs[i], args[i] );
573      if (fastinstrs[i] == NULL)
574         goto slowscheme;
575   }
576
577   /* Looks like we're in luck.  Emit the accumulated instructions and
578      move on to doing the call itself. */
579   for (i = 0; i < n_args; i++)
580      addInstr(env, fastinstrs[i]);
581
582   /* Fast scheme only applies for unconditional calls.  Hence: */
583   cc = Acc_ALWAYS;
584
585   goto handle_call;
586
587
588   /* SLOW SCHEME; move via temporaries */
589  slowscheme:
590   {}
591#  if 0 /* debug only */
592   if (n_args > 0) {for (i = 0; args[i]; i++) {
593   ppIRExpr(args[i]); vex_printf(" "); }
594   vex_printf("\n");}
595#  endif
596
597   /* If we have a vector return type, allocate a place for it on the
598      stack and record its address. */
599   HReg r_vecRetAddr = INVALID_HREG;
600   if (retTy == Ity_V128) {
601      r_vecRetAddr = newVRegI(env);
602      sub_from_rsp(env, 16);
603      addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
604   }
605   else if (retTy == Ity_V256) {
606      r_vecRetAddr = newVRegI(env);
607      sub_from_rsp(env, 32);
608      addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
609   }
610
611   vassert(n_args >= 0 && n_args <= 6);
612   for (i = 0; i < n_args; i++) {
613      IRExpr* arg = args[i];
614      if (UNLIKELY(arg->tag == Iex_BBPTR)) {
615         tmpregs[i] = newVRegI(env);
616         addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
617         nBBPTRs++;
618      }
619      else if (UNLIKELY(arg->tag == Iex_VECRET)) {
620         /* We stashed the address of the return slot earlier, so just
621            retrieve it now. */
622         vassert(!hregIsInvalid(r_vecRetAddr));
623         tmpregs[i] = r_vecRetAddr;
624         nVECRETs++;
625      }
626      else {
627         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
628         tmpregs[i] = iselIntExpr_R(env, args[i]);
629      }
630   }
631
632   /* Now we can compute the condition.  We can't do it earlier
633      because the argument computations could trash the condition
634      codes.  Be a bit clever to handle the common case where the
635      guard is 1:Bit. */
636   cc = Acc_ALWAYS;
637   if (guard) {
638      if (guard->tag == Iex_Const
639          && guard->Iex.Const.con->tag == Ico_U1
640          && guard->Iex.Const.con->Ico.U1 == True) {
641         /* unconditional -- do nothing */
642      } else {
643         cc = iselCondCode( env, guard );
644      }
645   }
646
647   /* Move the args to their final destinations. */
648   for (i = 0; i < n_args; i++) {
649      /* None of these insns, including any spill code that might
650         be generated, may alter the condition codes. */
651      addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
652   }
653
654
655   /* Do final checks, set the return values, and generate the call
656      instruction proper. */
657  handle_call:
658
659   if (retTy == Ity_V128 || retTy == Ity_V256) {
660      vassert(nVECRETs == 1);
661   } else {
662      vassert(nVECRETs == 0);
663   }
664
665   vassert(nBBPTRs == 0 || nBBPTRs == 1);
666
667   vassert(*stackAdjustAfterCall == 0);
668   vassert(is_RetLoc_INVALID(*retloc));
669   switch (retTy) {
670         case Ity_INVALID:
671            /* Function doesn't return a value. */
672            *retloc = mk_RetLoc_simple(RLPri_None);
673            break;
674         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
675            *retloc = mk_RetLoc_simple(RLPri_Int);
676            break;
677         case Ity_V128:
678            *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
679            *stackAdjustAfterCall = 16;
680            break;
681         case Ity_V256:
682            *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
683            *stackAdjustAfterCall = 32;
684            break;
685         default:
686            /* IR can denote other possible return types, but we don't
687               handle those here. */
688           vassert(0);
689   }
690
691   /* Finally, generate the call itself.  This needs the *retloc value
692      set in the switch above, which is why it's at the end. */
693   addInstr(env,
694            AMD64Instr_Call(cc, Ptr_to_ULong(cee->addr), n_args, *retloc));
695}
696
697
698/* Given a guest-state array descriptor, an index expression and a
699   bias, generate an AMD64AMode holding the relevant guest state
700   offset. */
701
702static
703AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
704                                  IRExpr* off, Int bias )
705{
706   HReg tmp, roff;
707   Int  elemSz = sizeofIRType(descr->elemTy);
708   Int  nElems = descr->nElems;
709
710   /* Throw out any cases not generated by an amd64 front end.  In
711      theory there might be a day where we need to handle them -- if
712      we ever run non-amd64-guest on amd64 host. */
713
714   if (nElems != 8 || (elemSz != 1 && elemSz != 8))
715      vpanic("genGuestArrayOffset(amd64 host)");
716
717   /* Compute off into a reg, %off.  Then return:
718
719         movq %off, %tmp
720         addq $bias, %tmp  (if bias != 0)
721         andq %tmp, 7
722         ... base(%rbp, %tmp, shift) ...
723   */
724   tmp  = newVRegI(env);
725   roff = iselIntExpr_R(env, off);
726   addInstr(env, mk_iMOVsd_RR(roff, tmp));
727   if (bias != 0) {
728      /* Make sure the bias is sane, in the sense that there are
729         no significant bits above bit 30 in it. */
730      vassert(-10000 < bias && bias < 10000);
731      addInstr(env,
732               AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
733   }
734   addInstr(env,
735            AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
736   vassert(elemSz == 1 || elemSz == 8);
737   return
738      AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
739                                    elemSz==8 ? 3 : 0);
740}
741
742
743/* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
744static
745void set_SSE_rounding_default ( ISelEnv* env )
746{
747   /* pushq $DEFAULT_MXCSR
748      ldmxcsr 0(%rsp)
749      addq $8, %rsp
750   */
751   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
752   addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
753   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
754   add_to_rsp(env, 8);
755}
756
757/* Mess with the FPU's rounding mode: set to the default rounding mode
758   (DEFAULT_FPUCW). */
759static
760void set_FPU_rounding_default ( ISelEnv* env )
761{
762   /* movq $DEFAULT_FPUCW, -8(%rsp)
763      fldcw -8(%esp)
764   */
765   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
766   addInstr(env, AMD64Instr_Alu64M(
767                    Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
768   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
769}
770
771
772/* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
773   expression denoting a value in the range 0 .. 3, indicating a round
774   mode encoded as per type IRRoundingMode.  Set the SSE machinery to
775   have the same rounding.
776*/
777static
778void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
779{
780   /* Note: this sequence only makes sense because DEFAULT_MXCSR has
781      both rounding bits == 0.  If that wasn't the case, we couldn't
782      create a new rounding field simply by ORing the new value into
783      place. */
784
785   /* movq $3, %reg
786      andq [[mode]], %reg  -- shouldn't be needed; paranoia
787      shlq $13, %reg
788      orq $DEFAULT_MXCSR, %reg
789      pushq %reg
790      ldmxcsr 0(%esp)
791      addq $8, %rsp
792   */
793   HReg        reg      = newVRegI(env);
794   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
795   addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
796   addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
797                                   iselIntExpr_RMI(env, mode), reg));
798   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
799   addInstr(env, AMD64Instr_Alu64R(
800                    Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
801   addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
802   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
803   add_to_rsp(env, 8);
804}
805
806
807/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
808   expression denoting a value in the range 0 .. 3, indicating a round
809   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
810   the same rounding.
811*/
812static
813void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
814{
815   HReg rrm  = iselIntExpr_R(env, mode);
816   HReg rrm2 = newVRegI(env);
817   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
818
819   /* movq  %rrm, %rrm2
820      andq  $3, %rrm2   -- shouldn't be needed; paranoia
821      shlq  $10, %rrm2
822      orq   $DEFAULT_FPUCW, %rrm2
823      movq  %rrm2, -8(%rsp)
824      fldcw -8(%esp)
825   */
826   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
827   addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
828   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
829   addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
830                                   AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
831   addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
832                                   AMD64RI_Reg(rrm2), m8_rsp));
833   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
834}
835
836
837/* Generate all-zeroes into a new vector register.
838*/
839static HReg generate_zeroes_V128 ( ISelEnv* env )
840{
841   HReg dst = newVRegV(env);
842   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
843   return dst;
844}
845
846/* Generate all-ones into a new vector register.
847*/
848static HReg generate_ones_V128 ( ISelEnv* env )
849{
850   HReg dst = newVRegV(env);
851   addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
852   return dst;
853}
854
855
856/* Generate !src into a new vector register.  Amazing that there isn't
857   a less crappy way to do this.
858*/
859static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
860{
861   HReg dst = generate_ones_V128(env);
862   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
863   return dst;
864}
865
866
867/* Expand the given byte into a 64-bit word, by cloning each bit
868   8 times. */
869static ULong bitmask8_to_bytemask64 ( UShort w8 )
870{
871   vassert(w8 == (w8 & 0xFF));
872   ULong w64 = 0;
873   Int i;
874   for (i = 0; i < 8; i++) {
875      if (w8 & (1<<i))
876         w64 |= (0xFFULL << (8 * i));
877   }
878   return w64;
879}
880
881
882/*---------------------------------------------------------*/
883/*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
884/*---------------------------------------------------------*/
885
886/* Select insns for an integer-typed expression, and add them to the
887   code list.  Return a reg holding the result.  This reg will be a
888   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
889   want to modify it, ask for a new vreg, copy it in there, and modify
890   the copy.  The register allocator will do its best to map both
891   vregs to the same real register, so the copies will often disappear
892   later in the game.
893
894   This should handle expressions of 64, 32, 16 and 8-bit type.  All
895   results are returned in a 64-bit register.  For 32-, 16- and 8-bit
896   expressions, the upper 32/48/56 bits are arbitrary, so you should
897   mask or sign extend partial values if necessary.
898*/
899
900static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
901{
902   HReg r = iselIntExpr_R_wrk(env, e);
903   /* sanity checks ... */
904#  if 0
905   vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
906#  endif
907   vassert(hregClass(r) == HRcInt64);
908   vassert(hregIsVirtual(r));
909   return r;
910}
911
912/* DO NOT CALL THIS DIRECTLY ! */
913static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
914{
915   /* Used for unary/binary SIMD64 ops. */
916   HWord fn = 0;
917   Bool second_is_UInt;
918
919   MatchInfo mi;
920   DECLARE_PATTERN(p_1Uto8_64to1);
921   DECLARE_PATTERN(p_LDle8_then_8Uto64);
922   DECLARE_PATTERN(p_LDle16_then_16Uto64);
923
924   IRType ty = typeOfIRExpr(env->type_env,e);
925   switch (ty) {
926      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
927      default: vassert(0);
928   }
929
930   switch (e->tag) {
931
932   /* --------- TEMP --------- */
933   case Iex_RdTmp: {
934      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
935   }
936
937   /* --------- LOAD --------- */
938   case Iex_Load: {
939      HReg dst = newVRegI(env);
940      AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
941
942      /* We can't handle big-endian loads, nor load-linked. */
943      if (e->Iex.Load.end != Iend_LE)
944         goto irreducible;
945
946      if (ty == Ity_I64) {
947         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
948                                         AMD64RMI_Mem(amode), dst) );
949         return dst;
950      }
951      if (ty == Ity_I32) {
952         addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
953         return dst;
954      }
955      if (ty == Ity_I16) {
956         addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
957         return dst;
958      }
959      if (ty == Ity_I8) {
960         addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
961         return dst;
962      }
963      break;
964   }
965
966   /* --------- BINARY OP --------- */
967   case Iex_Binop: {
968      AMD64AluOp   aluOp;
969      AMD64ShiftOp shOp;
970
971      /* Pattern: Sub64(0,x) */
972      /*     and: Sub32(0,x) */
973      if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
974          || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
975         HReg dst = newVRegI(env);
976         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
977         addInstr(env, mk_iMOVsd_RR(reg,dst));
978         addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
979         return dst;
980      }
981
982      /* Is it an addition or logical style op? */
983      switch (e->Iex.Binop.op) {
984         case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
985            aluOp = Aalu_ADD; break;
986         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
987            aluOp = Aalu_SUB; break;
988         case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
989            aluOp = Aalu_AND; break;
990         case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
991            aluOp = Aalu_OR; break;
992         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
993            aluOp = Aalu_XOR; break;
994         case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
995            aluOp = Aalu_MUL; break;
996         default:
997            aluOp = Aalu_INVALID; break;
998      }
999      /* For commutative ops we assume any literal
1000         values are on the second operand. */
1001      if (aluOp != Aalu_INVALID) {
1002         HReg dst      = newVRegI(env);
1003         HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1004         AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1005         addInstr(env, mk_iMOVsd_RR(reg,dst));
1006         addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1007         return dst;
1008      }
1009
1010      /* Perhaps a shift op? */
1011      switch (e->Iex.Binop.op) {
1012         case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1013            shOp = Ash_SHL; break;
1014         case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1015            shOp = Ash_SHR; break;
1016         case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1017            shOp = Ash_SAR; break;
1018         default:
1019            shOp = Ash_INVALID; break;
1020      }
1021      if (shOp != Ash_INVALID) {
1022         HReg dst = newVRegI(env);
1023
1024         /* regL = the value to be shifted */
1025         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1026         addInstr(env, mk_iMOVsd_RR(regL,dst));
1027
1028         /* Do any necessary widening for 32/16/8 bit operands */
1029         switch (e->Iex.Binop.op) {
1030            case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1031               break;
1032            case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1033               break;
1034            case Iop_Shr8:
1035               addInstr(env, AMD64Instr_Alu64R(
1036                                Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1037               break;
1038            case Iop_Shr16:
1039               addInstr(env, AMD64Instr_Alu64R(
1040                                Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1041               break;
1042            case Iop_Shr32:
1043               addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1044               break;
1045            case Iop_Sar8:
1046               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1047               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1048               break;
1049            case Iop_Sar16:
1050               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1051               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1052               break;
1053            case Iop_Sar32:
1054               addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1055               break;
1056            default:
1057               ppIROp(e->Iex.Binop.op);
1058               vassert(0);
1059         }
1060
1061         /* Now consider the shift amount.  If it's a literal, we
1062            can do a much better job than the general case. */
1063         if (e->Iex.Binop.arg2->tag == Iex_Const) {
1064            /* assert that the IR is well-typed */
1065            Int nshift;
1066            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1067            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1068            vassert(nshift >= 0);
1069            if (nshift > 0)
1070               /* Can't allow nshift==0 since that means %cl */
1071               addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1072         } else {
1073            /* General case; we have to force the amount into %cl. */
1074            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1075            addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1076            addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1077         }
1078         return dst;
1079      }
1080
1081      /* Deal with 64-bit SIMD binary ops */
1082      second_is_UInt = False;
1083      switch (e->Iex.Binop.op) {
1084         case Iop_Add8x8:
1085            fn = (HWord)h_generic_calc_Add8x8; break;
1086         case Iop_Add16x4:
1087            fn = (HWord)h_generic_calc_Add16x4; break;
1088         case Iop_Add32x2:
1089            fn = (HWord)h_generic_calc_Add32x2; break;
1090
1091         case Iop_Avg8Ux8:
1092            fn = (HWord)h_generic_calc_Avg8Ux8; break;
1093         case Iop_Avg16Ux4:
1094            fn = (HWord)h_generic_calc_Avg16Ux4; break;
1095
1096         case Iop_CmpEQ8x8:
1097            fn = (HWord)h_generic_calc_CmpEQ8x8; break;
1098         case Iop_CmpEQ16x4:
1099            fn = (HWord)h_generic_calc_CmpEQ16x4; break;
1100         case Iop_CmpEQ32x2:
1101            fn = (HWord)h_generic_calc_CmpEQ32x2; break;
1102
1103         case Iop_CmpGT8Sx8:
1104            fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
1105         case Iop_CmpGT16Sx4:
1106            fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
1107         case Iop_CmpGT32Sx2:
1108            fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
1109
1110         case Iop_InterleaveHI8x8:
1111            fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
1112         case Iop_InterleaveLO8x8:
1113            fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
1114         case Iop_InterleaveHI16x4:
1115            fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
1116         case Iop_InterleaveLO16x4:
1117            fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
1118         case Iop_InterleaveHI32x2:
1119            fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
1120         case Iop_InterleaveLO32x2:
1121            fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
1122         case Iop_CatOddLanes16x4:
1123            fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1124         case Iop_CatEvenLanes16x4:
1125            fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1126         case Iop_Perm8x8:
1127            fn = (HWord)h_generic_calc_Perm8x8; break;
1128
1129         case Iop_Max8Ux8:
1130            fn = (HWord)h_generic_calc_Max8Ux8; break;
1131         case Iop_Max16Sx4:
1132            fn = (HWord)h_generic_calc_Max16Sx4; break;
1133         case Iop_Min8Ux8:
1134            fn = (HWord)h_generic_calc_Min8Ux8; break;
1135         case Iop_Min16Sx4:
1136            fn = (HWord)h_generic_calc_Min16Sx4; break;
1137
1138         case Iop_Mul16x4:
1139            fn = (HWord)h_generic_calc_Mul16x4; break;
1140         case Iop_Mul32x2:
1141            fn = (HWord)h_generic_calc_Mul32x2; break;
1142         case Iop_MulHi16Sx4:
1143            fn = (HWord)h_generic_calc_MulHi16Sx4; break;
1144         case Iop_MulHi16Ux4:
1145            fn = (HWord)h_generic_calc_MulHi16Ux4; break;
1146
1147         case Iop_QAdd8Sx8:
1148            fn = (HWord)h_generic_calc_QAdd8Sx8; break;
1149         case Iop_QAdd16Sx4:
1150            fn = (HWord)h_generic_calc_QAdd16Sx4; break;
1151         case Iop_QAdd8Ux8:
1152            fn = (HWord)h_generic_calc_QAdd8Ux8; break;
1153         case Iop_QAdd16Ux4:
1154            fn = (HWord)h_generic_calc_QAdd16Ux4; break;
1155
1156         case Iop_QNarrowBin32Sto16Sx4:
1157            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1158         case Iop_QNarrowBin16Sto8Sx8:
1159            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1160         case Iop_QNarrowBin16Sto8Ux8:
1161            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1162         case Iop_NarrowBin16to8x8:
1163            fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1164         case Iop_NarrowBin32to16x4:
1165            fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1166
1167         case Iop_QSub8Sx8:
1168            fn = (HWord)h_generic_calc_QSub8Sx8; break;
1169         case Iop_QSub16Sx4:
1170            fn = (HWord)h_generic_calc_QSub16Sx4; break;
1171         case Iop_QSub8Ux8:
1172            fn = (HWord)h_generic_calc_QSub8Ux8; break;
1173         case Iop_QSub16Ux4:
1174            fn = (HWord)h_generic_calc_QSub16Ux4; break;
1175
1176         case Iop_Sub8x8:
1177            fn = (HWord)h_generic_calc_Sub8x8; break;
1178         case Iop_Sub16x4:
1179            fn = (HWord)h_generic_calc_Sub16x4; break;
1180         case Iop_Sub32x2:
1181            fn = (HWord)h_generic_calc_Sub32x2; break;
1182
1183         case Iop_ShlN32x2:
1184            fn = (HWord)h_generic_calc_ShlN32x2;
1185            second_is_UInt = True;
1186            break;
1187         case Iop_ShlN16x4:
1188            fn = (HWord)h_generic_calc_ShlN16x4;
1189            second_is_UInt = True;
1190            break;
1191         case Iop_ShlN8x8:
1192            fn = (HWord)h_generic_calc_ShlN8x8;
1193            second_is_UInt = True;
1194            break;
1195         case Iop_ShrN32x2:
1196            fn = (HWord)h_generic_calc_ShrN32x2;
1197            second_is_UInt = True;
1198            break;
1199         case Iop_ShrN16x4:
1200            fn = (HWord)h_generic_calc_ShrN16x4;
1201            second_is_UInt = True;
1202            break;
1203         case Iop_SarN32x2:
1204            fn = (HWord)h_generic_calc_SarN32x2;
1205            second_is_UInt = True;
1206            break;
1207         case Iop_SarN16x4:
1208            fn = (HWord)h_generic_calc_SarN16x4;
1209            second_is_UInt = True;
1210            break;
1211         case Iop_SarN8x8:
1212            fn = (HWord)h_generic_calc_SarN8x8;
1213            second_is_UInt = True;
1214            break;
1215
1216         default:
1217            fn = (HWord)0; break;
1218      }
1219      if (fn != (HWord)0) {
1220         /* Note: the following assumes all helpers are of signature
1221               ULong fn ( ULong, ULong ), and they are
1222            not marked as regparm functions.
1223         */
1224         HReg dst  = newVRegI(env);
1225         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1226         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1227         if (second_is_UInt)
1228            addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1229         addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1230         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1231         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1232                                        mk_RetLoc_simple(RLPri_Int) ));
1233         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1234         return dst;
1235      }
1236
1237      /* Handle misc other ops. */
1238
1239      if (e->Iex.Binop.op == Iop_Max32U) {
1240         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1241         HReg dst  = newVRegI(env);
1242         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1243         addInstr(env, mk_iMOVsd_RR(src1, dst));
1244         addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1245         addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
1246         return dst;
1247      }
1248
1249      if (e->Iex.Binop.op == Iop_DivModS64to32
1250          || e->Iex.Binop.op == Iop_DivModU64to32) {
1251         /* 64 x 32 -> (32(rem),32(div)) division */
1252         /* Get the 64-bit operand into edx:eax, and the other into
1253            any old R/M. */
1254         HReg      rax     = hregAMD64_RAX();
1255         HReg      rdx     = hregAMD64_RDX();
1256         HReg      dst     = newVRegI(env);
1257         Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1258         AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1259         /* Compute the left operand into a reg, and then
1260            put the top half in edx and the bottom in eax. */
1261         HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1262         addInstr(env, mk_iMOVsd_RR(left64, rdx));
1263         addInstr(env, mk_iMOVsd_RR(left64, rax));
1264         addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1265         addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1266	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1267	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1268         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1269         addInstr(env, mk_iMOVsd_RR(rax, dst));
1270         addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1271         return dst;
1272      }
1273
1274      if (e->Iex.Binop.op == Iop_32HLto64) {
1275         HReg hi32  = newVRegI(env);
1276         HReg lo32  = newVRegI(env);
1277         HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1278         HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1279         addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1280         addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1281         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1282	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1283         addInstr(env, AMD64Instr_Alu64R(
1284                          Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1285         return hi32;
1286      }
1287
1288      if (e->Iex.Binop.op == Iop_16HLto32) {
1289         HReg hi16  = newVRegI(env);
1290         HReg lo16  = newVRegI(env);
1291         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1292         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1293         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1294         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1295         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1296         addInstr(env, AMD64Instr_Alu64R(
1297                          Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1298         addInstr(env, AMD64Instr_Alu64R(
1299                          Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1300         return hi16;
1301      }
1302
1303      if (e->Iex.Binop.op == Iop_8HLto16) {
1304         HReg hi8  = newVRegI(env);
1305         HReg lo8  = newVRegI(env);
1306         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1307         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1308         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1309         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1310         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1311         addInstr(env, AMD64Instr_Alu64R(
1312                          Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1313         addInstr(env, AMD64Instr_Alu64R(
1314                          Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1315         return hi8;
1316      }
1317
1318      if (e->Iex.Binop.op == Iop_MullS32
1319          || e->Iex.Binop.op == Iop_MullS16
1320          || e->Iex.Binop.op == Iop_MullS8
1321          || e->Iex.Binop.op == Iop_MullU32
1322          || e->Iex.Binop.op == Iop_MullU16
1323          || e->Iex.Binop.op == Iop_MullU8) {
1324         HReg a32   = newVRegI(env);
1325         HReg b32   = newVRegI(env);
1326         HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1327         HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1328         Int          shift  = 0;
1329         AMD64ShiftOp shr_op = Ash_SHR;
1330         switch (e->Iex.Binop.op) {
1331            case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1332            case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1333            case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1334            case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1335            case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1336            case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1337            default: vassert(0);
1338         }
1339
1340         addInstr(env, mk_iMOVsd_RR(a32s, a32));
1341         addInstr(env, mk_iMOVsd_RR(b32s, b32));
1342         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1343         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1344         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1345         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1346         addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1347         return b32;
1348      }
1349
1350      if (e->Iex.Binop.op == Iop_CmpF64) {
1351         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1352         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1353         HReg dst = newVRegI(env);
1354         addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1355         /* Mask out irrelevant parts of the result so as to conform
1356            to the CmpF64 definition. */
1357         addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1358         return dst;
1359      }
1360
1361      if (e->Iex.Binop.op == Iop_F64toI32S
1362          || e->Iex.Binop.op == Iop_F64toI64S) {
1363         Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1364         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1365         HReg dst = newVRegI(env);
1366         set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1367         addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1368         set_SSE_rounding_default(env);
1369         return dst;
1370      }
1371
1372      break;
1373   }
1374
1375   /* --------- UNARY OP --------- */
1376   case Iex_Unop: {
1377
1378      /* 1Uto8(64to1(expr64)) */
1379      {
1380         DEFINE_PATTERN( p_1Uto8_64to1,
1381                         unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1382         if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1383            IRExpr* expr64 = mi.bindee[0];
1384            HReg    dst    = newVRegI(env);
1385            HReg    src    = iselIntExpr_R(env, expr64);
1386            addInstr(env, mk_iMOVsd_RR(src,dst) );
1387            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1388                                            AMD64RMI_Imm(1), dst));
1389            return dst;
1390         }
1391      }
1392
1393      /* 8Uto64(LDle(expr64)) */
1394      {
1395         DEFINE_PATTERN(p_LDle8_then_8Uto64,
1396                        unop(Iop_8Uto64,
1397                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1398         if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1399            HReg dst = newVRegI(env);
1400            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1401            addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1402            return dst;
1403         }
1404      }
1405
1406      /* 16Uto64(LDle(expr64)) */
1407      {
1408         DEFINE_PATTERN(p_LDle16_then_16Uto64,
1409                        unop(Iop_16Uto64,
1410                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1411         if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1412            HReg dst = newVRegI(env);
1413            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1414            addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1415            return dst;
1416         }
1417      }
1418
1419      /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1420         Use 32 bit arithmetic and let the default zero-extend rule
1421         do the 32Uto64 for free. */
1422      if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1423         IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1424         IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1425         IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1426         AMD64AluOp aluOp = Aalu_INVALID;
1427         switch (opi) {
1428            case Iop_Add32: aluOp = Aalu_ADD; break;
1429            case Iop_Sub32: aluOp = Aalu_SUB; break;
1430            case Iop_And32: aluOp = Aalu_AND; break;
1431            case Iop_Or32:  aluOp = Aalu_OR;  break;
1432            case Iop_Xor32: aluOp = Aalu_XOR; break;
1433            default: break;
1434         }
1435         if (aluOp != Aalu_INVALID) {
1436            /* For commutative ops we assume any literal values are on
1437               the second operand. */
1438            HReg dst      = newVRegI(env);
1439            HReg reg      = iselIntExpr_R(env, argL);
1440            AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1441            addInstr(env, mk_iMOVsd_RR(reg,dst));
1442            addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1443            return dst;
1444         }
1445         /* just fall through to normal handling for Iop_32Uto64 */
1446      }
1447
1448      /* Fallback cases */
1449      switch (e->Iex.Unop.op) {
1450         case Iop_32Uto64:
1451         case Iop_32Sto64: {
1452            HReg dst = newVRegI(env);
1453            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1454            addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1455                                            src, dst) );
1456            return dst;
1457         }
1458         case Iop_128HIto64: {
1459            HReg rHi, rLo;
1460            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1461            return rHi; /* and abandon rLo */
1462         }
1463         case Iop_128to64: {
1464            HReg rHi, rLo;
1465            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1466            return rLo; /* and abandon rHi */
1467         }
1468         case Iop_8Uto16:
1469         case Iop_8Uto32:
1470         case Iop_8Uto64:
1471         case Iop_16Uto64:
1472         case Iop_16Uto32: {
1473            HReg dst     = newVRegI(env);
1474            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1475            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1476                                   || e->Iex.Unop.op==Iop_16Uto64 );
1477            UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1478            addInstr(env, mk_iMOVsd_RR(src,dst) );
1479            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1480                                            AMD64RMI_Imm(mask), dst));
1481            return dst;
1482         }
1483         case Iop_8Sto16:
1484         case Iop_8Sto64:
1485         case Iop_8Sto32:
1486         case Iop_16Sto32:
1487         case Iop_16Sto64: {
1488            HReg dst     = newVRegI(env);
1489            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1490            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1491                                   || e->Iex.Unop.op==Iop_16Sto64 );
1492            UInt amt     = srcIs16 ? 48 : 56;
1493            addInstr(env, mk_iMOVsd_RR(src,dst) );
1494            addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1495            addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1496            return dst;
1497         }
1498 	 case Iop_Not8:
1499 	 case Iop_Not16:
1500         case Iop_Not32:
1501         case Iop_Not64: {
1502            HReg dst = newVRegI(env);
1503            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1504            addInstr(env, mk_iMOVsd_RR(src,dst) );
1505            addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1506            return dst;
1507         }
1508         case Iop_16HIto8:
1509         case Iop_32HIto16:
1510         case Iop_64HIto32: {
1511            HReg dst  = newVRegI(env);
1512            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1513            Int shift = 0;
1514            switch (e->Iex.Unop.op) {
1515               case Iop_16HIto8:  shift = 8;  break;
1516               case Iop_32HIto16: shift = 16; break;
1517               case Iop_64HIto32: shift = 32; break;
1518               default: vassert(0);
1519            }
1520            addInstr(env, mk_iMOVsd_RR(src,dst) );
1521            addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1522            return dst;
1523         }
1524         case Iop_1Uto64:
1525         case Iop_1Uto32:
1526         case Iop_1Uto8: {
1527            HReg dst           = newVRegI(env);
1528            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1529            addInstr(env, AMD64Instr_Set64(cond,dst));
1530            return dst;
1531         }
1532         case Iop_1Sto8:
1533         case Iop_1Sto16:
1534         case Iop_1Sto32:
1535         case Iop_1Sto64: {
1536            /* could do better than this, but for now ... */
1537            HReg dst           = newVRegI(env);
1538            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1539            addInstr(env, AMD64Instr_Set64(cond,dst));
1540            addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1541            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1542            return dst;
1543         }
1544         case Iop_Ctz64: {
1545            /* Count trailing zeroes, implemented by amd64 'bsfq' */
1546            HReg dst = newVRegI(env);
1547            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1548            addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1549            return dst;
1550         }
1551         case Iop_Clz64: {
1552            /* Count leading zeroes.  Do 'bsrq' to establish the index
1553               of the highest set bit, and subtract that value from
1554               63. */
1555            HReg tmp = newVRegI(env);
1556            HReg dst = newVRegI(env);
1557            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1558            addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1559            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1560                                            AMD64RMI_Imm(63), dst));
1561            addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1562                                            AMD64RMI_Reg(tmp), dst));
1563            return dst;
1564         }
1565
1566         case Iop_CmpwNEZ64: {
1567            HReg dst = newVRegI(env);
1568            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1569            addInstr(env, mk_iMOVsd_RR(src,dst));
1570            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1571            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1572                                            AMD64RMI_Reg(src), dst));
1573            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1574            return dst;
1575         }
1576
1577         case Iop_CmpwNEZ32: {
1578            HReg src = newVRegI(env);
1579            HReg dst = newVRegI(env);
1580            HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1581            addInstr(env, mk_iMOVsd_RR(pre,src));
1582            addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1583            addInstr(env, mk_iMOVsd_RR(src,dst));
1584            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1585            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1586                                            AMD64RMI_Reg(src), dst));
1587            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1588            return dst;
1589         }
1590
1591         case Iop_Left8:
1592         case Iop_Left16:
1593         case Iop_Left32:
1594         case Iop_Left64: {
1595            HReg dst = newVRegI(env);
1596            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1597            addInstr(env, mk_iMOVsd_RR(src, dst));
1598            addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1599            addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1600            return dst;
1601         }
1602
1603         case Iop_V128to32: {
1604            HReg        dst     = newVRegI(env);
1605            HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1606            AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1607            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1608            addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1609            return dst;
1610         }
1611
1612         /* V128{HI}to64 */
1613         case Iop_V128HIto64:
1614         case Iop_V128to64: {
1615            HReg dst = newVRegI(env);
1616            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
1617            HReg rsp = hregAMD64_RSP();
1618            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1619            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1620            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1621            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1622                                             16, vec, m16_rsp));
1623            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1624                                             AMD64RMI_Mem(off_rsp), dst ));
1625            return dst;
1626         }
1627
1628         case Iop_V256to64_0: case Iop_V256to64_1:
1629         case Iop_V256to64_2: case Iop_V256to64_3: {
1630            HReg vHi, vLo, vec;
1631            iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1632            /* Do the first part of the selection by deciding which of
1633               the 128 bit registers do look at, and second part using
1634               the same scheme as for V128{HI}to64 above. */
1635            Int off = 0;
1636            switch (e->Iex.Unop.op) {
1637               case Iop_V256to64_0: vec = vLo; off = -16; break;
1638               case Iop_V256to64_1: vec = vLo; off =  -8; break;
1639               case Iop_V256to64_2: vec = vHi; off = -16; break;
1640               case Iop_V256to64_3: vec = vHi; off =  -8; break;
1641               default: vassert(0);
1642            }
1643            HReg        dst     = newVRegI(env);
1644            HReg        rsp     = hregAMD64_RSP();
1645            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1646            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1647            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1648                                             16, vec, m16_rsp));
1649            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1650                                             AMD64RMI_Mem(off_rsp), dst ));
1651            return dst;
1652         }
1653
1654         /* ReinterpF64asI64(e) */
1655         /* Given an IEEE754 double, produce an I64 with the same bit
1656            pattern. */
1657         case Iop_ReinterpF64asI64: {
1658            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1659            HReg        dst    = newVRegI(env);
1660            HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1661            /* paranoia */
1662            set_SSE_rounding_default(env);
1663            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1664            addInstr(env, AMD64Instr_Alu64R(
1665                             Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1666            return dst;
1667         }
1668
1669         /* ReinterpF32asI32(e) */
1670         /* Given an IEEE754 single, produce an I64 with the same bit
1671            pattern in the lower half. */
1672         case Iop_ReinterpF32asI32: {
1673            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1674            HReg        dst    = newVRegI(env);
1675            HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1676            /* paranoia */
1677            set_SSE_rounding_default(env);
1678            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1679            addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1680            return dst;
1681         }
1682
1683         case Iop_16to8:
1684         case Iop_32to8:
1685         case Iop_64to8:
1686         case Iop_32to16:
1687         case Iop_64to16:
1688         case Iop_64to32:
1689            /* These are no-ops. */
1690            return iselIntExpr_R(env, e->Iex.Unop.arg);
1691
1692         case Iop_GetMSBs8x8: {
1693            /* Note: the following assumes the helper is of
1694               signature
1695                  UInt fn ( ULong ), and is not a regparm fn.
1696            */
1697            HReg dst = newVRegI(env);
1698            HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1699            fn = (HWord)h_generic_calc_GetMSBs8x8;
1700            addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1701            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1702                                           1, mk_RetLoc_simple(RLPri_Int) ));
1703            /* MovxLQ is not exactly the right thing here.  We just
1704               need to get the bottom 8 bits of RAX into dst, and zero
1705               out everything else.  Assuming that the helper returns
1706               a UInt with the top 24 bits zeroed out, it'll do,
1707               though. */
1708            addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1709            return dst;
1710         }
1711
1712         case Iop_GetMSBs8x16: {
1713            /* Note: the following assumes the helper is of signature
1714                  UInt fn ( ULong w64hi, ULong w64Lo ),
1715               and is not a regparm fn. */
1716            HReg dst = newVRegI(env);
1717            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1718            HReg rsp = hregAMD64_RSP();
1719            fn = (HWord)h_generic_calc_GetMSBs8x16;
1720            AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1721            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1722            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1723                                             16, vec, m16_rsp));
1724            /* hi 64 bits into RDI -- the first arg */
1725            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1726                                             AMD64RMI_Mem(m8_rsp),
1727                                             hregAMD64_RDI() )); /* 1st arg */
1728            /* lo 64 bits into RSI -- the 2nd arg */
1729            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1730                                             AMD64RMI_Mem(m16_rsp),
1731                                             hregAMD64_RSI() )); /* 2nd arg */
1732            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1733                                           2, mk_RetLoc_simple(RLPri_Int) ));
1734            /* MovxLQ is not exactly the right thing here.  We just
1735               need to get the bottom 16 bits of RAX into dst, and zero
1736               out everything else.  Assuming that the helper returns
1737               a UInt with the top 16 bits zeroed out, it'll do,
1738               though. */
1739            addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1740            return dst;
1741         }
1742
1743         default:
1744            break;
1745      }
1746
1747      /* Deal with unary 64-bit SIMD ops. */
1748      switch (e->Iex.Unop.op) {
1749         case Iop_CmpNEZ32x2:
1750            fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1751         case Iop_CmpNEZ16x4:
1752            fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1753         case Iop_CmpNEZ8x8:
1754            fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1755         default:
1756            fn = (HWord)0; break;
1757      }
1758      if (fn != (HWord)0) {
1759         /* Note: the following assumes all helpers are of
1760            signature
1761               ULong fn ( ULong ), and they are
1762            not marked as regparm functions.
1763         */
1764         HReg dst = newVRegI(env);
1765         HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1766         addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1767         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1768                                        mk_RetLoc_simple(RLPri_Int) ));
1769         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1770         return dst;
1771      }
1772
1773      break;
1774   }
1775
1776   /* --------- GET --------- */
1777   case Iex_Get: {
1778      if (ty == Ity_I64) {
1779         HReg dst = newVRegI(env);
1780         addInstr(env, AMD64Instr_Alu64R(
1781                          Aalu_MOV,
1782                          AMD64RMI_Mem(
1783                             AMD64AMode_IR(e->Iex.Get.offset,
1784                                           hregAMD64_RBP())),
1785                          dst));
1786         return dst;
1787      }
1788      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1789         HReg dst = newVRegI(env);
1790         addInstr(env, AMD64Instr_LoadEX(
1791                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1792                          False,
1793                          AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1794                          dst));
1795         return dst;
1796      }
1797      break;
1798   }
1799
1800   case Iex_GetI: {
1801      AMD64AMode* am
1802         = genGuestArrayOffset(
1803              env, e->Iex.GetI.descr,
1804                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1805      HReg dst = newVRegI(env);
1806      if (ty == Ity_I8) {
1807         addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1808         return dst;
1809      }
1810      if (ty == Ity_I64) {
1811         addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1812         return dst;
1813      }
1814      break;
1815   }
1816
1817   /* --------- CCALL --------- */
1818   case Iex_CCall: {
1819      HReg    dst = newVRegI(env);
1820      vassert(ty == e->Iex.CCall.retty);
1821
1822      /* be very restrictive for now.  Only 64-bit ints allowed for
1823         args, and 64 or 32 bits for return type. */
1824      if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1825         goto irreducible;
1826
1827      /* Marshal args, do the call. */
1828      UInt   addToSp = 0;
1829      RetLoc rloc    = mk_RetLoc_INVALID();
1830      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1831                    e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1832      vassert(is_sane_RetLoc(rloc));
1833      vassert(rloc.pri == RLPri_Int);
1834      vassert(addToSp == 0);
1835
1836      /* Move to dst, and zero out the top 32 bits if the result type is
1837         Ity_I32.  Probably overkill, but still .. */
1838      if (e->Iex.CCall.retty == Ity_I64)
1839         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1840      else
1841         addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1842
1843      return dst;
1844   }
1845
1846   /* --------- LITERAL --------- */
1847   /* 64/32/16/8-bit literals */
1848   case Iex_Const:
1849      if (ty == Ity_I64) {
1850         HReg r = newVRegI(env);
1851         addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1852         return r;
1853      } else {
1854         AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1855         HReg      r   = newVRegI(env);
1856         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1857         return r;
1858      }
1859
1860   /* --------- MULTIPLEX --------- */
1861   case Iex_ITE: { // VFD
1862      if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1863          && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1864         HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1865         AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.ITE.iffalse);
1866         HReg     dst = newVRegI(env);
1867         addInstr(env, mk_iMOVsd_RR(r1,dst));
1868         AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1869         addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1870         return dst;
1871      }
1872      break;
1873   }
1874
1875   /* --------- TERNARY OP --------- */
1876   case Iex_Triop: {
1877      IRTriop *triop = e->Iex.Triop.details;
1878      /* C3210 flags following FPU partial remainder (fprem), both
1879         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1880      if (triop->op == Iop_PRemC3210F64
1881          || triop->op == Iop_PRem1C3210F64) {
1882         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1883         HReg        arg1   = iselDblExpr(env, triop->arg2);
1884         HReg        arg2   = iselDblExpr(env, triop->arg3);
1885         HReg        dst    = newVRegI(env);
1886         addInstr(env, AMD64Instr_A87Free(2));
1887
1888         /* one arg -> top of x87 stack */
1889         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1890         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1891
1892         /* other arg -> top of x87 stack */
1893         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1894         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1895
1896         switch (triop->op) {
1897            case Iop_PRemC3210F64:
1898               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1899               break;
1900            case Iop_PRem1C3210F64:
1901               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1902               break;
1903            default:
1904               vassert(0);
1905         }
1906         /* Ignore the result, and instead make off with the FPU's
1907	    C3210 flags (in the status word). */
1908         addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1909         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1910         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1911         return dst;
1912      }
1913      break;
1914   }
1915
1916   default:
1917   break;
1918   } /* switch (e->tag) */
1919
1920   /* We get here if no pattern matched. */
1921  irreducible:
1922   ppIRExpr(e);
1923   vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1924}
1925
1926
1927/*---------------------------------------------------------*/
1928/*--- ISEL: Integer expression auxiliaries              ---*/
1929/*---------------------------------------------------------*/
1930
1931/* --------------------- AMODEs --------------------- */
1932
1933/* Return an AMode which computes the value of the specified
1934   expression, possibly also adding insns to the code list as a
1935   result.  The expression may only be a 32-bit one.
1936*/
1937
1938static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1939{
1940   AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
1941   vassert(sane_AMode(am));
1942   return am;
1943}
1944
1945/* DO NOT CALL THIS DIRECTLY ! */
1946static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1947{
1948   MatchInfo mi;
1949   DECLARE_PATTERN(p_complex);
1950   IRType ty = typeOfIRExpr(env->type_env,e);
1951   vassert(ty == Ity_I64);
1952
1953   /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
1954   /*              bind0        bind1  bind2   bind3   */
1955   DEFINE_PATTERN(p_complex,
1956      binop( Iop_Add64,
1957             binop( Iop_Add64,
1958                    bind(0),
1959                    binop(Iop_Shl64, bind(1), bind(2))
1960                  ),
1961             bind(3)
1962           )
1963   );
1964   if (matchIRExpr(&mi, p_complex, e)) {
1965      IRExpr* expr1  = mi.bindee[0];
1966      IRExpr* expr2  = mi.bindee[1];
1967      IRExpr* imm8   = mi.bindee[2];
1968      IRExpr* simm32 = mi.bindee[3];
1969      if (imm8->tag == Iex_Const
1970          && imm8->Iex.Const.con->tag == Ico_U8
1971          && imm8->Iex.Const.con->Ico.U8 < 4
1972          /* imm8 is OK, now check simm32 */
1973          && simm32->tag == Iex_Const
1974          && simm32->Iex.Const.con->tag == Ico_U64
1975          && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
1976         UInt shift = imm8->Iex.Const.con->Ico.U8;
1977         UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
1978         HReg r1 = iselIntExpr_R(env, expr1);
1979         HReg r2 = iselIntExpr_R(env, expr2);
1980         vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
1981         return AMD64AMode_IRRS(offset, r1, r2, shift);
1982      }
1983   }
1984
1985   /* Add64(expr1, Shl64(expr2, imm)) */
1986   if (e->tag == Iex_Binop
1987       && e->Iex.Binop.op == Iop_Add64
1988       && e->Iex.Binop.arg2->tag == Iex_Binop
1989       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
1990       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1991       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1992      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1993      if (shift == 1 || shift == 2 || shift == 3) {
1994         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1995         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1996         return AMD64AMode_IRRS(0, r1, r2, shift);
1997      }
1998   }
1999
2000   /* Add64(expr,i) */
2001   if (e->tag == Iex_Binop
2002       && e->Iex.Binop.op == Iop_Add64
2003       && e->Iex.Binop.arg2->tag == Iex_Const
2004       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2005       && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2006      HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2007      return AMD64AMode_IR(
2008                toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2009                r1
2010             );
2011   }
2012
2013   /* Doesn't match anything in particular.  Generate it into
2014      a register and use that. */
2015   {
2016      HReg r1 = iselIntExpr_R(env, e);
2017      return AMD64AMode_IR(0, r1);
2018   }
2019}
2020
2021
2022/* --------------------- RMIs --------------------- */
2023
2024/* Similarly, calculate an expression into an X86RMI operand.  As with
2025   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2026
2027static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
2028{
2029   AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2030   /* sanity checks ... */
2031   switch (rmi->tag) {
2032      case Armi_Imm:
2033         return rmi;
2034      case Armi_Reg:
2035         vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2036         vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2037         return rmi;
2038      case Armi_Mem:
2039         vassert(sane_AMode(rmi->Armi.Mem.am));
2040         return rmi;
2041      default:
2042         vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2043   }
2044}
2045
2046/* DO NOT CALL THIS DIRECTLY ! */
2047static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
2048{
2049   IRType ty = typeOfIRExpr(env->type_env,e);
2050   vassert(ty == Ity_I64 || ty == Ity_I32
2051           || ty == Ity_I16 || ty == Ity_I8);
2052
2053   /* special case: immediate 64/32/16/8 */
2054   if (e->tag == Iex_Const) {
2055      switch (e->Iex.Const.con->tag) {
2056        case Ico_U64:
2057           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2058              return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2059           }
2060           break;
2061         case Ico_U32:
2062            return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2063         case Ico_U16:
2064            return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2065         case Ico_U8:
2066            return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2067         default:
2068            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2069      }
2070   }
2071
2072   /* special case: 64-bit GET */
2073   if (e->tag == Iex_Get && ty == Ity_I64) {
2074      return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2075                                        hregAMD64_RBP()));
2076   }
2077
2078   /* special case: 64-bit load from memory */
2079   if (e->tag == Iex_Load && ty == Ity_I64
2080       && e->Iex.Load.end == Iend_LE) {
2081      AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2082      return AMD64RMI_Mem(am);
2083   }
2084
2085   /* default case: calculate into a register and return that */
2086   {
2087      HReg r = iselIntExpr_R ( env, e );
2088      return AMD64RMI_Reg(r);
2089   }
2090}
2091
2092
2093/* --------------------- RIs --------------------- */
2094
2095/* Calculate an expression into an AMD64RI operand.  As with
2096   iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2097   bits. */
2098
2099static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
2100{
2101   AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2102   /* sanity checks ... */
2103   switch (ri->tag) {
2104      case Ari_Imm:
2105         return ri;
2106      case Ari_Reg:
2107         vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2108         vassert(hregIsVirtual(ri->Ari.Reg.reg));
2109         return ri;
2110      default:
2111         vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2112   }
2113}
2114
2115/* DO NOT CALL THIS DIRECTLY ! */
2116static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
2117{
2118   IRType ty = typeOfIRExpr(env->type_env,e);
2119   vassert(ty == Ity_I64 || ty == Ity_I32
2120           || ty == Ity_I16 || ty == Ity_I8);
2121
2122   /* special case: immediate */
2123   if (e->tag == Iex_Const) {
2124      switch (e->Iex.Const.con->tag) {
2125        case Ico_U64:
2126           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2127              return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2128           }
2129           break;
2130         case Ico_U32:
2131            return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2132         case Ico_U16:
2133            return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2134         case Ico_U8:
2135            return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2136         default:
2137            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2138      }
2139   }
2140
2141   /* default case: calculate into a register and return that */
2142   {
2143      HReg r = iselIntExpr_R ( env, e );
2144      return AMD64RI_Reg(r);
2145   }
2146}
2147
2148
2149/* --------------------- RMs --------------------- */
2150
2151/* Similarly, calculate an expression into an AMD64RM operand.  As
2152   with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2153   bits.  */
2154
2155static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
2156{
2157   AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2158   /* sanity checks ... */
2159   switch (rm->tag) {
2160      case Arm_Reg:
2161         vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2162         vassert(hregIsVirtual(rm->Arm.Reg.reg));
2163         return rm;
2164      case Arm_Mem:
2165         vassert(sane_AMode(rm->Arm.Mem.am));
2166         return rm;
2167      default:
2168         vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2169   }
2170}
2171
2172/* DO NOT CALL THIS DIRECTLY ! */
2173static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
2174{
2175   IRType ty = typeOfIRExpr(env->type_env,e);
2176   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2177
2178   /* special case: 64-bit GET */
2179   if (e->tag == Iex_Get && ty == Ity_I64) {
2180      return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2181                                       hregAMD64_RBP()));
2182   }
2183
2184   /* special case: load from memory */
2185
2186   /* default case: calculate into a register and return that */
2187   {
2188      HReg r = iselIntExpr_R ( env, e );
2189      return AMD64RM_Reg(r);
2190   }
2191}
2192
2193
2194/* --------------------- CONDCODE --------------------- */
2195
2196/* Generate code to evaluated a bit-typed expression, returning the
2197   condition code which would correspond when the expression would
2198   notionally have returned 1. */
2199
2200static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
2201{
2202   /* Uh, there's nothing we can sanity check here, unfortunately. */
2203   return iselCondCode_wrk(env,e);
2204}
2205
2206/* DO NOT CALL THIS DIRECTLY ! */
2207static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
2208{
2209   MatchInfo mi;
2210
2211   vassert(e);
2212   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2213
2214   /* var */
2215   if (e->tag == Iex_RdTmp) {
2216      HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2217      HReg dst = newVRegI(env);
2218      addInstr(env, mk_iMOVsd_RR(r64,dst));
2219      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2220      return Acc_NZ;
2221   }
2222
2223   /* Constant 1:Bit */
2224   if (e->tag == Iex_Const) {
2225      HReg r;
2226      vassert(e->Iex.Const.con->tag == Ico_U1);
2227      vassert(e->Iex.Const.con->Ico.U1 == True
2228              || e->Iex.Const.con->Ico.U1 == False);
2229      r = newVRegI(env);
2230      addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2231      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2232      return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2233   }
2234
2235   /* Not1(...) */
2236   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2237      /* Generate code for the arg, and negate the test condition */
2238      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2239   }
2240
2241   /* --- patterns rooted at: 64to1 --- */
2242
2243   /* 64to1 */
2244   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2245      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2246      addInstr(env, AMD64Instr_Test64(1,reg));
2247      return Acc_NZ;
2248   }
2249
2250   /* --- patterns rooted at: 32to1 --- */
2251
2252   /* 32to1 */
2253   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2254      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2255      addInstr(env, AMD64Instr_Test64(1,reg));
2256      return Acc_NZ;
2257   }
2258
2259   /* --- patterns rooted at: CmpNEZ8 --- */
2260
2261   /* CmpNEZ8(x) */
2262   if (e->tag == Iex_Unop
2263       && e->Iex.Unop.op == Iop_CmpNEZ8) {
2264      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2265      addInstr(env, AMD64Instr_Test64(0xFF,r));
2266      return Acc_NZ;
2267   }
2268
2269   /* --- patterns rooted at: CmpNEZ16 --- */
2270
2271   /* CmpNEZ16(x) */
2272   if (e->tag == Iex_Unop
2273       && e->Iex.Unop.op == Iop_CmpNEZ16) {
2274      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2275      addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2276      return Acc_NZ;
2277   }
2278
2279   /* --- patterns rooted at: CmpNEZ32 --- */
2280
2281   /* CmpNEZ32(x) */
2282   if (e->tag == Iex_Unop
2283       && e->Iex.Unop.op == Iop_CmpNEZ32) {
2284      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2285      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2286      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2287      return Acc_NZ;
2288   }
2289
2290   /* --- patterns rooted at: CmpNEZ64 --- */
2291
2292   /* CmpNEZ64(Or64(x,y)) */
2293   {
2294      DECLARE_PATTERN(p_CmpNEZ64_Or64);
2295      DEFINE_PATTERN(p_CmpNEZ64_Or64,
2296                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
2297      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
2298         HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
2299         AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
2300         HReg      tmp  = newVRegI(env);
2301         addInstr(env, mk_iMOVsd_RR(r0, tmp));
2302         addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
2303         return Acc_NZ;
2304      }
2305   }
2306
2307   /* CmpNEZ64(x) */
2308   if (e->tag == Iex_Unop
2309       && e->Iex.Unop.op == Iop_CmpNEZ64) {
2310      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2311      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2312      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2313      return Acc_NZ;
2314   }
2315
2316   /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2317
2318   /* CmpEQ8 / CmpNE8 */
2319   if (e->tag == Iex_Binop
2320       && (e->Iex.Binop.op == Iop_CmpEQ8
2321           || e->Iex.Binop.op == Iop_CmpNE8
2322           || e->Iex.Binop.op == Iop_CasCmpEQ8
2323           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2324      if (isZeroU8(e->Iex.Binop.arg2)) {
2325         HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2326         addInstr(env, AMD64Instr_Test64(0xFF,r1));
2327         switch (e->Iex.Binop.op) {
2328            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2329            case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2330            default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2331         }
2332      } else {
2333         HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2334         AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2335         HReg      r    = newVRegI(env);
2336         addInstr(env, mk_iMOVsd_RR(r1,r));
2337         addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2338         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2339         switch (e->Iex.Binop.op) {
2340            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2341            case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2342            default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2343         }
2344      }
2345   }
2346
2347   /* CmpEQ16 / CmpNE16 */
2348   if (e->tag == Iex_Binop
2349       && (e->Iex.Binop.op == Iop_CmpEQ16
2350           || e->Iex.Binop.op == Iop_CmpNE16
2351           || e->Iex.Binop.op == Iop_CasCmpEQ16
2352           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2353      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2354      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2355      HReg      r    = newVRegI(env);
2356      addInstr(env, mk_iMOVsd_RR(r1,r));
2357      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2358      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2359      switch (e->Iex.Binop.op) {
2360         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2361         case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2362         default: vpanic("iselCondCode(amd64): CmpXX16");
2363      }
2364   }
2365
2366   /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2367      Saves a "movq %rax, %tmp" compared to the default route. */
2368   if (e->tag == Iex_Binop
2369       && e->Iex.Binop.op == Iop_CmpNE64
2370       && e->Iex.Binop.arg1->tag == Iex_CCall
2371       && e->Iex.Binop.arg2->tag == Iex_Const) {
2372      IRExpr* cal = e->Iex.Binop.arg1;
2373      IRExpr* con = e->Iex.Binop.arg2;
2374      HReg    tmp = newVRegI(env);
2375      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2376      vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2377      vassert(con->Iex.Const.con->tag == Ico_U64);
2378      /* Marshal args, do the call. */
2379      UInt   addToSp = 0;
2380      RetLoc rloc    = mk_RetLoc_INVALID();
2381      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2382                    cal->Iex.CCall.cee,
2383                    cal->Iex.CCall.retty, cal->Iex.CCall.args );
2384      vassert(is_sane_RetLoc(rloc));
2385      vassert(rloc.pri == RLPri_Int);
2386      vassert(addToSp == 0);
2387      /* */
2388      addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2389      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2390                                      AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2391      return Acc_NZ;
2392   }
2393
2394   /* Cmp*64*(x,y) */
2395   if (e->tag == Iex_Binop
2396       && (e->Iex.Binop.op == Iop_CmpEQ64
2397           || e->Iex.Binop.op == Iop_CmpNE64
2398           || e->Iex.Binop.op == Iop_CmpLT64S
2399           || e->Iex.Binop.op == Iop_CmpLT64U
2400           || e->Iex.Binop.op == Iop_CmpLE64S
2401           || e->Iex.Binop.op == Iop_CmpLE64U
2402           || e->Iex.Binop.op == Iop_CasCmpEQ64
2403           || e->Iex.Binop.op == Iop_CasCmpNE64
2404           || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2405      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2406      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2407      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2408      switch (e->Iex.Binop.op) {
2409         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2410         case Iop_CmpNE64:
2411         case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2412	 case Iop_CmpLT64S: return Acc_L;
2413	 case Iop_CmpLT64U: return Acc_B;
2414	 case Iop_CmpLE64S: return Acc_LE;
2415         case Iop_CmpLE64U: return Acc_BE;
2416         default: vpanic("iselCondCode(amd64): CmpXX64");
2417      }
2418   }
2419
2420   /* Cmp*32*(x,y) */
2421   if (e->tag == Iex_Binop
2422       && (e->Iex.Binop.op == Iop_CmpEQ32
2423           || e->Iex.Binop.op == Iop_CmpNE32
2424           || e->Iex.Binop.op == Iop_CmpLT32S
2425           || e->Iex.Binop.op == Iop_CmpLT32U
2426           || e->Iex.Binop.op == Iop_CmpLE32S
2427           || e->Iex.Binop.op == Iop_CmpLE32U
2428           || e->Iex.Binop.op == Iop_CasCmpEQ32
2429           || e->Iex.Binop.op == Iop_CasCmpNE32
2430           || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2431      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2432      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2433      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2434      switch (e->Iex.Binop.op) {
2435         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2436         case Iop_CmpNE32:
2437         case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2438	 case Iop_CmpLT32S: return Acc_L;
2439	 case Iop_CmpLT32U: return Acc_B;
2440	 case Iop_CmpLE32S: return Acc_LE;
2441         case Iop_CmpLE32U: return Acc_BE;
2442         default: vpanic("iselCondCode(amd64): CmpXX32");
2443      }
2444   }
2445
2446   ppIRExpr(e);
2447   vpanic("iselCondCode(amd64)");
2448}
2449
2450
2451/*---------------------------------------------------------*/
2452/*--- ISEL: Integer expressions (128 bit)               ---*/
2453/*---------------------------------------------------------*/
2454
2455/* Compute a 128-bit value into a register pair, which is returned as
2456   the first two parameters.  As with iselIntExpr_R, these may be
2457   either real or virtual regs; in any case they must not be changed
2458   by subsequent code emitted by the caller.  */
2459
2460static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2461                             ISelEnv* env, IRExpr* e )
2462{
2463   iselInt128Expr_wrk(rHi, rLo, env, e);
2464#  if 0
2465   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2466#  endif
2467   vassert(hregClass(*rHi) == HRcInt64);
2468   vassert(hregIsVirtual(*rHi));
2469   vassert(hregClass(*rLo) == HRcInt64);
2470   vassert(hregIsVirtual(*rLo));
2471}
2472
2473/* DO NOT CALL THIS DIRECTLY ! */
2474static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2475                                 ISelEnv* env, IRExpr* e )
2476{
2477   vassert(e);
2478   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2479
2480   /* read 128-bit IRTemp */
2481   if (e->tag == Iex_RdTmp) {
2482      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2483      return;
2484   }
2485
2486   /* --------- BINARY ops --------- */
2487   if (e->tag == Iex_Binop) {
2488      switch (e->Iex.Binop.op) {
2489         /* 64 x 64 -> 128 multiply */
2490         case Iop_MullU64:
2491         case Iop_MullS64: {
2492            /* get one operand into %rax, and the other into a R/M.
2493               Need to make an educated guess about which is better in
2494               which. */
2495            HReg     tLo    = newVRegI(env);
2496            HReg     tHi    = newVRegI(env);
2497            Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2498            AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2499            HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2500            addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2501            addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2502            /* Result is now in RDX:RAX.  Tell the caller. */
2503            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2504            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2505            *rHi = tHi;
2506            *rLo = tLo;
2507            return;
2508         }
2509
2510         /* 128 x 64 -> (64(rem),64(div)) division */
2511         case Iop_DivModU128to64:
2512         case Iop_DivModS128to64: {
2513            /* Get the 128-bit operand into rdx:rax, and the other into
2514               any old R/M. */
2515            HReg sHi, sLo;
2516            HReg     tLo     = newVRegI(env);
2517            HReg     tHi     = newVRegI(env);
2518            Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2519            AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2520            iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2521            addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2522            addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2523            addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2524            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2525            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2526            *rHi = tHi;
2527            *rLo = tLo;
2528            return;
2529         }
2530
2531         /* 64HLto128(e1,e2) */
2532         case Iop_64HLto128:
2533            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2534            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2535            return;
2536
2537         default:
2538            break;
2539      }
2540   } /* if (e->tag == Iex_Binop) */
2541
2542   ppIRExpr(e);
2543   vpanic("iselInt128Expr");
2544}
2545
2546
2547/*---------------------------------------------------------*/
2548/*--- ISEL: Floating point expressions (32 bit)         ---*/
2549/*---------------------------------------------------------*/
2550
2551/* Nothing interesting here; really just wrappers for
2552   64-bit stuff. */
2553
2554static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2555{
2556   HReg r = iselFltExpr_wrk( env, e );
2557#  if 0
2558   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2559#  endif
2560   vassert(hregClass(r) == HRcVec128);
2561   vassert(hregIsVirtual(r));
2562   return r;
2563}
2564
2565/* DO NOT CALL THIS DIRECTLY */
2566static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2567{
2568   IRType ty = typeOfIRExpr(env->type_env,e);
2569   vassert(ty == Ity_F32);
2570
2571   if (e->tag == Iex_RdTmp) {
2572      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2573   }
2574
2575   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2576      AMD64AMode* am;
2577      HReg res = newVRegV(env);
2578      vassert(e->Iex.Load.ty == Ity_F32);
2579      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2580      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2581      return res;
2582   }
2583
2584   if (e->tag == Iex_Binop
2585       && e->Iex.Binop.op == Iop_F64toF32) {
2586      /* Although the result is still held in a standard SSE register,
2587         we need to round it to reflect the loss of accuracy/range
2588         entailed in casting it to a 32-bit float. */
2589      HReg dst = newVRegV(env);
2590      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2591      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2592      addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2593      set_SSE_rounding_default( env );
2594      return dst;
2595   }
2596
2597   if (e->tag == Iex_Get) {
2598      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2599                                       hregAMD64_RBP() );
2600      HReg res = newVRegV(env);
2601      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2602      return res;
2603   }
2604
2605   if (e->tag == Iex_Unop
2606       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2607       /* Given an I32, produce an IEEE754 float with the same bit
2608          pattern. */
2609       HReg        dst    = newVRegV(env);
2610       HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2611       AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2612       addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2613       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2614       return dst;
2615   }
2616
2617   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2618      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2619      HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2620      HReg        dst    = newVRegV(env);
2621
2622      /* rf now holds the value to be rounded.  The first thing to do
2623         is set the FPU's rounding mode accordingly. */
2624
2625      /* Set host x87 rounding mode */
2626      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2627
2628      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2629      addInstr(env, AMD64Instr_A87Free(1));
2630      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2631      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2632      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2633      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2634
2635      /* Restore default x87 rounding. */
2636      set_FPU_rounding_default( env );
2637
2638      return dst;
2639   }
2640
2641   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2642      /* Sigh ... very rough code.  Could do much better. */
2643      /* Get the 128-bit literal 00---0 10---0 into a register
2644         and xor it with the value to be negated. */
2645      HReg r1  = newVRegI(env);
2646      HReg dst = newVRegV(env);
2647      HReg tmp = newVRegV(env);
2648      HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2649      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2650      addInstr(env, mk_vMOVsd_RR(src,tmp));
2651      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2652      addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2653      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2654      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2655      addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2656      add_to_rsp(env, 16);
2657      return dst;
2658   }
2659
2660   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2661      IRQop *qop = e->Iex.Qop.details;
2662      HReg dst  = newVRegV(env);
2663      HReg argX = iselFltExpr(env, qop->arg2);
2664      HReg argY = iselFltExpr(env, qop->arg3);
2665      HReg argZ = iselFltExpr(env, qop->arg4);
2666      /* XXXROUNDINGFIXME */
2667      /* set roundingmode here */
2668      /* subq $16, %rsp         -- make a space*/
2669      sub_from_rsp(env, 16);
2670      /* Prepare 4 arg regs:
2671         leaq 0(%rsp), %rdi
2672         leaq 4(%rsp), %rsi
2673         leaq 8(%rsp), %rdx
2674         leaq 12(%rsp), %rcx
2675      */
2676      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2677                                     hregAMD64_RDI()));
2678      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2679                                     hregAMD64_RSI()));
2680      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2681                                     hregAMD64_RDX()));
2682      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2683                                     hregAMD64_RCX()));
2684      /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2685         movss  %argX, 0(%rsi)
2686         movss  %argY, 0(%rdx)
2687         movss  %argZ, 0(%rcx)
2688         */
2689      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2690                                       AMD64AMode_IR(0, hregAMD64_RSI())));
2691      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2692                                       AMD64AMode_IR(0, hregAMD64_RDX())));
2693      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2694                                       AMD64AMode_IR(0, hregAMD64_RCX())));
2695      /* call the helper */
2696      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2697                                     (ULong)(HWord)h_generic_calc_MAddF32,
2698                                     4, mk_RetLoc_simple(RLPri_None) ));
2699      /* fetch the result from memory, using %r_argp, which the
2700         register allocator will keep alive across the call. */
2701      addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2702                                       AMD64AMode_IR(0, hregAMD64_RSP())));
2703      /* and finally, clear the space */
2704      add_to_rsp(env, 16);
2705      return dst;
2706   }
2707
2708   ppIRExpr(e);
2709   vpanic("iselFltExpr_wrk");
2710}
2711
2712
2713/*---------------------------------------------------------*/
2714/*--- ISEL: Floating point expressions (64 bit)         ---*/
2715/*---------------------------------------------------------*/
2716
2717/* Compute a 64-bit floating point value into the lower half of an xmm
2718   register, the identity of which is returned.  As with
2719   iselIntExpr_R, the returned reg will be virtual, and it must not be
2720   changed by subsequent code emitted by the caller.
2721*/
2722
2723/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2724
2725    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2726    ----                  ---------   -----------   -----------
2727    signalling NaN        u           2047 (max)    .0uuuuu---u
2728                                                    (with at least
2729                                                     one 1 bit)
2730    quiet NaN             u           2047 (max)    .1uuuuu---u
2731
2732    negative infinity     1           2047 (max)    .000000---0
2733
2734    positive infinity     0           2047 (max)    .000000---0
2735
2736    negative zero         1           0             .000000---0
2737
2738    positive zero         0           0             .000000---0
2739*/
2740
2741static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2742{
2743   HReg r = iselDblExpr_wrk( env, e );
2744#  if 0
2745   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2746#  endif
2747   vassert(hregClass(r) == HRcVec128);
2748   vassert(hregIsVirtual(r));
2749   return r;
2750}
2751
2752/* DO NOT CALL THIS DIRECTLY */
2753static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
2754{
2755   IRType ty = typeOfIRExpr(env->type_env,e);
2756   vassert(e);
2757   vassert(ty == Ity_F64);
2758
2759   if (e->tag == Iex_RdTmp) {
2760      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2761   }
2762
2763   if (e->tag == Iex_Const) {
2764      union { ULong u64; Double f64; } u;
2765      HReg res = newVRegV(env);
2766      HReg tmp = newVRegI(env);
2767      vassert(sizeof(u) == 8);
2768      vassert(sizeof(u.u64) == 8);
2769      vassert(sizeof(u.f64) == 8);
2770
2771      if (e->Iex.Const.con->tag == Ico_F64) {
2772         u.f64 = e->Iex.Const.con->Ico.F64;
2773      }
2774      else if (e->Iex.Const.con->tag == Ico_F64i) {
2775         u.u64 = e->Iex.Const.con->Ico.F64i;
2776      }
2777      else
2778         vpanic("iselDblExpr(amd64): const");
2779
2780      addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2781      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2782      addInstr(env, AMD64Instr_SseLdSt(
2783                       True/*load*/, 8, res,
2784                       AMD64AMode_IR(0, hregAMD64_RSP())
2785              ));
2786      add_to_rsp(env, 8);
2787      return res;
2788   }
2789
2790   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2791      AMD64AMode* am;
2792      HReg res = newVRegV(env);
2793      vassert(e->Iex.Load.ty == Ity_F64);
2794      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2795      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2796      return res;
2797   }
2798
2799   if (e->tag == Iex_Get) {
2800      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2801                                      hregAMD64_RBP() );
2802      HReg res = newVRegV(env);
2803      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2804      return res;
2805   }
2806
2807   if (e->tag == Iex_GetI) {
2808      AMD64AMode* am
2809         = genGuestArrayOffset(
2810              env, e->Iex.GetI.descr,
2811                   e->Iex.GetI.ix, e->Iex.GetI.bias );
2812      HReg res = newVRegV(env);
2813      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2814      return res;
2815   }
2816
2817   if (e->tag == Iex_Triop) {
2818      IRTriop *triop = e->Iex.Triop.details;
2819      AMD64SseOp op = Asse_INVALID;
2820      switch (triop->op) {
2821         case Iop_AddF64: op = Asse_ADDF; break;
2822         case Iop_SubF64: op = Asse_SUBF; break;
2823         case Iop_MulF64: op = Asse_MULF; break;
2824         case Iop_DivF64: op = Asse_DIVF; break;
2825         default: break;
2826      }
2827      if (op != Asse_INVALID) {
2828         HReg dst  = newVRegV(env);
2829         HReg argL = iselDblExpr(env, triop->arg2);
2830         HReg argR = iselDblExpr(env, triop->arg3);
2831         addInstr(env, mk_vMOVsd_RR(argL, dst));
2832         /* XXXROUNDINGFIXME */
2833         /* set roundingmode here */
2834         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2835         return dst;
2836      }
2837   }
2838
2839   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2840      IRQop *qop = e->Iex.Qop.details;
2841      HReg dst  = newVRegV(env);
2842      HReg argX = iselDblExpr(env, qop->arg2);
2843      HReg argY = iselDblExpr(env, qop->arg3);
2844      HReg argZ = iselDblExpr(env, qop->arg4);
2845      /* XXXROUNDINGFIXME */
2846      /* set roundingmode here */
2847      /* subq $32, %rsp         -- make a space*/
2848      sub_from_rsp(env, 32);
2849      /* Prepare 4 arg regs:
2850         leaq 0(%rsp), %rdi
2851         leaq 8(%rsp), %rsi
2852         leaq 16(%rsp), %rdx
2853         leaq 24(%rsp), %rcx
2854      */
2855      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2856                                     hregAMD64_RDI()));
2857      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2858                                     hregAMD64_RSI()));
2859      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2860                                     hregAMD64_RDX()));
2861      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2862                                     hregAMD64_RCX()));
2863      /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2864         movsd  %argX, 0(%rsi)
2865         movsd  %argY, 0(%rdx)
2866         movsd  %argZ, 0(%rcx)
2867         */
2868      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2869                                       AMD64AMode_IR(0, hregAMD64_RSI())));
2870      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2871                                       AMD64AMode_IR(0, hregAMD64_RDX())));
2872      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2873                                       AMD64AMode_IR(0, hregAMD64_RCX())));
2874      /* call the helper */
2875      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2876                                     (ULong)(HWord)h_generic_calc_MAddF64,
2877                                     4, mk_RetLoc_simple(RLPri_None) ));
2878      /* fetch the result from memory, using %r_argp, which the
2879         register allocator will keep alive across the call. */
2880      addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2881                                       AMD64AMode_IR(0, hregAMD64_RSP())));
2882      /* and finally, clear the space */
2883      add_to_rsp(env, 32);
2884      return dst;
2885   }
2886
2887   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2888      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2889      HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2890      HReg        dst    = newVRegV(env);
2891
2892      /* rf now holds the value to be rounded.  The first thing to do
2893         is set the FPU's rounding mode accordingly. */
2894
2895      /* Set host x87 rounding mode */
2896      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2897
2898      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2899      addInstr(env, AMD64Instr_A87Free(1));
2900      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2901      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2902      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2903      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2904
2905      /* Restore default x87 rounding. */
2906      set_FPU_rounding_default( env );
2907
2908      return dst;
2909   }
2910
2911   IRTriop *triop = e->Iex.Triop.details;
2912   if (e->tag == Iex_Triop
2913       && (triop->op == Iop_ScaleF64
2914           || triop->op == Iop_AtanF64
2915           || triop->op == Iop_Yl2xF64
2916           || triop->op == Iop_Yl2xp1F64
2917           || triop->op == Iop_PRemF64
2918           || triop->op == Iop_PRem1F64)
2919      ) {
2920      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2921      HReg        arg1   = iselDblExpr(env, triop->arg2);
2922      HReg        arg2   = iselDblExpr(env, triop->arg3);
2923      HReg        dst    = newVRegV(env);
2924      Bool     arg2first = toBool(triop->op == Iop_ScaleF64
2925                                  || triop->op == Iop_PRemF64
2926                                  || triop->op == Iop_PRem1F64);
2927      addInstr(env, AMD64Instr_A87Free(2));
2928
2929      /* one arg -> top of x87 stack */
2930      addInstr(env, AMD64Instr_SseLdSt(
2931                       False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
2932      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2933
2934      /* other arg -> top of x87 stack */
2935      addInstr(env, AMD64Instr_SseLdSt(
2936                       False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
2937      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2938
2939      /* do it */
2940      /* XXXROUNDINGFIXME */
2941      /* set roundingmode here */
2942      switch (triop->op) {
2943         case Iop_ScaleF64:
2944            addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
2945            break;
2946         case Iop_AtanF64:
2947            addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
2948            break;
2949         case Iop_Yl2xF64:
2950            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
2951            break;
2952         case Iop_Yl2xp1F64:
2953            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
2954            break;
2955         case Iop_PRemF64:
2956            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
2957            break;
2958         case Iop_PRem1F64:
2959            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
2960            break;
2961         default:
2962            vassert(0);
2963      }
2964
2965      /* save result */
2966      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2967      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2968      return dst;
2969   }
2970
2971   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
2972      HReg dst = newVRegV(env);
2973      HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
2974      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2975      addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
2976      set_SSE_rounding_default( env );
2977      return dst;
2978   }
2979
2980   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
2981      HReg dst = newVRegV(env);
2982      HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2983      set_SSE_rounding_default( env );
2984      addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
2985      return dst;
2986   }
2987
2988   if (e->tag == Iex_Unop
2989       && (e->Iex.Unop.op == Iop_NegF64
2990           || e->Iex.Unop.op == Iop_AbsF64)) {
2991      /* Sigh ... very rough code.  Could do much better. */
2992      /* Get the 128-bit literal 00---0 10---0 into a register
2993         and xor/nand it with the value to be negated. */
2994      HReg r1  = newVRegI(env);
2995      HReg dst = newVRegV(env);
2996      HReg tmp = newVRegV(env);
2997      HReg src = iselDblExpr(env, e->Iex.Unop.arg);
2998      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2999      addInstr(env, mk_vMOVsd_RR(src,tmp));
3000      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3001      addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3002      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3003      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3004
3005      if (e->Iex.Unop.op == Iop_NegF64)
3006         addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3007      else
3008         addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3009
3010      add_to_rsp(env, 16);
3011      return dst;
3012   }
3013
3014   if (e->tag == Iex_Binop) {
3015      A87FpOp fpop = Afp_INVALID;
3016      switch (e->Iex.Binop.op) {
3017         case Iop_SqrtF64: fpop = Afp_SQRT; break;
3018         case Iop_SinF64:  fpop = Afp_SIN;  break;
3019         case Iop_CosF64:  fpop = Afp_COS;  break;
3020         case Iop_TanF64:  fpop = Afp_TAN;  break;
3021         case Iop_2xm1F64: fpop = Afp_2XM1; break;
3022         default: break;
3023      }
3024      if (fpop != Afp_INVALID) {
3025         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3026         HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3027         HReg        dst    = newVRegV(env);
3028         Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3029         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3030         addInstr(env, AMD64Instr_A87Free(nNeeded));
3031         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3032         /* XXXROUNDINGFIXME */
3033         /* set roundingmode here */
3034         /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3035            codes.  I don't think that matters, since this insn
3036            selector never generates such an instruction intervening
3037            between an flag-setting instruction and a flag-using
3038            instruction. */
3039         addInstr(env, AMD64Instr_A87FpOp(fpop));
3040         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3041         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3042         return dst;
3043      }
3044   }
3045
3046   if (e->tag == Iex_Unop) {
3047      switch (e->Iex.Unop.op) {
3048//..          case Iop_I32toF64: {
3049//..             HReg dst = newVRegF(env);
3050//..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3051//..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3052//..             set_FPU_rounding_default(env);
3053//..             addInstr(env, X86Instr_FpLdStI(
3054//..                              True/*load*/, 4, dst,
3055//..                              X86AMode_IR(0, hregX86_ESP())));
3056//..             add_to_esp(env, 4);
3057//..             return dst;
3058//..          }
3059         case Iop_ReinterpI64asF64: {
3060            /* Given an I64, produce an IEEE754 double with the same
3061               bit pattern. */
3062            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3063            HReg        dst    = newVRegV(env);
3064            AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3065            /* paranoia */
3066            set_SSE_rounding_default(env);
3067            addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3068            addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3069            return dst;
3070         }
3071         case Iop_F32toF64: {
3072            HReg f32;
3073            HReg f64 = newVRegV(env);
3074            /* this shouldn't be necessary, but be paranoid ... */
3075            set_SSE_rounding_default(env);
3076            f32 = iselFltExpr(env, e->Iex.Unop.arg);
3077            addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3078            return f64;
3079         }
3080         default:
3081            break;
3082      }
3083   }
3084
3085   /* --------- MULTIPLEX --------- */
3086   if (e->tag == Iex_ITE) { // VFD
3087      HReg r1, r0, dst;
3088      vassert(ty == Ity_F64);
3089      vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3090      r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3091      r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3092      dst = newVRegV(env);
3093      addInstr(env, mk_vMOVsd_RR(r1,dst));
3094      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3095      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3096      return dst;
3097   }
3098
3099   ppIRExpr(e);
3100   vpanic("iselDblExpr_wrk");
3101}
3102
3103
3104/*---------------------------------------------------------*/
3105/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3106/*---------------------------------------------------------*/
3107
3108static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
3109{
3110   HReg r = iselVecExpr_wrk( env, e );
3111#  if 0
3112   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3113#  endif
3114   vassert(hregClass(r) == HRcVec128);
3115   vassert(hregIsVirtual(r));
3116   return r;
3117}
3118
3119
3120/* DO NOT CALL THIS DIRECTLY */
3121static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
3122{
3123   HWord      fn = 0; /* address of helper fn, if required */
3124   Bool       arg1isEReg = False;
3125   AMD64SseOp op = Asse_INVALID;
3126   IRType     ty = typeOfIRExpr(env->type_env,e);
3127   vassert(e);
3128   vassert(ty == Ity_V128);
3129
3130   if (e->tag == Iex_RdTmp) {
3131      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3132   }
3133
3134   if (e->tag == Iex_Get) {
3135      HReg dst = newVRegV(env);
3136      addInstr(env, AMD64Instr_SseLdSt(
3137                       True/*load*/,
3138                       16,
3139                       dst,
3140                       AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3141                    )
3142              );
3143      return dst;
3144   }
3145
3146   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3147      HReg        dst = newVRegV(env);
3148      AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3149      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3150      return dst;
3151   }
3152
3153   if (e->tag == Iex_Const) {
3154      HReg dst = newVRegV(env);
3155      vassert(e->Iex.Const.con->tag == Ico_V128);
3156      switch (e->Iex.Const.con->Ico.V128) {
3157         case 0x0000:
3158            dst = generate_zeroes_V128(env);
3159            break;
3160         case 0xFFFF:
3161            dst = generate_ones_V128(env);
3162            break;
3163         default: {
3164            AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3165            /* do push_uimm64 twice, first time for the high-order half. */
3166            push_uimm64(env, bitmask8_to_bytemask64(
3167                                (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3168                       ));
3169            push_uimm64(env, bitmask8_to_bytemask64(
3170                                (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3171                       ));
3172            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3173            add_to_rsp(env, 16);
3174            break;
3175         }
3176      }
3177      return dst;
3178   }
3179
3180   if (e->tag == Iex_Unop) {
3181   switch (e->Iex.Unop.op) {
3182
3183      case Iop_NotV128: {
3184         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3185         return do_sse_NotV128(env, arg);
3186      }
3187
3188      case Iop_CmpNEZ64x2: {
3189         /* We can use SSE2 instructions for this. */
3190         /* Ideally, we want to do a 64Ix2 comparison against zero of
3191            the operand.  Problem is no such insn exists.  Solution
3192            therefore is to do a 32Ix4 comparison instead, and bitwise-
3193            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3194            let the not'd result of this initial comparison be a:b:c:d.
3195            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3196            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3197            giving the required result.
3198
3199            The required selection sequence is 2,3,0,1, which
3200            according to Intel's documentation means the pshufd
3201            literal value is 0xB1, that is,
3202            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3203         */
3204         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3205         HReg tmp  = generate_zeroes_V128(env);
3206         HReg dst  = newVRegV(env);
3207         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3208         tmp = do_sse_NotV128(env, tmp);
3209         addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3210         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3211         return dst;
3212      }
3213
3214      case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3215      case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3216      case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3217      do_CmpNEZ_vector:
3218      {
3219         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3220         HReg tmp  = newVRegV(env);
3221         HReg zero = generate_zeroes_V128(env);
3222         HReg dst;
3223         addInstr(env, mk_vMOVsd_RR(arg, tmp));
3224         addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3225         dst = do_sse_NotV128(env, tmp);
3226         return dst;
3227      }
3228
3229      case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3230      case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3231      case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
3232      do_32Fx4_unary:
3233      {
3234         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3235         HReg dst = newVRegV(env);
3236         addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3237         return dst;
3238      }
3239
3240      case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
3241      do_64Fx2_unary:
3242      {
3243         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3244         HReg dst = newVRegV(env);
3245         addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
3246         return dst;
3247      }
3248
3249      case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3250      case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3251      case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
3252      do_32F0x4_unary:
3253      {
3254         /* A bit subtle.  We have to copy the arg to the result
3255            register first, because actually doing the SSE scalar insn
3256            leaves the upper 3/4 of the destination register
3257            unchanged.  Whereas the required semantics of these
3258            primops is that the upper 3/4 is simply copied in from the
3259            argument. */
3260         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3261         HReg dst = newVRegV(env);
3262         addInstr(env, mk_vMOVsd_RR(arg, dst));
3263         addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3264         return dst;
3265      }
3266
3267      case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3268      do_64F0x2_unary:
3269      {
3270         /* A bit subtle.  We have to copy the arg to the result
3271            register first, because actually doing the SSE scalar insn
3272            leaves the upper half of the destination register
3273            unchanged.  Whereas the required semantics of these
3274            primops is that the upper half is simply copied in from the
3275            argument. */
3276         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3277         HReg dst = newVRegV(env);
3278         addInstr(env, mk_vMOVsd_RR(arg, dst));
3279         addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3280         return dst;
3281      }
3282
3283      case Iop_32UtoV128: {
3284         HReg        dst     = newVRegV(env);
3285         AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3286         AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3287         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3288         addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3289         return dst;
3290      }
3291
3292      case Iop_64UtoV128: {
3293         HReg        dst  = newVRegV(env);
3294         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3295         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3296         addInstr(env, AMD64Instr_Push(rmi));
3297         addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3298         add_to_rsp(env, 8);
3299         return dst;
3300      }
3301
3302      case Iop_V256toV128_0:
3303      case Iop_V256toV128_1: {
3304         HReg vHi, vLo;
3305         iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3306         return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3307      }
3308
3309      default:
3310         break;
3311   } /* switch (e->Iex.Unop.op) */
3312   } /* if (e->tag == Iex_Unop) */
3313
3314   if (e->tag == Iex_Binop) {
3315   switch (e->Iex.Binop.op) {
3316
3317      /* FIXME: could we generate MOVQ here? */
3318      case Iop_SetV128lo64: {
3319         HReg dst  = newVRegV(env);
3320         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3321         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3322         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3323         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3324         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3325         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3326         return dst;
3327      }
3328
3329      /* FIXME: could we generate MOVD here? */
3330      case Iop_SetV128lo32: {
3331         HReg dst  = newVRegV(env);
3332         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3333         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3334         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3335         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3336         addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3337         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3338         return dst;
3339      }
3340
3341      case Iop_64HLtoV128: {
3342         HReg        rsp     = hregAMD64_RSP();
3343         AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
3344         AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
3345         AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
3346         AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
3347         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
3348         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
3349         HReg        dst = newVRegV(env);
3350         /* One store-forwarding stall coming up, oh well :-( */
3351         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
3352         return dst;
3353      }
3354
3355      case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3356      case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3357      case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3358      case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3359      case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3360      case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3361      do_32Fx4:
3362      {
3363         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3364         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3365         HReg dst = newVRegV(env);
3366         addInstr(env, mk_vMOVsd_RR(argL, dst));
3367         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3368         return dst;
3369      }
3370
3371      case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3372      case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3373      case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3374      case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3375      case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3376      case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3377      do_64Fx2:
3378      {
3379         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3380         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3381         HReg dst = newVRegV(env);
3382         addInstr(env, mk_vMOVsd_RR(argL, dst));
3383         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3384         return dst;
3385      }
3386
3387      case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3388      case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3389      case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3390      case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3391      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3392      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3393      case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3394      case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3395      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3396      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3397      do_32F0x4: {
3398         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3399         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3400         HReg dst = newVRegV(env);
3401         addInstr(env, mk_vMOVsd_RR(argL, dst));
3402         addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3403         return dst;
3404      }
3405
3406      case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3407      case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3408      case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3409      case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3410      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3411      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3412      case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3413      case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3414      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3415      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3416      do_64F0x2: {
3417         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3418         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3419         HReg dst = newVRegV(env);
3420         addInstr(env, mk_vMOVsd_RR(argL, dst));
3421         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3422         return dst;
3423      }
3424
3425      case Iop_QNarrowBin32Sto16Sx8:
3426         op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3427      case Iop_QNarrowBin16Sto8Sx16:
3428         op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3429      case Iop_QNarrowBin16Sto8Ux16:
3430         op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3431
3432      case Iop_InterleaveHI8x16:
3433         op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3434      case Iop_InterleaveHI16x8:
3435         op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3436      case Iop_InterleaveHI32x4:
3437         op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3438      case Iop_InterleaveHI64x2:
3439         op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3440
3441      case Iop_InterleaveLO8x16:
3442         op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3443      case Iop_InterleaveLO16x8:
3444         op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3445      case Iop_InterleaveLO32x4:
3446         op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3447      case Iop_InterleaveLO64x2:
3448         op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3449
3450      case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3451      case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3452      case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3453      case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3454      case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3455      case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3456      case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3457      case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3458      case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3459      case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3460      case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3461      case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3462      case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3463      case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3464      case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3465      case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3466      case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3467      case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3468      case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3469      case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3470      case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3471      case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3472      case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3473      case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3474      case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3475      case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3476      case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3477      case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3478      case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3479      case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3480      case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3481      case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3482      case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3483      case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3484      do_SseReRg: {
3485         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3486         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3487         HReg dst = newVRegV(env);
3488         if (arg1isEReg) {
3489            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3490            addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3491         } else {
3492            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3493            addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3494         }
3495         return dst;
3496      }
3497
3498      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
3499      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
3500      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
3501      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
3502      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
3503      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
3504      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
3505      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
3506      do_SseShift: {
3507         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
3508         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3509         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3510         HReg        ereg = newVRegV(env);
3511         HReg        dst  = newVRegV(env);
3512         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3513         addInstr(env, AMD64Instr_Push(rmi));
3514         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3515         addInstr(env, mk_vMOVsd_RR(greg, dst));
3516         addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3517         add_to_rsp(env, 16);
3518         return dst;
3519      }
3520
3521      case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3522                           goto do_SseAssistedBinary;
3523      case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3524                           goto do_SseAssistedBinary;
3525      case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3526                           goto do_SseAssistedBinary;
3527      case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3528                           goto do_SseAssistedBinary;
3529      case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3530                           goto do_SseAssistedBinary;
3531      case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3532                           goto do_SseAssistedBinary;
3533      case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3534                           goto do_SseAssistedBinary;
3535      case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3536                           goto do_SseAssistedBinary;
3537      case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3538                           goto do_SseAssistedBinary;
3539      case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3540                           goto do_SseAssistedBinary;
3541      case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3542                           goto do_SseAssistedBinary;
3543      case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3544                           goto do_SseAssistedBinary;
3545      case Iop_QNarrowBin32Sto16Ux8:
3546                           fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3547                           goto do_SseAssistedBinary;
3548      case Iop_NarrowBin16to8x16:
3549                           fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3550                           goto do_SseAssistedBinary;
3551      case Iop_NarrowBin32to16x8:
3552                           fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3553                           goto do_SseAssistedBinary;
3554      do_SseAssistedBinary: {
3555         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3556            well. */
3557         vassert(fn != 0);
3558         HReg dst = newVRegV(env);
3559         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3560         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3561         HReg argp = newVRegI(env);
3562         /* subq $112, %rsp         -- make a space*/
3563         sub_from_rsp(env, 112);
3564         /* leaq 48(%rsp), %r_argp  -- point into it */
3565         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3566                                        argp));
3567         /* andq $-16, %r_argp      -- 16-align the pointer */
3568         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3569                                         AMD64RMI_Imm( ~(UInt)15 ),
3570                                         argp));
3571         /* Prepare 3 arg regs:
3572            leaq 0(%r_argp), %rdi
3573            leaq 16(%r_argp), %rsi
3574            leaq 32(%r_argp), %rdx
3575         */
3576         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3577                                        hregAMD64_RDI()));
3578         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3579                                        hregAMD64_RSI()));
3580         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3581                                        hregAMD64_RDX()));
3582         /* Store the two args, at (%rsi) and (%rdx):
3583            movupd  %argL, 0(%rsi)
3584            movupd  %argR, 0(%rdx)
3585         */
3586         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3587                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3588         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3589                                          AMD64AMode_IR(0, hregAMD64_RDX())));
3590         /* call the helper */
3591         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3592                                        3, mk_RetLoc_simple(RLPri_None) ));
3593         /* fetch the result from memory, using %r_argp, which the
3594            register allocator will keep alive across the call. */
3595         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3596                                          AMD64AMode_IR(0, argp)));
3597         /* and finally, clear the space */
3598         add_to_rsp(env, 112);
3599         return dst;
3600      }
3601
3602      case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3603                         goto do_SseAssistedVectorAndScalar;
3604      case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3605                         goto do_SseAssistedVectorAndScalar;
3606      do_SseAssistedVectorAndScalar: {
3607         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3608            well. */
3609         vassert(fn != 0);
3610         HReg dst = newVRegV(env);
3611         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3612         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3613         HReg argp = newVRegI(env);
3614         /* subq $112, %rsp         -- make a space*/
3615         sub_from_rsp(env, 112);
3616         /* leaq 48(%rsp), %r_argp  -- point into it */
3617         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3618                                        argp));
3619         /* andq $-16, %r_argp      -- 16-align the pointer */
3620         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3621                                         AMD64RMI_Imm( ~(UInt)15 ),
3622                                         argp));
3623         /* Prepare 2 vector arg regs:
3624            leaq 0(%r_argp), %rdi
3625            leaq 16(%r_argp), %rsi
3626         */
3627         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3628                                        hregAMD64_RDI()));
3629         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3630                                        hregAMD64_RSI()));
3631         /* Store the vector arg, at (%rsi):
3632            movupd  %argL, 0(%rsi)
3633         */
3634         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3635                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3636         /* And get the scalar value into rdx */
3637         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3638
3639         /* call the helper */
3640         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3641                                        3, mk_RetLoc_simple(RLPri_None) ));
3642         /* fetch the result from memory, using %r_argp, which the
3643            register allocator will keep alive across the call. */
3644         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3645                                          AMD64AMode_IR(0, argp)));
3646         /* and finally, clear the space */
3647         add_to_rsp(env, 112);
3648         return dst;
3649      }
3650
3651      default:
3652         break;
3653   } /* switch (e->Iex.Binop.op) */
3654   } /* if (e->tag == Iex_Binop) */
3655
3656   if (e->tag == Iex_Triop) {
3657   IRTriop *triop = e->Iex.Triop.details;
3658   switch (triop->op) {
3659
3660      case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3661      case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3662      case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3663      case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3664      do_64Fx2_w_rm:
3665      {
3666         HReg argL = iselVecExpr(env, triop->arg2);
3667         HReg argR = iselVecExpr(env, triop->arg3);
3668         HReg dst = newVRegV(env);
3669         addInstr(env, mk_vMOVsd_RR(argL, dst));
3670         /* XXXROUNDINGFIXME */
3671         /* set roundingmode here */
3672         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3673         return dst;
3674      }
3675
3676      case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3677      case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3678      case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3679      case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3680      do_32Fx4_w_rm:
3681      {
3682         HReg argL = iselVecExpr(env, triop->arg2);
3683         HReg argR = iselVecExpr(env, triop->arg3);
3684         HReg dst = newVRegV(env);
3685         addInstr(env, mk_vMOVsd_RR(argL, dst));
3686         /* XXXROUNDINGFIXME */
3687         /* set roundingmode here */
3688         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3689         return dst;
3690      }
3691
3692      default:
3693         break;
3694   } /* switch (triop->op) */
3695   } /* if (e->tag == Iex_Triop) */
3696
3697   if (e->tag == Iex_ITE) { // VFD
3698      HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3699      HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3700      HReg dst = newVRegV(env);
3701      addInstr(env, mk_vMOVsd_RR(r1,dst));
3702      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3703      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3704      return dst;
3705   }
3706
3707   //vec_fail:
3708   vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3709              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3710   ppIRExpr(e);
3711   vpanic("iselVecExpr_wrk");
3712}
3713
3714
3715/*---------------------------------------------------------*/
3716/*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3717/*---------------------------------------------------------*/
3718
3719static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3720                           ISelEnv* env, IRExpr* e )
3721{
3722   iselDVecExpr_wrk( rHi, rLo, env, e );
3723#  if 0
3724   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3725#  endif
3726   vassert(hregClass(*rHi) == HRcVec128);
3727   vassert(hregClass(*rLo) == HRcVec128);
3728   vassert(hregIsVirtual(*rHi));
3729   vassert(hregIsVirtual(*rLo));
3730}
3731
3732
3733/* DO NOT CALL THIS DIRECTLY */
3734static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3735                               ISelEnv* env, IRExpr* e )
3736{
3737   HWord fn = 0; /* address of helper fn, if required */
3738   vassert(e);
3739   IRType ty = typeOfIRExpr(env->type_env,e);
3740   vassert(ty == Ity_V256);
3741
3742   AMD64SseOp op = Asse_INVALID;
3743
3744   /* read 256-bit IRTemp */
3745   if (e->tag == Iex_RdTmp) {
3746      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3747      return;
3748   }
3749
3750   if (e->tag == Iex_Get) {
3751      HReg        vHi  = newVRegV(env);
3752      HReg        vLo  = newVRegV(env);
3753      HReg        rbp  = hregAMD64_RBP();
3754      AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3755      AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3756      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3757      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3758      *rHi = vHi;
3759      *rLo = vLo;
3760      return;
3761   }
3762
3763   if (e->tag == Iex_Load) {
3764      HReg        vHi  = newVRegV(env);
3765      HReg        vLo  = newVRegV(env);
3766      HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3767      AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3768      AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3769      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3770      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3771      *rHi = vHi;
3772      *rLo = vLo;
3773      return;
3774   }
3775
3776   if (e->tag == Iex_Const) {
3777      vassert(e->Iex.Const.con->tag == Ico_V256);
3778      switch (e->Iex.Const.con->Ico.V256) {
3779         case 0x00000000: {
3780            HReg vHi = generate_zeroes_V128(env);
3781            HReg vLo = newVRegV(env);
3782            addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3783            *rHi = vHi;
3784            *rLo = vLo;
3785            return;
3786         }
3787         default:
3788            break; /* give up.   Until such time as is necessary. */
3789      }
3790   }
3791
3792   if (e->tag == Iex_Unop) {
3793   switch (e->Iex.Unop.op) {
3794
3795      case Iop_NotV256: {
3796         HReg argHi, argLo;
3797         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3798         *rHi = do_sse_NotV128(env, argHi);
3799         *rLo = do_sse_NotV128(env, argLo);
3800         return;
3801      }
3802
3803      case Iop_Recip32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
3804      case Iop_Sqrt32Fx8:  op = Asse_SQRTF;  goto do_32Fx8_unary;
3805      case Iop_RSqrt32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3806      do_32Fx8_unary:
3807      {
3808         HReg argHi, argLo;
3809         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3810         HReg dstHi = newVRegV(env);
3811         HReg dstLo = newVRegV(env);
3812         addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
3813         addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
3814         *rHi = dstHi;
3815         *rLo = dstLo;
3816         return;
3817      }
3818
3819      case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
3820      do_64Fx4_unary:
3821      {
3822         HReg argHi, argLo;
3823         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3824         HReg dstHi = newVRegV(env);
3825         HReg dstLo = newVRegV(env);
3826         addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
3827         addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
3828         *rHi = dstHi;
3829         *rLo = dstLo;
3830         return;
3831      }
3832
3833      case Iop_CmpNEZ64x4: {
3834         /* We can use SSE2 instructions for this. */
3835         /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
3836            (obviously).  See comment on Iop_CmpNEZ64x2 for
3837            explanation of what's going on here. */
3838         HReg argHi, argLo;
3839         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3840         HReg tmpHi  = generate_zeroes_V128(env);
3841         HReg tmpLo  = newVRegV(env);
3842         addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
3843         HReg dstHi  = newVRegV(env);
3844         HReg dstLo  = newVRegV(env);
3845         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
3846         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
3847         tmpHi = do_sse_NotV128(env, tmpHi);
3848         tmpLo = do_sse_NotV128(env, tmpLo);
3849         addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
3850         addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
3851         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
3852         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
3853         *rHi = dstHi;
3854         *rLo = dstLo;
3855         return;
3856      }
3857
3858      case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3859      case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3860      case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3861      do_CmpNEZ_vector:
3862      {
3863         HReg argHi, argLo;
3864         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3865         HReg tmpHi = newVRegV(env);
3866         HReg tmpLo = newVRegV(env);
3867         HReg zero  = generate_zeroes_V128(env);
3868         HReg dstHi, dstLo;
3869         addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
3870         addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
3871         addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
3872         addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
3873         dstHi = do_sse_NotV128(env, tmpHi);
3874         dstLo = do_sse_NotV128(env, tmpLo);
3875         *rHi = dstHi;
3876         *rLo = dstLo;
3877         return;
3878      }
3879
3880      default:
3881         break;
3882   } /* switch (e->Iex.Unop.op) */
3883   } /* if (e->tag == Iex_Unop) */
3884
3885   if (e->tag == Iex_Binop) {
3886   switch (e->Iex.Binop.op) {
3887
3888      case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
3889      case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
3890      do_64Fx4:
3891      {
3892         HReg argLhi, argLlo, argRhi, argRlo;
3893         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3894         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3895         HReg dstHi = newVRegV(env);
3896         HReg dstLo = newVRegV(env);
3897         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3898         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3899         addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
3900         addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
3901         *rHi = dstHi;
3902         *rLo = dstLo;
3903         return;
3904      }
3905
3906      case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
3907      case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
3908      do_32Fx8:
3909      {
3910         HReg argLhi, argLlo, argRhi, argRlo;
3911         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3912         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3913         HReg dstHi = newVRegV(env);
3914         HReg dstLo = newVRegV(env);
3915         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3916         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3917         addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
3918         addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
3919         *rHi = dstHi;
3920         *rLo = dstLo;
3921         return;
3922      }
3923
3924      case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
3925      case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
3926      case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
3927      case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
3928      case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
3929      case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
3930      case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
3931      case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
3932      case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
3933      case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
3934      case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
3935      case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
3936      case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
3937      case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
3938      case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
3939      case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
3940      case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
3941      case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
3942      case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
3943      case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
3944      case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
3945      case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
3946      case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
3947      case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
3948      case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
3949      case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
3950      case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
3951      case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
3952      case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
3953      case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
3954      case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
3955      case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
3956      case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
3957      case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
3958      do_SseReRg:
3959      {
3960         HReg argLhi, argLlo, argRhi, argRlo;
3961         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3962         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3963         HReg dstHi = newVRegV(env);
3964         HReg dstLo = newVRegV(env);
3965         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3966         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3967         addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
3968         addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
3969         *rHi = dstHi;
3970         *rLo = dstLo;
3971         return;
3972      }
3973
3974      case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
3975      case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
3976      case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
3977      case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
3978      case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
3979      case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
3980      case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
3981      case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
3982      do_SseShift: {
3983         HReg gregHi, gregLo;
3984         iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
3985         AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3986         AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
3987         HReg        ereg  = newVRegV(env);
3988         HReg        dstHi = newVRegV(env);
3989         HReg        dstLo = newVRegV(env);
3990         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3991         addInstr(env, AMD64Instr_Push(rmi));
3992         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3993         addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
3994         addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
3995         addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
3996         addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
3997         add_to_rsp(env, 16);
3998         *rHi = dstHi;
3999         *rLo = dstLo;
4000         return;
4001      }
4002
4003      case Iop_V128HLtoV256: {
4004         *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4005         *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4006         return;
4007      }
4008
4009      case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4010                           goto do_SseAssistedBinary;
4011      case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4012                           goto do_SseAssistedBinary;
4013      case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4014                           goto do_SseAssistedBinary;
4015      case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4016                           goto do_SseAssistedBinary;
4017      case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4018                           goto do_SseAssistedBinary;
4019      case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4020                           goto do_SseAssistedBinary;
4021      case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4022                           goto do_SseAssistedBinary;
4023      case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4024                           goto do_SseAssistedBinary;
4025      case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4026                           goto do_SseAssistedBinary;
4027      case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4028                           goto do_SseAssistedBinary;
4029      case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4030                           goto do_SseAssistedBinary;
4031      do_SseAssistedBinary: {
4032         /* RRRufff!  RRRufff code is what we're generating here.  Oh
4033            well. */
4034         vassert(fn != 0);
4035         HReg dstHi = newVRegV(env);
4036         HReg dstLo = newVRegV(env);
4037         HReg argLhi, argLlo, argRhi, argRlo;
4038         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4039         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4040         HReg argp = newVRegI(env);
4041         /* subq $160, %rsp         -- make a space*/
4042         sub_from_rsp(env, 160);
4043         /* leaq 48(%rsp), %r_argp  -- point into it */
4044         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4045                                        argp));
4046         /* andq $-16, %r_argp      -- 16-align the pointer */
4047         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4048                                         AMD64RMI_Imm( ~(UInt)15 ),
4049                                         argp));
4050         /* Prepare 3 arg regs:
4051            leaq 0(%r_argp), %rdi
4052            leaq 16(%r_argp), %rsi
4053            leaq 32(%r_argp), %rdx
4054         */
4055         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4056                                        hregAMD64_RDI()));
4057         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4058                                        hregAMD64_RSI()));
4059         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4060                                        hregAMD64_RDX()));
4061         /* Store the two high args, at (%rsi) and (%rdx):
4062            movupd  %argLhi, 0(%rsi)
4063            movupd  %argRhi, 0(%rdx)
4064         */
4065         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4066                                          AMD64AMode_IR(0, hregAMD64_RSI())));
4067         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4068                                          AMD64AMode_IR(0, hregAMD64_RDX())));
4069         /* Store the two low args, at 48(%rsi) and 48(%rdx):
4070            movupd  %argLlo, 48(%rsi)
4071            movupd  %argRlo, 48(%rdx)
4072         */
4073         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4074                                          AMD64AMode_IR(48, hregAMD64_RSI())));
4075         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4076                                          AMD64AMode_IR(48, hregAMD64_RDX())));
4077         /* call the helper */
4078         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4079                                        mk_RetLoc_simple(RLPri_None) ));
4080         /* Prepare 3 arg regs:
4081            leaq 48(%r_argp), %rdi
4082            leaq 64(%r_argp), %rsi
4083            leaq 80(%r_argp), %rdx
4084         */
4085         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4086                                        hregAMD64_RDI()));
4087         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4088                                        hregAMD64_RSI()));
4089         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4090                                        hregAMD64_RDX()));
4091         /* call the helper */
4092         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4093                                        mk_RetLoc_simple(RLPri_None) ));
4094         /* fetch the result from memory, using %r_argp, which the
4095            register allocator will keep alive across the call. */
4096         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4097                                          AMD64AMode_IR(0, argp)));
4098         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4099                                          AMD64AMode_IR(48, argp)));
4100         /* and finally, clear the space */
4101         add_to_rsp(env, 160);
4102         *rHi = dstHi;
4103         *rLo = dstLo;
4104         return;
4105      }
4106
4107      case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4108                           goto do_SseAssistedBinary256;
4109      do_SseAssistedBinary256: {
4110         /* RRRufff!  RRRufff code is what we're generating here.  Oh
4111            well. */
4112         vassert(fn != 0);
4113         HReg dstHi = newVRegV(env);
4114         HReg dstLo = newVRegV(env);
4115         HReg argLhi, argLlo, argRhi, argRlo;
4116         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4117         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4118         HReg argp = newVRegI(env);
4119         /* subq $160, %rsp         -- make a space*/
4120         sub_from_rsp(env, 160);
4121         /* leaq 48(%rsp), %r_argp  -- point into it */
4122         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4123                                        argp));
4124         /* andq $-16, %r_argp      -- 16-align the pointer */
4125         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4126                                         AMD64RMI_Imm( ~(UInt)15 ),
4127                                         argp));
4128         /* Prepare 3 arg regs:
4129            leaq 0(%r_argp), %rdi
4130            leaq 32(%r_argp), %rsi
4131            leaq 64(%r_argp), %rdx
4132         */
4133         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4134                                        hregAMD64_RDI()));
4135         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4136                                        hregAMD64_RSI()));
4137         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4138                                        hregAMD64_RDX()));
4139         /* Store the two args, at (%rsi) and (%rdx):
4140            movupd  %argLlo, 0(%rsi)
4141            movupd  %argLhi, 16(%rsi)
4142            movupd  %argRlo, 0(%rdx)
4143            movupd  %argRhi, 16(%rdx)
4144         */
4145         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4146                                          AMD64AMode_IR(0, hregAMD64_RSI())));
4147         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4148                                          AMD64AMode_IR(16, hregAMD64_RSI())));
4149         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4150                                          AMD64AMode_IR(0, hregAMD64_RDX())));
4151         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4152                                          AMD64AMode_IR(16, hregAMD64_RDX())));
4153         /* call the helper */
4154         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4155                                        mk_RetLoc_simple(RLPri_None) ));
4156         /* fetch the result from memory, using %r_argp, which the
4157            register allocator will keep alive across the call. */
4158         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4159                                          AMD64AMode_IR(0, argp)));
4160         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4161                                          AMD64AMode_IR(16, argp)));
4162         /* and finally, clear the space */
4163         add_to_rsp(env, 160);
4164         *rHi = dstHi;
4165         *rLo = dstLo;
4166         return;
4167      }
4168
4169      default:
4170         break;
4171   } /* switch (e->Iex.Binop.op) */
4172   } /* if (e->tag == Iex_Binop) */
4173
4174   if (e->tag == Iex_Triop) {
4175   IRTriop *triop = e->Iex.Triop.details;
4176   switch (triop->op) {
4177
4178      case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4179      case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4180      case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4181      case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4182      do_64Fx4_w_rm:
4183      {
4184         HReg argLhi, argLlo, argRhi, argRlo;
4185         iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4186         iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4187         HReg dstHi = newVRegV(env);
4188         HReg dstLo = newVRegV(env);
4189         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4190         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4191         /* XXXROUNDINGFIXME */
4192         /* set roundingmode here */
4193         addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4194         addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4195         *rHi = dstHi;
4196         *rLo = dstLo;
4197         return;
4198      }
4199
4200      case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4201      case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4202      case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4203      case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4204      do_32Fx8_w_rm:
4205      {
4206         HReg argLhi, argLlo, argRhi, argRlo;
4207         iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4208         iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4209         HReg dstHi = newVRegV(env);
4210         HReg dstLo = newVRegV(env);
4211         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4212         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4213         /* XXXROUNDINGFIXME */
4214         /* set roundingmode here */
4215         addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4216         addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4217         *rHi = dstHi;
4218         *rLo = dstLo;
4219         return;
4220      }
4221
4222      default:
4223         break;
4224   } /* switch (triop->op) */
4225   } /* if (e->tag == Iex_Triop) */
4226
4227
4228   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4229      HReg        rsp     = hregAMD64_RSP();
4230      HReg        vHi     = newVRegV(env);
4231      HReg        vLo     = newVRegV(env);
4232      AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
4233      AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
4234      /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4235      /* Get all the args into regs, before messing with the stack. */
4236      AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
4237      AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
4238      AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
4239      AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
4240      /* less significant lane (Q2) at the lower address (-16(rsp)) */
4241      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
4242      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
4243      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
4244      /* and then the lower half .. */
4245      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
4246      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
4247      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
4248      *rHi = vHi;
4249      *rLo = vLo;
4250      return;
4251   }
4252
4253   if (e->tag == Iex_ITE) {
4254      HReg r1Hi, r1Lo, r0Hi, r0Lo;
4255      iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4256      iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4257      HReg dstHi = newVRegV(env);
4258      HReg dstLo = newVRegV(env);
4259      addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4260      addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4261      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4262      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4263      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4264      *rHi = dstHi;
4265      *rLo = dstLo;
4266      return;
4267   }
4268
4269   //avx_fail:
4270   vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4271              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4272   ppIRExpr(e);
4273   vpanic("iselDVecExpr_wrk");
4274}
4275
4276
4277/*---------------------------------------------------------*/
4278/*--- ISEL: Statements                                  ---*/
4279/*---------------------------------------------------------*/
4280
4281static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4282{
4283   if (vex_traceflags & VEX_TRACE_VCODE) {
4284      vex_printf("\n-- ");
4285      ppIRStmt(stmt);
4286      vex_printf("\n");
4287   }
4288
4289   switch (stmt->tag) {
4290
4291   /* --------- STORE --------- */
4292   case Ist_Store: {
4293      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4294      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4295      IREndness end   = stmt->Ist.Store.end;
4296
4297      if (tya != Ity_I64 || end != Iend_LE)
4298         goto stmt_fail;
4299
4300      if (tyd == Ity_I64) {
4301         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4302         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4303         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4304         return;
4305      }
4306      if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4307         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4308         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4309         addInstr(env, AMD64Instr_Store(
4310                          toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4311                          r,am));
4312         return;
4313      }
4314      if (tyd == Ity_F64) {
4315         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4316         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4317         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4318         return;
4319      }
4320      if (tyd == Ity_F32) {
4321         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4322         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4323         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4324         return;
4325      }
4326      if (tyd == Ity_V128) {
4327         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4328         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4329         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4330         return;
4331      }
4332      if (tyd == Ity_V256) {
4333         HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4334         AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4335         AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4336         HReg vHi, vLo;
4337         iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4338         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4339         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4340         return;
4341      }
4342      break;
4343   }
4344
4345   /* --------- PUT --------- */
4346   case Ist_Put: {
4347      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4348      if (ty == Ity_I64) {
4349         /* We're going to write to memory, so compute the RHS into an
4350            AMD64RI. */
4351         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4352         addInstr(env,
4353                  AMD64Instr_Alu64M(
4354                     Aalu_MOV,
4355                     ri,
4356                     AMD64AMode_IR(stmt->Ist.Put.offset,
4357                                   hregAMD64_RBP())
4358                 ));
4359         return;
4360      }
4361      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4362         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4363         addInstr(env, AMD64Instr_Store(
4364                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4365                          r,
4366                          AMD64AMode_IR(stmt->Ist.Put.offset,
4367                                        hregAMD64_RBP())));
4368         return;
4369      }
4370      if (ty == Ity_F32) {
4371         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4372         AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4373         set_SSE_rounding_default(env); /* paranoia */
4374         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4375         return;
4376      }
4377      if (ty == Ity_F64) {
4378         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4379         AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4380                                         hregAMD64_RBP() );
4381         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4382         return;
4383      }
4384      if (ty == Ity_V128) {
4385         HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4386         AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4387                                         hregAMD64_RBP());
4388         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4389         return;
4390      }
4391      if (ty == Ity_V256) {
4392         HReg vHi, vLo;
4393         iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4394         HReg        rbp  = hregAMD64_RBP();
4395         AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4396         AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4397         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4398         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4399         return;
4400      }
4401      break;
4402   }
4403
4404   /* --------- Indexed PUT --------- */
4405   case Ist_PutI: {
4406      IRPutI *puti = stmt->Ist.PutI.details;
4407
4408      AMD64AMode* am
4409         = genGuestArrayOffset(
4410              env, puti->descr,
4411                   puti->ix, puti->bias );
4412
4413      IRType ty = typeOfIRExpr(env->type_env, puti->data);
4414      if (ty == Ity_F64) {
4415         HReg val = iselDblExpr(env, puti->data);
4416         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4417         return;
4418      }
4419      if (ty == Ity_I8) {
4420         HReg r = iselIntExpr_R(env, puti->data);
4421         addInstr(env, AMD64Instr_Store( 1, r, am ));
4422         return;
4423      }
4424      if (ty == Ity_I64) {
4425         AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4426         addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4427         return;
4428      }
4429      break;
4430   }
4431
4432   /* --------- TMP --------- */
4433   case Ist_WrTmp: {
4434      IRTemp tmp = stmt->Ist.WrTmp.tmp;
4435      IRType ty = typeOfIRTemp(env->type_env, tmp);
4436
4437      /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4438         compute it into an AMode and then use LEA.  This usually
4439         produces fewer instructions, often because (for memcheck
4440         created IR) we get t = address-expression, (t is later used
4441         twice) and so doing this naturally turns address-expression
4442         back into an AMD64 amode. */
4443      if (ty == Ity_I64
4444          && stmt->Ist.WrTmp.data->tag == Iex_Binop
4445          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4446         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4447         HReg dst = lookupIRTemp(env, tmp);
4448         if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4449            /* Hmm, iselIntExpr_AMode wimped out and just computed the
4450               value into a register.  Just emit a normal reg-reg move
4451               so reg-alloc can coalesce it away in the usual way. */
4452            HReg src = am->Aam.IR.reg;
4453            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4454         } else {
4455            addInstr(env, AMD64Instr_Lea64(am,dst));
4456         }
4457         return;
4458      }
4459
4460      if (ty == Ity_I64 || ty == Ity_I32
4461          || ty == Ity_I16 || ty == Ity_I8) {
4462         AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4463         HReg dst = lookupIRTemp(env, tmp);
4464         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4465         return;
4466      }
4467      if (ty == Ity_I128) {
4468         HReg rHi, rLo, dstHi, dstLo;
4469         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4470         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4471         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4472         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4473         return;
4474      }
4475      if (ty == Ity_I1) {
4476         AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4477         HReg dst = lookupIRTemp(env, tmp);
4478         addInstr(env, AMD64Instr_Set64(cond, dst));
4479         return;
4480      }
4481      if (ty == Ity_F64) {
4482         HReg dst = lookupIRTemp(env, tmp);
4483         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4484         addInstr(env, mk_vMOVsd_RR(src, dst));
4485         return;
4486      }
4487      if (ty == Ity_F32) {
4488         HReg dst = lookupIRTemp(env, tmp);
4489         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4490         addInstr(env, mk_vMOVsd_RR(src, dst));
4491         return;
4492      }
4493      if (ty == Ity_V128) {
4494         HReg dst = lookupIRTemp(env, tmp);
4495         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4496         addInstr(env, mk_vMOVsd_RR(src, dst));
4497         return;
4498      }
4499      if (ty == Ity_V256) {
4500         HReg rHi, rLo, dstHi, dstLo;
4501         iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4502         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4503         addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4504         addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4505         return;
4506      }
4507      break;
4508   }
4509
4510   /* --------- Call to DIRTY helper --------- */
4511   case Ist_Dirty: {
4512      IRDirty* d = stmt->Ist.Dirty.details;
4513
4514      /* Figure out the return type, if any. */
4515      IRType retty = Ity_INVALID;
4516      if (d->tmp != IRTemp_INVALID)
4517         retty = typeOfIRTemp(env->type_env, d->tmp);
4518
4519      /* Throw out any return types we don't know about. */
4520      Bool retty_ok = False;
4521      switch (retty) {
4522         case Ity_INVALID: /* function doesn't return anything */
4523         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4524         case Ity_V128: case Ity_V256:
4525            retty_ok = True; break;
4526         default:
4527            break;
4528      }
4529      if (!retty_ok)
4530         break; /* will go to stmt_fail: */
4531
4532      /* Marshal args, do the call, and set the return value to
4533         0x555..555 if this is a conditional call that returns a value
4534         and the call is skipped. */
4535      UInt   addToSp = 0;
4536      RetLoc rloc    = mk_RetLoc_INVALID();
4537      doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4538      vassert(is_sane_RetLoc(rloc));
4539
4540      /* Now figure out what to do with the returned value, if any. */
4541      switch (retty) {
4542         case Ity_INVALID: {
4543            /* No return value.  Nothing to do. */
4544            vassert(d->tmp == IRTemp_INVALID);
4545            vassert(rloc.pri == RLPri_None);
4546            vassert(addToSp == 0);
4547            return;
4548         }
4549         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4550            /* The returned value is in %rax.  Park it in the register
4551               associated with tmp. */
4552            vassert(rloc.pri == RLPri_Int);
4553            vassert(addToSp == 0);
4554            HReg dst = lookupIRTemp(env, d->tmp);
4555            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4556            return;
4557         }
4558         case Ity_V128: {
4559            /* The returned value is on the stack, and rloc.spOff
4560               tells us where.  Fish it off the stack and then move
4561               the stack pointer upwards to clear it, as directed by
4562               doHelperCall. */
4563            vassert(rloc.pri == RLPri_V128SpRel);
4564            vassert(addToSp >= 16);
4565            HReg        dst = lookupIRTemp(env, d->tmp);
4566            AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4567            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4568            add_to_rsp(env, addToSp);
4569            return;
4570         }
4571         case Ity_V256: {
4572            /* See comments for Ity_V128. */
4573            vassert(rloc.pri == RLPri_V256SpRel);
4574            vassert(addToSp >= 32);
4575            HReg        dstLo, dstHi;
4576            lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4577            AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4578            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4579            AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4580            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4581            add_to_rsp(env, addToSp);
4582            return;
4583         }
4584         default:
4585            /*NOTREACHED*/
4586            vassert(0);
4587      }
4588      break;
4589   }
4590
4591   /* --------- MEM FENCE --------- */
4592   case Ist_MBE:
4593      switch (stmt->Ist.MBE.event) {
4594         case Imbe_Fence:
4595            addInstr(env, AMD64Instr_MFence());
4596            return;
4597         default:
4598            break;
4599      }
4600      break;
4601
4602   /* --------- ACAS --------- */
4603   case Ist_CAS:
4604      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4605         /* "normal" singleton CAS */
4606         UChar  sz;
4607         IRCAS* cas = stmt->Ist.CAS.details;
4608         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4609         /* get: cas->expd into %rax, and cas->data into %rbx */
4610         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4611         HReg rData = iselIntExpr_R(env, cas->dataLo);
4612         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4613         HReg rOld  = lookupIRTemp(env, cas->oldLo);
4614         vassert(cas->expdHi == NULL);
4615         vassert(cas->dataHi == NULL);
4616         addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4617         addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4618         addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4619         switch (ty) {
4620            case Ity_I64: sz = 8; break;
4621            case Ity_I32: sz = 4; break;
4622            case Ity_I16: sz = 2; break;
4623            case Ity_I8:  sz = 1; break;
4624            default: goto unhandled_cas;
4625         }
4626         addInstr(env, AMD64Instr_ACAS(am, sz));
4627         addInstr(env, AMD64Instr_CMov64(
4628                          Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
4629         return;
4630      } else {
4631         /* double CAS */
4632         UChar  sz;
4633         IRCAS* cas = stmt->Ist.CAS.details;
4634         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4635         /* only 32-bit and 64-bit allowed in this case */
4636         /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4637         /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4638         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4639         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4640         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4641         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4642         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4643         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4644         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4645         switch (ty) {
4646            case Ity_I64:
4647               if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4648                  goto unhandled_cas; /* we'd have to generate
4649                                         cmpxchg16b, but the host
4650                                         doesn't support that */
4651               sz = 8;
4652               break;
4653            case Ity_I32:
4654               sz = 4;
4655               break;
4656            default:
4657               goto unhandled_cas;
4658         }
4659         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4660         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4661         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4662         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4663         addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4664         addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4665         addInstr(env, AMD64Instr_DACAS(am, sz));
4666         addInstr(env,
4667                  AMD64Instr_CMov64(
4668                     Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
4669         addInstr(env,
4670                  AMD64Instr_CMov64(
4671                     Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
4672         return;
4673      }
4674      unhandled_cas:
4675      break;
4676
4677   /* --------- INSTR MARK --------- */
4678   /* Doesn't generate any executable code ... */
4679   case Ist_IMark:
4680       return;
4681
4682   /* --------- ABI HINT --------- */
4683   /* These have no meaning (denotation in the IR) and so we ignore
4684      them ... if any actually made it this far. */
4685   case Ist_AbiHint:
4686       return;
4687
4688   /* --------- NO-OP --------- */
4689   case Ist_NoOp:
4690       return;
4691
4692   /* --------- EXIT --------- */
4693   case Ist_Exit: {
4694      if (stmt->Ist.Exit.dst->tag != Ico_U64)
4695         vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4696
4697      AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4698      AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
4699                                          hregAMD64_RBP());
4700
4701      /* Case: boring transfer to known address */
4702      if (stmt->Ist.Exit.jk == Ijk_Boring) {
4703         if (env->chainingAllowed) {
4704            /* .. almost always true .. */
4705            /* Skip the event check at the dst if this is a forwards
4706               edge. */
4707            Bool toFastEP
4708               = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
4709            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4710            addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
4711                                             amRIP, cc, toFastEP));
4712         } else {
4713            /* .. very occasionally .. */
4714            /* We can't use chaining, so ask for an assisted transfer,
4715               as that's the only alternative that is allowable. */
4716            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4717            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
4718         }
4719         return;
4720      }
4721
4722      /* Case: assisted transfer to arbitrary address */
4723      switch (stmt->Ist.Exit.jk) {
4724         /* Keep this list in sync with that in iselNext below */
4725         case Ijk_ClientReq:
4726         case Ijk_EmWarn:
4727         case Ijk_NoDecode:
4728         case Ijk_NoRedir:
4729         case Ijk_SigSEGV:
4730         case Ijk_SigTRAP:
4731         case Ijk_Sys_syscall:
4732         case Ijk_InvalICache:
4733         case Ijk_Yield:
4734         {
4735            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4736            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
4737            return;
4738         }
4739         default:
4740            break;
4741      }
4742
4743      /* Do we ever expect to see any other kind? */
4744      goto stmt_fail;
4745   }
4746
4747   default: break;
4748   }
4749  stmt_fail:
4750   ppIRStmt(stmt);
4751   vpanic("iselStmt(amd64)");
4752}
4753
4754
4755/*---------------------------------------------------------*/
4756/*--- ISEL: Basic block terminators (Nexts)             ---*/
4757/*---------------------------------------------------------*/
4758
4759static void iselNext ( ISelEnv* env,
4760                       IRExpr* next, IRJumpKind jk, Int offsIP )
4761{
4762   if (vex_traceflags & VEX_TRACE_VCODE) {
4763      vex_printf( "\n-- PUT(%d) = ", offsIP);
4764      ppIRExpr( next );
4765      vex_printf( "; exit-");
4766      ppIRJumpKind(jk);
4767      vex_printf( "\n");
4768   }
4769
4770   /* Case: boring transfer to known address */
4771   if (next->tag == Iex_Const) {
4772      IRConst* cdst = next->Iex.Const.con;
4773      vassert(cdst->tag == Ico_U64);
4774      if (jk == Ijk_Boring || jk == Ijk_Call) {
4775         /* Boring transfer to known address */
4776         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4777         if (env->chainingAllowed) {
4778            /* .. almost always true .. */
4779            /* Skip the event check at the dst if this is a forwards
4780               edge. */
4781            Bool toFastEP
4782               = ((Addr64)cdst->Ico.U64) > env->max_ga;
4783            if (0) vex_printf("%s", toFastEP ? "X" : ".");
4784            addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
4785                                             amRIP, Acc_ALWAYS,
4786                                             toFastEP));
4787         } else {
4788            /* .. very occasionally .. */
4789            /* We can't use chaining, so ask for an indirect transfer,
4790               as that's the cheapest alternative that is
4791               allowable. */
4792            HReg r = iselIntExpr_R(env, next);
4793            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4794                                               Ijk_Boring));
4795         }
4796         return;
4797      }
4798   }
4799
4800   /* Case: call/return (==boring) transfer to any address */
4801   switch (jk) {
4802      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4803         HReg        r     = iselIntExpr_R(env, next);
4804         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4805         if (env->chainingAllowed) {
4806            addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
4807         } else {
4808            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4809                                               Ijk_Boring));
4810         }
4811         return;
4812      }
4813      default:
4814         break;
4815   }
4816
4817   /* Case: assisted transfer to arbitrary address */
4818   switch (jk) {
4819      /* Keep this list in sync with that for Ist_Exit above */
4820      case Ijk_ClientReq:
4821      case Ijk_EmWarn:
4822      case Ijk_NoDecode:
4823      case Ijk_NoRedir:
4824      case Ijk_SigSEGV:
4825      case Ijk_SigTRAP:
4826      case Ijk_Sys_syscall:
4827      case Ijk_InvalICache:
4828      case Ijk_Yield: {
4829         HReg        r     = iselIntExpr_R(env, next);
4830         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4831         addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
4832         return;
4833      }
4834      default:
4835         break;
4836   }
4837
4838   vex_printf( "\n-- PUT(%d) = ", offsIP);
4839   ppIRExpr( next );
4840   vex_printf( "; exit-");
4841   ppIRJumpKind(jk);
4842   vex_printf( "\n");
4843   vassert(0); // are we expecting any other kind?
4844}
4845
4846
4847/*---------------------------------------------------------*/
4848/*--- Insn selector top-level                           ---*/
4849/*---------------------------------------------------------*/
4850
4851/* Translate an entire SB to amd64 code. */
4852
4853HInstrArray* iselSB_AMD64 ( IRSB* bb,
4854                            VexArch      arch_host,
4855                            VexArchInfo* archinfo_host,
4856                            VexAbiInfo*  vbi/*UNUSED*/,
4857                            Int offs_Host_EvC_Counter,
4858                            Int offs_Host_EvC_FailAddr,
4859                            Bool chainingAllowed,
4860                            Bool addProfInc,
4861                            Addr64 max_ga )
4862{
4863   Int        i, j;
4864   HReg       hreg, hregHI;
4865   ISelEnv*   env;
4866   UInt       hwcaps_host = archinfo_host->hwcaps;
4867   AMD64AMode *amCounter, *amFailAddr;
4868
4869   /* sanity ... */
4870   vassert(arch_host == VexArchAMD64);
4871   vassert(0 == (hwcaps_host
4872                 & ~(VEX_HWCAPS_AMD64_SSE3
4873                     | VEX_HWCAPS_AMD64_CX16
4874                     | VEX_HWCAPS_AMD64_LZCNT
4875                     | VEX_HWCAPS_AMD64_AVX
4876                     | VEX_HWCAPS_AMD64_RDTSCP
4877                     | VEX_HWCAPS_AMD64_BMI
4878                     | VEX_HWCAPS_AMD64_AVX2)));
4879
4880   /* Make up an initial environment to use. */
4881   env = LibVEX_Alloc(sizeof(ISelEnv));
4882   env->vreg_ctr = 0;
4883
4884   /* Set up output code array. */
4885   env->code = newHInstrArray();
4886
4887   /* Copy BB's type env. */
4888   env->type_env = bb->tyenv;
4889
4890   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4891      change as we go along. */
4892   env->n_vregmap = bb->tyenv->types_used;
4893   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4894   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
4895
4896   /* and finally ... */
4897   env->chainingAllowed = chainingAllowed;
4898   env->hwcaps          = hwcaps_host;
4899   env->max_ga          = max_ga;
4900
4901   /* For each IR temporary, allocate a suitably-kinded virtual
4902      register. */
4903   j = 0;
4904   for (i = 0; i < env->n_vregmap; i++) {
4905      hregHI = hreg = INVALID_HREG;
4906      switch (bb->tyenv->types[i]) {
4907         case Ity_I1:
4908         case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
4909            hreg = mkHReg(j++, HRcInt64, True);
4910            break;
4911         case Ity_I128:
4912            hreg   = mkHReg(j++, HRcInt64, True);
4913            hregHI = mkHReg(j++, HRcInt64, True);
4914            break;
4915         case Ity_F32:
4916         case Ity_F64:
4917         case Ity_V128:
4918            hreg = mkHReg(j++, HRcVec128, True);
4919            break;
4920         case Ity_V256:
4921            hreg   = mkHReg(j++, HRcVec128, True);
4922            hregHI = mkHReg(j++, HRcVec128, True);
4923            break;
4924         default:
4925            ppIRType(bb->tyenv->types[i]);
4926            vpanic("iselBB(amd64): IRTemp type");
4927      }
4928      env->vregmap[i]   = hreg;
4929      env->vregmapHI[i] = hregHI;
4930   }
4931   env->vreg_ctr = j;
4932
4933   /* The very first instruction must be an event check. */
4934   amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
4935   amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
4936   addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
4937
4938   /* Possibly a block counter increment (for profiling).  At this
4939      point we don't know the address of the counter, so just pretend
4940      it is zero.  It will have to be patched later, but before this
4941      translation is used, by a call to LibVEX_patchProfCtr. */
4942   if (addProfInc) {
4943      addInstr(env, AMD64Instr_ProfInc());
4944   }
4945
4946   /* Ok, finally we can iterate over the statements. */
4947   for (i = 0; i < bb->stmts_used; i++)
4948      if (bb->stmts[i])
4949         iselStmt(env, bb->stmts[i]);
4950
4951   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4952
4953   /* record the number of vregs we used. */
4954   env->code->n_vregs = env->vreg_ctr;
4955   return env->code;
4956}
4957
4958
4959/*---------------------------------------------------------------*/
4960/*--- end                                   host_amd64_isel.c ---*/
4961/*---------------------------------------------------------------*/
4962