1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_isel.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_ir.h"
38#include "libvex.h"
39
40#include "ir_match.h"
41#include "main_util.h"
42#include "main_globals.h"
43#include "host_generic_regs.h"
44#include "host_generic_simd64.h"
45#include "host_generic_simd128.h"
46#include "host_generic_simd256.h"
47#include "host_generic_maddf.h"
48#include "host_amd64_defs.h"
49
50
51/*---------------------------------------------------------*/
52/*--- x87/SSE control word stuff                        ---*/
53/*---------------------------------------------------------*/
54
55/* Vex-generated code expects to run with the FPU set as follows: all
56   exceptions masked, round-to-nearest, precision = 53 bits.  This
57   corresponds to a FPU control word value of 0x027F.
58
59   Similarly the SSE control word (%mxcsr) should be 0x1F80.
60
61   %fpucw and %mxcsr should have these values on entry to
62   Vex-generated code, and should those values should be
63   unchanged at exit.
64*/
65
66#define DEFAULT_FPUCW 0x027F
67
68#define DEFAULT_MXCSR 0x1F80
69
70/* debugging only, do not use */
71/* define DEFAULT_FPUCW 0x037F */
72
73
74/*---------------------------------------------------------*/
75/*--- misc helpers                                      ---*/
76/*---------------------------------------------------------*/
77
78/* These are duplicated in guest-amd64/toIR.c */
79static IRExpr* unop ( IROp op, IRExpr* a )
80{
81   return IRExpr_Unop(op, a);
82}
83
84static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
85{
86   return IRExpr_Binop(op, a1, a2);
87}
88
89static IRExpr* bind ( Int binder )
90{
91   return IRExpr_Binder(binder);
92}
93
94static Bool isZeroU8 ( IRExpr* e )
95{
96   return e->tag == Iex_Const
97          && e->Iex.Const.con->tag == Ico_U8
98          && e->Iex.Const.con->Ico.U8 == 0;
99}
100
101
102/*---------------------------------------------------------*/
103/*--- ISelEnv                                           ---*/
104/*---------------------------------------------------------*/
105
106/* This carries around:
107
108   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
109     might encounter.  This is computed before insn selection starts,
110     and does not change.
111
112   - A mapping from IRTemp to HReg.  This tells the insn selector
113     which virtual register is associated with each IRTemp
114     temporary.  This is computed before insn selection starts, and
115     does not change.  We expect this mapping to map precisely the
116     same set of IRTemps as the type mapping does.
117
118        - vregmap   holds the primary register for the IRTemp.
119        - vregmapHI is only used for 128-bit integer-typed
120             IRTemps.  It holds the identity of a second
121             64-bit virtual HReg, which holds the high half
122             of the value.
123
124   - The host subarchitecture we are selecting insns for.
125     This is set at the start and does not change.
126
127   - The code array, that is, the insns selected so far.
128
129   - A counter, for generating new virtual registers.
130
131   - A Bool for indicating whether we may generate chain-me
132     instructions for control flow transfers, or whether we must use
133     XAssisted.
134
135   - The maximum guest address of any guest insn in this block.
136     Actually, the address of the highest-addressed byte from any insn
137     in this block.  Is set at the start and does not change.  This is
138     used for detecting jumps which are definitely forward-edges from
139     this block, and therefore can be made (chained) to the fast entry
140     point of the destination, thereby avoiding the destination's
141     event check.
142
143   Note, this is all host-independent.  (JRS 20050201: well, kinda
144   ... not completely.  Compare with ISelEnv for X86.)
145*/
146
147typedef
148   struct {
149      /* Constant -- are set at the start and do not change. */
150      IRTypeEnv*   type_env;
151
152      HReg*        vregmap;
153      HReg*        vregmapHI;
154      Int          n_vregmap;
155
156      UInt         hwcaps;
157
158      Bool         chainingAllowed;
159      Addr64       max_ga;
160
161      /* These are modified as we go along. */
162      HInstrArray* code;
163      Int          vreg_ctr;
164   }
165   ISelEnv;
166
167
168static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
169{
170   vassert(tmp >= 0);
171   vassert(tmp < env->n_vregmap);
172   return env->vregmap[tmp];
173}
174
175static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
176                               ISelEnv* env, IRTemp tmp )
177{
178   vassert(tmp >= 0);
179   vassert(tmp < env->n_vregmap);
180   vassert(! hregIsInvalid(env->vregmapHI[tmp]));
181   *vrLO = env->vregmap[tmp];
182   *vrHI = env->vregmapHI[tmp];
183}
184
185static void addInstr ( ISelEnv* env, AMD64Instr* instr )
186{
187   addHInstr(env->code, instr);
188   if (vex_traceflags & VEX_TRACE_VCODE) {
189      ppAMD64Instr(instr, True);
190      vex_printf("\n");
191   }
192}
193
194static HReg newVRegI ( ISelEnv* env )
195{
196   HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
197   env->vreg_ctr++;
198   return reg;
199}
200
201static HReg newVRegV ( ISelEnv* env )
202{
203   HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
204   env->vreg_ctr++;
205   return reg;
206}
207
208
209/*---------------------------------------------------------*/
210/*--- ISEL: Forward declarations                        ---*/
211/*---------------------------------------------------------*/
212
213/* These are organised as iselXXX and iselXXX_wrk pairs.  The
214   iselXXX_wrk do the real work, but are not to be called directly.
215   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
216   checks that all returned registers are virtual.  You should not
217   call the _wrk version directly.
218*/
219static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
220static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
221
222static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, const IRExpr* e );
223static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, const IRExpr* e );
224
225static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, const IRExpr* e );
226static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, const IRExpr* e );
227
228static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, const IRExpr* e );
229static HReg          iselIntExpr_R       ( ISelEnv* env, const IRExpr* e );
230
231static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
232static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
233
234static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
235                                          ISelEnv* env, const IRExpr* e );
236static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
237                                          ISelEnv* env, const IRExpr* e );
238
239static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, const IRExpr* e );
240static AMD64CondCode iselCondCode        ( ISelEnv* env, const IRExpr* e );
241
242static HReg          iselDblExpr_wrk     ( ISelEnv* env, const IRExpr* e );
243static HReg          iselDblExpr         ( ISelEnv* env, const IRExpr* e );
244
245static HReg          iselFltExpr_wrk     ( ISelEnv* env, const IRExpr* e );
246static HReg          iselFltExpr         ( ISelEnv* env, const IRExpr* e );
247
248static HReg          iselVecExpr_wrk     ( ISelEnv* env, const IRExpr* e );
249static HReg          iselVecExpr         ( ISelEnv* env, const IRExpr* e );
250
251static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252                                        ISelEnv* env, const IRExpr* e );
253static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
254                                        ISelEnv* env, const IRExpr* e );
255
256
257/*---------------------------------------------------------*/
258/*--- ISEL: Misc helpers                                ---*/
259/*---------------------------------------------------------*/
260
261static Bool sane_AMode ( AMD64AMode* am )
262{
263   switch (am->tag) {
264      case Aam_IR:
265         return
266            toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267                    && (hregIsVirtual(am->Aam.IR.reg)
268                        || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269      case Aam_IRRS:
270         return
271            toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272                    && hregIsVirtual(am->Aam.IRRS.base)
273                    && hregClass(am->Aam.IRRS.index) == HRcInt64
274                    && hregIsVirtual(am->Aam.IRRS.index) );
275      default:
276        vpanic("sane_AMode: unknown amd64 amode tag");
277   }
278}
279
280
281/* Can the lower 32 bits be signedly widened to produce the whole
282   64-bit value?  In other words, are the top 33 bits either all 0 or
283   all 1 ? */
284static Bool fitsIn32Bits ( ULong x )
285{
286   Long y1;
287   y1 = x << 32;
288   y1 >>=/*s*/ 32;
289   return toBool(x == y1);
290}
291
292/* Is this a 64-bit zero expression? */
293
294static Bool isZeroU64 ( IRExpr* e )
295{
296   return e->tag == Iex_Const
297          && e->Iex.Const.con->tag == Ico_U64
298          && e->Iex.Const.con->Ico.U64 == 0ULL;
299}
300
301static Bool isZeroU32 ( IRExpr* e )
302{
303   return e->tag == Iex_Const
304          && e->Iex.Const.con->tag == Ico_U32
305          && e->Iex.Const.con->Ico.U32 == 0;
306}
307
308/* Make a int reg-reg move. */
309
310static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
311{
312   vassert(hregClass(src) == HRcInt64);
313   vassert(hregClass(dst) == HRcInt64);
314   return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
315}
316
317/* Make a vector (128 bit) reg-reg move. */
318
319static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
320{
321   vassert(hregClass(src) == HRcVec128);
322   vassert(hregClass(dst) == HRcVec128);
323   return AMD64Instr_SseReRg(Asse_MOV, src, dst);
324}
325
326/* Advance/retreat %rsp by n. */
327
328static void add_to_rsp ( ISelEnv* env, Int n )
329{
330   vassert(n > 0 && n < 256 && (n%8) == 0);
331   addInstr(env,
332            AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
333                                        hregAMD64_RSP()));
334}
335
336static void sub_from_rsp ( ISelEnv* env, Int n )
337{
338   vassert(n > 0 && n < 256 && (n%8) == 0);
339   addInstr(env,
340            AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
341                                        hregAMD64_RSP()));
342}
343
344/* Push 64-bit constants on the stack. */
345static void push_uimm64( ISelEnv* env, ULong uimm64 )
346{
347   /* If uimm64 can be expressed as the sign extension of its
348      lower 32 bits, we can do it the easy way. */
349   Long simm64 = (Long)uimm64;
350   if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
351      addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
352   } else {
353      HReg tmp = newVRegI(env);
354      addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
355      addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
356   }
357}
358
359
360/* Used only in doHelperCall.  If possible, produce a single
361   instruction which computes 'e' into 'dst'.  If not possible, return
362   NULL. */
363
364static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
365                                                    HReg     dst,
366                                                    IRExpr*  e )
367{
368   /* Per comments in doHelperCall below, appearance of
369      Iex_VECRET implies ill-formed IR. */
370   vassert(e->tag != Iex_VECRET);
371
372   /* In this case we give out a copy of the BaseBlock pointer. */
373   if (UNLIKELY(e->tag == Iex_GSPTR)) {
374      return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
375   }
376
377   vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
378
379   if (e->tag == Iex_Const) {
380      vassert(e->Iex.Const.con->tag == Ico_U64);
381      if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
382         return AMD64Instr_Alu64R(
383                   Aalu_MOV,
384                   AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
385                   dst
386                );
387      } else {
388         return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
389      }
390   }
391
392   if (e->tag == Iex_RdTmp) {
393      HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
394      return mk_iMOVsd_RR(src, dst);
395   }
396
397   if (e->tag == Iex_Get) {
398      vassert(e->Iex.Get.ty == Ity_I64);
399      return AMD64Instr_Alu64R(
400                Aalu_MOV,
401                AMD64RMI_Mem(
402                   AMD64AMode_IR(e->Iex.Get.offset,
403                                 hregAMD64_RBP())),
404                dst);
405   }
406
407   if (e->tag == Iex_Unop
408       && e->Iex.Unop.op == Iop_32Uto64
409       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
410      HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
411      return AMD64Instr_MovxLQ(False, src, dst);
412   }
413
414   if (0) { ppIRExpr(e); vex_printf("\n"); }
415
416   return NULL;
417}
418
419
420/* Do a complete function call.  |guard| is a Ity_Bit expression
421   indicating whether or not the call happens.  If guard==NULL, the
422   call is unconditional.  |retloc| is set to indicate where the
423   return value is after the call.  The caller (of this fn) must
424   generate code to add |stackAdjustAfterCall| to the stack pointer
425   after the call is done. */
426
427static
428void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
429                    /*OUT*/RetLoc* retloc,
430                    ISelEnv* env,
431                    IRExpr* guard,
432                    IRCallee* cee, IRType retTy, IRExpr** args )
433{
434   AMD64CondCode cc;
435   HReg          argregs[6];
436   HReg          tmpregs[6];
437   AMD64Instr*   fastinstrs[6];
438   UInt          n_args, i;
439
440   /* Set default returns.  We'll update them later if needed. */
441   *stackAdjustAfterCall = 0;
442   *retloc               = mk_RetLoc_INVALID();
443
444   /* These are used for cross-checking that IR-level constraints on
445      the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
446   UInt nVECRETs = 0;
447   UInt nGSPTRs  = 0;
448
449   /* Marshal args for a call and do the call.
450
451      This function only deals with a tiny set of possibilities, which
452      cover all helpers in practice.  The restrictions are that only
453      arguments in registers are supported, hence only 6x64 integer
454      bits in total can be passed.  In fact the only supported arg
455      type is I64.
456
457      The return type can be I{64,32,16,8} or V{128,256}.  In the
458      latter two cases, it is expected that |args| will contain the
459      special node IRExpr_VECRET(), in which case this routine
460      generates code to allocate space on the stack for the vector
461      return value.  Since we are not passing any scalars on the
462      stack, it is enough to preallocate the return space before
463      marshalling any arguments, in this case.
464
465      |args| may also contain IRExpr_GSPTR(), in which case the
466      value in %rbp is passed as the corresponding argument.
467
468      Generating code which is both efficient and correct when
469      parameters are to be passed in registers is difficult, for the
470      reasons elaborated in detail in comments attached to
471      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
472      of the method described in those comments.
473
474      The problem is split into two cases: the fast scheme and the
475      slow scheme.  In the fast scheme, arguments are computed
476      directly into the target (real) registers.  This is only safe
477      when we can be sure that computation of each argument will not
478      trash any real registers set by computation of any other
479      argument.
480
481      In the slow scheme, all args are first computed into vregs, and
482      once they are all done, they are moved to the relevant real
483      regs.  This always gives correct code, but it also gives a bunch
484      of vreg-to-rreg moves which are usually redundant but are hard
485      for the register allocator to get rid of.
486
487      To decide which scheme to use, all argument expressions are
488      first examined.  If they are all so simple that it is clear they
489      will be evaluated without use of any fixed registers, use the
490      fast scheme, else use the slow scheme.  Note also that only
491      unconditional calls may use the fast scheme, since having to
492      compute a condition expression could itself trash real
493      registers.  Note that for simplicity, in the case where
494      IRExpr_VECRET() is present, we use the slow scheme.  This is
495      motivated by the desire to avoid any possible complexity
496      w.r.t. nested calls.
497
498      Note this requires being able to examine an expression and
499      determine whether or not evaluation of it might use a fixed
500      register.  That requires knowledge of how the rest of this insn
501      selector works.  Currently just the following 3 are regarded as
502      safe -- hopefully they cover the majority of arguments in
503      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
504   */
505
506   /* Note that the cee->regparms field is meaningless on AMD64 host
507      (since there is only one calling convention) and so we always
508      ignore it. */
509   n_args = 0;
510   for (i = 0; args[i]; i++)
511      n_args++;
512
513   if (n_args > 6)
514      vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
515
516   argregs[0] = hregAMD64_RDI();
517   argregs[1] = hregAMD64_RSI();
518   argregs[2] = hregAMD64_RDX();
519   argregs[3] = hregAMD64_RCX();
520   argregs[4] = hregAMD64_R8();
521   argregs[5] = hregAMD64_R9();
522
523   tmpregs[0] = tmpregs[1] = tmpregs[2] =
524   tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
525
526   fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
527   fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
528
529   /* First decide which scheme (slow or fast) is to be used.  First
530      assume the fast scheme, and select slow if any contraindications
531      (wow) appear. */
532
533   /* We'll need space on the stack for the return value.  Avoid
534      possible complications with nested calls by using the slow
535      scheme. */
536   if (retTy == Ity_V128 || retTy == Ity_V256)
537      goto slowscheme;
538
539   if (guard) {
540      if (guard->tag == Iex_Const
541          && guard->Iex.Const.con->tag == Ico_U1
542          && guard->Iex.Const.con->Ico.U1 == True) {
543         /* unconditional */
544      } else {
545         /* Not manifestly unconditional -- be conservative. */
546         goto slowscheme;
547      }
548   }
549
550   /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
551      use the slow scheme.  Because this is tentative, we can't call
552      addInstr (that is, commit to) any instructions until we're
553      handled all the arguments.  So park the resulting instructions
554      in a buffer and emit that if we're successful. */
555
556   /* FAST SCHEME */
557   /* In this loop, we process args that can be computed into the
558      destination (real) register with a single instruction, without
559      using any fixed regs.  That also includes IRExpr_GSPTR(), but
560      not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
561      never see IRExpr_VECRET() at this point, since the return-type
562      check above should ensure all those cases use the slow scheme
563      instead. */
564   vassert(n_args >= 0 && n_args <= 6);
565   for (i = 0; i < n_args; i++) {
566      IRExpr* arg = args[i];
567      if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
568         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
569      }
570      fastinstrs[i]
571         = iselIntExpr_single_instruction( env, argregs[i], args[i] );
572      if (fastinstrs[i] == NULL)
573         goto slowscheme;
574   }
575
576   /* Looks like we're in luck.  Emit the accumulated instructions and
577      move on to doing the call itself. */
578   for (i = 0; i < n_args; i++)
579      addInstr(env, fastinstrs[i]);
580
581   /* Fast scheme only applies for unconditional calls.  Hence: */
582   cc = Acc_ALWAYS;
583
584   goto handle_call;
585
586
587   /* SLOW SCHEME; move via temporaries */
588  slowscheme:
589   {}
590#  if 0 /* debug only */
591   if (n_args > 0) {for (i = 0; args[i]; i++) {
592   ppIRExpr(args[i]); vex_printf(" "); }
593   vex_printf("\n");}
594#  endif
595
596   /* If we have a vector return type, allocate a place for it on the
597      stack and record its address. */
598   HReg r_vecRetAddr = INVALID_HREG;
599   if (retTy == Ity_V128) {
600      r_vecRetAddr = newVRegI(env);
601      sub_from_rsp(env, 16);
602      addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
603   }
604   else if (retTy == Ity_V256) {
605      r_vecRetAddr = newVRegI(env);
606      sub_from_rsp(env, 32);
607      addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
608   }
609
610   vassert(n_args >= 0 && n_args <= 6);
611   for (i = 0; i < n_args; i++) {
612      IRExpr* arg = args[i];
613      if (UNLIKELY(arg->tag == Iex_GSPTR)) {
614         tmpregs[i] = newVRegI(env);
615         addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
616         nGSPTRs++;
617      }
618      else if (UNLIKELY(arg->tag == Iex_VECRET)) {
619         /* We stashed the address of the return slot earlier, so just
620            retrieve it now. */
621         vassert(!hregIsInvalid(r_vecRetAddr));
622         tmpregs[i] = r_vecRetAddr;
623         nVECRETs++;
624      }
625      else {
626         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
627         tmpregs[i] = iselIntExpr_R(env, args[i]);
628      }
629   }
630
631   /* Now we can compute the condition.  We can't do it earlier
632      because the argument computations could trash the condition
633      codes.  Be a bit clever to handle the common case where the
634      guard is 1:Bit. */
635   cc = Acc_ALWAYS;
636   if (guard) {
637      if (guard->tag == Iex_Const
638          && guard->Iex.Const.con->tag == Ico_U1
639          && guard->Iex.Const.con->Ico.U1 == True) {
640         /* unconditional -- do nothing */
641      } else {
642         cc = iselCondCode( env, guard );
643      }
644   }
645
646   /* Move the args to their final destinations. */
647   for (i = 0; i < n_args; i++) {
648      /* None of these insns, including any spill code that might
649         be generated, may alter the condition codes. */
650      addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
651   }
652
653
654   /* Do final checks, set the return values, and generate the call
655      instruction proper. */
656  handle_call:
657
658   if (retTy == Ity_V128 || retTy == Ity_V256) {
659      vassert(nVECRETs == 1);
660   } else {
661      vassert(nVECRETs == 0);
662   }
663
664   vassert(nGSPTRs == 0 || nGSPTRs == 1);
665
666   vassert(*stackAdjustAfterCall == 0);
667   vassert(is_RetLoc_INVALID(*retloc));
668   switch (retTy) {
669         case Ity_INVALID:
670            /* Function doesn't return a value. */
671            *retloc = mk_RetLoc_simple(RLPri_None);
672            break;
673         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
674            *retloc = mk_RetLoc_simple(RLPri_Int);
675            break;
676         case Ity_V128:
677            *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
678            *stackAdjustAfterCall = 16;
679            break;
680         case Ity_V256:
681            *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
682            *stackAdjustAfterCall = 32;
683            break;
684         default:
685            /* IR can denote other possible return types, but we don't
686               handle those here. */
687           vassert(0);
688   }
689
690   /* Finally, generate the call itself.  This needs the *retloc value
691      set in the switch above, which is why it's at the end. */
692   addInstr(env,
693            AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
694}
695
696
697/* Given a guest-state array descriptor, an index expression and a
698   bias, generate an AMD64AMode holding the relevant guest state
699   offset. */
700
701static
702AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
703                                  IRExpr* off, Int bias )
704{
705   HReg tmp, roff;
706   Int  elemSz = sizeofIRType(descr->elemTy);
707   Int  nElems = descr->nElems;
708
709   /* Throw out any cases not generated by an amd64 front end.  In
710      theory there might be a day where we need to handle them -- if
711      we ever run non-amd64-guest on amd64 host. */
712
713   if (nElems != 8 || (elemSz != 1 && elemSz != 8))
714      vpanic("genGuestArrayOffset(amd64 host)");
715
716   /* Compute off into a reg, %off.  Then return:
717
718         movq %off, %tmp
719         addq $bias, %tmp  (if bias != 0)
720         andq %tmp, 7
721         ... base(%rbp, %tmp, shift) ...
722   */
723   tmp  = newVRegI(env);
724   roff = iselIntExpr_R(env, off);
725   addInstr(env, mk_iMOVsd_RR(roff, tmp));
726   if (bias != 0) {
727      /* Make sure the bias is sane, in the sense that there are
728         no significant bits above bit 30 in it. */
729      vassert(-10000 < bias && bias < 10000);
730      addInstr(env,
731               AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
732   }
733   addInstr(env,
734            AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
735   vassert(elemSz == 1 || elemSz == 8);
736   return
737      AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
738                                    elemSz==8 ? 3 : 0);
739}
740
741
742/* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
743static
744void set_SSE_rounding_default ( ISelEnv* env )
745{
746   /* pushq $DEFAULT_MXCSR
747      ldmxcsr 0(%rsp)
748      addq $8, %rsp
749   */
750   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
751   addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
752   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
753   add_to_rsp(env, 8);
754}
755
756/* Mess with the FPU's rounding mode: set to the default rounding mode
757   (DEFAULT_FPUCW). */
758static
759void set_FPU_rounding_default ( ISelEnv* env )
760{
761   /* movq $DEFAULT_FPUCW, -8(%rsp)
762      fldcw -8(%esp)
763   */
764   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
765   addInstr(env, AMD64Instr_Alu64M(
766                    Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
767   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
768}
769
770
771/* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
772   expression denoting a value in the range 0 .. 3, indicating a round
773   mode encoded as per type IRRoundingMode.  Set the SSE machinery to
774   have the same rounding.
775*/
776static
777void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
778{
779   /* Note: this sequence only makes sense because DEFAULT_MXCSR has
780      both rounding bits == 0.  If that wasn't the case, we couldn't
781      create a new rounding field simply by ORing the new value into
782      place. */
783
784   /* movq $3, %reg
785      andq [[mode]], %reg  -- shouldn't be needed; paranoia
786      shlq $13, %reg
787      orq $DEFAULT_MXCSR, %reg
788      pushq %reg
789      ldmxcsr 0(%esp)
790      addq $8, %rsp
791   */
792   HReg        reg      = newVRegI(env);
793   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
794   addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
795   addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
796                                   iselIntExpr_RMI(env, mode), reg));
797   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
798   addInstr(env, AMD64Instr_Alu64R(
799                    Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
800   addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
801   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
802   add_to_rsp(env, 8);
803}
804
805
806/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
807   expression denoting a value in the range 0 .. 3, indicating a round
808   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
809   the same rounding.
810*/
811static
812void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
813{
814   HReg rrm  = iselIntExpr_R(env, mode);
815   HReg rrm2 = newVRegI(env);
816   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
817
818   /* movq  %rrm, %rrm2
819      andq  $3, %rrm2   -- shouldn't be needed; paranoia
820      shlq  $10, %rrm2
821      orq   $DEFAULT_FPUCW, %rrm2
822      movq  %rrm2, -8(%rsp)
823      fldcw -8(%esp)
824   */
825   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
826   addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
827   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
828   addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
829                                   AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
830   addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
831                                   AMD64RI_Reg(rrm2), m8_rsp));
832   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
833}
834
835
836/* Generate all-zeroes into a new vector register.
837*/
838static HReg generate_zeroes_V128 ( ISelEnv* env )
839{
840   HReg dst = newVRegV(env);
841   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
842   return dst;
843}
844
845/* Generate all-ones into a new vector register.
846*/
847static HReg generate_ones_V128 ( ISelEnv* env )
848{
849   HReg dst = newVRegV(env);
850   addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
851   return dst;
852}
853
854
855/* Generate !src into a new vector register.  Amazing that there isn't
856   a less crappy way to do this.
857*/
858static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
859{
860   HReg dst = generate_ones_V128(env);
861   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
862   return dst;
863}
864
865
866/* Expand the given byte into a 64-bit word, by cloning each bit
867   8 times. */
868static ULong bitmask8_to_bytemask64 ( UShort w8 )
869{
870   vassert(w8 == (w8 & 0xFF));
871   ULong w64 = 0;
872   Int i;
873   for (i = 0; i < 8; i++) {
874      if (w8 & (1<<i))
875         w64 |= (0xFFULL << (8 * i));
876   }
877   return w64;
878}
879
880
881/*---------------------------------------------------------*/
882/*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
883/*---------------------------------------------------------*/
884
885/* Select insns for an integer-typed expression, and add them to the
886   code list.  Return a reg holding the result.  This reg will be a
887   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
888   want to modify it, ask for a new vreg, copy it in there, and modify
889   the copy.  The register allocator will do its best to map both
890   vregs to the same real register, so the copies will often disappear
891   later in the game.
892
893   This should handle expressions of 64, 32, 16 and 8-bit type.  All
894   results are returned in a 64-bit register.  For 32-, 16- and 8-bit
895   expressions, the upper 32/48/56 bits are arbitrary, so you should
896   mask or sign extend partial values if necessary.
897*/
898
899static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
900{
901   HReg r = iselIntExpr_R_wrk(env, e);
902   /* sanity checks ... */
903#  if 0
904   vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
905#  endif
906   vassert(hregClass(r) == HRcInt64);
907   vassert(hregIsVirtual(r));
908   return r;
909}
910
911/* DO NOT CALL THIS DIRECTLY ! */
912static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
913{
914   /* Used for unary/binary SIMD64 ops. */
915   HWord fn = 0;
916   Bool second_is_UInt;
917
918   MatchInfo mi;
919   DECLARE_PATTERN(p_1Uto8_64to1);
920   DECLARE_PATTERN(p_LDle8_then_8Uto64);
921   DECLARE_PATTERN(p_LDle16_then_16Uto64);
922
923   IRType ty = typeOfIRExpr(env->type_env,e);
924   switch (ty) {
925      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
926      default: vassert(0);
927   }
928
929   switch (e->tag) {
930
931   /* --------- TEMP --------- */
932   case Iex_RdTmp: {
933      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
934   }
935
936   /* --------- LOAD --------- */
937   case Iex_Load: {
938      HReg dst = newVRegI(env);
939      AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
940
941      /* We can't handle big-endian loads, nor load-linked. */
942      if (e->Iex.Load.end != Iend_LE)
943         goto irreducible;
944
945      if (ty == Ity_I64) {
946         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
947                                         AMD64RMI_Mem(amode), dst) );
948         return dst;
949      }
950      if (ty == Ity_I32) {
951         addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
952         return dst;
953      }
954      if (ty == Ity_I16) {
955         addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
956         return dst;
957      }
958      if (ty == Ity_I8) {
959         addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
960         return dst;
961      }
962      break;
963   }
964
965   /* --------- BINARY OP --------- */
966   case Iex_Binop: {
967      AMD64AluOp   aluOp;
968      AMD64ShiftOp shOp;
969
970      /* Pattern: Sub64(0,x) */
971      /*     and: Sub32(0,x) */
972      if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
973          || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
974         HReg dst = newVRegI(env);
975         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
976         addInstr(env, mk_iMOVsd_RR(reg,dst));
977         addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
978         return dst;
979      }
980
981      /* Is it an addition or logical style op? */
982      switch (e->Iex.Binop.op) {
983         case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
984            aluOp = Aalu_ADD; break;
985         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
986            aluOp = Aalu_SUB; break;
987         case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
988            aluOp = Aalu_AND; break;
989         case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
990            aluOp = Aalu_OR; break;
991         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
992            aluOp = Aalu_XOR; break;
993         case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
994            aluOp = Aalu_MUL; break;
995         default:
996            aluOp = Aalu_INVALID; break;
997      }
998      /* For commutative ops we assume any literal
999         values are on the second operand. */
1000      if (aluOp != Aalu_INVALID) {
1001         HReg dst      = newVRegI(env);
1002         HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1003         AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1004         addInstr(env, mk_iMOVsd_RR(reg,dst));
1005         addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1006         return dst;
1007      }
1008
1009      /* Perhaps a shift op? */
1010      switch (e->Iex.Binop.op) {
1011         case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1012            shOp = Ash_SHL; break;
1013         case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1014            shOp = Ash_SHR; break;
1015         case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1016            shOp = Ash_SAR; break;
1017         default:
1018            shOp = Ash_INVALID; break;
1019      }
1020      if (shOp != Ash_INVALID) {
1021         HReg dst = newVRegI(env);
1022
1023         /* regL = the value to be shifted */
1024         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1025         addInstr(env, mk_iMOVsd_RR(regL,dst));
1026
1027         /* Do any necessary widening for 32/16/8 bit operands */
1028         switch (e->Iex.Binop.op) {
1029            case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1030               break;
1031            case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1032               break;
1033            case Iop_Shr8:
1034               addInstr(env, AMD64Instr_Alu64R(
1035                                Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1036               break;
1037            case Iop_Shr16:
1038               addInstr(env, AMD64Instr_Alu64R(
1039                                Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1040               break;
1041            case Iop_Shr32:
1042               addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1043               break;
1044            case Iop_Sar8:
1045               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1046               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1047               break;
1048            case Iop_Sar16:
1049               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1050               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1051               break;
1052            case Iop_Sar32:
1053               addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1054               break;
1055            default:
1056               ppIROp(e->Iex.Binop.op);
1057               vassert(0);
1058         }
1059
1060         /* Now consider the shift amount.  If it's a literal, we
1061            can do a much better job than the general case. */
1062         if (e->Iex.Binop.arg2->tag == Iex_Const) {
1063            /* assert that the IR is well-typed */
1064            Int nshift;
1065            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1066            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1067            vassert(nshift >= 0);
1068            if (nshift > 0)
1069               /* Can't allow nshift==0 since that means %cl */
1070               addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1071         } else {
1072            /* General case; we have to force the amount into %cl. */
1073            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1074            addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1075            addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1076         }
1077         return dst;
1078      }
1079
1080      /* Deal with 64-bit SIMD binary ops */
1081      second_is_UInt = False;
1082      switch (e->Iex.Binop.op) {
1083         case Iop_Add8x8:
1084            fn = (HWord)h_generic_calc_Add8x8; break;
1085         case Iop_Add16x4:
1086            fn = (HWord)h_generic_calc_Add16x4; break;
1087         case Iop_Add32x2:
1088            fn = (HWord)h_generic_calc_Add32x2; break;
1089
1090         case Iop_Avg8Ux8:
1091            fn = (HWord)h_generic_calc_Avg8Ux8; break;
1092         case Iop_Avg16Ux4:
1093            fn = (HWord)h_generic_calc_Avg16Ux4; break;
1094
1095         case Iop_CmpEQ8x8:
1096            fn = (HWord)h_generic_calc_CmpEQ8x8; break;
1097         case Iop_CmpEQ16x4:
1098            fn = (HWord)h_generic_calc_CmpEQ16x4; break;
1099         case Iop_CmpEQ32x2:
1100            fn = (HWord)h_generic_calc_CmpEQ32x2; break;
1101
1102         case Iop_CmpGT8Sx8:
1103            fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
1104         case Iop_CmpGT16Sx4:
1105            fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
1106         case Iop_CmpGT32Sx2:
1107            fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
1108
1109         case Iop_InterleaveHI8x8:
1110            fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
1111         case Iop_InterleaveLO8x8:
1112            fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
1113         case Iop_InterleaveHI16x4:
1114            fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
1115         case Iop_InterleaveLO16x4:
1116            fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
1117         case Iop_InterleaveHI32x2:
1118            fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
1119         case Iop_InterleaveLO32x2:
1120            fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
1121         case Iop_CatOddLanes16x4:
1122            fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1123         case Iop_CatEvenLanes16x4:
1124            fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1125         case Iop_Perm8x8:
1126            fn = (HWord)h_generic_calc_Perm8x8; break;
1127
1128         case Iop_Max8Ux8:
1129            fn = (HWord)h_generic_calc_Max8Ux8; break;
1130         case Iop_Max16Sx4:
1131            fn = (HWord)h_generic_calc_Max16Sx4; break;
1132         case Iop_Min8Ux8:
1133            fn = (HWord)h_generic_calc_Min8Ux8; break;
1134         case Iop_Min16Sx4:
1135            fn = (HWord)h_generic_calc_Min16Sx4; break;
1136
1137         case Iop_Mul16x4:
1138            fn = (HWord)h_generic_calc_Mul16x4; break;
1139         case Iop_Mul32x2:
1140            fn = (HWord)h_generic_calc_Mul32x2; break;
1141         case Iop_MulHi16Sx4:
1142            fn = (HWord)h_generic_calc_MulHi16Sx4; break;
1143         case Iop_MulHi16Ux4:
1144            fn = (HWord)h_generic_calc_MulHi16Ux4; break;
1145
1146         case Iop_QAdd8Sx8:
1147            fn = (HWord)h_generic_calc_QAdd8Sx8; break;
1148         case Iop_QAdd16Sx4:
1149            fn = (HWord)h_generic_calc_QAdd16Sx4; break;
1150         case Iop_QAdd8Ux8:
1151            fn = (HWord)h_generic_calc_QAdd8Ux8; break;
1152         case Iop_QAdd16Ux4:
1153            fn = (HWord)h_generic_calc_QAdd16Ux4; break;
1154
1155         case Iop_QNarrowBin32Sto16Sx4:
1156            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1157         case Iop_QNarrowBin16Sto8Sx8:
1158            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1159         case Iop_QNarrowBin16Sto8Ux8:
1160            fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1161         case Iop_NarrowBin16to8x8:
1162            fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1163         case Iop_NarrowBin32to16x4:
1164            fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1165
1166         case Iop_QSub8Sx8:
1167            fn = (HWord)h_generic_calc_QSub8Sx8; break;
1168         case Iop_QSub16Sx4:
1169            fn = (HWord)h_generic_calc_QSub16Sx4; break;
1170         case Iop_QSub8Ux8:
1171            fn = (HWord)h_generic_calc_QSub8Ux8; break;
1172         case Iop_QSub16Ux4:
1173            fn = (HWord)h_generic_calc_QSub16Ux4; break;
1174
1175         case Iop_Sub8x8:
1176            fn = (HWord)h_generic_calc_Sub8x8; break;
1177         case Iop_Sub16x4:
1178            fn = (HWord)h_generic_calc_Sub16x4; break;
1179         case Iop_Sub32x2:
1180            fn = (HWord)h_generic_calc_Sub32x2; break;
1181
1182         case Iop_ShlN32x2:
1183            fn = (HWord)h_generic_calc_ShlN32x2;
1184            second_is_UInt = True;
1185            break;
1186         case Iop_ShlN16x4:
1187            fn = (HWord)h_generic_calc_ShlN16x4;
1188            second_is_UInt = True;
1189            break;
1190         case Iop_ShlN8x8:
1191            fn = (HWord)h_generic_calc_ShlN8x8;
1192            second_is_UInt = True;
1193            break;
1194         case Iop_ShrN32x2:
1195            fn = (HWord)h_generic_calc_ShrN32x2;
1196            second_is_UInt = True;
1197            break;
1198         case Iop_ShrN16x4:
1199            fn = (HWord)h_generic_calc_ShrN16x4;
1200            second_is_UInt = True;
1201            break;
1202         case Iop_SarN32x2:
1203            fn = (HWord)h_generic_calc_SarN32x2;
1204            second_is_UInt = True;
1205            break;
1206         case Iop_SarN16x4:
1207            fn = (HWord)h_generic_calc_SarN16x4;
1208            second_is_UInt = True;
1209            break;
1210         case Iop_SarN8x8:
1211            fn = (HWord)h_generic_calc_SarN8x8;
1212            second_is_UInt = True;
1213            break;
1214
1215         default:
1216            fn = (HWord)0; break;
1217      }
1218      if (fn != (HWord)0) {
1219         /* Note: the following assumes all helpers are of signature
1220               ULong fn ( ULong, ULong ), and they are
1221            not marked as regparm functions.
1222         */
1223         HReg dst  = newVRegI(env);
1224         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1225         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1226         if (second_is_UInt)
1227            addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1228         addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1229         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1230         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1231                                        mk_RetLoc_simple(RLPri_Int) ));
1232         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1233         return dst;
1234      }
1235
1236      /* Handle misc other ops. */
1237
1238      if (e->Iex.Binop.op == Iop_Max32U) {
1239         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1240         HReg dst  = newVRegI(env);
1241         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1242         addInstr(env, mk_iMOVsd_RR(src1, dst));
1243         addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1244         addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1245         return dst;
1246      }
1247
1248      if (e->Iex.Binop.op == Iop_DivModS64to32
1249          || e->Iex.Binop.op == Iop_DivModU64to32) {
1250         /* 64 x 32 -> (32(rem),32(div)) division */
1251         /* Get the 64-bit operand into edx:eax, and the other into
1252            any old R/M. */
1253         HReg      rax     = hregAMD64_RAX();
1254         HReg      rdx     = hregAMD64_RDX();
1255         HReg      dst     = newVRegI(env);
1256         Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1257         AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1258         /* Compute the left operand into a reg, and then
1259            put the top half in edx and the bottom in eax. */
1260         HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1261         addInstr(env, mk_iMOVsd_RR(left64, rdx));
1262         addInstr(env, mk_iMOVsd_RR(left64, rax));
1263         addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1264         addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1265	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1266	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1267         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1268         addInstr(env, mk_iMOVsd_RR(rax, dst));
1269         addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1270         return dst;
1271      }
1272
1273      if (e->Iex.Binop.op == Iop_32HLto64) {
1274         HReg hi32  = newVRegI(env);
1275         HReg lo32  = newVRegI(env);
1276         HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1277         HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1278         addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1279         addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1280         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1281	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1282         addInstr(env, AMD64Instr_Alu64R(
1283                          Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1284         return hi32;
1285      }
1286
1287      if (e->Iex.Binop.op == Iop_16HLto32) {
1288         HReg hi16  = newVRegI(env);
1289         HReg lo16  = newVRegI(env);
1290         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1291         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1292         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1293         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1294         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1295         addInstr(env, AMD64Instr_Alu64R(
1296                          Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1297         addInstr(env, AMD64Instr_Alu64R(
1298                          Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1299         return hi16;
1300      }
1301
1302      if (e->Iex.Binop.op == Iop_8HLto16) {
1303         HReg hi8  = newVRegI(env);
1304         HReg lo8  = newVRegI(env);
1305         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1306         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1307         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1308         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1309         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1310         addInstr(env, AMD64Instr_Alu64R(
1311                          Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1312         addInstr(env, AMD64Instr_Alu64R(
1313                          Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1314         return hi8;
1315      }
1316
1317      if (e->Iex.Binop.op == Iop_MullS32
1318          || e->Iex.Binop.op == Iop_MullS16
1319          || e->Iex.Binop.op == Iop_MullS8
1320          || e->Iex.Binop.op == Iop_MullU32
1321          || e->Iex.Binop.op == Iop_MullU16
1322          || e->Iex.Binop.op == Iop_MullU8) {
1323         HReg a32   = newVRegI(env);
1324         HReg b32   = newVRegI(env);
1325         HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1326         HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1327         Int          shift  = 0;
1328         AMD64ShiftOp shr_op = Ash_SHR;
1329         switch (e->Iex.Binop.op) {
1330            case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1331            case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1332            case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1333            case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1334            case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1335            case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1336            default: vassert(0);
1337         }
1338
1339         addInstr(env, mk_iMOVsd_RR(a32s, a32));
1340         addInstr(env, mk_iMOVsd_RR(b32s, b32));
1341         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1342         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1343         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1344         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1345         addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1346         return b32;
1347      }
1348
1349      if (e->Iex.Binop.op == Iop_CmpF64) {
1350         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1351         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1352         HReg dst = newVRegI(env);
1353         addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1354         /* Mask out irrelevant parts of the result so as to conform
1355            to the CmpF64 definition. */
1356         addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1357         return dst;
1358      }
1359
1360      if (e->Iex.Binop.op == Iop_F64toI32S
1361          || e->Iex.Binop.op == Iop_F64toI64S) {
1362         Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1363         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1364         HReg dst = newVRegI(env);
1365         set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1366         addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1367         set_SSE_rounding_default(env);
1368         return dst;
1369      }
1370
1371      break;
1372   }
1373
1374   /* --------- UNARY OP --------- */
1375   case Iex_Unop: {
1376
1377      /* 1Uto8(64to1(expr64)) */
1378      {
1379         DEFINE_PATTERN( p_1Uto8_64to1,
1380                         unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1381         if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1382            const IRExpr* expr64 = mi.bindee[0];
1383            HReg    dst    = newVRegI(env);
1384            HReg    src    = iselIntExpr_R(env, expr64);
1385            addInstr(env, mk_iMOVsd_RR(src,dst) );
1386            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1387                                            AMD64RMI_Imm(1), dst));
1388            return dst;
1389         }
1390      }
1391
1392      /* 8Uto64(LDle(expr64)) */
1393      {
1394         DEFINE_PATTERN(p_LDle8_then_8Uto64,
1395                        unop(Iop_8Uto64,
1396                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1397         if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1398            HReg dst = newVRegI(env);
1399            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1400            addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1401            return dst;
1402         }
1403      }
1404
1405      /* 16Uto64(LDle(expr64)) */
1406      {
1407         DEFINE_PATTERN(p_LDle16_then_16Uto64,
1408                        unop(Iop_16Uto64,
1409                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1410         if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1411            HReg dst = newVRegI(env);
1412            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1413            addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1414            return dst;
1415         }
1416      }
1417
1418      /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1419         Use 32 bit arithmetic and let the default zero-extend rule
1420         do the 32Uto64 for free. */
1421      if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1422         IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1423         IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1424         IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1425         AMD64AluOp aluOp = Aalu_INVALID;
1426         switch (opi) {
1427            case Iop_Add32: aluOp = Aalu_ADD; break;
1428            case Iop_Sub32: aluOp = Aalu_SUB; break;
1429            case Iop_And32: aluOp = Aalu_AND; break;
1430            case Iop_Or32:  aluOp = Aalu_OR;  break;
1431            case Iop_Xor32: aluOp = Aalu_XOR; break;
1432            default: break;
1433         }
1434         if (aluOp != Aalu_INVALID) {
1435            /* For commutative ops we assume any literal values are on
1436               the second operand. */
1437            HReg dst      = newVRegI(env);
1438            HReg reg      = iselIntExpr_R(env, argL);
1439            AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1440            addInstr(env, mk_iMOVsd_RR(reg,dst));
1441            addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1442            return dst;
1443         }
1444         /* just fall through to normal handling for Iop_32Uto64 */
1445      }
1446
1447      /* Fallback cases */
1448      switch (e->Iex.Unop.op) {
1449         case Iop_32Uto64:
1450         case Iop_32Sto64: {
1451            HReg dst = newVRegI(env);
1452            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1453            addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1454                                            src, dst) );
1455            return dst;
1456         }
1457         case Iop_128HIto64: {
1458            HReg rHi, rLo;
1459            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1460            return rHi; /* and abandon rLo */
1461         }
1462         case Iop_128to64: {
1463            HReg rHi, rLo;
1464            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1465            return rLo; /* and abandon rHi */
1466         }
1467         case Iop_8Uto16:
1468         case Iop_8Uto32:
1469         case Iop_8Uto64:
1470         case Iop_16Uto64:
1471         case Iop_16Uto32: {
1472            HReg dst     = newVRegI(env);
1473            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1474            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1475                                   || e->Iex.Unop.op==Iop_16Uto64 );
1476            UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1477            addInstr(env, mk_iMOVsd_RR(src,dst) );
1478            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1479                                            AMD64RMI_Imm(mask), dst));
1480            return dst;
1481         }
1482         case Iop_8Sto16:
1483         case Iop_8Sto64:
1484         case Iop_8Sto32:
1485         case Iop_16Sto32:
1486         case Iop_16Sto64: {
1487            HReg dst     = newVRegI(env);
1488            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1489            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1490                                   || e->Iex.Unop.op==Iop_16Sto64 );
1491            UInt amt     = srcIs16 ? 48 : 56;
1492            addInstr(env, mk_iMOVsd_RR(src,dst) );
1493            addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1494            addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1495            return dst;
1496         }
1497 	 case Iop_Not8:
1498 	 case Iop_Not16:
1499         case Iop_Not32:
1500         case Iop_Not64: {
1501            HReg dst = newVRegI(env);
1502            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1503            addInstr(env, mk_iMOVsd_RR(src,dst) );
1504            addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1505            return dst;
1506         }
1507         case Iop_16HIto8:
1508         case Iop_32HIto16:
1509         case Iop_64HIto32: {
1510            HReg dst  = newVRegI(env);
1511            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1512            Int shift = 0;
1513            switch (e->Iex.Unop.op) {
1514               case Iop_16HIto8:  shift = 8;  break;
1515               case Iop_32HIto16: shift = 16; break;
1516               case Iop_64HIto32: shift = 32; break;
1517               default: vassert(0);
1518            }
1519            addInstr(env, mk_iMOVsd_RR(src,dst) );
1520            addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1521            return dst;
1522         }
1523         case Iop_1Uto64:
1524         case Iop_1Uto32:
1525         case Iop_1Uto8: {
1526            HReg dst           = newVRegI(env);
1527            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1528            addInstr(env, AMD64Instr_Set64(cond,dst));
1529            return dst;
1530         }
1531         case Iop_1Sto8:
1532         case Iop_1Sto16:
1533         case Iop_1Sto32:
1534         case Iop_1Sto64: {
1535            /* could do better than this, but for now ... */
1536            HReg dst           = newVRegI(env);
1537            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1538            addInstr(env, AMD64Instr_Set64(cond,dst));
1539            addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1540            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1541            return dst;
1542         }
1543         case Iop_Ctz64: {
1544            /* Count trailing zeroes, implemented by amd64 'bsfq' */
1545            HReg dst = newVRegI(env);
1546            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1547            addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1548            return dst;
1549         }
1550         case Iop_Clz64: {
1551            /* Count leading zeroes.  Do 'bsrq' to establish the index
1552               of the highest set bit, and subtract that value from
1553               63. */
1554            HReg tmp = newVRegI(env);
1555            HReg dst = newVRegI(env);
1556            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1557            addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1558            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1559                                            AMD64RMI_Imm(63), dst));
1560            addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1561                                            AMD64RMI_Reg(tmp), dst));
1562            return dst;
1563         }
1564
1565         case Iop_CmpwNEZ64: {
1566            HReg dst = newVRegI(env);
1567            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1568            addInstr(env, mk_iMOVsd_RR(src,dst));
1569            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1570            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1571                                            AMD64RMI_Reg(src), dst));
1572            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1573            return dst;
1574         }
1575
1576         case Iop_CmpwNEZ32: {
1577            HReg src = newVRegI(env);
1578            HReg dst = newVRegI(env);
1579            HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1580            addInstr(env, mk_iMOVsd_RR(pre,src));
1581            addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1582            addInstr(env, mk_iMOVsd_RR(src,dst));
1583            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1584            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1585                                            AMD64RMI_Reg(src), dst));
1586            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1587            return dst;
1588         }
1589
1590         case Iop_Left8:
1591         case Iop_Left16:
1592         case Iop_Left32:
1593         case Iop_Left64: {
1594            HReg dst = newVRegI(env);
1595            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1596            addInstr(env, mk_iMOVsd_RR(src, dst));
1597            addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1598            addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1599            return dst;
1600         }
1601
1602         case Iop_V128to32: {
1603            HReg        dst     = newVRegI(env);
1604            HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1605            AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1606            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1607            addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1608            return dst;
1609         }
1610
1611         /* V128{HI}to64 */
1612         case Iop_V128HIto64:
1613         case Iop_V128to64: {
1614            HReg dst = newVRegI(env);
1615            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
1616            HReg rsp = hregAMD64_RSP();
1617            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1618            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1619            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1620            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1621                                             16, vec, m16_rsp));
1622            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1623                                             AMD64RMI_Mem(off_rsp), dst ));
1624            return dst;
1625         }
1626
1627         case Iop_V256to64_0: case Iop_V256to64_1:
1628         case Iop_V256to64_2: case Iop_V256to64_3: {
1629            HReg vHi, vLo, vec;
1630            iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1631            /* Do the first part of the selection by deciding which of
1632               the 128 bit registers do look at, and second part using
1633               the same scheme as for V128{HI}to64 above. */
1634            Int off = 0;
1635            switch (e->Iex.Unop.op) {
1636               case Iop_V256to64_0: vec = vLo; off = -16; break;
1637               case Iop_V256to64_1: vec = vLo; off =  -8; break;
1638               case Iop_V256to64_2: vec = vHi; off = -16; break;
1639               case Iop_V256to64_3: vec = vHi; off =  -8; break;
1640               default: vassert(0);
1641            }
1642            HReg        dst     = newVRegI(env);
1643            HReg        rsp     = hregAMD64_RSP();
1644            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1645            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1646            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1647                                             16, vec, m16_rsp));
1648            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1649                                             AMD64RMI_Mem(off_rsp), dst ));
1650            return dst;
1651         }
1652
1653         /* ReinterpF64asI64(e) */
1654         /* Given an IEEE754 double, produce an I64 with the same bit
1655            pattern. */
1656         case Iop_ReinterpF64asI64: {
1657            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1658            HReg        dst    = newVRegI(env);
1659            HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1660            /* paranoia */
1661            set_SSE_rounding_default(env);
1662            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1663            addInstr(env, AMD64Instr_Alu64R(
1664                             Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1665            return dst;
1666         }
1667
1668         /* ReinterpF32asI32(e) */
1669         /* Given an IEEE754 single, produce an I64 with the same bit
1670            pattern in the lower half. */
1671         case Iop_ReinterpF32asI32: {
1672            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1673            HReg        dst    = newVRegI(env);
1674            HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1675            /* paranoia */
1676            set_SSE_rounding_default(env);
1677            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1678            addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1679            return dst;
1680         }
1681
1682         case Iop_16to8:
1683         case Iop_32to8:
1684         case Iop_64to8:
1685         case Iop_32to16:
1686         case Iop_64to16:
1687         case Iop_64to32:
1688            /* These are no-ops. */
1689            return iselIntExpr_R(env, e->Iex.Unop.arg);
1690
1691         case Iop_GetMSBs8x8: {
1692            /* Note: the following assumes the helper is of
1693               signature
1694                  UInt fn ( ULong ), and is not a regparm fn.
1695            */
1696            HReg dst = newVRegI(env);
1697            HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1698            fn = (HWord)h_generic_calc_GetMSBs8x8;
1699            addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1700            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1701                                           1, mk_RetLoc_simple(RLPri_Int) ));
1702            /* MovxLQ is not exactly the right thing here.  We just
1703               need to get the bottom 8 bits of RAX into dst, and zero
1704               out everything else.  Assuming that the helper returns
1705               a UInt with the top 24 bits zeroed out, it'll do,
1706               though. */
1707            addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1708            return dst;
1709         }
1710
1711         case Iop_GetMSBs8x16: {
1712            /* Note: the following assumes the helper is of signature
1713                  UInt fn ( ULong w64hi, ULong w64Lo ),
1714               and is not a regparm fn. */
1715            HReg dst = newVRegI(env);
1716            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1717            HReg rsp = hregAMD64_RSP();
1718            fn = (HWord)h_generic_calc_GetMSBs8x16;
1719            AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1720            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1721            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1722                                             16, vec, m16_rsp));
1723            /* hi 64 bits into RDI -- the first arg */
1724            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1725                                             AMD64RMI_Mem(m8_rsp),
1726                                             hregAMD64_RDI() )); /* 1st arg */
1727            /* lo 64 bits into RSI -- the 2nd arg */
1728            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1729                                             AMD64RMI_Mem(m16_rsp),
1730                                             hregAMD64_RSI() )); /* 2nd arg */
1731            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1732                                           2, mk_RetLoc_simple(RLPri_Int) ));
1733            /* MovxLQ is not exactly the right thing here.  We just
1734               need to get the bottom 16 bits of RAX into dst, and zero
1735               out everything else.  Assuming that the helper returns
1736               a UInt with the top 16 bits zeroed out, it'll do,
1737               though. */
1738            addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1739            return dst;
1740         }
1741
1742         default:
1743            break;
1744      }
1745
1746      /* Deal with unary 64-bit SIMD ops. */
1747      switch (e->Iex.Unop.op) {
1748         case Iop_CmpNEZ32x2:
1749            fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1750         case Iop_CmpNEZ16x4:
1751            fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1752         case Iop_CmpNEZ8x8:
1753            fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1754         default:
1755            fn = (HWord)0; break;
1756      }
1757      if (fn != (HWord)0) {
1758         /* Note: the following assumes all helpers are of
1759            signature
1760               ULong fn ( ULong ), and they are
1761            not marked as regparm functions.
1762         */
1763         HReg dst = newVRegI(env);
1764         HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1765         addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1766         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1767                                        mk_RetLoc_simple(RLPri_Int) ));
1768         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1769         return dst;
1770      }
1771
1772      break;
1773   }
1774
1775   /* --------- GET --------- */
1776   case Iex_Get: {
1777      if (ty == Ity_I64) {
1778         HReg dst = newVRegI(env);
1779         addInstr(env, AMD64Instr_Alu64R(
1780                          Aalu_MOV,
1781                          AMD64RMI_Mem(
1782                             AMD64AMode_IR(e->Iex.Get.offset,
1783                                           hregAMD64_RBP())),
1784                          dst));
1785         return dst;
1786      }
1787      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1788         HReg dst = newVRegI(env);
1789         addInstr(env, AMD64Instr_LoadEX(
1790                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1791                          False,
1792                          AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1793                          dst));
1794         return dst;
1795      }
1796      break;
1797   }
1798
1799   case Iex_GetI: {
1800      AMD64AMode* am
1801         = genGuestArrayOffset(
1802              env, e->Iex.GetI.descr,
1803                   e->Iex.GetI.ix, e->Iex.GetI.bias );
1804      HReg dst = newVRegI(env);
1805      if (ty == Ity_I8) {
1806         addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1807         return dst;
1808      }
1809      if (ty == Ity_I64) {
1810         addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1811         return dst;
1812      }
1813      break;
1814   }
1815
1816   /* --------- CCALL --------- */
1817   case Iex_CCall: {
1818      HReg    dst = newVRegI(env);
1819      vassert(ty == e->Iex.CCall.retty);
1820
1821      /* be very restrictive for now.  Only 64-bit ints allowed for
1822         args, and 64 or 32 bits for return type. */
1823      if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1824         goto irreducible;
1825
1826      /* Marshal args, do the call. */
1827      UInt   addToSp = 0;
1828      RetLoc rloc    = mk_RetLoc_INVALID();
1829      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1830                    e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1831      vassert(is_sane_RetLoc(rloc));
1832      vassert(rloc.pri == RLPri_Int);
1833      vassert(addToSp == 0);
1834
1835      /* Move to dst, and zero out the top 32 bits if the result type is
1836         Ity_I32.  Probably overkill, but still .. */
1837      if (e->Iex.CCall.retty == Ity_I64)
1838         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1839      else
1840         addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1841
1842      return dst;
1843   }
1844
1845   /* --------- LITERAL --------- */
1846   /* 64/32/16/8-bit literals */
1847   case Iex_Const:
1848      if (ty == Ity_I64) {
1849         HReg r = newVRegI(env);
1850         addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1851         return r;
1852      } else {
1853         AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1854         HReg      r   = newVRegI(env);
1855         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1856         return r;
1857      }
1858
1859   /* --------- MULTIPLEX --------- */
1860   case Iex_ITE: { // VFD
1861      if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1862          && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1863         HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1864         HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1865         HReg     dst = newVRegI(env);
1866         addInstr(env, mk_iMOVsd_RR(r1,dst));
1867         AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1868         addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1869         return dst;
1870      }
1871      break;
1872   }
1873
1874   /* --------- TERNARY OP --------- */
1875   case Iex_Triop: {
1876      IRTriop *triop = e->Iex.Triop.details;
1877      /* C3210 flags following FPU partial remainder (fprem), both
1878         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1879      if (triop->op == Iop_PRemC3210F64
1880          || triop->op == Iop_PRem1C3210F64) {
1881         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1882         HReg        arg1   = iselDblExpr(env, triop->arg2);
1883         HReg        arg2   = iselDblExpr(env, triop->arg3);
1884         HReg        dst    = newVRegI(env);
1885         addInstr(env, AMD64Instr_A87Free(2));
1886
1887         /* one arg -> top of x87 stack */
1888         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1889         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1890
1891         /* other arg -> top of x87 stack */
1892         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1893         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1894
1895         switch (triop->op) {
1896            case Iop_PRemC3210F64:
1897               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1898               break;
1899            case Iop_PRem1C3210F64:
1900               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1901               break;
1902            default:
1903               vassert(0);
1904         }
1905         /* Ignore the result, and instead make off with the FPU's
1906	    C3210 flags (in the status word). */
1907         addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1908         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1909         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1910         return dst;
1911      }
1912      break;
1913   }
1914
1915   default:
1916   break;
1917   } /* switch (e->tag) */
1918
1919   /* We get here if no pattern matched. */
1920  irreducible:
1921   ppIRExpr(e);
1922   vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1923}
1924
1925
1926/*---------------------------------------------------------*/
1927/*--- ISEL: Integer expression auxiliaries              ---*/
1928/*---------------------------------------------------------*/
1929
1930/* --------------------- AMODEs --------------------- */
1931
1932/* Return an AMode which computes the value of the specified
1933   expression, possibly also adding insns to the code list as a
1934   result.  The expression may only be a 32-bit one.
1935*/
1936
1937static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
1938{
1939   AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
1940   vassert(sane_AMode(am));
1941   return am;
1942}
1943
1944/* DO NOT CALL THIS DIRECTLY ! */
1945static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
1946{
1947   MatchInfo mi;
1948   DECLARE_PATTERN(p_complex);
1949   IRType ty = typeOfIRExpr(env->type_env,e);
1950   vassert(ty == Ity_I64);
1951
1952   /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
1953   /*              bind0        bind1  bind2   bind3   */
1954   DEFINE_PATTERN(p_complex,
1955      binop( Iop_Add64,
1956             binop( Iop_Add64,
1957                    bind(0),
1958                    binop(Iop_Shl64, bind(1), bind(2))
1959                  ),
1960             bind(3)
1961           )
1962   );
1963   if (matchIRExpr(&mi, p_complex, e)) {
1964      const IRExpr* expr1  = mi.bindee[0];
1965      const IRExpr* expr2  = mi.bindee[1];
1966      const IRExpr* imm8   = mi.bindee[2];
1967      const IRExpr* simm32 = mi.bindee[3];
1968      if (imm8->tag == Iex_Const
1969          && imm8->Iex.Const.con->tag == Ico_U8
1970          && imm8->Iex.Const.con->Ico.U8 < 4
1971          /* imm8 is OK, now check simm32 */
1972          && simm32->tag == Iex_Const
1973          && simm32->Iex.Const.con->tag == Ico_U64
1974          && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
1975         UInt shift = imm8->Iex.Const.con->Ico.U8;
1976         UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
1977         HReg r1 = iselIntExpr_R(env, expr1);
1978         HReg r2 = iselIntExpr_R(env, expr2);
1979         vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
1980         return AMD64AMode_IRRS(offset, r1, r2, shift);
1981      }
1982   }
1983
1984   /* Add64(expr1, Shl64(expr2, imm)) */
1985   if (e->tag == Iex_Binop
1986       && e->Iex.Binop.op == Iop_Add64
1987       && e->Iex.Binop.arg2->tag == Iex_Binop
1988       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
1989       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1990       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1991      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1992      if (shift == 1 || shift == 2 || shift == 3) {
1993         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1994         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1995         return AMD64AMode_IRRS(0, r1, r2, shift);
1996      }
1997   }
1998
1999   /* Add64(expr,i) */
2000   if (e->tag == Iex_Binop
2001       && e->Iex.Binop.op == Iop_Add64
2002       && e->Iex.Binop.arg2->tag == Iex_Const
2003       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2004       && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2005      HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2006      return AMD64AMode_IR(
2007                toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2008                r1
2009             );
2010   }
2011
2012   /* Doesn't match anything in particular.  Generate it into
2013      a register and use that. */
2014   {
2015      HReg r1 = iselIntExpr_R(env, e);
2016      return AMD64AMode_IR(0, r1);
2017   }
2018}
2019
2020
2021/* --------------------- RMIs --------------------- */
2022
2023/* Similarly, calculate an expression into an X86RMI operand.  As with
2024   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2025
2026static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2027{
2028   AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2029   /* sanity checks ... */
2030   switch (rmi->tag) {
2031      case Armi_Imm:
2032         return rmi;
2033      case Armi_Reg:
2034         vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2035         vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2036         return rmi;
2037      case Armi_Mem:
2038         vassert(sane_AMode(rmi->Armi.Mem.am));
2039         return rmi;
2040      default:
2041         vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2042   }
2043}
2044
2045/* DO NOT CALL THIS DIRECTLY ! */
2046static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2047{
2048   IRType ty = typeOfIRExpr(env->type_env,e);
2049   vassert(ty == Ity_I64 || ty == Ity_I32
2050           || ty == Ity_I16 || ty == Ity_I8);
2051
2052   /* special case: immediate 64/32/16/8 */
2053   if (e->tag == Iex_Const) {
2054      switch (e->Iex.Const.con->tag) {
2055        case Ico_U64:
2056           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2057              return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2058           }
2059           break;
2060         case Ico_U32:
2061            return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2062         case Ico_U16:
2063            return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2064         case Ico_U8:
2065            return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2066         default:
2067            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2068      }
2069   }
2070
2071   /* special case: 64-bit GET */
2072   if (e->tag == Iex_Get && ty == Ity_I64) {
2073      return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2074                                        hregAMD64_RBP()));
2075   }
2076
2077   /* special case: 64-bit load from memory */
2078   if (e->tag == Iex_Load && ty == Ity_I64
2079       && e->Iex.Load.end == Iend_LE) {
2080      AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2081      return AMD64RMI_Mem(am);
2082   }
2083
2084   /* default case: calculate into a register and return that */
2085   {
2086      HReg r = iselIntExpr_R ( env, e );
2087      return AMD64RMI_Reg(r);
2088   }
2089}
2090
2091
2092/* --------------------- RIs --------------------- */
2093
2094/* Calculate an expression into an AMD64RI operand.  As with
2095   iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2096   bits. */
2097
2098static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2099{
2100   AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2101   /* sanity checks ... */
2102   switch (ri->tag) {
2103      case Ari_Imm:
2104         return ri;
2105      case Ari_Reg:
2106         vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2107         vassert(hregIsVirtual(ri->Ari.Reg.reg));
2108         return ri;
2109      default:
2110         vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2111   }
2112}
2113
2114/* DO NOT CALL THIS DIRECTLY ! */
2115static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2116{
2117   IRType ty = typeOfIRExpr(env->type_env,e);
2118   vassert(ty == Ity_I64 || ty == Ity_I32
2119           || ty == Ity_I16 || ty == Ity_I8);
2120
2121   /* special case: immediate */
2122   if (e->tag == Iex_Const) {
2123      switch (e->Iex.Const.con->tag) {
2124        case Ico_U64:
2125           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2126              return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2127           }
2128           break;
2129         case Ico_U32:
2130            return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2131         case Ico_U16:
2132            return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2133         case Ico_U8:
2134            return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2135         default:
2136            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2137      }
2138   }
2139
2140   /* default case: calculate into a register and return that */
2141   {
2142      HReg r = iselIntExpr_R ( env, e );
2143      return AMD64RI_Reg(r);
2144   }
2145}
2146
2147
2148/* --------------------- RMs --------------------- */
2149
2150/* Similarly, calculate an expression into an AMD64RM operand.  As
2151   with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2152   bits.  */
2153
2154static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2155{
2156   AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2157   /* sanity checks ... */
2158   switch (rm->tag) {
2159      case Arm_Reg:
2160         vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2161         vassert(hregIsVirtual(rm->Arm.Reg.reg));
2162         return rm;
2163      case Arm_Mem:
2164         vassert(sane_AMode(rm->Arm.Mem.am));
2165         return rm;
2166      default:
2167         vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2168   }
2169}
2170
2171/* DO NOT CALL THIS DIRECTLY ! */
2172static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2173{
2174   IRType ty = typeOfIRExpr(env->type_env,e);
2175   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2176
2177   /* special case: 64-bit GET */
2178   if (e->tag == Iex_Get && ty == Ity_I64) {
2179      return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2180                                       hregAMD64_RBP()));
2181   }
2182
2183   /* special case: load from memory */
2184
2185   /* default case: calculate into a register and return that */
2186   {
2187      HReg r = iselIntExpr_R ( env, e );
2188      return AMD64RM_Reg(r);
2189   }
2190}
2191
2192
2193/* --------------------- CONDCODE --------------------- */
2194
2195/* Generate code to evaluated a bit-typed expression, returning the
2196   condition code which would correspond when the expression would
2197   notionally have returned 1. */
2198
2199static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2200{
2201   /* Uh, there's nothing we can sanity check here, unfortunately. */
2202   return iselCondCode_wrk(env,e);
2203}
2204
2205/* DO NOT CALL THIS DIRECTLY ! */
2206static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2207{
2208   MatchInfo mi;
2209
2210   vassert(e);
2211   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2212
2213   /* var */
2214   if (e->tag == Iex_RdTmp) {
2215      HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2216      HReg dst = newVRegI(env);
2217      addInstr(env, mk_iMOVsd_RR(r64,dst));
2218      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2219      return Acc_NZ;
2220   }
2221
2222   /* Constant 1:Bit */
2223   if (e->tag == Iex_Const) {
2224      HReg r;
2225      vassert(e->Iex.Const.con->tag == Ico_U1);
2226      vassert(e->Iex.Const.con->Ico.U1 == True
2227              || e->Iex.Const.con->Ico.U1 == False);
2228      r = newVRegI(env);
2229      addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2230      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2231      return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2232   }
2233
2234   /* Not1(...) */
2235   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2236      /* Generate code for the arg, and negate the test condition */
2237      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2238   }
2239
2240   /* --- patterns rooted at: 64to1 --- */
2241
2242   /* 64to1 */
2243   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2244      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2245      addInstr(env, AMD64Instr_Test64(1,reg));
2246      return Acc_NZ;
2247   }
2248
2249   /* --- patterns rooted at: 32to1 --- */
2250
2251   /* 32to1 */
2252   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2253      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2254      addInstr(env, AMD64Instr_Test64(1,reg));
2255      return Acc_NZ;
2256   }
2257
2258   /* --- patterns rooted at: CmpNEZ8 --- */
2259
2260   /* CmpNEZ8(x) */
2261   if (e->tag == Iex_Unop
2262       && e->Iex.Unop.op == Iop_CmpNEZ8) {
2263      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2264      addInstr(env, AMD64Instr_Test64(0xFF,r));
2265      return Acc_NZ;
2266   }
2267
2268   /* --- patterns rooted at: CmpNEZ16 --- */
2269
2270   /* CmpNEZ16(x) */
2271   if (e->tag == Iex_Unop
2272       && e->Iex.Unop.op == Iop_CmpNEZ16) {
2273      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2274      addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2275      return Acc_NZ;
2276   }
2277
2278   /* --- patterns rooted at: CmpNEZ32 --- */
2279
2280   /* CmpNEZ32(x) */
2281   if (e->tag == Iex_Unop
2282       && e->Iex.Unop.op == Iop_CmpNEZ32) {
2283      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2284      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2285      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2286      return Acc_NZ;
2287   }
2288
2289   /* --- patterns rooted at: CmpNEZ64 --- */
2290
2291   /* CmpNEZ64(Or64(x,y)) */
2292   {
2293      DECLARE_PATTERN(p_CmpNEZ64_Or64);
2294      DEFINE_PATTERN(p_CmpNEZ64_Or64,
2295                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
2296      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
2297         HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
2298         AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
2299         HReg      tmp  = newVRegI(env);
2300         addInstr(env, mk_iMOVsd_RR(r0, tmp));
2301         addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
2302         return Acc_NZ;
2303      }
2304   }
2305
2306   /* CmpNEZ64(x) */
2307   if (e->tag == Iex_Unop
2308       && e->Iex.Unop.op == Iop_CmpNEZ64) {
2309      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
2310      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2311      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2312      return Acc_NZ;
2313   }
2314
2315   /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2316
2317   /* CmpEQ8 / CmpNE8 */
2318   if (e->tag == Iex_Binop
2319       && (e->Iex.Binop.op == Iop_CmpEQ8
2320           || e->Iex.Binop.op == Iop_CmpNE8
2321           || e->Iex.Binop.op == Iop_CasCmpEQ8
2322           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2323      if (isZeroU8(e->Iex.Binop.arg2)) {
2324         HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2325         addInstr(env, AMD64Instr_Test64(0xFF,r1));
2326         switch (e->Iex.Binop.op) {
2327            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2328            case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2329            default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2330         }
2331      } else {
2332         HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2333         AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2334         HReg      r    = newVRegI(env);
2335         addInstr(env, mk_iMOVsd_RR(r1,r));
2336         addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2337         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2338         switch (e->Iex.Binop.op) {
2339            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2340            case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2341            default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2342         }
2343      }
2344   }
2345
2346   /* CmpEQ16 / CmpNE16 */
2347   if (e->tag == Iex_Binop
2348       && (e->Iex.Binop.op == Iop_CmpEQ16
2349           || e->Iex.Binop.op == Iop_CmpNE16
2350           || e->Iex.Binop.op == Iop_CasCmpEQ16
2351           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2352      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2353      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2354      HReg      r    = newVRegI(env);
2355      addInstr(env, mk_iMOVsd_RR(r1,r));
2356      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2357      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2358      switch (e->Iex.Binop.op) {
2359         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2360         case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2361         default: vpanic("iselCondCode(amd64): CmpXX16");
2362      }
2363   }
2364
2365   /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2366      Saves a "movq %rax, %tmp" compared to the default route. */
2367   if (e->tag == Iex_Binop
2368       && e->Iex.Binop.op == Iop_CmpNE64
2369       && e->Iex.Binop.arg1->tag == Iex_CCall
2370       && e->Iex.Binop.arg2->tag == Iex_Const) {
2371      IRExpr* cal = e->Iex.Binop.arg1;
2372      IRExpr* con = e->Iex.Binop.arg2;
2373      HReg    tmp = newVRegI(env);
2374      /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2375      vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2376      vassert(con->Iex.Const.con->tag == Ico_U64);
2377      /* Marshal args, do the call. */
2378      UInt   addToSp = 0;
2379      RetLoc rloc    = mk_RetLoc_INVALID();
2380      doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2381                    cal->Iex.CCall.cee,
2382                    cal->Iex.CCall.retty, cal->Iex.CCall.args );
2383      vassert(is_sane_RetLoc(rloc));
2384      vassert(rloc.pri == RLPri_Int);
2385      vassert(addToSp == 0);
2386      /* */
2387      addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2388      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2389                                      AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2390      return Acc_NZ;
2391   }
2392
2393   /* Cmp*64*(x,y) */
2394   if (e->tag == Iex_Binop
2395       && (e->Iex.Binop.op == Iop_CmpEQ64
2396           || e->Iex.Binop.op == Iop_CmpNE64
2397           || e->Iex.Binop.op == Iop_CmpLT64S
2398           || e->Iex.Binop.op == Iop_CmpLT64U
2399           || e->Iex.Binop.op == Iop_CmpLE64S
2400           || e->Iex.Binop.op == Iop_CmpLE64U
2401           || e->Iex.Binop.op == Iop_CasCmpEQ64
2402           || e->Iex.Binop.op == Iop_CasCmpNE64
2403           || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2404      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2405      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2406      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2407      switch (e->Iex.Binop.op) {
2408         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2409         case Iop_CmpNE64:
2410         case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2411	 case Iop_CmpLT64S: return Acc_L;
2412	 case Iop_CmpLT64U: return Acc_B;
2413	 case Iop_CmpLE64S: return Acc_LE;
2414         case Iop_CmpLE64U: return Acc_BE;
2415         default: vpanic("iselCondCode(amd64): CmpXX64");
2416      }
2417   }
2418
2419   /* Cmp*32*(x,y) */
2420   if (e->tag == Iex_Binop
2421       && (e->Iex.Binop.op == Iop_CmpEQ32
2422           || e->Iex.Binop.op == Iop_CmpNE32
2423           || e->Iex.Binop.op == Iop_CmpLT32S
2424           || e->Iex.Binop.op == Iop_CmpLT32U
2425           || e->Iex.Binop.op == Iop_CmpLE32S
2426           || e->Iex.Binop.op == Iop_CmpLE32U
2427           || e->Iex.Binop.op == Iop_CasCmpEQ32
2428           || e->Iex.Binop.op == Iop_CasCmpNE32
2429           || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2430      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2431      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2432      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2433      switch (e->Iex.Binop.op) {
2434         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2435         case Iop_CmpNE32:
2436         case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2437	 case Iop_CmpLT32S: return Acc_L;
2438	 case Iop_CmpLT32U: return Acc_B;
2439	 case Iop_CmpLE32S: return Acc_LE;
2440         case Iop_CmpLE32U: return Acc_BE;
2441         default: vpanic("iselCondCode(amd64): CmpXX32");
2442      }
2443   }
2444
2445   ppIRExpr(e);
2446   vpanic("iselCondCode(amd64)");
2447}
2448
2449
2450/*---------------------------------------------------------*/
2451/*--- ISEL: Integer expressions (128 bit)               ---*/
2452/*---------------------------------------------------------*/
2453
2454/* Compute a 128-bit value into a register pair, which is returned as
2455   the first two parameters.  As with iselIntExpr_R, these may be
2456   either real or virtual regs; in any case they must not be changed
2457   by subsequent code emitted by the caller.  */
2458
2459static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2460                             ISelEnv* env, const IRExpr* e )
2461{
2462   iselInt128Expr_wrk(rHi, rLo, env, e);
2463#  if 0
2464   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2465#  endif
2466   vassert(hregClass(*rHi) == HRcInt64);
2467   vassert(hregIsVirtual(*rHi));
2468   vassert(hregClass(*rLo) == HRcInt64);
2469   vassert(hregIsVirtual(*rLo));
2470}
2471
2472/* DO NOT CALL THIS DIRECTLY ! */
2473static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2474                                 ISelEnv* env, const IRExpr* e )
2475{
2476   vassert(e);
2477   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2478
2479   /* read 128-bit IRTemp */
2480   if (e->tag == Iex_RdTmp) {
2481      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2482      return;
2483   }
2484
2485   /* --------- BINARY ops --------- */
2486   if (e->tag == Iex_Binop) {
2487      switch (e->Iex.Binop.op) {
2488         /* 64 x 64 -> 128 multiply */
2489         case Iop_MullU64:
2490         case Iop_MullS64: {
2491            /* get one operand into %rax, and the other into a R/M.
2492               Need to make an educated guess about which is better in
2493               which. */
2494            HReg     tLo    = newVRegI(env);
2495            HReg     tHi    = newVRegI(env);
2496            Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2497            AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2498            HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2499            addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2500            addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2501            /* Result is now in RDX:RAX.  Tell the caller. */
2502            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2503            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2504            *rHi = tHi;
2505            *rLo = tLo;
2506            return;
2507         }
2508
2509         /* 128 x 64 -> (64(rem),64(div)) division */
2510         case Iop_DivModU128to64:
2511         case Iop_DivModS128to64: {
2512            /* Get the 128-bit operand into rdx:rax, and the other into
2513               any old R/M. */
2514            HReg sHi, sLo;
2515            HReg     tLo     = newVRegI(env);
2516            HReg     tHi     = newVRegI(env);
2517            Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2518            AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2519            iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2520            addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2521            addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2522            addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2523            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2524            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2525            *rHi = tHi;
2526            *rLo = tLo;
2527            return;
2528         }
2529
2530         /* 64HLto128(e1,e2) */
2531         case Iop_64HLto128:
2532            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2533            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2534            return;
2535
2536         default:
2537            break;
2538      }
2539   } /* if (e->tag == Iex_Binop) */
2540
2541   ppIRExpr(e);
2542   vpanic("iselInt128Expr");
2543}
2544
2545
2546/*---------------------------------------------------------*/
2547/*--- ISEL: Floating point expressions (32 bit)         ---*/
2548/*---------------------------------------------------------*/
2549
2550/* Nothing interesting here; really just wrappers for
2551   64-bit stuff. */
2552
2553static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2554{
2555   HReg r = iselFltExpr_wrk( env, e );
2556#  if 0
2557   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2558#  endif
2559   vassert(hregClass(r) == HRcVec128);
2560   vassert(hregIsVirtual(r));
2561   return r;
2562}
2563
2564/* DO NOT CALL THIS DIRECTLY */
2565static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2566{
2567   IRType ty = typeOfIRExpr(env->type_env,e);
2568   vassert(ty == Ity_F32);
2569
2570   if (e->tag == Iex_RdTmp) {
2571      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2572   }
2573
2574   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2575      AMD64AMode* am;
2576      HReg res = newVRegV(env);
2577      vassert(e->Iex.Load.ty == Ity_F32);
2578      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2579      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2580      return res;
2581   }
2582
2583   if (e->tag == Iex_Binop
2584       && e->Iex.Binop.op == Iop_F64toF32) {
2585      /* Although the result is still held in a standard SSE register,
2586         we need to round it to reflect the loss of accuracy/range
2587         entailed in casting it to a 32-bit float. */
2588      HReg dst = newVRegV(env);
2589      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2590      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2591      addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2592      set_SSE_rounding_default( env );
2593      return dst;
2594   }
2595
2596   if (e->tag == Iex_Get) {
2597      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2598                                       hregAMD64_RBP() );
2599      HReg res = newVRegV(env);
2600      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2601      return res;
2602   }
2603
2604   if (e->tag == Iex_Unop
2605       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2606       /* Given an I32, produce an IEEE754 float with the same bit
2607          pattern. */
2608       HReg        dst    = newVRegV(env);
2609       HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2610       AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2611       addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2612       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2613       return dst;
2614   }
2615
2616   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2617      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2618      HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2619      HReg        dst    = newVRegV(env);
2620
2621      /* rf now holds the value to be rounded.  The first thing to do
2622         is set the FPU's rounding mode accordingly. */
2623
2624      /* Set host x87 rounding mode */
2625      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2626
2627      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2628      addInstr(env, AMD64Instr_A87Free(1));
2629      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2630      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2631      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2632      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2633
2634      /* Restore default x87 rounding. */
2635      set_FPU_rounding_default( env );
2636
2637      return dst;
2638   }
2639
2640   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2641      /* Sigh ... very rough code.  Could do much better. */
2642      /* Get the 128-bit literal 00---0 10---0 into a register
2643         and xor it with the value to be negated. */
2644      HReg r1  = newVRegI(env);
2645      HReg dst = newVRegV(env);
2646      HReg tmp = newVRegV(env);
2647      HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2648      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2649      addInstr(env, mk_vMOVsd_RR(src,tmp));
2650      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2651      addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2652      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2653      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2654      addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2655      add_to_rsp(env, 16);
2656      return dst;
2657   }
2658
2659   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2660      IRQop *qop = e->Iex.Qop.details;
2661      HReg dst  = newVRegV(env);
2662      HReg argX = iselFltExpr(env, qop->arg2);
2663      HReg argY = iselFltExpr(env, qop->arg3);
2664      HReg argZ = iselFltExpr(env, qop->arg4);
2665      /* XXXROUNDINGFIXME */
2666      /* set roundingmode here */
2667      /* subq $16, %rsp         -- make a space*/
2668      sub_from_rsp(env, 16);
2669      /* Prepare 4 arg regs:
2670         leaq 0(%rsp), %rdi
2671         leaq 4(%rsp), %rsi
2672         leaq 8(%rsp), %rdx
2673         leaq 12(%rsp), %rcx
2674      */
2675      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2676                                     hregAMD64_RDI()));
2677      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2678                                     hregAMD64_RSI()));
2679      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2680                                     hregAMD64_RDX()));
2681      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2682                                     hregAMD64_RCX()));
2683      /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2684         movss  %argX, 0(%rsi)
2685         movss  %argY, 0(%rdx)
2686         movss  %argZ, 0(%rcx)
2687         */
2688      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2689                                       AMD64AMode_IR(0, hregAMD64_RSI())));
2690      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2691                                       AMD64AMode_IR(0, hregAMD64_RDX())));
2692      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2693                                       AMD64AMode_IR(0, hregAMD64_RCX())));
2694      /* call the helper */
2695      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2696                                     (ULong)(HWord)h_generic_calc_MAddF32,
2697                                     4, mk_RetLoc_simple(RLPri_None) ));
2698      /* fetch the result from memory, using %r_argp, which the
2699         register allocator will keep alive across the call. */
2700      addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2701                                       AMD64AMode_IR(0, hregAMD64_RSP())));
2702      /* and finally, clear the space */
2703      add_to_rsp(env, 16);
2704      return dst;
2705   }
2706
2707   ppIRExpr(e);
2708   vpanic("iselFltExpr_wrk");
2709}
2710
2711
2712/*---------------------------------------------------------*/
2713/*--- ISEL: Floating point expressions (64 bit)         ---*/
2714/*---------------------------------------------------------*/
2715
2716/* Compute a 64-bit floating point value into the lower half of an xmm
2717   register, the identity of which is returned.  As with
2718   iselIntExpr_R, the returned reg will be virtual, and it must not be
2719   changed by subsequent code emitted by the caller.
2720*/
2721
2722/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2723
2724    Type                  S (1 bit)   E (11 bits)   F (52 bits)
2725    ----                  ---------   -----------   -----------
2726    signalling NaN        u           2047 (max)    .0uuuuu---u
2727                                                    (with at least
2728                                                     one 1 bit)
2729    quiet NaN             u           2047 (max)    .1uuuuu---u
2730
2731    negative infinity     1           2047 (max)    .000000---0
2732
2733    positive infinity     0           2047 (max)    .000000---0
2734
2735    negative zero         1           0             .000000---0
2736
2737    positive zero         0           0             .000000---0
2738*/
2739
2740static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2741{
2742   HReg r = iselDblExpr_wrk( env, e );
2743#  if 0
2744   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2745#  endif
2746   vassert(hregClass(r) == HRcVec128);
2747   vassert(hregIsVirtual(r));
2748   return r;
2749}
2750
2751/* DO NOT CALL THIS DIRECTLY */
2752static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2753{
2754   IRType ty = typeOfIRExpr(env->type_env,e);
2755   vassert(e);
2756   vassert(ty == Ity_F64);
2757
2758   if (e->tag == Iex_RdTmp) {
2759      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2760   }
2761
2762   if (e->tag == Iex_Const) {
2763      union { ULong u64; Double f64; } u;
2764      HReg res = newVRegV(env);
2765      HReg tmp = newVRegI(env);
2766      vassert(sizeof(u) == 8);
2767      vassert(sizeof(u.u64) == 8);
2768      vassert(sizeof(u.f64) == 8);
2769
2770      if (e->Iex.Const.con->tag == Ico_F64) {
2771         u.f64 = e->Iex.Const.con->Ico.F64;
2772      }
2773      else if (e->Iex.Const.con->tag == Ico_F64i) {
2774         u.u64 = e->Iex.Const.con->Ico.F64i;
2775      }
2776      else
2777         vpanic("iselDblExpr(amd64): const");
2778
2779      addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2780      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2781      addInstr(env, AMD64Instr_SseLdSt(
2782                       True/*load*/, 8, res,
2783                       AMD64AMode_IR(0, hregAMD64_RSP())
2784              ));
2785      add_to_rsp(env, 8);
2786      return res;
2787   }
2788
2789   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2790      AMD64AMode* am;
2791      HReg res = newVRegV(env);
2792      vassert(e->Iex.Load.ty == Ity_F64);
2793      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2794      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2795      return res;
2796   }
2797
2798   if (e->tag == Iex_Get) {
2799      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2800                                      hregAMD64_RBP() );
2801      HReg res = newVRegV(env);
2802      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2803      return res;
2804   }
2805
2806   if (e->tag == Iex_GetI) {
2807      AMD64AMode* am
2808         = genGuestArrayOffset(
2809              env, e->Iex.GetI.descr,
2810                   e->Iex.GetI.ix, e->Iex.GetI.bias );
2811      HReg res = newVRegV(env);
2812      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2813      return res;
2814   }
2815
2816   if (e->tag == Iex_Triop) {
2817      IRTriop *triop = e->Iex.Triop.details;
2818      AMD64SseOp op = Asse_INVALID;
2819      switch (triop->op) {
2820         case Iop_AddF64: op = Asse_ADDF; break;
2821         case Iop_SubF64: op = Asse_SUBF; break;
2822         case Iop_MulF64: op = Asse_MULF; break;
2823         case Iop_DivF64: op = Asse_DIVF; break;
2824         default: break;
2825      }
2826      if (op != Asse_INVALID) {
2827         HReg dst  = newVRegV(env);
2828         HReg argL = iselDblExpr(env, triop->arg2);
2829         HReg argR = iselDblExpr(env, triop->arg3);
2830         addInstr(env, mk_vMOVsd_RR(argL, dst));
2831         /* XXXROUNDINGFIXME */
2832         /* set roundingmode here */
2833         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2834         return dst;
2835      }
2836   }
2837
2838   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2839      IRQop *qop = e->Iex.Qop.details;
2840      HReg dst  = newVRegV(env);
2841      HReg argX = iselDblExpr(env, qop->arg2);
2842      HReg argY = iselDblExpr(env, qop->arg3);
2843      HReg argZ = iselDblExpr(env, qop->arg4);
2844      /* XXXROUNDINGFIXME */
2845      /* set roundingmode here */
2846      /* subq $32, %rsp         -- make a space*/
2847      sub_from_rsp(env, 32);
2848      /* Prepare 4 arg regs:
2849         leaq 0(%rsp), %rdi
2850         leaq 8(%rsp), %rsi
2851         leaq 16(%rsp), %rdx
2852         leaq 24(%rsp), %rcx
2853      */
2854      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2855                                     hregAMD64_RDI()));
2856      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2857                                     hregAMD64_RSI()));
2858      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2859                                     hregAMD64_RDX()));
2860      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2861                                     hregAMD64_RCX()));
2862      /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2863         movsd  %argX, 0(%rsi)
2864         movsd  %argY, 0(%rdx)
2865         movsd  %argZ, 0(%rcx)
2866         */
2867      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2868                                       AMD64AMode_IR(0, hregAMD64_RSI())));
2869      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2870                                       AMD64AMode_IR(0, hregAMD64_RDX())));
2871      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2872                                       AMD64AMode_IR(0, hregAMD64_RCX())));
2873      /* call the helper */
2874      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2875                                     (ULong)(HWord)h_generic_calc_MAddF64,
2876                                     4, mk_RetLoc_simple(RLPri_None) ));
2877      /* fetch the result from memory, using %r_argp, which the
2878         register allocator will keep alive across the call. */
2879      addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2880                                       AMD64AMode_IR(0, hregAMD64_RSP())));
2881      /* and finally, clear the space */
2882      add_to_rsp(env, 32);
2883      return dst;
2884   }
2885
2886   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2887      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2888      HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2889      HReg        dst    = newVRegV(env);
2890
2891      /* rf now holds the value to be rounded.  The first thing to do
2892         is set the FPU's rounding mode accordingly. */
2893
2894      /* Set host x87 rounding mode */
2895      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2896
2897      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2898      addInstr(env, AMD64Instr_A87Free(1));
2899      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2900      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2901      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2902      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2903
2904      /* Restore default x87 rounding. */
2905      set_FPU_rounding_default( env );
2906
2907      return dst;
2908   }
2909
2910   IRTriop *triop = e->Iex.Triop.details;
2911   if (e->tag == Iex_Triop
2912       && (triop->op == Iop_ScaleF64
2913           || triop->op == Iop_AtanF64
2914           || triop->op == Iop_Yl2xF64
2915           || triop->op == Iop_Yl2xp1F64
2916           || triop->op == Iop_PRemF64
2917           || triop->op == Iop_PRem1F64)
2918      ) {
2919      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2920      HReg        arg1   = iselDblExpr(env, triop->arg2);
2921      HReg        arg2   = iselDblExpr(env, triop->arg3);
2922      HReg        dst    = newVRegV(env);
2923      Bool     arg2first = toBool(triop->op == Iop_ScaleF64
2924                                  || triop->op == Iop_PRemF64
2925                                  || triop->op == Iop_PRem1F64);
2926      addInstr(env, AMD64Instr_A87Free(2));
2927
2928      /* one arg -> top of x87 stack */
2929      addInstr(env, AMD64Instr_SseLdSt(
2930                       False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
2931      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2932
2933      /* other arg -> top of x87 stack */
2934      addInstr(env, AMD64Instr_SseLdSt(
2935                       False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
2936      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2937
2938      /* do it */
2939      /* XXXROUNDINGFIXME */
2940      /* set roundingmode here */
2941      switch (triop->op) {
2942         case Iop_ScaleF64:
2943            addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
2944            break;
2945         case Iop_AtanF64:
2946            addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
2947            break;
2948         case Iop_Yl2xF64:
2949            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
2950            break;
2951         case Iop_Yl2xp1F64:
2952            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
2953            break;
2954         case Iop_PRemF64:
2955            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
2956            break;
2957         case Iop_PRem1F64:
2958            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
2959            break;
2960         default:
2961            vassert(0);
2962      }
2963
2964      /* save result */
2965      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2966      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2967      return dst;
2968   }
2969
2970   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
2971      HReg dst = newVRegV(env);
2972      HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
2973      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2974      addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
2975      set_SSE_rounding_default( env );
2976      return dst;
2977   }
2978
2979   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
2980      HReg dst = newVRegV(env);
2981      HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2982      set_SSE_rounding_default( env );
2983      addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
2984      return dst;
2985   }
2986
2987   if (e->tag == Iex_Unop
2988       && (e->Iex.Unop.op == Iop_NegF64
2989           || e->Iex.Unop.op == Iop_AbsF64)) {
2990      /* Sigh ... very rough code.  Could do much better. */
2991      /* Get the 128-bit literal 00---0 10---0 into a register
2992         and xor/nand it with the value to be negated. */
2993      HReg r1  = newVRegI(env);
2994      HReg dst = newVRegV(env);
2995      HReg tmp = newVRegV(env);
2996      HReg src = iselDblExpr(env, e->Iex.Unop.arg);
2997      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2998      addInstr(env, mk_vMOVsd_RR(src,tmp));
2999      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3000      addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3001      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3002      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3003
3004      if (e->Iex.Unop.op == Iop_NegF64)
3005         addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3006      else
3007         addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3008
3009      add_to_rsp(env, 16);
3010      return dst;
3011   }
3012
3013   if (e->tag == Iex_Binop) {
3014      A87FpOp fpop = Afp_INVALID;
3015      switch (e->Iex.Binop.op) {
3016         case Iop_SqrtF64: fpop = Afp_SQRT; break;
3017         case Iop_SinF64:  fpop = Afp_SIN;  break;
3018         case Iop_CosF64:  fpop = Afp_COS;  break;
3019         case Iop_TanF64:  fpop = Afp_TAN;  break;
3020         case Iop_2xm1F64: fpop = Afp_2XM1; break;
3021         default: break;
3022      }
3023      if (fpop != Afp_INVALID) {
3024         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3025         HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3026         HReg        dst    = newVRegV(env);
3027         Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3028         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3029         addInstr(env, AMD64Instr_A87Free(nNeeded));
3030         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3031         /* XXXROUNDINGFIXME */
3032         /* set roundingmode here */
3033         /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3034            codes.  I don't think that matters, since this insn
3035            selector never generates such an instruction intervening
3036            between an flag-setting instruction and a flag-using
3037            instruction. */
3038         addInstr(env, AMD64Instr_A87FpOp(fpop));
3039         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3040         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3041         return dst;
3042      }
3043   }
3044
3045   if (e->tag == Iex_Unop) {
3046      switch (e->Iex.Unop.op) {
3047//..          case Iop_I32toF64: {
3048//..             HReg dst = newVRegF(env);
3049//..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3050//..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3051//..             set_FPU_rounding_default(env);
3052//..             addInstr(env, X86Instr_FpLdStI(
3053//..                              True/*load*/, 4, dst,
3054//..                              X86AMode_IR(0, hregX86_ESP())));
3055//..             add_to_esp(env, 4);
3056//..             return dst;
3057//..          }
3058         case Iop_ReinterpI64asF64: {
3059            /* Given an I64, produce an IEEE754 double with the same
3060               bit pattern. */
3061            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3062            HReg        dst    = newVRegV(env);
3063            AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3064            /* paranoia */
3065            set_SSE_rounding_default(env);
3066            addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3067            addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3068            return dst;
3069         }
3070         case Iop_F32toF64: {
3071            HReg f32;
3072            HReg f64 = newVRegV(env);
3073            /* this shouldn't be necessary, but be paranoid ... */
3074            set_SSE_rounding_default(env);
3075            f32 = iselFltExpr(env, e->Iex.Unop.arg);
3076            addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3077            return f64;
3078         }
3079         default:
3080            break;
3081      }
3082   }
3083
3084   /* --------- MULTIPLEX --------- */
3085   if (e->tag == Iex_ITE) { // VFD
3086      HReg r1, r0, dst;
3087      vassert(ty == Ity_F64);
3088      vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3089      r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3090      r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3091      dst = newVRegV(env);
3092      addInstr(env, mk_vMOVsd_RR(r1,dst));
3093      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3094      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3095      return dst;
3096   }
3097
3098   ppIRExpr(e);
3099   vpanic("iselDblExpr_wrk");
3100}
3101
3102
3103/*---------------------------------------------------------*/
3104/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3105/*---------------------------------------------------------*/
3106
3107static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3108{
3109   HReg r = iselVecExpr_wrk( env, e );
3110#  if 0
3111   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3112#  endif
3113   vassert(hregClass(r) == HRcVec128);
3114   vassert(hregIsVirtual(r));
3115   return r;
3116}
3117
3118
3119/* DO NOT CALL THIS DIRECTLY */
3120static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3121{
3122   HWord      fn = 0; /* address of helper fn, if required */
3123   Bool       arg1isEReg = False;
3124   AMD64SseOp op = Asse_INVALID;
3125   IRType     ty = typeOfIRExpr(env->type_env,e);
3126   vassert(e);
3127   vassert(ty == Ity_V128);
3128
3129   if (e->tag == Iex_RdTmp) {
3130      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3131   }
3132
3133   if (e->tag == Iex_Get) {
3134      HReg dst = newVRegV(env);
3135      addInstr(env, AMD64Instr_SseLdSt(
3136                       True/*load*/,
3137                       16,
3138                       dst,
3139                       AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3140                    )
3141              );
3142      return dst;
3143   }
3144
3145   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3146      HReg        dst = newVRegV(env);
3147      AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3148      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3149      return dst;
3150   }
3151
3152   if (e->tag == Iex_Const) {
3153      HReg dst = newVRegV(env);
3154      vassert(e->Iex.Const.con->tag == Ico_V128);
3155      switch (e->Iex.Const.con->Ico.V128) {
3156         case 0x0000:
3157            dst = generate_zeroes_V128(env);
3158            break;
3159         case 0xFFFF:
3160            dst = generate_ones_V128(env);
3161            break;
3162         default: {
3163            AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3164            /* do push_uimm64 twice, first time for the high-order half. */
3165            push_uimm64(env, bitmask8_to_bytemask64(
3166                                (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3167                       ));
3168            push_uimm64(env, bitmask8_to_bytemask64(
3169                                (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3170                       ));
3171            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3172            add_to_rsp(env, 16);
3173            break;
3174         }
3175      }
3176      return dst;
3177   }
3178
3179   if (e->tag == Iex_Unop) {
3180   switch (e->Iex.Unop.op) {
3181
3182      case Iop_NotV128: {
3183         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3184         return do_sse_NotV128(env, arg);
3185      }
3186
3187      case Iop_CmpNEZ64x2: {
3188         /* We can use SSE2 instructions for this. */
3189         /* Ideally, we want to do a 64Ix2 comparison against zero of
3190            the operand.  Problem is no such insn exists.  Solution
3191            therefore is to do a 32Ix4 comparison instead, and bitwise-
3192            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3193            let the not'd result of this initial comparison be a:b:c:d.
3194            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3195            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3196            giving the required result.
3197
3198            The required selection sequence is 2,3,0,1, which
3199            according to Intel's documentation means the pshufd
3200            literal value is 0xB1, that is,
3201            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3202         */
3203         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3204         HReg tmp  = generate_zeroes_V128(env);
3205         HReg dst  = newVRegV(env);
3206         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3207         tmp = do_sse_NotV128(env, tmp);
3208         addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3209         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3210         return dst;
3211      }
3212
3213      case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3214      case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3215      case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3216      do_CmpNEZ_vector:
3217      {
3218         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3219         HReg tmp  = newVRegV(env);
3220         HReg zero = generate_zeroes_V128(env);
3221         HReg dst;
3222         addInstr(env, mk_vMOVsd_RR(arg, tmp));
3223         addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3224         dst = do_sse_NotV128(env, tmp);
3225         return dst;
3226      }
3227
3228      case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3229      case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3230      do_32Fx4_unary:
3231      {
3232         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3233         HReg dst = newVRegV(env);
3234         addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3235         return dst;
3236      }
3237
3238      case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3239      case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3240      case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
3241      do_32F0x4_unary:
3242      {
3243         /* A bit subtle.  We have to copy the arg to the result
3244            register first, because actually doing the SSE scalar insn
3245            leaves the upper 3/4 of the destination register
3246            unchanged.  Whereas the required semantics of these
3247            primops is that the upper 3/4 is simply copied in from the
3248            argument. */
3249         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3250         HReg dst = newVRegV(env);
3251         addInstr(env, mk_vMOVsd_RR(arg, dst));
3252         addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3253         return dst;
3254      }
3255
3256      case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3257      do_64F0x2_unary:
3258      {
3259         /* A bit subtle.  We have to copy the arg to the result
3260            register first, because actually doing the SSE scalar insn
3261            leaves the upper half of the destination register
3262            unchanged.  Whereas the required semantics of these
3263            primops is that the upper half is simply copied in from the
3264            argument. */
3265         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3266         HReg dst = newVRegV(env);
3267         addInstr(env, mk_vMOVsd_RR(arg, dst));
3268         addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3269         return dst;
3270      }
3271
3272      case Iop_32UtoV128: {
3273         HReg        dst     = newVRegV(env);
3274         AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3275         AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3276         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3277         addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3278         return dst;
3279      }
3280
3281      case Iop_64UtoV128: {
3282         HReg        dst  = newVRegV(env);
3283         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3284         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3285         addInstr(env, AMD64Instr_Push(rmi));
3286         addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3287         add_to_rsp(env, 8);
3288         return dst;
3289      }
3290
3291      case Iop_V256toV128_0:
3292      case Iop_V256toV128_1: {
3293         HReg vHi, vLo;
3294         iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3295         return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3296      }
3297
3298      default:
3299         break;
3300   } /* switch (e->Iex.Unop.op) */
3301   } /* if (e->tag == Iex_Unop) */
3302
3303   if (e->tag == Iex_Binop) {
3304   switch (e->Iex.Binop.op) {
3305
3306      case Iop_Sqrt64Fx2:
3307      case Iop_Sqrt32Fx4: {
3308         /* :: (rmode, vec) -> vec */
3309         HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3310         HReg dst = newVRegV(env);
3311         /* XXXROUNDINGFIXME */
3312         /* set roundingmode here */
3313         addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3314                           ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3315                       (Asse_SQRTF, arg, dst));
3316         return dst;
3317      }
3318
3319      /* FIXME: could we generate MOVQ here? */
3320      case Iop_SetV128lo64: {
3321         HReg dst  = newVRegV(env);
3322         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3323         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3324         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3325         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3326         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3327         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3328         return dst;
3329      }
3330
3331      /* FIXME: could we generate MOVD here? */
3332      case Iop_SetV128lo32: {
3333         HReg dst  = newVRegV(env);
3334         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3335         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3336         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3337         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3338         addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3339         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3340         return dst;
3341      }
3342
3343      case Iop_64HLtoV128: {
3344         HReg        rsp     = hregAMD64_RSP();
3345         AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
3346         AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
3347         AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
3348         AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
3349         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
3350         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
3351         HReg        dst = newVRegV(env);
3352         /* One store-forwarding stall coming up, oh well :-( */
3353         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
3354         return dst;
3355      }
3356
3357      case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3358      case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3359      case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3360      case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3361      case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3362      case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3363      do_32Fx4:
3364      {
3365         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3366         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3367         HReg dst = newVRegV(env);
3368         addInstr(env, mk_vMOVsd_RR(argL, dst));
3369         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3370         return dst;
3371      }
3372
3373      case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3374      case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3375      case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3376      case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3377      case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3378      case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3379      do_64Fx2:
3380      {
3381         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3382         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3383         HReg dst = newVRegV(env);
3384         addInstr(env, mk_vMOVsd_RR(argL, dst));
3385         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3386         return dst;
3387      }
3388
3389      case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3390      case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3391      case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3392      case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3393      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3394      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3395      case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3396      case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3397      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3398      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3399      do_32F0x4: {
3400         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3401         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3402         HReg dst = newVRegV(env);
3403         addInstr(env, mk_vMOVsd_RR(argL, dst));
3404         addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3405         return dst;
3406      }
3407
3408      case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3409      case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3410      case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3411      case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3412      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3413      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3414      case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3415      case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3416      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3417      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3418      do_64F0x2: {
3419         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3420         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3421         HReg dst = newVRegV(env);
3422         addInstr(env, mk_vMOVsd_RR(argL, dst));
3423         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3424         return dst;
3425      }
3426
3427      case Iop_QNarrowBin32Sto16Sx8:
3428         op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3429      case Iop_QNarrowBin16Sto8Sx16:
3430         op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3431      case Iop_QNarrowBin16Sto8Ux16:
3432         op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3433
3434      case Iop_InterleaveHI8x16:
3435         op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3436      case Iop_InterleaveHI16x8:
3437         op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3438      case Iop_InterleaveHI32x4:
3439         op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3440      case Iop_InterleaveHI64x2:
3441         op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3442
3443      case Iop_InterleaveLO8x16:
3444         op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3445      case Iop_InterleaveLO16x8:
3446         op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3447      case Iop_InterleaveLO32x4:
3448         op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3449      case Iop_InterleaveLO64x2:
3450         op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3451
3452      case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3453      case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3454      case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3455      case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3456      case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3457      case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3458      case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3459      case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3460      case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3461      case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3462      case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3463      case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3464      case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3465      case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3466      case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3467      case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3468      case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3469      case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3470      case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3471      case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3472      case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3473      case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3474      case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3475      case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3476      case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3477      case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3478      case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3479      case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3480      case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3481      case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3482      case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3483      case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3484      case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3485      case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3486      do_SseReRg: {
3487         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3488         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3489         HReg dst = newVRegV(env);
3490         if (arg1isEReg) {
3491            addInstr(env, mk_vMOVsd_RR(arg2, dst));
3492            addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3493         } else {
3494            addInstr(env, mk_vMOVsd_RR(arg1, dst));
3495            addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3496         }
3497         return dst;
3498      }
3499
3500      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
3501      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
3502      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
3503      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
3504      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
3505      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
3506      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
3507      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
3508      do_SseShift: {
3509         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
3510         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3511         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3512         HReg        ereg = newVRegV(env);
3513         HReg        dst  = newVRegV(env);
3514         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3515         addInstr(env, AMD64Instr_Push(rmi));
3516         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3517         addInstr(env, mk_vMOVsd_RR(greg, dst));
3518         addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3519         add_to_rsp(env, 16);
3520         return dst;
3521      }
3522
3523      case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3524                           goto do_SseAssistedBinary;
3525      case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3526                           goto do_SseAssistedBinary;
3527      case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3528                           goto do_SseAssistedBinary;
3529      case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3530                           goto do_SseAssistedBinary;
3531      case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3532                           goto do_SseAssistedBinary;
3533      case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3534                           goto do_SseAssistedBinary;
3535      case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3536                           goto do_SseAssistedBinary;
3537      case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3538                           goto do_SseAssistedBinary;
3539      case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3540                           goto do_SseAssistedBinary;
3541      case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3542                           goto do_SseAssistedBinary;
3543      case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3544                           goto do_SseAssistedBinary;
3545      case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3546                           goto do_SseAssistedBinary;
3547      case Iop_QNarrowBin32Sto16Ux8:
3548                           fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3549                           goto do_SseAssistedBinary;
3550      case Iop_NarrowBin16to8x16:
3551                           fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3552                           goto do_SseAssistedBinary;
3553      case Iop_NarrowBin32to16x8:
3554                           fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3555                           goto do_SseAssistedBinary;
3556      do_SseAssistedBinary: {
3557         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3558            well. */
3559         vassert(fn != 0);
3560         HReg dst = newVRegV(env);
3561         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3562         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3563         HReg argp = newVRegI(env);
3564         /* subq $112, %rsp         -- make a space*/
3565         sub_from_rsp(env, 112);
3566         /* leaq 48(%rsp), %r_argp  -- point into it */
3567         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3568                                        argp));
3569         /* andq $-16, %r_argp      -- 16-align the pointer */
3570         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3571                                         AMD64RMI_Imm( ~(UInt)15 ),
3572                                         argp));
3573         /* Prepare 3 arg regs:
3574            leaq 0(%r_argp), %rdi
3575            leaq 16(%r_argp), %rsi
3576            leaq 32(%r_argp), %rdx
3577         */
3578         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3579                                        hregAMD64_RDI()));
3580         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3581                                        hregAMD64_RSI()));
3582         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3583                                        hregAMD64_RDX()));
3584         /* Store the two args, at (%rsi) and (%rdx):
3585            movupd  %argL, 0(%rsi)
3586            movupd  %argR, 0(%rdx)
3587         */
3588         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3589                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3590         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3591                                          AMD64AMode_IR(0, hregAMD64_RDX())));
3592         /* call the helper */
3593         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3594                                        3, mk_RetLoc_simple(RLPri_None) ));
3595         /* fetch the result from memory, using %r_argp, which the
3596            register allocator will keep alive across the call. */
3597         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3598                                          AMD64AMode_IR(0, argp)));
3599         /* and finally, clear the space */
3600         add_to_rsp(env, 112);
3601         return dst;
3602      }
3603
3604      case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3605                         goto do_SseAssistedVectorAndScalar;
3606      case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3607                         goto do_SseAssistedVectorAndScalar;
3608      do_SseAssistedVectorAndScalar: {
3609         /* RRRufff!  RRRufff code is what we're generating here.  Oh
3610            well. */
3611         vassert(fn != 0);
3612         HReg dst = newVRegV(env);
3613         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3614         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3615         HReg argp = newVRegI(env);
3616         /* subq $112, %rsp         -- make a space*/
3617         sub_from_rsp(env, 112);
3618         /* leaq 48(%rsp), %r_argp  -- point into it */
3619         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3620                                        argp));
3621         /* andq $-16, %r_argp      -- 16-align the pointer */
3622         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3623                                         AMD64RMI_Imm( ~(UInt)15 ),
3624                                         argp));
3625         /* Prepare 2 vector arg regs:
3626            leaq 0(%r_argp), %rdi
3627            leaq 16(%r_argp), %rsi
3628         */
3629         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3630                                        hregAMD64_RDI()));
3631         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3632                                        hregAMD64_RSI()));
3633         /* Store the vector arg, at (%rsi):
3634            movupd  %argL, 0(%rsi)
3635         */
3636         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3637                                          AMD64AMode_IR(0, hregAMD64_RSI())));
3638         /* And get the scalar value into rdx */
3639         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3640
3641         /* call the helper */
3642         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3643                                        3, mk_RetLoc_simple(RLPri_None) ));
3644         /* fetch the result from memory, using %r_argp, which the
3645            register allocator will keep alive across the call. */
3646         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3647                                          AMD64AMode_IR(0, argp)));
3648         /* and finally, clear the space */
3649         add_to_rsp(env, 112);
3650         return dst;
3651      }
3652
3653      default:
3654         break;
3655   } /* switch (e->Iex.Binop.op) */
3656   } /* if (e->tag == Iex_Binop) */
3657
3658   if (e->tag == Iex_Triop) {
3659   IRTriop *triop = e->Iex.Triop.details;
3660   switch (triop->op) {
3661
3662      case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3663      case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3664      case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3665      case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3666      do_64Fx2_w_rm:
3667      {
3668         HReg argL = iselVecExpr(env, triop->arg2);
3669         HReg argR = iselVecExpr(env, triop->arg3);
3670         HReg dst = newVRegV(env);
3671         addInstr(env, mk_vMOVsd_RR(argL, dst));
3672         /* XXXROUNDINGFIXME */
3673         /* set roundingmode here */
3674         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3675         return dst;
3676      }
3677
3678      case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3679      case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3680      case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3681      case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3682      do_32Fx4_w_rm:
3683      {
3684         HReg argL = iselVecExpr(env, triop->arg2);
3685         HReg argR = iselVecExpr(env, triop->arg3);
3686         HReg dst = newVRegV(env);
3687         addInstr(env, mk_vMOVsd_RR(argL, dst));
3688         /* XXXROUNDINGFIXME */
3689         /* set roundingmode here */
3690         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3691         return dst;
3692      }
3693
3694      default:
3695         break;
3696   } /* switch (triop->op) */
3697   } /* if (e->tag == Iex_Triop) */
3698
3699   if (e->tag == Iex_ITE) { // VFD
3700      HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3701      HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3702      HReg dst = newVRegV(env);
3703      addInstr(env, mk_vMOVsd_RR(r1,dst));
3704      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3705      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3706      return dst;
3707   }
3708
3709   //vec_fail:
3710   vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3711              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3712   ppIRExpr(e);
3713   vpanic("iselVecExpr_wrk");
3714}
3715
3716
3717/*---------------------------------------------------------*/
3718/*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3719/*---------------------------------------------------------*/
3720
3721static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3722                           ISelEnv* env, const IRExpr* e )
3723{
3724   iselDVecExpr_wrk( rHi, rLo, env, e );
3725#  if 0
3726   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3727#  endif
3728   vassert(hregClass(*rHi) == HRcVec128);
3729   vassert(hregClass(*rLo) == HRcVec128);
3730   vassert(hregIsVirtual(*rHi));
3731   vassert(hregIsVirtual(*rLo));
3732}
3733
3734
3735/* DO NOT CALL THIS DIRECTLY */
3736static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3737                               ISelEnv* env, const IRExpr* e )
3738{
3739   HWord fn = 0; /* address of helper fn, if required */
3740   vassert(e);
3741   IRType ty = typeOfIRExpr(env->type_env,e);
3742   vassert(ty == Ity_V256);
3743
3744   AMD64SseOp op = Asse_INVALID;
3745
3746   /* read 256-bit IRTemp */
3747   if (e->tag == Iex_RdTmp) {
3748      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3749      return;
3750   }
3751
3752   if (e->tag == Iex_Get) {
3753      HReg        vHi  = newVRegV(env);
3754      HReg        vLo  = newVRegV(env);
3755      HReg        rbp  = hregAMD64_RBP();
3756      AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3757      AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3758      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3759      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3760      *rHi = vHi;
3761      *rLo = vLo;
3762      return;
3763   }
3764
3765   if (e->tag == Iex_Load) {
3766      HReg        vHi  = newVRegV(env);
3767      HReg        vLo  = newVRegV(env);
3768      HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3769      AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3770      AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3771      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3772      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3773      *rHi = vHi;
3774      *rLo = vLo;
3775      return;
3776   }
3777
3778   if (e->tag == Iex_Const) {
3779      vassert(e->Iex.Const.con->tag == Ico_V256);
3780      switch (e->Iex.Const.con->Ico.V256) {
3781         case 0x00000000: {
3782            HReg vHi = generate_zeroes_V128(env);
3783            HReg vLo = newVRegV(env);
3784            addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3785            *rHi = vHi;
3786            *rLo = vLo;
3787            return;
3788         }
3789         default:
3790            break; /* give up.   Until such time as is necessary. */
3791      }
3792   }
3793
3794   if (e->tag == Iex_Unop) {
3795   switch (e->Iex.Unop.op) {
3796
3797      case Iop_NotV256: {
3798         HReg argHi, argLo;
3799         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3800         *rHi = do_sse_NotV128(env, argHi);
3801         *rLo = do_sse_NotV128(env, argLo);
3802         return;
3803      }
3804
3805      case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
3806      case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
3807      case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3808      do_32Fx8_unary:
3809      {
3810         HReg argHi, argLo;
3811         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3812         HReg dstHi = newVRegV(env);
3813         HReg dstLo = newVRegV(env);
3814         addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
3815         addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
3816         *rHi = dstHi;
3817         *rLo = dstLo;
3818         return;
3819      }
3820
3821      case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
3822      do_64Fx4_unary:
3823      {
3824         HReg argHi, argLo;
3825         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3826         HReg dstHi = newVRegV(env);
3827         HReg dstLo = newVRegV(env);
3828         addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
3829         addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
3830         *rHi = dstHi;
3831         *rLo = dstLo;
3832         return;
3833      }
3834
3835      case Iop_CmpNEZ64x4: {
3836         /* We can use SSE2 instructions for this. */
3837         /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
3838            (obviously).  See comment on Iop_CmpNEZ64x2 for
3839            explanation of what's going on here. */
3840         HReg argHi, argLo;
3841         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3842         HReg tmpHi  = generate_zeroes_V128(env);
3843         HReg tmpLo  = newVRegV(env);
3844         addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
3845         HReg dstHi  = newVRegV(env);
3846         HReg dstLo  = newVRegV(env);
3847         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
3848         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
3849         tmpHi = do_sse_NotV128(env, tmpHi);
3850         tmpLo = do_sse_NotV128(env, tmpLo);
3851         addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
3852         addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
3853         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
3854         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
3855         *rHi = dstHi;
3856         *rLo = dstLo;
3857         return;
3858      }
3859
3860      case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3861      case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3862      case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3863      do_CmpNEZ_vector:
3864      {
3865         HReg argHi, argLo;
3866         iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3867         HReg tmpHi = newVRegV(env);
3868         HReg tmpLo = newVRegV(env);
3869         HReg zero  = generate_zeroes_V128(env);
3870         HReg dstHi, dstLo;
3871         addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
3872         addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
3873         addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
3874         addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
3875         dstHi = do_sse_NotV128(env, tmpHi);
3876         dstLo = do_sse_NotV128(env, tmpLo);
3877         *rHi = dstHi;
3878         *rLo = dstLo;
3879         return;
3880      }
3881
3882      default:
3883         break;
3884   } /* switch (e->Iex.Unop.op) */
3885   } /* if (e->tag == Iex_Unop) */
3886
3887   if (e->tag == Iex_Binop) {
3888   switch (e->Iex.Binop.op) {
3889
3890      case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
3891      case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
3892      do_64Fx4:
3893      {
3894         HReg argLhi, argLlo, argRhi, argRlo;
3895         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3896         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3897         HReg dstHi = newVRegV(env);
3898         HReg dstLo = newVRegV(env);
3899         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3900         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3901         addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
3902         addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
3903         *rHi = dstHi;
3904         *rLo = dstLo;
3905         return;
3906      }
3907
3908      case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
3909      case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
3910      do_32Fx8:
3911      {
3912         HReg argLhi, argLlo, argRhi, argRlo;
3913         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3914         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3915         HReg dstHi = newVRegV(env);
3916         HReg dstLo = newVRegV(env);
3917         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3918         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3919         addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
3920         addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
3921         *rHi = dstHi;
3922         *rLo = dstLo;
3923         return;
3924      }
3925
3926      case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
3927      case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
3928      case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
3929      case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
3930      case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
3931      case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
3932      case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
3933      case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
3934      case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
3935      case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
3936      case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
3937      case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
3938      case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
3939      case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
3940      case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
3941      case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
3942      case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
3943      case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
3944      case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
3945      case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
3946      case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
3947      case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
3948      case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
3949      case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
3950      case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
3951      case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
3952      case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
3953      case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
3954      case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
3955      case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
3956      case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
3957      case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
3958      case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
3959      case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
3960      do_SseReRg:
3961      {
3962         HReg argLhi, argLlo, argRhi, argRlo;
3963         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3964         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3965         HReg dstHi = newVRegV(env);
3966         HReg dstLo = newVRegV(env);
3967         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3968         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3969         addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
3970         addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
3971         *rHi = dstHi;
3972         *rLo = dstLo;
3973         return;
3974      }
3975
3976      case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
3977      case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
3978      case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
3979      case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
3980      case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
3981      case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
3982      case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
3983      case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
3984      do_SseShift: {
3985         HReg gregHi, gregLo;
3986         iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
3987         AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3988         AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
3989         HReg        ereg  = newVRegV(env);
3990         HReg        dstHi = newVRegV(env);
3991         HReg        dstLo = newVRegV(env);
3992         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3993         addInstr(env, AMD64Instr_Push(rmi));
3994         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3995         addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
3996         addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
3997         addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
3998         addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
3999         add_to_rsp(env, 16);
4000         *rHi = dstHi;
4001         *rLo = dstLo;
4002         return;
4003      }
4004
4005      case Iop_V128HLtoV256: {
4006         *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4007         *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4008         return;
4009      }
4010
4011      case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4012                           goto do_SseAssistedBinary;
4013      case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4014                           goto do_SseAssistedBinary;
4015      case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4016                           goto do_SseAssistedBinary;
4017      case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4018                           goto do_SseAssistedBinary;
4019      case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4020                           goto do_SseAssistedBinary;
4021      case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4022                           goto do_SseAssistedBinary;
4023      case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4024                           goto do_SseAssistedBinary;
4025      case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4026                           goto do_SseAssistedBinary;
4027      case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4028                           goto do_SseAssistedBinary;
4029      case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4030                           goto do_SseAssistedBinary;
4031      case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4032                           goto do_SseAssistedBinary;
4033      do_SseAssistedBinary: {
4034         /* RRRufff!  RRRufff code is what we're generating here.  Oh
4035            well. */
4036         vassert(fn != 0);
4037         HReg dstHi = newVRegV(env);
4038         HReg dstLo = newVRegV(env);
4039         HReg argLhi, argLlo, argRhi, argRlo;
4040         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4041         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4042         HReg argp = newVRegI(env);
4043         /* subq $160, %rsp         -- make a space*/
4044         sub_from_rsp(env, 160);
4045         /* leaq 48(%rsp), %r_argp  -- point into it */
4046         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4047                                        argp));
4048         /* andq $-16, %r_argp      -- 16-align the pointer */
4049         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4050                                         AMD64RMI_Imm( ~(UInt)15 ),
4051                                         argp));
4052         /* Prepare 3 arg regs:
4053            leaq 0(%r_argp), %rdi
4054            leaq 16(%r_argp), %rsi
4055            leaq 32(%r_argp), %rdx
4056         */
4057         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4058                                        hregAMD64_RDI()));
4059         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4060                                        hregAMD64_RSI()));
4061         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4062                                        hregAMD64_RDX()));
4063         /* Store the two high args, at (%rsi) and (%rdx):
4064            movupd  %argLhi, 0(%rsi)
4065            movupd  %argRhi, 0(%rdx)
4066         */
4067         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4068                                          AMD64AMode_IR(0, hregAMD64_RSI())));
4069         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4070                                          AMD64AMode_IR(0, hregAMD64_RDX())));
4071         /* Store the two low args, at 48(%rsi) and 48(%rdx):
4072            movupd  %argLlo, 48(%rsi)
4073            movupd  %argRlo, 48(%rdx)
4074         */
4075         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4076                                          AMD64AMode_IR(48, hregAMD64_RSI())));
4077         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4078                                          AMD64AMode_IR(48, hregAMD64_RDX())));
4079         /* call the helper */
4080         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4081                                        mk_RetLoc_simple(RLPri_None) ));
4082         /* Prepare 3 arg regs:
4083            leaq 48(%r_argp), %rdi
4084            leaq 64(%r_argp), %rsi
4085            leaq 80(%r_argp), %rdx
4086         */
4087         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4088                                        hregAMD64_RDI()));
4089         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4090                                        hregAMD64_RSI()));
4091         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4092                                        hregAMD64_RDX()));
4093         /* call the helper */
4094         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4095                                        mk_RetLoc_simple(RLPri_None) ));
4096         /* fetch the result from memory, using %r_argp, which the
4097            register allocator will keep alive across the call. */
4098         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4099                                          AMD64AMode_IR(0, argp)));
4100         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4101                                          AMD64AMode_IR(48, argp)));
4102         /* and finally, clear the space */
4103         add_to_rsp(env, 160);
4104         *rHi = dstHi;
4105         *rLo = dstLo;
4106         return;
4107      }
4108
4109      case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4110                           goto do_SseAssistedBinary256;
4111      do_SseAssistedBinary256: {
4112         /* RRRufff!  RRRufff code is what we're generating here.  Oh
4113            well. */
4114         vassert(fn != 0);
4115         HReg dstHi = newVRegV(env);
4116         HReg dstLo = newVRegV(env);
4117         HReg argLhi, argLlo, argRhi, argRlo;
4118         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4119         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4120         HReg argp = newVRegI(env);
4121         /* subq $160, %rsp         -- make a space*/
4122         sub_from_rsp(env, 160);
4123         /* leaq 48(%rsp), %r_argp  -- point into it */
4124         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4125                                        argp));
4126         /* andq $-16, %r_argp      -- 16-align the pointer */
4127         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4128                                         AMD64RMI_Imm( ~(UInt)15 ),
4129                                         argp));
4130         /* Prepare 3 arg regs:
4131            leaq 0(%r_argp), %rdi
4132            leaq 32(%r_argp), %rsi
4133            leaq 64(%r_argp), %rdx
4134         */
4135         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4136                                        hregAMD64_RDI()));
4137         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4138                                        hregAMD64_RSI()));
4139         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4140                                        hregAMD64_RDX()));
4141         /* Store the two args, at (%rsi) and (%rdx):
4142            movupd  %argLlo, 0(%rsi)
4143            movupd  %argLhi, 16(%rsi)
4144            movupd  %argRlo, 0(%rdx)
4145            movupd  %argRhi, 16(%rdx)
4146         */
4147         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4148                                          AMD64AMode_IR(0, hregAMD64_RSI())));
4149         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4150                                          AMD64AMode_IR(16, hregAMD64_RSI())));
4151         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4152                                          AMD64AMode_IR(0, hregAMD64_RDX())));
4153         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4154                                          AMD64AMode_IR(16, hregAMD64_RDX())));
4155         /* call the helper */
4156         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4157                                        mk_RetLoc_simple(RLPri_None) ));
4158         /* fetch the result from memory, using %r_argp, which the
4159            register allocator will keep alive across the call. */
4160         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4161                                          AMD64AMode_IR(0, argp)));
4162         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4163                                          AMD64AMode_IR(16, argp)));
4164         /* and finally, clear the space */
4165         add_to_rsp(env, 160);
4166         *rHi = dstHi;
4167         *rLo = dstLo;
4168         return;
4169      }
4170
4171      default:
4172         break;
4173   } /* switch (e->Iex.Binop.op) */
4174   } /* if (e->tag == Iex_Binop) */
4175
4176   if (e->tag == Iex_Triop) {
4177   IRTriop *triop = e->Iex.Triop.details;
4178   switch (triop->op) {
4179
4180      case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4181      case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4182      case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4183      case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4184      do_64Fx4_w_rm:
4185      {
4186         HReg argLhi, argLlo, argRhi, argRlo;
4187         iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4188         iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4189         HReg dstHi = newVRegV(env);
4190         HReg dstLo = newVRegV(env);
4191         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4192         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4193         /* XXXROUNDINGFIXME */
4194         /* set roundingmode here */
4195         addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4196         addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4197         *rHi = dstHi;
4198         *rLo = dstLo;
4199         return;
4200      }
4201
4202      case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4203      case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4204      case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4205      case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4206      do_32Fx8_w_rm:
4207      {
4208         HReg argLhi, argLlo, argRhi, argRlo;
4209         iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4210         iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4211         HReg dstHi = newVRegV(env);
4212         HReg dstLo = newVRegV(env);
4213         addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4214         addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4215         /* XXXROUNDINGFIXME */
4216         /* set roundingmode here */
4217         addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4218         addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4219         *rHi = dstHi;
4220         *rLo = dstLo;
4221         return;
4222      }
4223
4224      default:
4225         break;
4226   } /* switch (triop->op) */
4227   } /* if (e->tag == Iex_Triop) */
4228
4229
4230   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4231      HReg        rsp     = hregAMD64_RSP();
4232      HReg        vHi     = newVRegV(env);
4233      HReg        vLo     = newVRegV(env);
4234      AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
4235      AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
4236      /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4237      /* Get all the args into regs, before messing with the stack. */
4238      AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
4239      AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
4240      AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
4241      AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
4242      /* less significant lane (Q2) at the lower address (-16(rsp)) */
4243      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
4244      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
4245      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
4246      /* and then the lower half .. */
4247      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
4248      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
4249      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
4250      *rHi = vHi;
4251      *rLo = vLo;
4252      return;
4253   }
4254
4255   if (e->tag == Iex_ITE) {
4256      HReg r1Hi, r1Lo, r0Hi, r0Lo;
4257      iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4258      iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4259      HReg dstHi = newVRegV(env);
4260      HReg dstLo = newVRegV(env);
4261      addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4262      addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4263      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4264      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4265      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4266      *rHi = dstHi;
4267      *rLo = dstLo;
4268      return;
4269   }
4270
4271   //avx_fail:
4272   vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4273              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4274   ppIRExpr(e);
4275   vpanic("iselDVecExpr_wrk");
4276}
4277
4278
4279/*---------------------------------------------------------*/
4280/*--- ISEL: Statements                                  ---*/
4281/*---------------------------------------------------------*/
4282
4283static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4284{
4285   if (vex_traceflags & VEX_TRACE_VCODE) {
4286      vex_printf("\n-- ");
4287      ppIRStmt(stmt);
4288      vex_printf("\n");
4289   }
4290
4291   switch (stmt->tag) {
4292
4293   /* --------- LOADG (guarded load) --------- */
4294   case Ist_LoadG: {
4295      IRLoadG* lg = stmt->Ist.LoadG.details;
4296      if (lg->end != Iend_LE)
4297         goto stmt_fail;
4298
4299      UChar szB = 0; /* invalid */
4300      switch (lg->cvt) {
4301         case ILGop_Ident32:   szB = 4;  break;
4302         case ILGop_Ident64:   szB = 8;  break;
4303         case ILGop_IdentV128: szB = 16; break;
4304         default: break;
4305      }
4306      if (szB == 0)
4307         goto stmt_fail;
4308
4309      AMD64AMode* amAddr
4310         = iselIntExpr_AMode(env, lg->addr);
4311      HReg rAlt
4312         = szB == 16 ? iselVecExpr(env, lg->alt)
4313                     : iselIntExpr_R(env, lg->alt);
4314      HReg rDst
4315         = lookupIRTemp(env, lg->dst);
4316
4317      /* Get the alt value into the dst.  We'll do a conditional load
4318         which overwrites it -- or not -- with loaded data. */
4319      if (szB == 16) {
4320         addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4321      } else {
4322         addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4323      }
4324      AMD64CondCode cc = iselCondCode(env, lg->guard);
4325      if (szB == 16) {
4326         addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4327      } else {
4328         addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4329      }
4330      return;
4331   }
4332
4333   /* --------- STOREG (guarded store) --------- */
4334   case Ist_StoreG: {
4335      IRStoreG* sg = stmt->Ist.StoreG.details;
4336      if (sg->end != Iend_LE)
4337         goto stmt_fail;
4338
4339      UChar szB = 0; /* invalid */
4340      switch (typeOfIRExpr(env->type_env, sg->data)) {
4341         case Ity_I32:  szB = 4; break;
4342         case Ity_I64:  szB = 8; break;
4343         case Ity_V128: szB = 16; break;
4344         default: break;
4345      }
4346      if (szB == 0)
4347         goto stmt_fail;
4348
4349      AMD64AMode* amAddr
4350         = iselIntExpr_AMode(env, sg->addr);
4351      HReg rSrc
4352         = szB == 16 ? iselVecExpr(env, sg->data)
4353                     : iselIntExpr_R(env, sg->data);
4354      AMD64CondCode cc
4355         = iselCondCode(env, sg->guard);
4356      if (szB == 16) {
4357         addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4358      } else {
4359         addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4360      }
4361      return;
4362   }
4363
4364   /* --------- STORE --------- */
4365   case Ist_Store: {
4366      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4367      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4368      IREndness end   = stmt->Ist.Store.end;
4369
4370      if (tya != Ity_I64 || end != Iend_LE)
4371         goto stmt_fail;
4372
4373      if (tyd == Ity_I64) {
4374         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4375         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4376         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4377         return;
4378      }
4379      if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4380         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4381         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4382         addInstr(env, AMD64Instr_Store(
4383                          toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4384                          r,am));
4385         return;
4386      }
4387      if (tyd == Ity_F64) {
4388         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4389         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4390         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4391         return;
4392      }
4393      if (tyd == Ity_F32) {
4394         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4395         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4396         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4397         return;
4398      }
4399      if (tyd == Ity_V128) {
4400         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4401         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4402         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4403         return;
4404      }
4405      if (tyd == Ity_V256) {
4406         HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4407         AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4408         AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4409         HReg vHi, vLo;
4410         iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4411         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4412         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4413         return;
4414      }
4415      break;
4416   }
4417
4418   /* --------- PUT --------- */
4419   case Ist_Put: {
4420      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4421      if (ty == Ity_I64) {
4422         /* We're going to write to memory, so compute the RHS into an
4423            AMD64RI. */
4424         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4425         addInstr(env,
4426                  AMD64Instr_Alu64M(
4427                     Aalu_MOV,
4428                     ri,
4429                     AMD64AMode_IR(stmt->Ist.Put.offset,
4430                                   hregAMD64_RBP())
4431                 ));
4432         return;
4433      }
4434      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4435         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4436         addInstr(env, AMD64Instr_Store(
4437                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4438                          r,
4439                          AMD64AMode_IR(stmt->Ist.Put.offset,
4440                                        hregAMD64_RBP())));
4441         return;
4442      }
4443      if (ty == Ity_F32) {
4444         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4445         AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4446         set_SSE_rounding_default(env); /* paranoia */
4447         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4448         return;
4449      }
4450      if (ty == Ity_F64) {
4451         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4452         AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4453                                         hregAMD64_RBP() );
4454         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4455         return;
4456      }
4457      if (ty == Ity_V128) {
4458         HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4459         AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4460                                         hregAMD64_RBP());
4461         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4462         return;
4463      }
4464      if (ty == Ity_V256) {
4465         HReg vHi, vLo;
4466         iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4467         HReg        rbp  = hregAMD64_RBP();
4468         AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4469         AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4470         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4471         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4472         return;
4473      }
4474      break;
4475   }
4476
4477   /* --------- Indexed PUT --------- */
4478   case Ist_PutI: {
4479      IRPutI *puti = stmt->Ist.PutI.details;
4480
4481      AMD64AMode* am
4482         = genGuestArrayOffset(
4483              env, puti->descr,
4484                   puti->ix, puti->bias );
4485
4486      IRType ty = typeOfIRExpr(env->type_env, puti->data);
4487      if (ty == Ity_F64) {
4488         HReg val = iselDblExpr(env, puti->data);
4489         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4490         return;
4491      }
4492      if (ty == Ity_I8) {
4493         HReg r = iselIntExpr_R(env, puti->data);
4494         addInstr(env, AMD64Instr_Store( 1, r, am ));
4495         return;
4496      }
4497      if (ty == Ity_I64) {
4498         AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4499         addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4500         return;
4501      }
4502      break;
4503   }
4504
4505   /* --------- TMP --------- */
4506   case Ist_WrTmp: {
4507      IRTemp tmp = stmt->Ist.WrTmp.tmp;
4508      IRType ty = typeOfIRTemp(env->type_env, tmp);
4509
4510      /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4511         compute it into an AMode and then use LEA.  This usually
4512         produces fewer instructions, often because (for memcheck
4513         created IR) we get t = address-expression, (t is later used
4514         twice) and so doing this naturally turns address-expression
4515         back into an AMD64 amode. */
4516      if (ty == Ity_I64
4517          && stmt->Ist.WrTmp.data->tag == Iex_Binop
4518          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4519         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4520         HReg dst = lookupIRTemp(env, tmp);
4521         if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4522            /* Hmm, iselIntExpr_AMode wimped out and just computed the
4523               value into a register.  Just emit a normal reg-reg move
4524               so reg-alloc can coalesce it away in the usual way. */
4525            HReg src = am->Aam.IR.reg;
4526            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4527         } else {
4528            addInstr(env, AMD64Instr_Lea64(am,dst));
4529         }
4530         return;
4531      }
4532
4533      if (ty == Ity_I64 || ty == Ity_I32
4534          || ty == Ity_I16 || ty == Ity_I8) {
4535         AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4536         HReg dst = lookupIRTemp(env, tmp);
4537         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4538         return;
4539      }
4540      if (ty == Ity_I128) {
4541         HReg rHi, rLo, dstHi, dstLo;
4542         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4543         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4544         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4545         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4546         return;
4547      }
4548      if (ty == Ity_I1) {
4549         AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4550         HReg dst = lookupIRTemp(env, tmp);
4551         addInstr(env, AMD64Instr_Set64(cond, dst));
4552         return;
4553      }
4554      if (ty == Ity_F64) {
4555         HReg dst = lookupIRTemp(env, tmp);
4556         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4557         addInstr(env, mk_vMOVsd_RR(src, dst));
4558         return;
4559      }
4560      if (ty == Ity_F32) {
4561         HReg dst = lookupIRTemp(env, tmp);
4562         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4563         addInstr(env, mk_vMOVsd_RR(src, dst));
4564         return;
4565      }
4566      if (ty == Ity_V128) {
4567         HReg dst = lookupIRTemp(env, tmp);
4568         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4569         addInstr(env, mk_vMOVsd_RR(src, dst));
4570         return;
4571      }
4572      if (ty == Ity_V256) {
4573         HReg rHi, rLo, dstHi, dstLo;
4574         iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4575         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4576         addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4577         addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4578         return;
4579      }
4580      break;
4581   }
4582
4583   /* --------- Call to DIRTY helper --------- */
4584   case Ist_Dirty: {
4585      IRDirty* d = stmt->Ist.Dirty.details;
4586
4587      /* Figure out the return type, if any. */
4588      IRType retty = Ity_INVALID;
4589      if (d->tmp != IRTemp_INVALID)
4590         retty = typeOfIRTemp(env->type_env, d->tmp);
4591
4592      /* Throw out any return types we don't know about. */
4593      Bool retty_ok = False;
4594      switch (retty) {
4595         case Ity_INVALID: /* function doesn't return anything */
4596         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4597         case Ity_V128: case Ity_V256:
4598            retty_ok = True; break;
4599         default:
4600            break;
4601      }
4602      if (!retty_ok)
4603         break; /* will go to stmt_fail: */
4604
4605      /* Marshal args, do the call, and set the return value to
4606         0x555..555 if this is a conditional call that returns a value
4607         and the call is skipped. */
4608      UInt   addToSp = 0;
4609      RetLoc rloc    = mk_RetLoc_INVALID();
4610      doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4611      vassert(is_sane_RetLoc(rloc));
4612
4613      /* Now figure out what to do with the returned value, if any. */
4614      switch (retty) {
4615         case Ity_INVALID: {
4616            /* No return value.  Nothing to do. */
4617            vassert(d->tmp == IRTemp_INVALID);
4618            vassert(rloc.pri == RLPri_None);
4619            vassert(addToSp == 0);
4620            return;
4621         }
4622         case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4623            /* The returned value is in %rax.  Park it in the register
4624               associated with tmp. */
4625            vassert(rloc.pri == RLPri_Int);
4626            vassert(addToSp == 0);
4627            HReg dst = lookupIRTemp(env, d->tmp);
4628            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4629            return;
4630         }
4631         case Ity_V128: {
4632            /* The returned value is on the stack, and rloc.spOff
4633               tells us where.  Fish it off the stack and then move
4634               the stack pointer upwards to clear it, as directed by
4635               doHelperCall. */
4636            vassert(rloc.pri == RLPri_V128SpRel);
4637            vassert(addToSp >= 16);
4638            HReg        dst = lookupIRTemp(env, d->tmp);
4639            AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4640            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4641            add_to_rsp(env, addToSp);
4642            return;
4643         }
4644         case Ity_V256: {
4645            /* See comments for Ity_V128. */
4646            vassert(rloc.pri == RLPri_V256SpRel);
4647            vassert(addToSp >= 32);
4648            HReg        dstLo, dstHi;
4649            lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4650            AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4651            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4652            AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4653            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4654            add_to_rsp(env, addToSp);
4655            return;
4656         }
4657         default:
4658            /*NOTREACHED*/
4659            vassert(0);
4660      }
4661      break;
4662   }
4663
4664   /* --------- MEM FENCE --------- */
4665   case Ist_MBE:
4666      switch (stmt->Ist.MBE.event) {
4667         case Imbe_Fence:
4668            addInstr(env, AMD64Instr_MFence());
4669            return;
4670         default:
4671            break;
4672      }
4673      break;
4674
4675   /* --------- ACAS --------- */
4676   case Ist_CAS:
4677      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4678         /* "normal" singleton CAS */
4679         UChar  sz;
4680         IRCAS* cas = stmt->Ist.CAS.details;
4681         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4682         /* get: cas->expd into %rax, and cas->data into %rbx */
4683         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4684         HReg rData = iselIntExpr_R(env, cas->dataLo);
4685         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4686         HReg rOld  = lookupIRTemp(env, cas->oldLo);
4687         vassert(cas->expdHi == NULL);
4688         vassert(cas->dataHi == NULL);
4689         addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4690         addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4691         addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4692         switch (ty) {
4693            case Ity_I64: sz = 8; break;
4694            case Ity_I32: sz = 4; break;
4695            case Ity_I16: sz = 2; break;
4696            case Ity_I8:  sz = 1; break;
4697            default: goto unhandled_cas;
4698         }
4699         addInstr(env, AMD64Instr_ACAS(am, sz));
4700         addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4701         return;
4702      } else {
4703         /* double CAS */
4704         UChar  sz;
4705         IRCAS* cas = stmt->Ist.CAS.details;
4706         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4707         /* only 32-bit and 64-bit allowed in this case */
4708         /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4709         /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4710         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4711         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4712         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4713         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4714         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4715         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4716         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4717         switch (ty) {
4718            case Ity_I64:
4719               if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4720                  goto unhandled_cas; /* we'd have to generate
4721                                         cmpxchg16b, but the host
4722                                         doesn't support that */
4723               sz = 8;
4724               break;
4725            case Ity_I32:
4726               sz = 4;
4727               break;
4728            default:
4729               goto unhandled_cas;
4730         }
4731         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4732         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4733         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4734         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4735         addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4736         addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4737         addInstr(env, AMD64Instr_DACAS(am, sz));
4738         addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
4739         addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
4740         return;
4741      }
4742      unhandled_cas:
4743      break;
4744
4745   /* --------- INSTR MARK --------- */
4746   /* Doesn't generate any executable code ... */
4747   case Ist_IMark:
4748       return;
4749
4750   /* --------- ABI HINT --------- */
4751   /* These have no meaning (denotation in the IR) and so we ignore
4752      them ... if any actually made it this far. */
4753   case Ist_AbiHint:
4754       return;
4755
4756   /* --------- NO-OP --------- */
4757   case Ist_NoOp:
4758       return;
4759
4760   /* --------- EXIT --------- */
4761   case Ist_Exit: {
4762      if (stmt->Ist.Exit.dst->tag != Ico_U64)
4763         vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4764
4765      AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4766      AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
4767                                          hregAMD64_RBP());
4768
4769      /* Case: boring transfer to known address */
4770      if (stmt->Ist.Exit.jk == Ijk_Boring) {
4771         if (env->chainingAllowed) {
4772            /* .. almost always true .. */
4773            /* Skip the event check at the dst if this is a forwards
4774               edge. */
4775            Bool toFastEP
4776               = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
4777            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4778            addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
4779                                             amRIP, cc, toFastEP));
4780         } else {
4781            /* .. very occasionally .. */
4782            /* We can't use chaining, so ask for an assisted transfer,
4783               as that's the only alternative that is allowable. */
4784            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4785            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
4786         }
4787         return;
4788      }
4789
4790      /* Case: assisted transfer to arbitrary address */
4791      switch (stmt->Ist.Exit.jk) {
4792         /* Keep this list in sync with that in iselNext below */
4793         case Ijk_ClientReq:
4794         case Ijk_EmWarn:
4795         case Ijk_NoDecode:
4796         case Ijk_NoRedir:
4797         case Ijk_SigSEGV:
4798         case Ijk_SigTRAP:
4799         case Ijk_Sys_syscall:
4800         case Ijk_Sys_int210:
4801         case Ijk_InvalICache:
4802         case Ijk_Yield:
4803         {
4804            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4805            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
4806            return;
4807         }
4808         default:
4809            break;
4810      }
4811
4812      /* Do we ever expect to see any other kind? */
4813      goto stmt_fail;
4814   }
4815
4816   default: break;
4817   }
4818  stmt_fail:
4819   ppIRStmt(stmt);
4820   vpanic("iselStmt(amd64)");
4821}
4822
4823
4824/*---------------------------------------------------------*/
4825/*--- ISEL: Basic block terminators (Nexts)             ---*/
4826/*---------------------------------------------------------*/
4827
4828static void iselNext ( ISelEnv* env,
4829                       IRExpr* next, IRJumpKind jk, Int offsIP )
4830{
4831   if (vex_traceflags & VEX_TRACE_VCODE) {
4832      vex_printf( "\n-- PUT(%d) = ", offsIP);
4833      ppIRExpr( next );
4834      vex_printf( "; exit-");
4835      ppIRJumpKind(jk);
4836      vex_printf( "\n");
4837   }
4838
4839   /* Case: boring transfer to known address */
4840   if (next->tag == Iex_Const) {
4841      IRConst* cdst = next->Iex.Const.con;
4842      vassert(cdst->tag == Ico_U64);
4843      if (jk == Ijk_Boring || jk == Ijk_Call) {
4844         /* Boring transfer to known address */
4845         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4846         if (env->chainingAllowed) {
4847            /* .. almost always true .. */
4848            /* Skip the event check at the dst if this is a forwards
4849               edge. */
4850            Bool toFastEP
4851               = ((Addr64)cdst->Ico.U64) > env->max_ga;
4852            if (0) vex_printf("%s", toFastEP ? "X" : ".");
4853            addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
4854                                             amRIP, Acc_ALWAYS,
4855                                             toFastEP));
4856         } else {
4857            /* .. very occasionally .. */
4858            /* We can't use chaining, so ask for an indirect transfer,
4859               as that's the cheapest alternative that is
4860               allowable. */
4861            HReg r = iselIntExpr_R(env, next);
4862            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4863                                               Ijk_Boring));
4864         }
4865         return;
4866      }
4867   }
4868
4869   /* Case: call/return (==boring) transfer to any address */
4870   switch (jk) {
4871      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4872         HReg        r     = iselIntExpr_R(env, next);
4873         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4874         if (env->chainingAllowed) {
4875            addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
4876         } else {
4877            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4878                                               Ijk_Boring));
4879         }
4880         return;
4881      }
4882      default:
4883         break;
4884   }
4885
4886   /* Case: assisted transfer to arbitrary address */
4887   switch (jk) {
4888      /* Keep this list in sync with that for Ist_Exit above */
4889      case Ijk_ClientReq:
4890      case Ijk_EmWarn:
4891      case Ijk_NoDecode:
4892      case Ijk_NoRedir:
4893      case Ijk_SigSEGV:
4894      case Ijk_SigTRAP:
4895      case Ijk_Sys_syscall:
4896      case Ijk_Sys_int210:
4897      case Ijk_InvalICache:
4898      case Ijk_Yield: {
4899         HReg        r     = iselIntExpr_R(env, next);
4900         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4901         addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
4902         return;
4903      }
4904      default:
4905         break;
4906   }
4907
4908   vex_printf( "\n-- PUT(%d) = ", offsIP);
4909   ppIRExpr( next );
4910   vex_printf( "; exit-");
4911   ppIRJumpKind(jk);
4912   vex_printf( "\n");
4913   vassert(0); // are we expecting any other kind?
4914}
4915
4916
4917/*---------------------------------------------------------*/
4918/*--- Insn selector top-level                           ---*/
4919/*---------------------------------------------------------*/
4920
4921/* Translate an entire SB to amd64 code. */
4922
4923HInstrArray* iselSB_AMD64 ( const IRSB* bb,
4924                            VexArch      arch_host,
4925                            const VexArchInfo* archinfo_host,
4926                            const VexAbiInfo*  vbi/*UNUSED*/,
4927                            Int offs_Host_EvC_Counter,
4928                            Int offs_Host_EvC_FailAddr,
4929                            Bool chainingAllowed,
4930                            Bool addProfInc,
4931                            Addr max_ga )
4932{
4933   Int        i, j;
4934   HReg       hreg, hregHI;
4935   ISelEnv*   env;
4936   UInt       hwcaps_host = archinfo_host->hwcaps;
4937   AMD64AMode *amCounter, *amFailAddr;
4938
4939   /* sanity ... */
4940   vassert(arch_host == VexArchAMD64);
4941   vassert(0 == (hwcaps_host
4942                 & ~(VEX_HWCAPS_AMD64_SSE3
4943                     | VEX_HWCAPS_AMD64_CX16
4944                     | VEX_HWCAPS_AMD64_LZCNT
4945                     | VEX_HWCAPS_AMD64_AVX
4946                     | VEX_HWCAPS_AMD64_RDTSCP
4947                     | VEX_HWCAPS_AMD64_BMI
4948                     | VEX_HWCAPS_AMD64_AVX2)));
4949
4950   /* Check that the host's endianness is as expected. */
4951   vassert(archinfo_host->endness == VexEndnessLE);
4952
4953   /* Make up an initial environment to use. */
4954   env = LibVEX_Alloc_inline(sizeof(ISelEnv));
4955   env->vreg_ctr = 0;
4956
4957   /* Set up output code array. */
4958   env->code = newHInstrArray();
4959
4960   /* Copy BB's type env. */
4961   env->type_env = bb->tyenv;
4962
4963   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4964      change as we go along. */
4965   env->n_vregmap = bb->tyenv->types_used;
4966   env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4967   env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4968
4969   /* and finally ... */
4970   env->chainingAllowed = chainingAllowed;
4971   env->hwcaps          = hwcaps_host;
4972   env->max_ga          = max_ga;
4973
4974   /* For each IR temporary, allocate a suitably-kinded virtual
4975      register. */
4976   j = 0;
4977   for (i = 0; i < env->n_vregmap; i++) {
4978      hregHI = hreg = INVALID_HREG;
4979      switch (bb->tyenv->types[i]) {
4980         case Ity_I1:
4981         case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
4982            hreg = mkHReg(True, HRcInt64, 0, j++);
4983            break;
4984         case Ity_I128:
4985            hreg   = mkHReg(True, HRcInt64, 0, j++);
4986            hregHI = mkHReg(True, HRcInt64, 0, j++);
4987            break;
4988         case Ity_F32:
4989         case Ity_F64:
4990         case Ity_V128:
4991            hreg = mkHReg(True, HRcVec128, 0, j++);
4992            break;
4993         case Ity_V256:
4994            hreg   = mkHReg(True, HRcVec128, 0, j++);
4995            hregHI = mkHReg(True, HRcVec128, 0, j++);
4996            break;
4997         default:
4998            ppIRType(bb->tyenv->types[i]);
4999            vpanic("iselBB(amd64): IRTemp type");
5000      }
5001      env->vregmap[i]   = hreg;
5002      env->vregmapHI[i] = hregHI;
5003   }
5004   env->vreg_ctr = j;
5005
5006   /* The very first instruction must be an event check. */
5007   amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
5008   amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5009   addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5010
5011   /* Possibly a block counter increment (for profiling).  At this
5012      point we don't know the address of the counter, so just pretend
5013      it is zero.  It will have to be patched later, but before this
5014      translation is used, by a call to LibVEX_patchProfCtr. */
5015   if (addProfInc) {
5016      addInstr(env, AMD64Instr_ProfInc());
5017   }
5018
5019   /* Ok, finally we can iterate over the statements. */
5020   for (i = 0; i < bb->stmts_used; i++)
5021      if (bb->stmts[i])
5022         iselStmt(env, bb->stmts[i]);
5023
5024   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5025
5026   /* record the number of vregs we used. */
5027   env->code->n_vregs = env->vreg_ctr;
5028   return env->code;
5029}
5030
5031
5032/*---------------------------------------------------------------*/
5033/*--- end                                   host_amd64_isel.c ---*/
5034/*---------------------------------------------------------------*/
5035