1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_amd64_defs.h"
43
44
45/* --------- Registers. --------- */
46
47void ppHRegAMD64 ( HReg reg )
48{
49   Int r;
50   static const HChar* ireg64_names[16]
51     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
52         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
53   /* Be generic for all virtual regs. */
54   if (hregIsVirtual(reg)) {
55      ppHReg(reg);
56      return;
57   }
58   /* But specific for real regs. */
59   switch (hregClass(reg)) {
60      case HRcInt64:
61         r = hregNumber(reg);
62         vassert(r >= 0 && r < 16);
63         vex_printf("%s", ireg64_names[r]);
64         return;
65      case HRcFlt64:
66         r = hregNumber(reg);
67         vassert(r >= 0 && r < 6);
68         vex_printf("%%fake%d", r);
69         return;
70      case HRcVec128:
71         r = hregNumber(reg);
72         vassert(r >= 0 && r < 16);
73         vex_printf("%%xmm%d", r);
74         return;
75      default:
76         vpanic("ppHRegAMD64");
77   }
78}
79
80static void ppHRegAMD64_lo32 ( HReg reg )
81{
82   Int r;
83   static const HChar* ireg32_names[16]
84     = { "%eax",  "%ecx",  "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
85         "%r8d",  "%r9d",  "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
86   /* Be generic for all virtual regs. */
87   if (hregIsVirtual(reg)) {
88      ppHReg(reg);
89      vex_printf("d");
90      return;
91   }
92   /* But specific for real regs. */
93   switch (hregClass(reg)) {
94      case HRcInt64:
95         r = hregNumber(reg);
96         vassert(r >= 0 && r < 16);
97         vex_printf("%s", ireg32_names[r]);
98         return;
99      default:
100         vpanic("ppHRegAMD64_lo32: invalid regclass");
101   }
102}
103
104HReg hregAMD64_RAX ( void ) { return mkHReg( 0, HRcInt64, False); }
105HReg hregAMD64_RCX ( void ) { return mkHReg( 1, HRcInt64, False); }
106HReg hregAMD64_RDX ( void ) { return mkHReg( 2, HRcInt64, False); }
107HReg hregAMD64_RBX ( void ) { return mkHReg( 3, HRcInt64, False); }
108HReg hregAMD64_RSP ( void ) { return mkHReg( 4, HRcInt64, False); }
109HReg hregAMD64_RBP ( void ) { return mkHReg( 5, HRcInt64, False); }
110HReg hregAMD64_RSI ( void ) { return mkHReg( 6, HRcInt64, False); }
111HReg hregAMD64_RDI ( void ) { return mkHReg( 7, HRcInt64, False); }
112HReg hregAMD64_R8  ( void ) { return mkHReg( 8, HRcInt64, False); }
113HReg hregAMD64_R9  ( void ) { return mkHReg( 9, HRcInt64, False); }
114HReg hregAMD64_R10 ( void ) { return mkHReg(10, HRcInt64, False); }
115HReg hregAMD64_R11 ( void ) { return mkHReg(11, HRcInt64, False); }
116HReg hregAMD64_R12 ( void ) { return mkHReg(12, HRcInt64, False); }
117HReg hregAMD64_R13 ( void ) { return mkHReg(13, HRcInt64, False); }
118HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
119HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
120
121HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
122HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
123HReg hregAMD64_XMM3  ( void ) { return mkHReg( 3, HRcVec128, False); }
124HReg hregAMD64_XMM4  ( void ) { return mkHReg( 4, HRcVec128, False); }
125HReg hregAMD64_XMM5  ( void ) { return mkHReg( 5, HRcVec128, False); }
126HReg hregAMD64_XMM6  ( void ) { return mkHReg( 6, HRcVec128, False); }
127HReg hregAMD64_XMM7  ( void ) { return mkHReg( 7, HRcVec128, False); }
128HReg hregAMD64_XMM8  ( void ) { return mkHReg( 8, HRcVec128, False); }
129HReg hregAMD64_XMM9  ( void ) { return mkHReg( 9, HRcVec128, False); }
130HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); }
131HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); }
132HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); }
133
134
135void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr )
136{
137#if 0
138   *nregs = 6;
139   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
140   (*arr)[ 0] = hregAMD64_RSI();
141   (*arr)[ 1] = hregAMD64_RDI();
142   (*arr)[ 2] = hregAMD64_RBX();
143
144   (*arr)[ 3] = hregAMD64_XMM7();
145   (*arr)[ 4] = hregAMD64_XMM8();
146   (*arr)[ 5] = hregAMD64_XMM9();
147#endif
148#if 1
149   *nregs = 20;
150   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
151   (*arr)[ 0] = hregAMD64_RSI();
152   (*arr)[ 1] = hregAMD64_RDI();
153   (*arr)[ 2] = hregAMD64_R8();
154   (*arr)[ 3] = hregAMD64_R9();
155   (*arr)[ 4] = hregAMD64_R12();
156   (*arr)[ 5] = hregAMD64_R13();
157   (*arr)[ 6] = hregAMD64_R14();
158   (*arr)[ 7] = hregAMD64_R15();
159   (*arr)[ 8] = hregAMD64_RBX();
160
161   (*arr)[ 9] = hregAMD64_XMM3();
162   (*arr)[10] = hregAMD64_XMM4();
163   (*arr)[11] = hregAMD64_XMM5();
164   (*arr)[12] = hregAMD64_XMM6();
165   (*arr)[13] = hregAMD64_XMM7();
166   (*arr)[14] = hregAMD64_XMM8();
167   (*arr)[15] = hregAMD64_XMM9();
168   (*arr)[16] = hregAMD64_XMM10();
169   (*arr)[17] = hregAMD64_XMM11();
170   (*arr)[18] = hregAMD64_XMM12();
171   (*arr)[19] = hregAMD64_R10();
172#endif
173}
174
175
176/* --------- Condition codes, Intel encoding. --------- */
177
178const HChar* showAMD64CondCode ( AMD64CondCode cond )
179{
180   switch (cond) {
181      case Acc_O:      return "o";
182      case Acc_NO:     return "no";
183      case Acc_B:      return "b";
184      case Acc_NB:     return "nb";
185      case Acc_Z:      return "z";
186      case Acc_NZ:     return "nz";
187      case Acc_BE:     return "be";
188      case Acc_NBE:    return "nbe";
189      case Acc_S:      return "s";
190      case Acc_NS:     return "ns";
191      case Acc_P:      return "p";
192      case Acc_NP:     return "np";
193      case Acc_L:      return "l";
194      case Acc_NL:     return "nl";
195      case Acc_LE:     return "le";
196      case Acc_NLE:    return "nle";
197      case Acc_ALWAYS: return "ALWAYS";
198      default: vpanic("ppAMD64CondCode");
199   }
200}
201
202
203/* --------- AMD64AMode: memory address expressions. --------- */
204
205AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
206   AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
207   am->tag        = Aam_IR;
208   am->Aam.IR.imm = imm32;
209   am->Aam.IR.reg = reg;
210   return am;
211}
212AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
213   AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
214   am->tag = Aam_IRRS;
215   am->Aam.IRRS.imm   = imm32;
216   am->Aam.IRRS.base  = base;
217   am->Aam.IRRS.index = indEx;
218   am->Aam.IRRS.shift = shift;
219   vassert(shift >= 0 && shift <= 3);
220   return am;
221}
222
223void ppAMD64AMode ( AMD64AMode* am ) {
224   switch (am->tag) {
225      case Aam_IR:
226         if (am->Aam.IR.imm == 0)
227            vex_printf("(");
228         else
229            vex_printf("0x%x(", am->Aam.IR.imm);
230         ppHRegAMD64(am->Aam.IR.reg);
231         vex_printf(")");
232         return;
233      case Aam_IRRS:
234         vex_printf("0x%x(", am->Aam.IRRS.imm);
235         ppHRegAMD64(am->Aam.IRRS.base);
236         vex_printf(",");
237         ppHRegAMD64(am->Aam.IRRS.index);
238         vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
239         return;
240      default:
241         vpanic("ppAMD64AMode");
242   }
243}
244
245static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
246   switch (am->tag) {
247      case Aam_IR:
248         addHRegUse(u, HRmRead, am->Aam.IR.reg);
249         return;
250      case Aam_IRRS:
251         addHRegUse(u, HRmRead, am->Aam.IRRS.base);
252         addHRegUse(u, HRmRead, am->Aam.IRRS.index);
253         return;
254      default:
255         vpanic("addRegUsage_AMD64AMode");
256   }
257}
258
259static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
260   switch (am->tag) {
261      case Aam_IR:
262         am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
263         return;
264      case Aam_IRRS:
265         am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
266         am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
267         return;
268      default:
269         vpanic("mapRegs_AMD64AMode");
270   }
271}
272
273/* --------- Operand, which can be reg, immediate or memory. --------- */
274
275AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
276   AMD64RMI* op       = LibVEX_Alloc(sizeof(AMD64RMI));
277   op->tag            = Armi_Imm;
278   op->Armi.Imm.imm32 = imm32;
279   return op;
280}
281AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
282   AMD64RMI* op     = LibVEX_Alloc(sizeof(AMD64RMI));
283   op->tag          = Armi_Reg;
284   op->Armi.Reg.reg = reg;
285   return op;
286}
287AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
288   AMD64RMI* op    = LibVEX_Alloc(sizeof(AMD64RMI));
289   op->tag         = Armi_Mem;
290   op->Armi.Mem.am = am;
291   return op;
292}
293
294static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
295   switch (op->tag) {
296      case Armi_Imm:
297         vex_printf("$0x%x", op->Armi.Imm.imm32);
298         return;
299      case Armi_Reg:
300         if (lo32)
301            ppHRegAMD64_lo32(op->Armi.Reg.reg);
302         else
303            ppHRegAMD64(op->Armi.Reg.reg);
304         return;
305      case Armi_Mem:
306         ppAMD64AMode(op->Armi.Mem.am);
307         return;
308     default:
309         vpanic("ppAMD64RMI");
310   }
311}
312void ppAMD64RMI ( AMD64RMI* op ) {
313   ppAMD64RMI_wrk(op, False/*!lo32*/);
314}
315void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
316   ppAMD64RMI_wrk(op, True/*lo32*/);
317}
318
319/* An AMD64RMI can only be used in a "read" context (what would it mean
320   to write or modify a literal?) and so we enumerate its registers
321   accordingly. */
322static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
323   switch (op->tag) {
324      case Armi_Imm:
325         return;
326      case Armi_Reg:
327         addHRegUse(u, HRmRead, op->Armi.Reg.reg);
328         return;
329      case Armi_Mem:
330         addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
331         return;
332      default:
333         vpanic("addRegUsage_AMD64RMI");
334   }
335}
336
337static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
338   switch (op->tag) {
339      case Armi_Imm:
340         return;
341      case Armi_Reg:
342         op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
343         return;
344      case Armi_Mem:
345         mapRegs_AMD64AMode(m, op->Armi.Mem.am);
346         return;
347      default:
348         vpanic("mapRegs_AMD64RMI");
349   }
350}
351
352
353/* --------- Operand, which can be reg or immediate only. --------- */
354
355AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
356   AMD64RI* op       = LibVEX_Alloc(sizeof(AMD64RI));
357   op->tag           = Ari_Imm;
358   op->Ari.Imm.imm32 = imm32;
359   return op;
360}
361AMD64RI* AMD64RI_Reg ( HReg reg ) {
362   AMD64RI* op     = LibVEX_Alloc(sizeof(AMD64RI));
363   op->tag         = Ari_Reg;
364   op->Ari.Reg.reg = reg;
365   return op;
366}
367
368void ppAMD64RI ( AMD64RI* op ) {
369   switch (op->tag) {
370      case Ari_Imm:
371         vex_printf("$0x%x", op->Ari.Imm.imm32);
372         return;
373      case Ari_Reg:
374         ppHRegAMD64(op->Ari.Reg.reg);
375         return;
376     default:
377         vpanic("ppAMD64RI");
378   }
379}
380
381/* An AMD64RI can only be used in a "read" context (what would it mean
382   to write or modify a literal?) and so we enumerate its registers
383   accordingly. */
384static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
385   switch (op->tag) {
386      case Ari_Imm:
387         return;
388      case Ari_Reg:
389         addHRegUse(u, HRmRead, op->Ari.Reg.reg);
390         return;
391      default:
392         vpanic("addRegUsage_AMD64RI");
393   }
394}
395
396static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
397   switch (op->tag) {
398      case Ari_Imm:
399         return;
400      case Ari_Reg:
401         op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
402         return;
403      default:
404         vpanic("mapRegs_AMD64RI");
405   }
406}
407
408
409/* --------- Operand, which can be reg or memory only. --------- */
410
411AMD64RM* AMD64RM_Reg ( HReg reg ) {
412   AMD64RM* op       = LibVEX_Alloc(sizeof(AMD64RM));
413   op->tag         = Arm_Reg;
414   op->Arm.Reg.reg = reg;
415   return op;
416}
417AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
418   AMD64RM* op    = LibVEX_Alloc(sizeof(AMD64RM));
419   op->tag        = Arm_Mem;
420   op->Arm.Mem.am = am;
421   return op;
422}
423
424void ppAMD64RM ( AMD64RM* op ) {
425   switch (op->tag) {
426      case Arm_Mem:
427         ppAMD64AMode(op->Arm.Mem.am);
428         return;
429      case Arm_Reg:
430         ppHRegAMD64(op->Arm.Reg.reg);
431         return;
432     default:
433         vpanic("ppAMD64RM");
434   }
435}
436
437/* Because an AMD64RM can be both a source or destination operand, we
438   have to supply a mode -- pertaining to the operand as a whole --
439   indicating how it's being used. */
440static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
441   switch (op->tag) {
442      case Arm_Mem:
443         /* Memory is read, written or modified.  So we just want to
444            know the regs read by the amode. */
445         addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
446         return;
447      case Arm_Reg:
448         /* reg is read, written or modified.  Add it in the
449            appropriate way. */
450         addHRegUse(u, mode, op->Arm.Reg.reg);
451         return;
452     default:
453         vpanic("addRegUsage_AMD64RM");
454   }
455}
456
457static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
458{
459   switch (op->tag) {
460      case Arm_Mem:
461         mapRegs_AMD64AMode(m, op->Arm.Mem.am);
462         return;
463      case Arm_Reg:
464         op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
465         return;
466     default:
467         vpanic("mapRegs_AMD64RM");
468   }
469}
470
471
472/* --------- Instructions. --------- */
473
474static const HChar* showAMD64ScalarSz ( Int sz ) {
475   switch (sz) {
476      case 2: return "w";
477      case 4: return "l";
478      case 8: return "q";
479      default: vpanic("showAMD64ScalarSz");
480   }
481}
482
483const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
484   switch (op) {
485      case Aun_NOT: return "not";
486      case Aun_NEG: return "neg";
487      default: vpanic("showAMD64UnaryOp");
488   }
489}
490
491const HChar* showAMD64AluOp ( AMD64AluOp op ) {
492   switch (op) {
493      case Aalu_MOV:  return "mov";
494      case Aalu_CMP:  return "cmp";
495      case Aalu_ADD:  return "add";
496      case Aalu_SUB:  return "sub";
497      case Aalu_ADC:  return "adc";
498      case Aalu_SBB:  return "sbb";
499      case Aalu_AND:  return "and";
500      case Aalu_OR:   return "or";
501      case Aalu_XOR:  return "xor";
502      case Aalu_MUL:  return "imul";
503      default: vpanic("showAMD64AluOp");
504   }
505}
506
507const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
508   switch (op) {
509      case Ash_SHL: return "shl";
510      case Ash_SHR: return "shr";
511      case Ash_SAR: return "sar";
512      default: vpanic("showAMD64ShiftOp");
513   }
514}
515
516const HChar* showA87FpOp ( A87FpOp op ) {
517   switch (op) {
518      case Afp_SCALE:  return "scale";
519      case Afp_ATAN:   return "atan";
520      case Afp_YL2X:   return "yl2x";
521      case Afp_YL2XP1: return "yl2xp1";
522      case Afp_PREM:   return "prem";
523      case Afp_PREM1:  return "prem1";
524      case Afp_SQRT:   return "sqrt";
525      case Afp_SIN:    return "sin";
526      case Afp_COS:    return "cos";
527      case Afp_TAN:    return "tan";
528      case Afp_ROUND:  return "round";
529      case Afp_2XM1:   return "2xm1";
530      default: vpanic("showA87FpOp");
531   }
532}
533
534const HChar* showAMD64SseOp ( AMD64SseOp op ) {
535   switch (op) {
536      case Asse_MOV:      return "movups";
537      case Asse_ADDF:     return "add";
538      case Asse_SUBF:     return "sub";
539      case Asse_MULF:     return "mul";
540      case Asse_DIVF:     return "div";
541      case Asse_MAXF:     return "max";
542      case Asse_MINF:     return "min";
543      case Asse_CMPEQF:   return "cmpFeq";
544      case Asse_CMPLTF:   return "cmpFlt";
545      case Asse_CMPLEF:   return "cmpFle";
546      case Asse_CMPUNF:   return "cmpFun";
547      case Asse_RCPF:     return "rcp";
548      case Asse_RSQRTF:   return "rsqrt";
549      case Asse_SQRTF:    return "sqrt";
550      case Asse_AND:      return "and";
551      case Asse_OR:       return "or";
552      case Asse_XOR:      return "xor";
553      case Asse_ANDN:     return "andn";
554      case Asse_ADD8:     return "paddb";
555      case Asse_ADD16:    return "paddw";
556      case Asse_ADD32:    return "paddd";
557      case Asse_ADD64:    return "paddq";
558      case Asse_QADD8U:   return "paddusb";
559      case Asse_QADD16U:  return "paddusw";
560      case Asse_QADD8S:   return "paddsb";
561      case Asse_QADD16S:  return "paddsw";
562      case Asse_SUB8:     return "psubb";
563      case Asse_SUB16:    return "psubw";
564      case Asse_SUB32:    return "psubd";
565      case Asse_SUB64:    return "psubq";
566      case Asse_QSUB8U:   return "psubusb";
567      case Asse_QSUB16U:  return "psubusw";
568      case Asse_QSUB8S:   return "psubsb";
569      case Asse_QSUB16S:  return "psubsw";
570      case Asse_MUL16:    return "pmullw";
571      case Asse_MULHI16U: return "pmulhuw";
572      case Asse_MULHI16S: return "pmulhw";
573      case Asse_AVG8U:    return "pavgb";
574      case Asse_AVG16U:   return "pavgw";
575      case Asse_MAX16S:   return "pmaxw";
576      case Asse_MAX8U:    return "pmaxub";
577      case Asse_MIN16S:   return "pminw";
578      case Asse_MIN8U:    return "pminub";
579      case Asse_CMPEQ8:   return "pcmpeqb";
580      case Asse_CMPEQ16:  return "pcmpeqw";
581      case Asse_CMPEQ32:  return "pcmpeqd";
582      case Asse_CMPGT8S:  return "pcmpgtb";
583      case Asse_CMPGT16S: return "pcmpgtw";
584      case Asse_CMPGT32S: return "pcmpgtd";
585      case Asse_SHL16:    return "psllw";
586      case Asse_SHL32:    return "pslld";
587      case Asse_SHL64:    return "psllq";
588      case Asse_SHR16:    return "psrlw";
589      case Asse_SHR32:    return "psrld";
590      case Asse_SHR64:    return "psrlq";
591      case Asse_SAR16:    return "psraw";
592      case Asse_SAR32:    return "psrad";
593      case Asse_PACKSSD:  return "packssdw";
594      case Asse_PACKSSW:  return "packsswb";
595      case Asse_PACKUSW:  return "packuswb";
596      case Asse_UNPCKHB:  return "punpckhb";
597      case Asse_UNPCKHW:  return "punpckhw";
598      case Asse_UNPCKHD:  return "punpckhd";
599      case Asse_UNPCKHQ:  return "punpckhq";
600      case Asse_UNPCKLB:  return "punpcklb";
601      case Asse_UNPCKLW:  return "punpcklw";
602      case Asse_UNPCKLD:  return "punpckld";
603      case Asse_UNPCKLQ:  return "punpcklq";
604      default: vpanic("showAMD64SseOp");
605   }
606}
607
608AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
609   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
610   i->tag             = Ain_Imm64;
611   i->Ain.Imm64.imm64 = imm64;
612   i->Ain.Imm64.dst   = dst;
613   return i;
614}
615AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
616   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
617   i->tag            = Ain_Alu64R;
618   i->Ain.Alu64R.op  = op;
619   i->Ain.Alu64R.src = src;
620   i->Ain.Alu64R.dst = dst;
621   return i;
622}
623AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
624   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
625   i->tag            = Ain_Alu64M;
626   i->Ain.Alu64M.op  = op;
627   i->Ain.Alu64M.src = src;
628   i->Ain.Alu64M.dst = dst;
629   vassert(op != Aalu_MUL);
630   return i;
631}
632AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
633   AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
634   i->tag          = Ain_Sh64;
635   i->Ain.Sh64.op  = op;
636   i->Ain.Sh64.src = src;
637   i->Ain.Sh64.dst = dst;
638   return i;
639}
640AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
641   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
642   i->tag              = Ain_Test64;
643   i->Ain.Test64.imm32 = imm32;
644   i->Ain.Test64.dst   = dst;
645   return i;
646}
647AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
648   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
649   i->tag             = Ain_Unary64;
650   i->Ain.Unary64.op  = op;
651   i->Ain.Unary64.dst = dst;
652   return i;
653}
654AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
655   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
656   i->tag             = Ain_Lea64;
657   i->Ain.Lea64.am    = am;
658   i->Ain.Lea64.dst   = dst;
659   return i;
660}
661AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
662   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
663   i->tag            = Ain_Alu32R;
664   i->Ain.Alu32R.op  = op;
665   i->Ain.Alu32R.src = src;
666   i->Ain.Alu32R.dst = dst;
667   switch (op) {
668      case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
669      case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
670      default: vassert(0);
671   }
672   return i;
673}
674AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
675   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
676   i->tag            = Ain_MulL;
677   i->Ain.MulL.syned = syned;
678   i->Ain.MulL.src   = src;
679   return i;
680}
681AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
682   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
683   i->tag            = Ain_Div;
684   i->Ain.Div.syned  = syned;
685   i->Ain.Div.sz     = sz;
686   i->Ain.Div.src    = src;
687   vassert(sz == 4 || sz == 8);
688   return i;
689}
690AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
691   AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
692   i->tag          = Ain_Push;
693   i->Ain.Push.src = src;
694   return i;
695}
696AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
697                              RetLoc rloc ) {
698   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
699   i->tag               = Ain_Call;
700   i->Ain.Call.cond     = cond;
701   i->Ain.Call.target   = target;
702   i->Ain.Call.regparms = regparms;
703   i->Ain.Call.rloc     = rloc;
704   vassert(regparms >= 0 && regparms <= 6);
705   vassert(is_sane_RetLoc(rloc));
706   return i;
707}
708
709AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
710                                 AMD64CondCode cond, Bool toFastEP ) {
711   AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
712   i->tag                  = Ain_XDirect;
713   i->Ain.XDirect.dstGA    = dstGA;
714   i->Ain.XDirect.amRIP    = amRIP;
715   i->Ain.XDirect.cond     = cond;
716   i->Ain.XDirect.toFastEP = toFastEP;
717   return i;
718}
719AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
720                                AMD64CondCode cond ) {
721   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
722   i->tag              = Ain_XIndir;
723   i->Ain.XIndir.dstGA = dstGA;
724   i->Ain.XIndir.amRIP = amRIP;
725   i->Ain.XIndir.cond  = cond;
726   return i;
727}
728AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
729                                   AMD64CondCode cond, IRJumpKind jk ) {
730   AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
731   i->tag                 = Ain_XAssisted;
732   i->Ain.XAssisted.dstGA = dstGA;
733   i->Ain.XAssisted.amRIP = amRIP;
734   i->Ain.XAssisted.cond  = cond;
735   i->Ain.XAssisted.jk    = jk;
736   return i;
737}
738
739AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
740   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
741   i->tag             = Ain_CMov64;
742   i->Ain.CMov64.cond = cond;
743   i->Ain.CMov64.src  = src;
744   i->Ain.CMov64.dst  = dst;
745   vassert(cond != Acc_ALWAYS);
746   return i;
747}
748AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
749   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
750   i->tag              = Ain_MovxLQ;
751   i->Ain.MovxLQ.syned = syned;
752   i->Ain.MovxLQ.src   = src;
753   i->Ain.MovxLQ.dst   = dst;
754   return i;
755}
756AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
757                                AMD64AMode* src, HReg dst ) {
758   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
759   i->tag                = Ain_LoadEX;
760   i->Ain.LoadEX.szSmall = szSmall;
761   i->Ain.LoadEX.syned   = syned;
762   i->Ain.LoadEX.src     = src;
763   i->Ain.LoadEX.dst     = dst;
764   vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
765   return i;
766}
767AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
768   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
769   i->tag           = Ain_Store;
770   i->Ain.Store.sz  = sz;
771   i->Ain.Store.src = src;
772   i->Ain.Store.dst = dst;
773   vassert(sz == 1 || sz == 2 || sz == 4);
774   return i;
775}
776AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
777   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
778   i->tag            = Ain_Set64;
779   i->Ain.Set64.cond = cond;
780   i->Ain.Set64.dst  = dst;
781   return i;
782}
783AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
784   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
785   i->tag               = Ain_Bsfr64;
786   i->Ain.Bsfr64.isFwds = isFwds;
787   i->Ain.Bsfr64.src    = src;
788   i->Ain.Bsfr64.dst    = dst;
789   return i;
790}
791AMD64Instr* AMD64Instr_MFence ( void ) {
792   AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
793   i->tag        = Ain_MFence;
794   return i;
795}
796AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
797   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
798   i->tag           = Ain_ACAS;
799   i->Ain.ACAS.addr = addr;
800   i->Ain.ACAS.sz   = sz;
801   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
802   return i;
803}
804AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
805   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
806   i->tag            = Ain_DACAS;
807   i->Ain.DACAS.addr = addr;
808   i->Ain.DACAS.sz   = sz;
809   vassert(sz == 8 || sz == 4);
810   return i;
811}
812
813AMD64Instr* AMD64Instr_A87Free ( Int nregs )
814{
815   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
816   i->tag               = Ain_A87Free;
817   i->Ain.A87Free.nregs = nregs;
818   vassert(nregs >= 1 && nregs <= 7);
819   return i;
820}
821AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
822{
823   AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
824   i->tag                   = Ain_A87PushPop;
825   i->Ain.A87PushPop.addr   = addr;
826   i->Ain.A87PushPop.isPush = isPush;
827   i->Ain.A87PushPop.szB    = szB;
828   vassert(szB == 8 || szB == 4);
829   return i;
830}
831AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
832{
833   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
834   i->tag            = Ain_A87FpOp;
835   i->Ain.A87FpOp.op = op;
836   return i;
837}
838AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
839{
840   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
841   i->tag              = Ain_A87LdCW;
842   i->Ain.A87LdCW.addr = addr;
843   return i;
844}
845AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
846{
847   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
848   i->tag              = Ain_A87StSW;
849   i->Ain.A87StSW.addr = addr;
850   return i;
851}
852AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
853   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
854   i->tag                = Ain_LdMXCSR;
855   i->Ain.LdMXCSR.addr   = addr;
856   return i;
857}
858AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
859   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
860   i->tag                = Ain_SseUComIS;
861   i->Ain.SseUComIS.sz   = toUChar(sz);
862   i->Ain.SseUComIS.srcL = srcL;
863   i->Ain.SseUComIS.srcR = srcR;
864   i->Ain.SseUComIS.dst  = dst;
865   vassert(sz == 4 || sz == 8);
866   return i;
867}
868AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
869   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
870   i->tag              = Ain_SseSI2SF;
871   i->Ain.SseSI2SF.szS = toUChar(szS);
872   i->Ain.SseSI2SF.szD = toUChar(szD);
873   i->Ain.SseSI2SF.src = src;
874   i->Ain.SseSI2SF.dst = dst;
875   vassert(szS == 4 || szS == 8);
876   vassert(szD == 4 || szD == 8);
877   return i;
878}
879AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
880   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
881   i->tag              = Ain_SseSF2SI;
882   i->Ain.SseSF2SI.szS = toUChar(szS);
883   i->Ain.SseSF2SI.szD = toUChar(szD);
884   i->Ain.SseSF2SI.src = src;
885   i->Ain.SseSF2SI.dst = dst;
886   vassert(szS == 4 || szS == 8);
887   vassert(szD == 4 || szD == 8);
888   return i;
889}
890AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
891{
892   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
893   i->tag                = Ain_SseSDSS;
894   i->Ain.SseSDSS.from64 = from64;
895   i->Ain.SseSDSS.src    = src;
896   i->Ain.SseSDSS.dst    = dst;
897   return i;
898}
899AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
900                                 HReg reg, AMD64AMode* addr ) {
901   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
902   i->tag                = Ain_SseLdSt;
903   i->Ain.SseLdSt.isLoad = isLoad;
904   i->Ain.SseLdSt.sz     = toUChar(sz);
905   i->Ain.SseLdSt.reg    = reg;
906   i->Ain.SseLdSt.addr   = addr;
907   vassert(sz == 4 || sz == 8 || sz == 16);
908   return i;
909}
910AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
911{
912   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
913   i->tag                = Ain_SseLdzLO;
914   i->Ain.SseLdzLO.sz    = sz;
915   i->Ain.SseLdzLO.reg   = reg;
916   i->Ain.SseLdzLO.addr  = addr;
917   vassert(sz == 4 || sz == 8);
918   return i;
919}
920AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
921   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
922   i->tag              = Ain_Sse32Fx4;
923   i->Ain.Sse32Fx4.op  = op;
924   i->Ain.Sse32Fx4.src = src;
925   i->Ain.Sse32Fx4.dst = dst;
926   vassert(op != Asse_MOV);
927   return i;
928}
929AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
930   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
931   i->tag              = Ain_Sse32FLo;
932   i->Ain.Sse32FLo.op  = op;
933   i->Ain.Sse32FLo.src = src;
934   i->Ain.Sse32FLo.dst = dst;
935   vassert(op != Asse_MOV);
936   return i;
937}
938AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
939   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
940   i->tag              = Ain_Sse64Fx2;
941   i->Ain.Sse64Fx2.op  = op;
942   i->Ain.Sse64Fx2.src = src;
943   i->Ain.Sse64Fx2.dst = dst;
944   vassert(op != Asse_MOV);
945   return i;
946}
947AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
948   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
949   i->tag              = Ain_Sse64FLo;
950   i->Ain.Sse64FLo.op  = op;
951   i->Ain.Sse64FLo.src = src;
952   i->Ain.Sse64FLo.dst = dst;
953   vassert(op != Asse_MOV);
954   return i;
955}
956AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
957   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
958   i->tag             = Ain_SseReRg;
959   i->Ain.SseReRg.op  = op;
960   i->Ain.SseReRg.src = re;
961   i->Ain.SseReRg.dst = rg;
962   return i;
963}
964AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
965   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
966   i->tag              = Ain_SseCMov;
967   i->Ain.SseCMov.cond = cond;
968   i->Ain.SseCMov.src  = src;
969   i->Ain.SseCMov.dst  = dst;
970   vassert(cond != Acc_ALWAYS);
971   return i;
972}
973AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
974   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
975   i->tag               = Ain_SseShuf;
976   i->Ain.SseShuf.order = order;
977   i->Ain.SseShuf.src   = src;
978   i->Ain.SseShuf.dst   = dst;
979   vassert(order >= 0 && order <= 0xFF);
980   return i;
981}
982//uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
983//uu                                  HReg reg, AMD64AMode* addr ) {
984//uu    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
985//uu    i->tag                = Ain_AvxLdSt;
986//uu    i->Ain.AvxLdSt.isLoad = isLoad;
987//uu    i->Ain.AvxLdSt.reg    = reg;
988//uu    i->Ain.AvxLdSt.addr   = addr;
989//uu    return i;
990//uu }
991//uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
992//uu    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
993//uu    i->tag             = Ain_AvxReRg;
994//uu    i->Ain.AvxReRg.op  = op;
995//uu    i->Ain.AvxReRg.src = re;
996//uu    i->Ain.AvxReRg.dst = rg;
997//uu    return i;
998//uu }
999AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1000                                 AMD64AMode* amFailAddr ) {
1001   AMD64Instr* i             = LibVEX_Alloc(sizeof(AMD64Instr));
1002   i->tag                    = Ain_EvCheck;
1003   i->Ain.EvCheck.amCounter  = amCounter;
1004   i->Ain.EvCheck.amFailAddr = amFailAddr;
1005   return i;
1006}
1007AMD64Instr* AMD64Instr_ProfInc ( void ) {
1008   AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
1009   i->tag        = Ain_ProfInc;
1010   return i;
1011}
1012
1013void ppAMD64Instr ( AMD64Instr* i, Bool mode64 )
1014{
1015   vassert(mode64 == True);
1016   switch (i->tag) {
1017      case Ain_Imm64:
1018         vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1019         ppHRegAMD64(i->Ain.Imm64.dst);
1020         return;
1021      case Ain_Alu64R:
1022         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1023         ppAMD64RMI(i->Ain.Alu64R.src);
1024         vex_printf(",");
1025         ppHRegAMD64(i->Ain.Alu64R.dst);
1026         return;
1027      case Ain_Alu64M:
1028         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1029         ppAMD64RI(i->Ain.Alu64M.src);
1030         vex_printf(",");
1031         ppAMD64AMode(i->Ain.Alu64M.dst);
1032         return;
1033      case Ain_Sh64:
1034         vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1035         if (i->Ain.Sh64.src == 0)
1036            vex_printf("%%cl,");
1037         else
1038            vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1039         ppHRegAMD64(i->Ain.Sh64.dst);
1040         return;
1041      case Ain_Test64:
1042         vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1043         ppHRegAMD64(i->Ain.Test64.dst);
1044         return;
1045      case Ain_Unary64:
1046         vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1047         ppHRegAMD64(i->Ain.Unary64.dst);
1048         return;
1049      case Ain_Lea64:
1050         vex_printf("leaq ");
1051         ppAMD64AMode(i->Ain.Lea64.am);
1052         vex_printf(",");
1053         ppHRegAMD64(i->Ain.Lea64.dst);
1054         return;
1055      case Ain_Alu32R:
1056         vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1057         ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1058         vex_printf(",");
1059         ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1060         return;
1061      case Ain_MulL:
1062         vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1063         ppAMD64RM(i->Ain.MulL.src);
1064         return;
1065      case Ain_Div:
1066         vex_printf("%cdiv%s ",
1067                    i->Ain.Div.syned ? 's' : 'u',
1068                    showAMD64ScalarSz(i->Ain.Div.sz));
1069         ppAMD64RM(i->Ain.Div.src);
1070         return;
1071      case Ain_Push:
1072         vex_printf("pushq ");
1073         ppAMD64RMI(i->Ain.Push.src);
1074         return;
1075      case Ain_Call:
1076         vex_printf("call%s[%d,",
1077                    i->Ain.Call.cond==Acc_ALWAYS
1078                       ? "" : showAMD64CondCode(i->Ain.Call.cond),
1079                    i->Ain.Call.regparms );
1080         ppRetLoc(i->Ain.Call.rloc);
1081         vex_printf("] 0x%llx", i->Ain.Call.target);
1082         break;
1083
1084      case Ain_XDirect:
1085         vex_printf("(xDirect) ");
1086         vex_printf("if (%%rflags.%s) { ",
1087                    showAMD64CondCode(i->Ain.XDirect.cond));
1088         vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1089         vex_printf("movq %%r11,");
1090         ppAMD64AMode(i->Ain.XDirect.amRIP);
1091         vex_printf("; ");
1092         vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1093                    i->Ain.XDirect.toFastEP ? "fast" : "slow");
1094         return;
1095      case Ain_XIndir:
1096         vex_printf("(xIndir) ");
1097         vex_printf("if (%%rflags.%s) { ",
1098                    showAMD64CondCode(i->Ain.XIndir.cond));
1099         vex_printf("movq ");
1100         ppHRegAMD64(i->Ain.XIndir.dstGA);
1101         vex_printf(",");
1102         ppAMD64AMode(i->Ain.XIndir.amRIP);
1103         vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1104         return;
1105      case Ain_XAssisted:
1106         vex_printf("(xAssisted) ");
1107         vex_printf("if (%%rflags.%s) { ",
1108                    showAMD64CondCode(i->Ain.XAssisted.cond));
1109         vex_printf("movq ");
1110         ppHRegAMD64(i->Ain.XAssisted.dstGA);
1111         vex_printf(",");
1112         ppAMD64AMode(i->Ain.XAssisted.amRIP);
1113         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1114                    (Int)i->Ain.XAssisted.jk);
1115         vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1116         return;
1117
1118      case Ain_CMov64:
1119         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1120         ppAMD64RM(i->Ain.CMov64.src);
1121         vex_printf(",");
1122         ppHRegAMD64(i->Ain.CMov64.dst);
1123         return;
1124      case Ain_MovxLQ:
1125         vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1126         ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1127         vex_printf(",");
1128         ppHRegAMD64(i->Ain.MovxLQ.dst);
1129         return;
1130      case Ain_LoadEX:
1131         if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1132            vex_printf("movl ");
1133            ppAMD64AMode(i->Ain.LoadEX.src);
1134            vex_printf(",");
1135            ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1136         } else {
1137            vex_printf("mov%c%cq ",
1138                       i->Ain.LoadEX.syned ? 's' : 'z',
1139                       i->Ain.LoadEX.szSmall==1
1140                          ? 'b'
1141                          : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1142            ppAMD64AMode(i->Ain.LoadEX.src);
1143            vex_printf(",");
1144            ppHRegAMD64(i->Ain.LoadEX.dst);
1145         }
1146         return;
1147      case Ain_Store:
1148         vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1149                              : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1150         ppHRegAMD64(i->Ain.Store.src);
1151         vex_printf(",");
1152         ppAMD64AMode(i->Ain.Store.dst);
1153         return;
1154      case Ain_Set64:
1155         vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1156         ppHRegAMD64(i->Ain.Set64.dst);
1157         return;
1158      case Ain_Bsfr64:
1159         vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1160         ppHRegAMD64(i->Ain.Bsfr64.src);
1161         vex_printf(",");
1162         ppHRegAMD64(i->Ain.Bsfr64.dst);
1163         return;
1164      case Ain_MFence:
1165         vex_printf("mfence" );
1166         return;
1167      case Ain_ACAS:
1168         vex_printf("lock cmpxchg%c ",
1169                     i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1170                     : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1171         vex_printf("{%%rax->%%rbx},");
1172         ppAMD64AMode(i->Ain.ACAS.addr);
1173         return;
1174      case Ain_DACAS:
1175         vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1176                    (Int)(2 * i->Ain.DACAS.sz));
1177         ppAMD64AMode(i->Ain.DACAS.addr);
1178         return;
1179      case Ain_A87Free:
1180         vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1181         break;
1182      case Ain_A87PushPop:
1183         vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1184                    i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1185         ppAMD64AMode(i->Ain.A87PushPop.addr);
1186         break;
1187      case Ain_A87FpOp:
1188         vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1189         break;
1190      case Ain_A87LdCW:
1191         vex_printf("fldcw ");
1192         ppAMD64AMode(i->Ain.A87LdCW.addr);
1193         break;
1194      case Ain_A87StSW:
1195         vex_printf("fstsw ");
1196         ppAMD64AMode(i->Ain.A87StSW.addr);
1197         break;
1198      case Ain_LdMXCSR:
1199         vex_printf("ldmxcsr ");
1200         ppAMD64AMode(i->Ain.LdMXCSR.addr);
1201         break;
1202      case Ain_SseUComIS:
1203         vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1204         ppHRegAMD64(i->Ain.SseUComIS.srcL);
1205         vex_printf(",");
1206         ppHRegAMD64(i->Ain.SseUComIS.srcR);
1207         vex_printf(" ; pushfq ; popq ");
1208         ppHRegAMD64(i->Ain.SseUComIS.dst);
1209         break;
1210      case Ain_SseSI2SF:
1211         vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1212         (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1213            (i->Ain.SseSI2SF.src);
1214         vex_printf(",");
1215         ppHRegAMD64(i->Ain.SseSI2SF.dst);
1216         break;
1217      case Ain_SseSF2SI:
1218         vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1219         ppHRegAMD64(i->Ain.SseSF2SI.src);
1220         vex_printf(",");
1221         (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1222            (i->Ain.SseSF2SI.dst);
1223         break;
1224      case Ain_SseSDSS:
1225         vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1226         ppHRegAMD64(i->Ain.SseSDSS.src);
1227         vex_printf(",");
1228         ppHRegAMD64(i->Ain.SseSDSS.dst);
1229         break;
1230      case Ain_SseLdSt:
1231         switch (i->Ain.SseLdSt.sz) {
1232            case 4:  vex_printf("movss "); break;
1233            case 8:  vex_printf("movsd "); break;
1234            case 16: vex_printf("movups "); break;
1235            default: vassert(0);
1236         }
1237         if (i->Ain.SseLdSt.isLoad) {
1238            ppAMD64AMode(i->Ain.SseLdSt.addr);
1239            vex_printf(",");
1240            ppHRegAMD64(i->Ain.SseLdSt.reg);
1241         } else {
1242            ppHRegAMD64(i->Ain.SseLdSt.reg);
1243            vex_printf(",");
1244            ppAMD64AMode(i->Ain.SseLdSt.addr);
1245         }
1246         return;
1247      case Ain_SseLdzLO:
1248         vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1249         ppAMD64AMode(i->Ain.SseLdzLO.addr);
1250         vex_printf(",");
1251         ppHRegAMD64(i->Ain.SseLdzLO.reg);
1252         return;
1253      case Ain_Sse32Fx4:
1254         vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1255         ppHRegAMD64(i->Ain.Sse32Fx4.src);
1256         vex_printf(",");
1257         ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1258         return;
1259      case Ain_Sse32FLo:
1260         vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1261         ppHRegAMD64(i->Ain.Sse32FLo.src);
1262         vex_printf(",");
1263         ppHRegAMD64(i->Ain.Sse32FLo.dst);
1264         return;
1265      case Ain_Sse64Fx2:
1266         vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1267         ppHRegAMD64(i->Ain.Sse64Fx2.src);
1268         vex_printf(",");
1269         ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1270         return;
1271      case Ain_Sse64FLo:
1272         vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1273         ppHRegAMD64(i->Ain.Sse64FLo.src);
1274         vex_printf(",");
1275         ppHRegAMD64(i->Ain.Sse64FLo.dst);
1276         return;
1277      case Ain_SseReRg:
1278         vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1279         ppHRegAMD64(i->Ain.SseReRg.src);
1280         vex_printf(",");
1281         ppHRegAMD64(i->Ain.SseReRg.dst);
1282         return;
1283      case Ain_SseCMov:
1284         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1285         ppHRegAMD64(i->Ain.SseCMov.src);
1286         vex_printf(",");
1287         ppHRegAMD64(i->Ain.SseCMov.dst);
1288         return;
1289      case Ain_SseShuf:
1290         vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
1291         ppHRegAMD64(i->Ain.SseShuf.src);
1292         vex_printf(",");
1293         ppHRegAMD64(i->Ain.SseShuf.dst);
1294         return;
1295      //uu case Ain_AvxLdSt:
1296      //uu    vex_printf("vmovups ");
1297      //uu    if (i->Ain.AvxLdSt.isLoad) {
1298      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1299      //uu       vex_printf(",");
1300      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1301      //uu    } else {
1302      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1303      //uu       vex_printf(",");
1304      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1305      //uu    }
1306      //uu    return;
1307      //uu case Ain_AvxReRg:
1308      //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1309      //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1310      //uu    vex_printf(",");
1311      //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1312      //uu    return;
1313      case Ain_EvCheck:
1314         vex_printf("(evCheck) decl ");
1315         ppAMD64AMode(i->Ain.EvCheck.amCounter);
1316         vex_printf("; jns nofail; jmp *");
1317         ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1318         vex_printf("; nofail:");
1319         return;
1320      case Ain_ProfInc:
1321         vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1322         return;
1323      default:
1324         vpanic("ppAMD64Instr");
1325   }
1326}
1327
1328/* --------- Helpers for register allocation. --------- */
1329
1330void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
1331{
1332   Bool unary;
1333   vassert(mode64 == True);
1334   initHRegUsage(u);
1335   switch (i->tag) {
1336      case Ain_Imm64:
1337         addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1338         return;
1339      case Ain_Alu64R:
1340         addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1341         if (i->Ain.Alu64R.op == Aalu_MOV) {
1342            addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1343            return;
1344         }
1345         if (i->Ain.Alu64R.op == Aalu_CMP) {
1346            addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1347            return;
1348         }
1349         addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1350         return;
1351      case Ain_Alu64M:
1352         addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1353         addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1354         return;
1355      case Ain_Sh64:
1356         addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1357         if (i->Ain.Sh64.src == 0)
1358            addHRegUse(u, HRmRead, hregAMD64_RCX());
1359         return;
1360      case Ain_Test64:
1361         addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1362         return;
1363      case Ain_Unary64:
1364         addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1365         return;
1366      case Ain_Lea64:
1367         addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1368         addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1369         return;
1370      case Ain_Alu32R:
1371         vassert(i->Ain.Alu32R.op != Aalu_MOV);
1372         addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1373         if (i->Ain.Alu32R.op == Aalu_CMP) {
1374            addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1375            return;
1376         }
1377         addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1378         return;
1379      case Ain_MulL:
1380         addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1381         addHRegUse(u, HRmModify, hregAMD64_RAX());
1382         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1383         return;
1384      case Ain_Div:
1385         addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1386         addHRegUse(u, HRmModify, hregAMD64_RAX());
1387         addHRegUse(u, HRmModify, hregAMD64_RDX());
1388         return;
1389      case Ain_Push:
1390         addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1391         addHRegUse(u, HRmModify, hregAMD64_RSP());
1392         return;
1393      case Ain_Call:
1394         /* This is a bit subtle. */
1395         /* First off, claim it trashes all the caller-saved regs
1396            which fall within the register allocator's jurisdiction.
1397            These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
1398            and all the xmm registers.
1399         */
1400         addHRegUse(u, HRmWrite, hregAMD64_RAX());
1401         addHRegUse(u, HRmWrite, hregAMD64_RCX());
1402         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1403         addHRegUse(u, HRmWrite, hregAMD64_RSI());
1404         addHRegUse(u, HRmWrite, hregAMD64_RDI());
1405         addHRegUse(u, HRmWrite, hregAMD64_R8());
1406         addHRegUse(u, HRmWrite, hregAMD64_R9());
1407         addHRegUse(u, HRmWrite, hregAMD64_R10());
1408         addHRegUse(u, HRmWrite, hregAMD64_R11());
1409         addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1410         addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1411         addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1412         addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1413         addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1414         addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1415         addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1416         addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1417         addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1418         addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1419         addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1420         addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1421
1422         /* Now we have to state any parameter-carrying registers
1423            which might be read.  This depends on the regparmness. */
1424         switch (i->Ain.Call.regparms) {
1425            case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1426            case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1427            case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1428            case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1429            case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1430            case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1431            case 0: break;
1432            default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1433         }
1434         /* Finally, there is the issue that the insn trashes a
1435            register because the literal target address has to be
1436            loaded into a register.  Fortunately, r11 is stated in the
1437            ABI as a scratch register, and so seems a suitable victim.  */
1438         addHRegUse(u, HRmWrite, hregAMD64_R11());
1439         /* Upshot of this is that the assembler really must use r11,
1440            and no other, as a destination temporary. */
1441         return;
1442      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1443         conditionally exit the block.  Hence we only need to list (1)
1444         the registers that they read, and (2) the registers that they
1445         write in the case where the block is not exited.  (2) is
1446         empty, hence only (1) is relevant here. */
1447      case Ain_XDirect:
1448         /* Don't bother to mention the write to %r11, since it is not
1449            available to the allocator. */
1450         addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1451         return;
1452      case Ain_XIndir:
1453         /* Ditto re %r11 */
1454         addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1455         addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1456         return;
1457      case Ain_XAssisted:
1458         /* Ditto re %r11 and %rbp (the baseblock ptr) */
1459         addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1460         addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1461         return;
1462      case Ain_CMov64:
1463         addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
1464         addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1465         return;
1466      case Ain_MovxLQ:
1467         addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1468         addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1469         return;
1470      case Ain_LoadEX:
1471         addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1472         addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1473         return;
1474      case Ain_Store:
1475         addHRegUse(u, HRmRead, i->Ain.Store.src);
1476         addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1477         return;
1478      case Ain_Set64:
1479         addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1480         return;
1481      case Ain_Bsfr64:
1482         addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1483         addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1484         return;
1485      case Ain_MFence:
1486         return;
1487      case Ain_ACAS:
1488         addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1489         addHRegUse(u, HRmRead, hregAMD64_RBX());
1490         addHRegUse(u, HRmModify, hregAMD64_RAX());
1491         return;
1492      case Ain_DACAS:
1493         addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1494         addHRegUse(u, HRmRead, hregAMD64_RCX());
1495         addHRegUse(u, HRmRead, hregAMD64_RBX());
1496         addHRegUse(u, HRmModify, hregAMD64_RDX());
1497         addHRegUse(u, HRmModify, hregAMD64_RAX());
1498         return;
1499      case Ain_A87Free:
1500         return;
1501      case Ain_A87PushPop:
1502         addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1503         return;
1504      case Ain_A87FpOp:
1505         return;
1506      case Ain_A87LdCW:
1507         addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1508         return;
1509      case Ain_A87StSW:
1510         addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1511         return;
1512      case Ain_LdMXCSR:
1513         addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1514         return;
1515      case Ain_SseUComIS:
1516         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1517         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1518         addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1519         return;
1520      case Ain_SseSI2SF:
1521         addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1522         addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1523         return;
1524      case Ain_SseSF2SI:
1525         addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1526         addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1527         return;
1528      case Ain_SseSDSS:
1529         addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1530         addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1531         return;
1532      case Ain_SseLdSt:
1533         addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1534         addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1535                       i->Ain.SseLdSt.reg);
1536         return;
1537      case Ain_SseLdzLO:
1538         addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1539         addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1540         return;
1541      case Ain_Sse32Fx4:
1542         vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1543         unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1544                         || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1545                         || i->Ain.Sse32Fx4.op == Asse_SQRTF );
1546         addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1547         addHRegUse(u, unary ? HRmWrite : HRmModify,
1548                       i->Ain.Sse32Fx4.dst);
1549         return;
1550      case Ain_Sse32FLo:
1551         vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1552         unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1553                         || i->Ain.Sse32FLo.op == Asse_RSQRTF
1554                         || i->Ain.Sse32FLo.op == Asse_SQRTF );
1555         addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1556         addHRegUse(u, unary ? HRmWrite : HRmModify,
1557                       i->Ain.Sse32FLo.dst);
1558         return;
1559      case Ain_Sse64Fx2:
1560         vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1561         unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1562                         || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1563                         || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1564         addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1565         addHRegUse(u, unary ? HRmWrite : HRmModify,
1566                       i->Ain.Sse64Fx2.dst);
1567         return;
1568      case Ain_Sse64FLo:
1569         vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1570         unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1571                         || i->Ain.Sse64FLo.op == Asse_RSQRTF
1572                         || i->Ain.Sse64FLo.op == Asse_SQRTF );
1573         addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1574         addHRegUse(u, unary ? HRmWrite : HRmModify,
1575                       i->Ain.Sse64FLo.dst);
1576         return;
1577      case Ain_SseReRg:
1578         if ( (i->Ain.SseReRg.op == Asse_XOR
1579               || i->Ain.SseReRg.op == Asse_CMPEQ32)
1580              && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1581            /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1582               r,r' as a write of a value to r, and independent of any
1583               previous value in r */
1584            /* (as opposed to a rite of passage :-) */
1585            addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1586         } else {
1587            addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1588            addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1589                             ? HRmWrite : HRmModify,
1590                          i->Ain.SseReRg.dst);
1591         }
1592         return;
1593      case Ain_SseCMov:
1594         addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1595         addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1596         return;
1597      case Ain_SseShuf:
1598         addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1599         addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1600         return;
1601      //uu case Ain_AvxLdSt:
1602      //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1603      //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1604      //uu               i->Ain.AvxLdSt.reg);
1605      //uu return;
1606      //uu case Ain_AvxReRg:
1607      //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1608      //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1609      //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1610      //uu       /* See comments on the case for Ain_SseReRg. */
1611      //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1612      //uu    } else {
1613      //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1614      //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1615      //uu                        ? HRmWrite : HRmModify,
1616      //uu                     i->Ain.AvxReRg.dst);
1617      //uu    }
1618      //uu    return;
1619      case Ain_EvCheck:
1620         /* We expect both amodes only to mention %rbp, so this is in
1621            fact pointless, since %rbp isn't allocatable, but anyway.. */
1622         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1623         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1624         return;
1625      case Ain_ProfInc:
1626         addHRegUse(u, HRmWrite, hregAMD64_R11());
1627         return;
1628      default:
1629         ppAMD64Instr(i, mode64);
1630         vpanic("getRegUsage_AMD64Instr");
1631   }
1632}
1633
1634/* local helper */
1635static inline void mapReg(HRegRemap* m, HReg* r)
1636{
1637   *r = lookupHRegRemap(m, *r);
1638}
1639
1640void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1641{
1642   vassert(mode64 == True);
1643   switch (i->tag) {
1644      case Ain_Imm64:
1645         mapReg(m, &i->Ain.Imm64.dst);
1646         return;
1647      case Ain_Alu64R:
1648         mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1649         mapReg(m, &i->Ain.Alu64R.dst);
1650         return;
1651      case Ain_Alu64M:
1652         mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1653         mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1654         return;
1655      case Ain_Sh64:
1656         mapReg(m, &i->Ain.Sh64.dst);
1657         return;
1658      case Ain_Test64:
1659         mapReg(m, &i->Ain.Test64.dst);
1660         return;
1661      case Ain_Unary64:
1662         mapReg(m, &i->Ain.Unary64.dst);
1663         return;
1664      case Ain_Lea64:
1665         mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1666         mapReg(m, &i->Ain.Lea64.dst);
1667         return;
1668      case Ain_Alu32R:
1669         mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1670         mapReg(m, &i->Ain.Alu32R.dst);
1671         return;
1672      case Ain_MulL:
1673         mapRegs_AMD64RM(m, i->Ain.MulL.src);
1674         return;
1675      case Ain_Div:
1676         mapRegs_AMD64RM(m, i->Ain.Div.src);
1677         return;
1678      case Ain_Push:
1679         mapRegs_AMD64RMI(m, i->Ain.Push.src);
1680         return;
1681      case Ain_Call:
1682         return;
1683      case Ain_XDirect:
1684         mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1685         return;
1686      case Ain_XIndir:
1687         mapReg(m, &i->Ain.XIndir.dstGA);
1688         mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1689         return;
1690      case Ain_XAssisted:
1691         mapReg(m, &i->Ain.XAssisted.dstGA);
1692         mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1693         return;
1694      case Ain_CMov64:
1695         mapRegs_AMD64RM(m, i->Ain.CMov64.src);
1696         mapReg(m, &i->Ain.CMov64.dst);
1697         return;
1698      case Ain_MovxLQ:
1699         mapReg(m, &i->Ain.MovxLQ.src);
1700         mapReg(m, &i->Ain.MovxLQ.dst);
1701         return;
1702      case Ain_LoadEX:
1703         mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1704         mapReg(m, &i->Ain.LoadEX.dst);
1705         return;
1706      case Ain_Store:
1707         mapReg(m, &i->Ain.Store.src);
1708         mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1709         return;
1710      case Ain_Set64:
1711         mapReg(m, &i->Ain.Set64.dst);
1712         return;
1713      case Ain_Bsfr64:
1714         mapReg(m, &i->Ain.Bsfr64.src);
1715         mapReg(m, &i->Ain.Bsfr64.dst);
1716         return;
1717      case Ain_MFence:
1718         return;
1719      case Ain_ACAS:
1720         mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1721         return;
1722      case Ain_DACAS:
1723         mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1724         return;
1725      case Ain_A87Free:
1726         return;
1727      case Ain_A87PushPop:
1728         mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1729         return;
1730      case Ain_A87FpOp:
1731         return;
1732      case Ain_A87LdCW:
1733         mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1734         return;
1735      case Ain_A87StSW:
1736         mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1737         return;
1738      case Ain_LdMXCSR:
1739         mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1740         return;
1741      case Ain_SseUComIS:
1742         mapReg(m, &i->Ain.SseUComIS.srcL);
1743         mapReg(m, &i->Ain.SseUComIS.srcR);
1744         mapReg(m, &i->Ain.SseUComIS.dst);
1745         return;
1746      case Ain_SseSI2SF:
1747         mapReg(m, &i->Ain.SseSI2SF.src);
1748         mapReg(m, &i->Ain.SseSI2SF.dst);
1749         return;
1750      case Ain_SseSF2SI:
1751         mapReg(m, &i->Ain.SseSF2SI.src);
1752         mapReg(m, &i->Ain.SseSF2SI.dst);
1753         return;
1754      case Ain_SseSDSS:
1755         mapReg(m, &i->Ain.SseSDSS.src);
1756         mapReg(m, &i->Ain.SseSDSS.dst);
1757         return;
1758      case Ain_SseLdSt:
1759         mapReg(m, &i->Ain.SseLdSt.reg);
1760         mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1761         break;
1762      case Ain_SseLdzLO:
1763         mapReg(m, &i->Ain.SseLdzLO.reg);
1764         mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1765         break;
1766      case Ain_Sse32Fx4:
1767         mapReg(m, &i->Ain.Sse32Fx4.src);
1768         mapReg(m, &i->Ain.Sse32Fx4.dst);
1769         return;
1770      case Ain_Sse32FLo:
1771         mapReg(m, &i->Ain.Sse32FLo.src);
1772         mapReg(m, &i->Ain.Sse32FLo.dst);
1773         return;
1774      case Ain_Sse64Fx2:
1775         mapReg(m, &i->Ain.Sse64Fx2.src);
1776         mapReg(m, &i->Ain.Sse64Fx2.dst);
1777         return;
1778      case Ain_Sse64FLo:
1779         mapReg(m, &i->Ain.Sse64FLo.src);
1780         mapReg(m, &i->Ain.Sse64FLo.dst);
1781         return;
1782      case Ain_SseReRg:
1783         mapReg(m, &i->Ain.SseReRg.src);
1784         mapReg(m, &i->Ain.SseReRg.dst);
1785         return;
1786      case Ain_SseCMov:
1787         mapReg(m, &i->Ain.SseCMov.src);
1788         mapReg(m, &i->Ain.SseCMov.dst);
1789         return;
1790      case Ain_SseShuf:
1791         mapReg(m, &i->Ain.SseShuf.src);
1792         mapReg(m, &i->Ain.SseShuf.dst);
1793         return;
1794      //uu case Ain_AvxLdSt:
1795      //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1796      //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1797      //uu    break;
1798      //uu case Ain_AvxReRg:
1799      //uu    mapReg(m, &i->Ain.AvxReRg.src);
1800      //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1801      //uu    return;
1802      case Ain_EvCheck:
1803         /* We expect both amodes only to mention %rbp, so this is in
1804            fact pointless, since %rbp isn't allocatable, but anyway.. */
1805         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1806         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1807         return;
1808      case Ain_ProfInc:
1809         /* hardwires r11 -- nothing to modify. */
1810         return;
1811      default:
1812         ppAMD64Instr(i, mode64);
1813         vpanic("mapRegs_AMD64Instr");
1814   }
1815}
1816
1817/* Figure out if i represents a reg-reg move, and if so assign the
1818   source and destination to *src and *dst.  If in doubt say No.  Used
1819   by the register allocator to do move coalescing.
1820*/
1821Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst )
1822{
1823   switch (i->tag) {
1824      case Ain_Alu64R:
1825         /* Moves between integer regs */
1826         if (i->Ain.Alu64R.op != Aalu_MOV)
1827            return False;
1828         if (i->Ain.Alu64R.src->tag != Armi_Reg)
1829            return False;
1830         *src = i->Ain.Alu64R.src->Armi.Reg.reg;
1831         *dst = i->Ain.Alu64R.dst;
1832         return True;
1833      case Ain_SseReRg:
1834         /* Moves between SSE regs */
1835         if (i->Ain.SseReRg.op != Asse_MOV)
1836            return False;
1837         *src = i->Ain.SseReRg.src;
1838         *dst = i->Ain.SseReRg.dst;
1839         return True;
1840      //uu case Ain_AvxReRg:
1841      //uu    /* Moves between AVX regs */
1842      //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
1843      //uu       return False;
1844      //uu    *src = i->Ain.AvxReRg.src;
1845      //uu    *dst = i->Ain.AvxReRg.dst;
1846      //uu    return True;
1847      default:
1848         return False;
1849   }
1850   /*NOTREACHED*/
1851}
1852
1853
1854/* Generate amd64 spill/reload instructions under the direction of the
1855   register allocator.  Note it's critical these don't write the
1856   condition codes. */
1857
1858void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1859                      HReg rreg, Int offsetB, Bool mode64 )
1860{
1861   AMD64AMode* am;
1862   vassert(offsetB >= 0);
1863   vassert(!hregIsVirtual(rreg));
1864   vassert(mode64 == True);
1865   *i1 = *i2 = NULL;
1866   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1867   switch (hregClass(rreg)) {
1868      case HRcInt64:
1869         *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
1870         return;
1871      case HRcVec128:
1872         *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
1873         return;
1874      default:
1875         ppHRegClass(hregClass(rreg));
1876         vpanic("genSpill_AMD64: unimplemented regclass");
1877   }
1878}
1879
1880void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1881                       HReg rreg, Int offsetB, Bool mode64 )
1882{
1883   AMD64AMode* am;
1884   vassert(offsetB >= 0);
1885   vassert(!hregIsVirtual(rreg));
1886   vassert(mode64 == True);
1887   *i1 = *i2 = NULL;
1888   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1889   switch (hregClass(rreg)) {
1890      case HRcInt64:
1891         *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
1892         return;
1893      case HRcVec128:
1894         *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
1895         return;
1896      default:
1897         ppHRegClass(hregClass(rreg));
1898         vpanic("genReload_AMD64: unimplemented regclass");
1899   }
1900}
1901
1902
1903/* --------- The amd64 assembler (bleh.) --------- */
1904
1905/* Produce the low three bits of an integer register number. */
1906static UChar iregBits210 ( HReg r )
1907{
1908   UInt n;
1909   vassert(hregClass(r) == HRcInt64);
1910   vassert(!hregIsVirtual(r));
1911   n = hregNumber(r);
1912   vassert(n <= 15);
1913   return toUChar(n & 7);
1914}
1915
1916/* Produce bit 3 of an integer register number. */
1917static UChar iregBit3 ( HReg r )
1918{
1919   UInt n;
1920   vassert(hregClass(r) == HRcInt64);
1921   vassert(!hregIsVirtual(r));
1922   n = hregNumber(r);
1923   vassert(n <= 15);
1924   return toUChar((n >> 3) & 1);
1925}
1926
1927/* Produce a complete 4-bit integer register number. */
1928static UChar iregBits3210 ( HReg r )
1929{
1930   UInt n;
1931   vassert(hregClass(r) == HRcInt64);
1932   vassert(!hregIsVirtual(r));
1933   n = hregNumber(r);
1934   vassert(n <= 15);
1935   return toUChar(n);
1936}
1937
1938/* Given an xmm (128bit V-class) register number, produce the
1939   equivalent numbered register in 64-bit I-class.  This is a bit of
1940   fakery which facilitates using functions that work on integer
1941   register numbers to be used when assembling SSE instructions
1942   too. */
1943static HReg vreg2ireg ( HReg r )
1944{
1945   UInt n;
1946   vassert(hregClass(r) == HRcVec128);
1947   vassert(!hregIsVirtual(r));
1948   n = hregNumber(r);
1949   vassert(n <= 15);
1950   return mkHReg(n, HRcInt64, False);
1951}
1952
1953//uu /* Ditto for ymm regs. */
1954//uu static HReg dvreg2ireg ( HReg r )
1955//uu {
1956//uu    UInt n;
1957//uu    vassert(hregClass(r) == HRcVec256);
1958//uu    vassert(!hregIsVirtual(r));
1959//uu    n = hregNumber(r);
1960//uu    vassert(n <= 15);
1961//uu    return mkHReg(n, HRcInt64, False);
1962//uu }
1963
1964static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
1965{
1966   vassert(mod < 4);
1967   vassert((reg|regmem) < 8);
1968   return toUChar( ((mod & 3) << 6)
1969                   | ((reg & 7) << 3)
1970                   | (regmem & 7) );
1971}
1972
1973static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
1974{
1975   vassert(shift < 4);
1976   vassert((regindex|regbase) < 8);
1977   return toUChar( ((shift & 3) << 6)
1978                   | ((regindex & 7) << 3)
1979                   | (regbase & 7) );
1980}
1981
1982static UChar* emit32 ( UChar* p, UInt w32 )
1983{
1984   *p++ = toUChar((w32)       & 0x000000FF);
1985   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1986   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1987   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1988   return p;
1989}
1990
1991static UChar* emit64 ( UChar* p, ULong w64 )
1992{
1993   p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
1994   p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
1995   return p;
1996}
1997
1998/* Does a sign-extend of the lowest 8 bits give
1999   the original number? */
2000static Bool fits8bits ( UInt w32 )
2001{
2002   Int i32 = (Int)w32;
2003   return toBool(i32 == ((i32 << 24) >> 24));
2004}
2005/* Can the lower 32 bits be signedly widened to produce the whole
2006   64-bit value?  In other words, are the top 33 bits either all 0 or
2007   all 1 ? */
2008static Bool fitsIn32Bits ( ULong x )
2009{
2010   Long y0 = (Long)x;
2011   Long y1 = y0;
2012   y1 <<= 32;
2013   y1 >>=/*s*/ 32;
2014   return toBool(x == y1);
2015}
2016
2017
2018/* Forming mod-reg-rm bytes and scale-index-base bytes.
2019
2020     greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2021                       =  00 greg ereg
2022
2023     greg,  d8(ereg)   |  ereg is neither of: RSP R12
2024                       =  01 greg ereg, d8
2025
2026     greg,  d32(ereg)  |  ereg is neither of: RSP R12
2027                       =  10 greg ereg, d32
2028
2029     greg,  d8(ereg)   |  ereg is either: RSP R12
2030                       =  01 greg 100, 0x24, d8
2031                       (lowest bit of rex distinguishes R12/RSP)
2032
2033     greg,  d32(ereg)  |  ereg is either: RSP R12
2034                       =  10 greg 100, 0x24, d32
2035                       (lowest bit of rex distinguishes R12/RSP)
2036
2037     -----------------------------------------------
2038
2039     greg,  d8(base,index,scale)
2040               |  index != RSP
2041               =  01 greg 100, scale index base, d8
2042
2043     greg,  d32(base,index,scale)
2044               |  index != RSP
2045               =  10 greg 100, scale index base, d32
2046*/
2047static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2048{
2049   if (am->tag == Aam_IR) {
2050      if (am->Aam.IR.imm == 0
2051          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2052          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2053          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2054          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2055         ) {
2056         *p++ = mkModRegRM(0, iregBits210(greg),
2057                              iregBits210(am->Aam.IR.reg));
2058         return p;
2059      }
2060      if (fits8bits(am->Aam.IR.imm)
2061          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2062          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2063         ) {
2064         *p++ = mkModRegRM(1, iregBits210(greg),
2065                              iregBits210(am->Aam.IR.reg));
2066         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2067         return p;
2068      }
2069      if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2070          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2071         ) {
2072         *p++ = mkModRegRM(2, iregBits210(greg),
2073                              iregBits210(am->Aam.IR.reg));
2074         p = emit32(p, am->Aam.IR.imm);
2075         return p;
2076      }
2077      if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2078           || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2079          && fits8bits(am->Aam.IR.imm)) {
2080 	 *p++ = mkModRegRM(1, iregBits210(greg), 4);
2081         *p++ = 0x24;
2082         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2083         return p;
2084      }
2085      if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2086	     || wait for test case for RSP case */
2087          sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2088 	 *p++ = mkModRegRM(2, iregBits210(greg), 4);
2089         *p++ = 0x24;
2090         p = emit32(p, am->Aam.IR.imm);
2091         return p;
2092      }
2093      ppAMD64AMode(am);
2094      vpanic("doAMode_M: can't emit amode IR");
2095      /*NOTREACHED*/
2096   }
2097   if (am->tag == Aam_IRRS) {
2098      if (fits8bits(am->Aam.IRRS.imm)
2099          && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2100         *p++ = mkModRegRM(1, iregBits210(greg), 4);
2101         *p++ = mkSIB(am->Aam.IRRS.shift, iregBits210(am->Aam.IRRS.index),
2102                                          iregBits210(am->Aam.IRRS.base));
2103         *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2104         return p;
2105      }
2106      if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2107         *p++ = mkModRegRM(2, iregBits210(greg), 4);
2108         *p++ = mkSIB(am->Aam.IRRS.shift, iregBits210(am->Aam.IRRS.index),
2109                                          iregBits210(am->Aam.IRRS.base));
2110         p = emit32(p, am->Aam.IRRS.imm);
2111         return p;
2112      }
2113      ppAMD64AMode(am);
2114      vpanic("doAMode_M: can't emit amode IRRS");
2115      /*NOTREACHED*/
2116   }
2117   vpanic("doAMode_M: unknown amode");
2118   /*NOTREACHED*/
2119}
2120
2121
2122/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2123static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2124{
2125   *p++ = mkModRegRM(3, iregBits210(greg), iregBits210(ereg));
2126   return p;
2127}
2128
2129
2130/* Clear the W bit on a REX byte, thereby changing the operand size
2131   back to whatever that instruction's default operand size is. */
2132static inline UChar clearWBit ( UChar rex )
2133{
2134   return toUChar(rex & ~(1<<3));
2135}
2136
2137
2138/* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2139static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2140{
2141   if (am->tag == Aam_IR) {
2142      UChar W = 1;  /* we want 64-bit mode */
2143      UChar R = iregBit3(greg);
2144      UChar X = 0; /* not relevant */
2145      UChar B = iregBit3(am->Aam.IR.reg);
2146      return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
2147   }
2148   if (am->tag == Aam_IRRS) {
2149      UChar W = 1;  /* we want 64-bit mode */
2150      UChar R = iregBit3(greg);
2151      UChar X = iregBit3(am->Aam.IRRS.index);
2152      UChar B = iregBit3(am->Aam.IRRS.base);
2153      return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
2154   }
2155   vassert(0);
2156   return 0; /*NOTREACHED*/
2157}
2158
2159/* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2160static UChar rexAMode_R ( HReg greg, HReg ereg )
2161{
2162   UChar W = 1;  /* we want 64-bit mode */
2163   UChar R = iregBit3(greg);
2164   UChar X = 0; /* not relevant */
2165   UChar B = iregBit3(ereg);
2166   return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
2167}
2168
2169
2170//uu /* May 2012: this VEX prefix stuff is currently unused, but has
2171//uu    verified correct (I reckon).  Certainly it has been known to
2172//uu    produce correct VEX prefixes during testing. */
2173//uu
2174//uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2175//uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2176//uu    in verbatim.  There's no range checking on the bits. */
2177//uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2178//uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2179//uu                             UInt L, UInt pp )
2180//uu {
2181//uu    UChar byte0 = 0;
2182//uu    UChar byte1 = 0;
2183//uu    UChar byte2 = 0;
2184//uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2185//uu       /* 2 byte encoding is possible. */
2186//uu       byte0 = 0xC5;
2187//uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2188//uu               | (L << 2) | pp;
2189//uu    } else {
2190//uu       /* 3 byte encoding is needed. */
2191//uu       byte0 = 0xC4;
2192//uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2193//uu               | ((rexB ^ 1) << 5) | mmmmm;
2194//uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2195//uu    }
2196//uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2197//uu }
2198//uu
2199//uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2200//uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2201//uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2202//uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2203//uu    vvvv=1111 (unused 3rd reg). */
2204//uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2205//uu {
2206//uu    UChar L       = 1; /* size = 256 */
2207//uu    UChar pp      = 0; /* no SIMD prefix */
2208//uu    UChar mmmmm   = 1; /* 0F */
2209//uu    UChar notVvvv = 0; /* unused */
2210//uu    UChar rexW    = 0;
2211//uu    UChar rexR    = 0;
2212//uu    UChar rexX    = 0;
2213//uu    UChar rexB    = 0;
2214//uu    /* Same logic as in rexAMode_M. */
2215//uu    if (am->tag == Aam_IR) {
2216//uu       rexR = iregBit3(greg);
2217//uu       rexX = 0; /* not relevant */
2218//uu       rexB = iregBit3(am->Aam.IR.reg);
2219//uu    }
2220//uu    else if (am->tag == Aam_IRRS) {
2221//uu       rexR = iregBit3(greg);
2222//uu       rexX = iregBit3(am->Aam.IRRS.index);
2223//uu       rexB = iregBit3(am->Aam.IRRS.base);
2224//uu    } else {
2225//uu       vassert(0);
2226//uu    }
2227//uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2228//uu }
2229//uu
2230//uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2231//uu {
2232//uu    switch (vex & 0xFF) {
2233//uu       case 0xC5:
2234//uu          *p++ = 0xC5;
2235//uu          *p++ = (vex >> 8) & 0xFF;
2236//uu          vassert(0 == (vex >> 16));
2237//uu          break;
2238//uu       case 0xC4:
2239//uu          *p++ = 0xC4;
2240//uu          *p++ = (vex >> 8) & 0xFF;
2241//uu          *p++ = (vex >> 16) & 0xFF;
2242//uu          vassert(0 == (vex >> 24));
2243//uu          break;
2244//uu       default:
2245//uu          vassert(0);
2246//uu    }
2247//uu    return p;
2248//uu }
2249
2250
2251/* Emit ffree %st(N) */
2252static UChar* do_ffree_st ( UChar* p, Int n )
2253{
2254   vassert(n >= 0 && n <= 7);
2255   *p++ = 0xDD;
2256   *p++ = toUChar(0xC0 + n);
2257   return p;
2258}
2259
2260/* Emit an instruction into buf and return the number of bytes used.
2261   Note that buf is not the insn's final place, and therefore it is
2262   imperative to emit position-independent code.  If the emitted
2263   instruction was a profiler inc, set *is_profInc to True, else
2264   leave it unchanged. */
2265
2266Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2267                      UChar* buf, Int nbuf, AMD64Instr* i,
2268                      Bool mode64,
2269                      void* disp_cp_chain_me_to_slowEP,
2270                      void* disp_cp_chain_me_to_fastEP,
2271                      void* disp_cp_xindir,
2272                      void* disp_cp_xassisted )
2273{
2274   UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2275   UInt   xtra;
2276   UInt   reg;
2277   UChar  rex;
2278   UChar* p = &buf[0];
2279   UChar* ptmp;
2280   Int    j;
2281   vassert(nbuf >= 32);
2282   vassert(mode64 == True);
2283
2284   /* Wrap an integer as a int register, for use assembling
2285      GrpN insns, in which the greg field is used as a sub-opcode
2286      and does not really contain a register. */
2287#  define fake(_n) mkHReg((_n), HRcInt64, False)
2288
2289   /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2290
2291   switch (i->tag) {
2292
2293   case Ain_Imm64:
2294      if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2295         /* Use the short form (load into 32 bit reg, + default
2296            widening rule) for constants under 1 million.  We could
2297            use this form for the range 0 to 0x7FFFFFFF inclusive, but
2298            limit it to a smaller range for verifiability purposes. */
2299         if (1 & iregBit3(i->Ain.Imm64.dst))
2300            *p++ = 0x41;
2301         *p++ = 0xB8 + iregBits210(i->Ain.Imm64.dst);
2302         p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2303      } else {
2304         *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Imm64.dst)));
2305         *p++ = toUChar(0xB8 + iregBits210(i->Ain.Imm64.dst));
2306         p = emit64(p, i->Ain.Imm64.imm64);
2307      }
2308      goto done;
2309
2310   case Ain_Alu64R:
2311      /* Deal specially with MOV */
2312      if (i->Ain.Alu64R.op == Aalu_MOV) {
2313         switch (i->Ain.Alu64R.src->tag) {
2314            case Armi_Imm:
2315               if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2316                  /* Actually we could use this form for constants in
2317                     the range 0 through 0x7FFFFFFF inclusive, but
2318                     limit it to a small range for verifiability
2319                     purposes. */
2320                  /* Generate "movl $imm32, 32-bit-register" and let
2321                     the default zero-extend rule cause the upper half
2322                     of the dst to be zeroed out too.  This saves 1
2323                     and sometimes 2 bytes compared to the more
2324                     obvious encoding in the 'else' branch. */
2325                  if (1 & iregBit3(i->Ain.Alu64R.dst))
2326                     *p++ = 0x41;
2327                  *p++ = 0xB8 + iregBits210(i->Ain.Alu64R.dst);
2328                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2329               } else {
2330                  *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Alu64R.dst)));
2331                  *p++ = 0xC7;
2332                  *p++ = toUChar(0xC0 + iregBits210(i->Ain.Alu64R.dst));
2333                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2334               }
2335               goto done;
2336            case Armi_Reg:
2337               *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2338                                  i->Ain.Alu64R.dst );
2339               *p++ = 0x89;
2340               p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2341                                i->Ain.Alu64R.dst);
2342               goto done;
2343            case Armi_Mem:
2344               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2345                                 i->Ain.Alu64R.src->Armi.Mem.am);
2346               *p++ = 0x8B;
2347               p = doAMode_M(p, i->Ain.Alu64R.dst,
2348                                i->Ain.Alu64R.src->Armi.Mem.am);
2349               goto done;
2350            default:
2351               goto bad;
2352         }
2353      }
2354      /* MUL */
2355      if (i->Ain.Alu64R.op == Aalu_MUL) {
2356         switch (i->Ain.Alu64R.src->tag) {
2357            case Armi_Reg:
2358               *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2359                                  i->Ain.Alu64R.src->Armi.Reg.reg);
2360               *p++ = 0x0F;
2361               *p++ = 0xAF;
2362               p = doAMode_R(p, i->Ain.Alu64R.dst,
2363                                i->Ain.Alu64R.src->Armi.Reg.reg);
2364               goto done;
2365            case Armi_Mem:
2366               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2367                                 i->Ain.Alu64R.src->Armi.Mem.am);
2368               *p++ = 0x0F;
2369               *p++ = 0xAF;
2370               p = doAMode_M(p, i->Ain.Alu64R.dst,
2371                                i->Ain.Alu64R.src->Armi.Mem.am);
2372               goto done;
2373            case Armi_Imm:
2374               if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2375                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2376                  *p++ = 0x6B;
2377                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2378                  *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2379               } else {
2380                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2381                  *p++ = 0x69;
2382                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2383                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2384               }
2385               goto done;
2386            default:
2387               goto bad;
2388         }
2389      }
2390      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2391      opc = opc_rr = subopc_imm = opc_imma = 0;
2392      switch (i->Ain.Alu64R.op) {
2393         case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2394                        subopc_imm = 2; opc_imma = 0x15; break;
2395         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2396                        subopc_imm = 0; opc_imma = 0x05; break;
2397         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2398                        subopc_imm = 5; opc_imma = 0x2D; break;
2399         case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2400                        subopc_imm = 3; opc_imma = 0x1D; break;
2401         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2402                        subopc_imm = 4; opc_imma = 0x25; break;
2403         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2404                        subopc_imm = 6; opc_imma = 0x35; break;
2405         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2406                        subopc_imm = 1; opc_imma = 0x0D; break;
2407         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2408                        subopc_imm = 7; opc_imma = 0x3D; break;
2409         default: goto bad;
2410      }
2411      switch (i->Ain.Alu64R.src->tag) {
2412         case Armi_Imm:
2413            if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2414                && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2415               goto bad; /* FIXME: awaiting test case */
2416               *p++ = toUChar(opc_imma);
2417               p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2418            } else
2419            if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2420               *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst );
2421               *p++ = 0x83;
2422               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
2423               *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2424            } else {
2425               *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst);
2426               *p++ = 0x81;
2427               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
2428               p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2429            }
2430            goto done;
2431         case Armi_Reg:
2432            *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2433                               i->Ain.Alu64R.dst);
2434            *p++ = toUChar(opc_rr);
2435            p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2436                             i->Ain.Alu64R.dst);
2437            goto done;
2438         case Armi_Mem:
2439            *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2440                               i->Ain.Alu64R.src->Armi.Mem.am);
2441            *p++ = toUChar(opc);
2442            p = doAMode_M(p, i->Ain.Alu64R.dst,
2443                             i->Ain.Alu64R.src->Armi.Mem.am);
2444            goto done;
2445         default:
2446            goto bad;
2447      }
2448      break;
2449
2450   case Ain_Alu64M:
2451      /* Deal specially with MOV */
2452      if (i->Ain.Alu64M.op == Aalu_MOV) {
2453         switch (i->Ain.Alu64M.src->tag) {
2454            case Ari_Reg:
2455               *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2456                                 i->Ain.Alu64M.dst);
2457               *p++ = 0x89;
2458               p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2459                                i->Ain.Alu64M.dst);
2460               goto done;
2461            case Ari_Imm:
2462               *p++ = rexAMode_M(fake(0), i->Ain.Alu64M.dst);
2463               *p++ = 0xC7;
2464               p = doAMode_M(p, fake(0), i->Ain.Alu64M.dst);
2465               p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2466               goto done;
2467            default:
2468               goto bad;
2469         }
2470      }
2471      break;
2472
2473   case Ain_Sh64:
2474      opc_cl = opc_imm = subopc = 0;
2475      switch (i->Ain.Sh64.op) {
2476         case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2477         case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2478         case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2479         default: goto bad;
2480      }
2481      if (i->Ain.Sh64.src == 0) {
2482         *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
2483         *p++ = toUChar(opc_cl);
2484         p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
2485         goto done;
2486      } else {
2487         *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
2488         *p++ = toUChar(opc_imm);
2489         p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
2490         *p++ = (UChar)(i->Ain.Sh64.src);
2491         goto done;
2492      }
2493      break;
2494
2495   case Ain_Test64:
2496      /* testq sign-extend($imm32), %reg */
2497      *p++ = rexAMode_R(fake(0), i->Ain.Test64.dst);
2498      *p++ = 0xF7;
2499      p = doAMode_R(p, fake(0), i->Ain.Test64.dst);
2500      p = emit32(p, i->Ain.Test64.imm32);
2501      goto done;
2502
2503   case Ain_Unary64:
2504      if (i->Ain.Unary64.op == Aun_NOT) {
2505         *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
2506         *p++ = 0xF7;
2507         p = doAMode_R(p, fake(2), i->Ain.Unary64.dst);
2508         goto done;
2509      }
2510      if (i->Ain.Unary64.op == Aun_NEG) {
2511         *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
2512         *p++ = 0xF7;
2513         p = doAMode_R(p, fake(3), i->Ain.Unary64.dst);
2514         goto done;
2515      }
2516      break;
2517
2518   case Ain_Lea64:
2519      *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2520      *p++ = 0x8D;
2521      p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2522      goto done;
2523
2524   case Ain_Alu32R:
2525      /* ADD/SUB/AND/OR/XOR/CMP */
2526      opc = opc_rr = subopc_imm = opc_imma = 0;
2527      switch (i->Ain.Alu32R.op) {
2528         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2529                        subopc_imm = 0; opc_imma = 0x05; break;
2530         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2531                        subopc_imm = 5; opc_imma = 0x2D; break;
2532         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2533                        subopc_imm = 4; opc_imma = 0x25; break;
2534         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2535                        subopc_imm = 6; opc_imma = 0x35; break;
2536         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2537                        subopc_imm = 1; opc_imma = 0x0D; break;
2538         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2539                        subopc_imm = 7; opc_imma = 0x3D; break;
2540         default: goto bad;
2541      }
2542      switch (i->Ain.Alu32R.src->tag) {
2543         case Armi_Imm:
2544            if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2545                && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2546               goto bad; /* FIXME: awaiting test case */
2547               *p++ = toUChar(opc_imma);
2548               p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2549            } else
2550            if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2551               rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst ) );
2552               if (rex != 0x40) *p++ = rex;
2553               *p++ = 0x83;
2554               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
2555               *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2556            } else {
2557               rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst) );
2558               if (rex != 0x40) *p++ = rex;
2559               *p++ = 0x81;
2560               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
2561               p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2562            }
2563            goto done;
2564         case Armi_Reg:
2565            rex  = clearWBit(
2566                   rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2567                               i->Ain.Alu32R.dst) );
2568            if (rex != 0x40) *p++ = rex;
2569            *p++ = toUChar(opc_rr);
2570            p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2571                             i->Ain.Alu32R.dst);
2572            goto done;
2573         case Armi_Mem:
2574            rex  = clearWBit(
2575                   rexAMode_M( i->Ain.Alu32R.dst,
2576                               i->Ain.Alu32R.src->Armi.Mem.am) );
2577            if (rex != 0x40) *p++ = rex;
2578            *p++ = toUChar(opc);
2579            p = doAMode_M(p, i->Ain.Alu32R.dst,
2580                             i->Ain.Alu32R.src->Armi.Mem.am);
2581            goto done;
2582         default:
2583            goto bad;
2584      }
2585      break;
2586
2587   case Ain_MulL:
2588      subopc = i->Ain.MulL.syned ? 5 : 4;
2589      switch (i->Ain.MulL.src->tag)  {
2590         case Arm_Mem:
2591            *p++ = rexAMode_M( fake(0),
2592                               i->Ain.MulL.src->Arm.Mem.am);
2593            *p++ = 0xF7;
2594            p = doAMode_M(p, fake(subopc),
2595                             i->Ain.MulL.src->Arm.Mem.am);
2596            goto done;
2597         case Arm_Reg:
2598            *p++ = rexAMode_R(fake(0),
2599                              i->Ain.MulL.src->Arm.Reg.reg);
2600            *p++ = 0xF7;
2601            p = doAMode_R(p, fake(subopc),
2602                             i->Ain.MulL.src->Arm.Reg.reg);
2603            goto done;
2604         default:
2605            goto bad;
2606      }
2607      break;
2608
2609   case Ain_Div:
2610      subopc = i->Ain.Div.syned ? 7 : 6;
2611      if (i->Ain.Div.sz == 4) {
2612         switch (i->Ain.Div.src->tag)  {
2613            case Arm_Mem:
2614               goto bad;
2615               /*FIXME*/
2616               *p++ = 0xF7;
2617               p = doAMode_M(p, fake(subopc),
2618                                i->Ain.Div.src->Arm.Mem.am);
2619               goto done;
2620            case Arm_Reg:
2621               *p++ = clearWBit(
2622                      rexAMode_R( fake(0), i->Ain.Div.src->Arm.Reg.reg));
2623               *p++ = 0xF7;
2624               p = doAMode_R(p, fake(subopc),
2625                                i->Ain.Div.src->Arm.Reg.reg);
2626               goto done;
2627            default:
2628               goto bad;
2629         }
2630      }
2631      if (i->Ain.Div.sz == 8) {
2632         switch (i->Ain.Div.src->tag)  {
2633            case Arm_Mem:
2634               *p++ = rexAMode_M( fake(0),
2635                                  i->Ain.Div.src->Arm.Mem.am);
2636               *p++ = 0xF7;
2637               p = doAMode_M(p, fake(subopc),
2638                                i->Ain.Div.src->Arm.Mem.am);
2639               goto done;
2640            case Arm_Reg:
2641               *p++ = rexAMode_R( fake(0),
2642                                  i->Ain.Div.src->Arm.Reg.reg);
2643               *p++ = 0xF7;
2644               p = doAMode_R(p, fake(subopc),
2645                                i->Ain.Div.src->Arm.Reg.reg);
2646               goto done;
2647            default:
2648               goto bad;
2649         }
2650      }
2651      break;
2652
2653   case Ain_Push:
2654      switch (i->Ain.Push.src->tag) {
2655         case Armi_Mem:
2656            *p++ = clearWBit(
2657                   rexAMode_M(fake(0), i->Ain.Push.src->Armi.Mem.am));
2658            *p++ = 0xFF;
2659            p = doAMode_M(p, fake(6), i->Ain.Push.src->Armi.Mem.am);
2660            goto done;
2661         case Armi_Imm:
2662            *p++ = 0x68;
2663            p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2664            goto done;
2665         case Armi_Reg:
2666            *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.Push.src->Armi.Reg.reg)));
2667            *p++ = toUChar(0x50 + iregBits210(i->Ain.Push.src->Armi.Reg.reg));
2668            goto done;
2669        default:
2670            goto bad;
2671      }
2672
2673   case Ain_Call: {
2674      if (i->Ain.Call.cond != Acc_ALWAYS
2675          && i->Ain.Call.rloc.pri != RLPri_None) {
2676         /* The call might not happen (it isn't unconditional) and it
2677            returns a result.  In this case we will need to generate a
2678            control flow diamond to put 0x555..555 in the return
2679            register(s) in the case where the call doesn't happen.  If
2680            this ever becomes necessary, maybe copy code from the ARM
2681            equivalent.  Until that day, just give up. */
2682         goto bad;
2683      }
2684      /* As per detailed comment for Ain_Call in
2685         getRegUsage_AMD64Instr above, %r11 is used as an address
2686         temporary. */
2687      /* jump over the following two insns if the condition does not
2688         hold */
2689      Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2690      if (i->Ain.Call.cond != Acc_ALWAYS) {
2691         *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2692         *p++ = shortImm ? 10 : 13;
2693         /* 10 or 13 bytes in the next two insns */
2694      }
2695      if (shortImm) {
2696         /* 7 bytes: movl sign-extend(imm32), %r11 */
2697         *p++ = 0x49;
2698         *p++ = 0xC7;
2699         *p++ = 0xC3;
2700         p = emit32(p, (UInt)i->Ain.Call.target);
2701      } else {
2702         /* 10 bytes: movabsq $target, %r11 */
2703         *p++ = 0x49;
2704         *p++ = 0xBB;
2705         p = emit64(p, i->Ain.Call.target);
2706      }
2707      /* 3 bytes: call *%r11 */
2708      *p++ = 0x41;
2709      *p++ = 0xFF;
2710      *p++ = 0xD3;
2711      goto done;
2712   }
2713
2714   case Ain_XDirect: {
2715      /* NB: what goes on here has to be very closely coordinated with the
2716         chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
2717      /* We're generating chain-me requests here, so we need to be
2718         sure this is actually allowed -- no-redir translations can't
2719         use chain-me's.  Hence: */
2720      vassert(disp_cp_chain_me_to_slowEP != NULL);
2721      vassert(disp_cp_chain_me_to_fastEP != NULL);
2722
2723      HReg r11 = hregAMD64_R11();
2724
2725      /* Use ptmp for backpatching conditional jumps. */
2726      ptmp = NULL;
2727
2728      /* First off, if this is conditional, create a conditional
2729         jump over the rest of it. */
2730      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2731         /* jmp fwds if !condition */
2732         *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
2733         ptmp = p; /* fill in this bit later */
2734         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2735      }
2736
2737      /* Update the guest RIP. */
2738      if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
2739         /* use a shorter encoding */
2740         /* movl sign-extend(dstGA), %r11 */
2741         *p++ = 0x49;
2742         *p++ = 0xC7;
2743         *p++ = 0xC3;
2744         p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
2745      } else {
2746         /* movabsq $dstGA, %r11 */
2747         *p++ = 0x49;
2748         *p++ = 0xBB;
2749         p = emit64(p, i->Ain.XDirect.dstGA);
2750      }
2751
2752      /* movq %r11, amRIP */
2753      *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
2754      *p++ = 0x89;
2755      p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
2756
2757      /* --- FIRST PATCHABLE BYTE follows --- */
2758      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2759         to) backs up the return address, so as to find the address of
2760         the first patchable byte.  So: don't change the length of the
2761         two instructions below. */
2762      /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
2763      *p++ = 0x49;
2764      *p++ = 0xBB;
2765      void* disp_cp_chain_me
2766               = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2767                                         : disp_cp_chain_me_to_slowEP;
2768      p = emit64(p, Ptr_to_ULong(disp_cp_chain_me));
2769      /* call *%r11 */
2770      *p++ = 0x41;
2771      *p++ = 0xFF;
2772      *p++ = 0xD3;
2773      /* --- END of PATCHABLE BYTES --- */
2774
2775      /* Fix up the conditional jump, if there was one. */
2776      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2777         Int delta = p - ptmp;
2778         vassert(delta > 0 && delta < 40);
2779         *ptmp = toUChar(delta-1);
2780      }
2781      goto done;
2782   }
2783
2784   case Ain_XIndir: {
2785      /* We're generating transfers that could lead indirectly to a
2786         chain-me, so we need to be sure this is actually allowed --
2787         no-redir translations are not allowed to reach normal
2788         translations without going through the scheduler.  That means
2789         no XDirects or XIndirs out from no-redir translations.
2790         Hence: */
2791      vassert(disp_cp_xindir != NULL);
2792
2793      /* Use ptmp for backpatching conditional jumps. */
2794      ptmp = NULL;
2795
2796      /* First off, if this is conditional, create a conditional
2797         jump over the rest of it. */
2798      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2799         /* jmp fwds if !condition */
2800         *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
2801         ptmp = p; /* fill in this bit later */
2802         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2803      }
2804
2805      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2806      *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2807      *p++ = 0x89;
2808      p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2809
2810      /* get $disp_cp_xindir into %r11 */
2811      if (fitsIn32Bits(Ptr_to_ULong(disp_cp_xindir))) {
2812         /* use a shorter encoding */
2813         /* movl sign-extend(disp_cp_xindir), %r11 */
2814         *p++ = 0x49;
2815         *p++ = 0xC7;
2816         *p++ = 0xC3;
2817         p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
2818      } else {
2819         /* movabsq $disp_cp_xindir, %r11 */
2820         *p++ = 0x49;
2821         *p++ = 0xBB;
2822         p = emit64(p, Ptr_to_ULong(disp_cp_xindir));
2823      }
2824
2825      /* jmp *%r11 */
2826      *p++ = 0x41;
2827      *p++ = 0xFF;
2828      *p++ = 0xE3;
2829
2830      /* Fix up the conditional jump, if there was one. */
2831      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2832         Int delta = p - ptmp;
2833         vassert(delta > 0 && delta < 40);
2834         *ptmp = toUChar(delta-1);
2835      }
2836      goto done;
2837   }
2838
2839   case Ain_XAssisted: {
2840      /* Use ptmp for backpatching conditional jumps. */
2841      ptmp = NULL;
2842
2843      /* First off, if this is conditional, create a conditional
2844         jump over the rest of it. */
2845      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
2846         /* jmp fwds if !condition */
2847         *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
2848         ptmp = p; /* fill in this bit later */
2849         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2850      }
2851
2852      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2853      *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2854      *p++ = 0x89;
2855      p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2856      /* movl $magic_number, %ebp.  Since these numbers are all small positive
2857         integers, we can get away with "movl $N, %ebp" rather than
2858         the longer "movq $N, %rbp". */
2859      UInt trcval = 0;
2860      switch (i->Ain.XAssisted.jk) {
2861         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
2862         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
2863         case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
2864         case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
2865         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
2866         case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
2867         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
2868         case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
2869         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
2870         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
2871         case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
2872         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
2873         /* We don't expect to see the following being assisted. */
2874         case Ijk_Ret:
2875         case Ijk_Call:
2876         /* fallthrough */
2877         default:
2878            ppIRJumpKind(i->Ain.XAssisted.jk);
2879            vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
2880      }
2881      vassert(trcval != 0);
2882      *p++ = 0xBD;
2883      p = emit32(p, trcval);
2884      /* movabsq $disp_assisted, %r11 */
2885      *p++ = 0x49;
2886      *p++ = 0xBB;
2887      p = emit64(p, Ptr_to_ULong(disp_cp_xassisted));
2888      /* jmp *%r11 */
2889      *p++ = 0x41;
2890      *p++ = 0xFF;
2891      *p++ = 0xE3;
2892
2893      /* Fix up the conditional jump, if there was one. */
2894      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
2895         Int delta = p - ptmp;
2896         vassert(delta > 0 && delta < 40);
2897         *ptmp = toUChar(delta-1);
2898      }
2899      goto done;
2900   }
2901
2902   case Ain_CMov64:
2903      vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
2904      if (i->Ain.CMov64.src->tag == Arm_Reg) {
2905         *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
2906         *p++ = 0x0F;
2907         *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
2908         p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
2909         goto done;
2910      }
2911      if (i->Ain.CMov64.src->tag == Arm_Mem) {
2912         *p++ = rexAMode_M(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
2913         *p++ = 0x0F;
2914         *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
2915         p = doAMode_M(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
2916         goto done;
2917      }
2918      break;
2919
2920   case Ain_MovxLQ:
2921      /* No, _don't_ ask me why the sense of the args has to be
2922         different in the S vs Z case.  I don't know. */
2923      if (i->Ain.MovxLQ.syned) {
2924         /* Need REX.W = 1 here, but rexAMode_R does that for us. */
2925         *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
2926         *p++ = 0x63;
2927         p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
2928      } else {
2929         /* Produce a 32-bit reg-reg move, since the implicit
2930            zero-extend does what we want. */
2931         *p++ = clearWBit (
2932                   rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
2933         *p++ = 0x89;
2934         p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
2935      }
2936      goto done;
2937
2938   case Ain_LoadEX:
2939      if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
2940         /* movzbq */
2941         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2942         *p++ = 0x0F;
2943         *p++ = 0xB6;
2944         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2945         goto done;
2946      }
2947      if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
2948         /* movzwq */
2949         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2950         *p++ = 0x0F;
2951         *p++ = 0xB7;
2952         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2953         goto done;
2954      }
2955      if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
2956         /* movzlq */
2957         /* This isn't really an existing AMD64 instruction per se.
2958            Rather, we have to do a 32-bit load.  Because a 32-bit
2959            write implicitly clears the upper 32 bits of the target
2960            register, we get what we want. */
2961         *p++ = clearWBit(
2962                rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
2963         *p++ = 0x8B;
2964         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2965         goto done;
2966      }
2967      break;
2968
2969   case Ain_Set64:
2970      /* Make the destination register be 1 or 0, depending on whether
2971         the relevant condition holds.  Complication: the top 56 bits
2972         of the destination should be forced to zero, but doing 'xorq
2973         %r,%r' kills the flag(s) we are about to read.  Sigh.  So
2974         start off my moving $0 into the dest. */
2975      reg = iregBits3210(i->Ain.Set64.dst);
2976      vassert(reg < 16);
2977
2978      /* movq $0, %dst */
2979      *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
2980      *p++ = 0xC7;
2981      *p++ = toUChar(0xC0 + (reg & 7));
2982      p = emit32(p, 0);
2983
2984      /* setb lo8(%dst) */
2985      /* note, 8-bit register rex trickyness.  Be careful here. */
2986      *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
2987      *p++ = 0x0F;
2988      *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
2989      *p++ = toUChar(0xC0 + (reg & 7));
2990      goto done;
2991
2992   case Ain_Bsfr64:
2993      *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
2994      *p++ = 0x0F;
2995      if (i->Ain.Bsfr64.isFwds) {
2996         *p++ = 0xBC;
2997      } else {
2998         *p++ = 0xBD;
2999      }
3000      p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3001      goto done;
3002
3003   case Ain_MFence:
3004      /* mfence */
3005      *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3006      goto done;
3007
3008   case Ain_ACAS:
3009      /* lock */
3010      *p++ = 0xF0;
3011      if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3012      /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3013         in %rbx.  The new-value register is hardwired to be %rbx
3014         since dealing with byte integer registers is too much hassle,
3015         so we force the register operand to %rbx (could equally be
3016         %rcx or %rdx). */
3017      rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3018      if (i->Ain.ACAS.sz != 8)
3019         rex = clearWBit(rex);
3020
3021      *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3022      *p++ = 0x0F;
3023      if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3024      p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3025      goto done;
3026
3027   case Ain_DACAS:
3028      /* lock */
3029      *p++ = 0xF0;
3030      /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3031         value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3032         aren't encoded in the insn. */
3033      rex = rexAMode_M( fake(1), i->Ain.ACAS.addr );
3034      if (i->Ain.ACAS.sz != 8)
3035         rex = clearWBit(rex);
3036      *p++ = rex;
3037      *p++ = 0x0F;
3038      *p++ = 0xC7;
3039      p = doAMode_M(p, fake(1), i->Ain.DACAS.addr);
3040      goto done;
3041
3042   case Ain_A87Free:
3043      vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3044      for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3045         p = do_ffree_st(p, 7-j);
3046      }
3047      goto done;
3048
3049   case Ain_A87PushPop:
3050      vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3051      if (i->Ain.A87PushPop.isPush) {
3052         /* Load from memory into %st(0): flds/fldl amode */
3053         *p++ = clearWBit(
3054                   rexAMode_M(fake(0), i->Ain.A87PushPop.addr) );
3055         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3056	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Ain.A87PushPop.addr);
3057      } else {
3058         /* Dump %st(0) to memory: fstps/fstpl amode */
3059         *p++ = clearWBit(
3060                   rexAMode_M(fake(3), i->Ain.A87PushPop.addr) );
3061         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3062         p = doAMode_M(p, fake(3)/*subopcode*/, i->Ain.A87PushPop.addr);
3063         goto done;
3064      }
3065      goto done;
3066
3067   case Ain_A87FpOp:
3068      switch (i->Ain.A87FpOp.op) {
3069         case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3070         case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3071         case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3072         case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3073         case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3074         case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3075         case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3076         case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3077         case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3078         case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3079         case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3080         case Afp_TAN:
3081            /* fptan pushes 1.0 on the FP stack, except when the
3082               argument is out of range.  Hence we have to do the
3083               instruction, then inspect C2 to see if there is an out
3084               of range condition.  If there is, we skip the fincstp
3085               that is used by the in-range case to get rid of this
3086               extra 1.0 value. */
3087            *p++ = 0xD9; *p++ = 0xF2; // fptan
3088            *p++ = 0x50;              // pushq %rax
3089            *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3090            *p++ = 0x66; *p++ = 0xA9;
3091            *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3092            *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3093            *p++ = 0xD9; *p++ = 0xF7; // fincstp
3094            *p++ = 0x58;              // after_fincstp: popq %rax
3095            break;
3096         default:
3097            goto bad;
3098      }
3099      goto done;
3100
3101   case Ain_A87LdCW:
3102      *p++ = clearWBit(
3103                rexAMode_M(fake(5), i->Ain.A87LdCW.addr) );
3104      *p++ = 0xD9;
3105      p = doAMode_M(p, fake(5)/*subopcode*/, i->Ain.A87LdCW.addr);
3106      goto done;
3107
3108   case Ain_A87StSW:
3109      *p++ = clearWBit(
3110                rexAMode_M(fake(7), i->Ain.A87StSW.addr) );
3111      *p++ = 0xDD;
3112      p = doAMode_M(p, fake(7)/*subopcode*/, i->Ain.A87StSW.addr);
3113      goto done;
3114
3115   case Ain_Store:
3116      if (i->Ain.Store.sz == 2) {
3117         /* This just goes to show the crazyness of the instruction
3118            set encoding.  We have to insert two prefix bytes, but be
3119            careful to avoid a conflict in what the size should be, by
3120            ensuring that REX.W = 0. */
3121         *p++ = 0x66; /* override to 16-bits */
3122	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3123         *p++ = 0x89;
3124         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3125         goto done;
3126      }
3127      if (i->Ain.Store.sz == 4) {
3128	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3129         *p++ = 0x89;
3130         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3131         goto done;
3132      }
3133      if (i->Ain.Store.sz == 1) {
3134         /* This is one place where it would be wrong to skip emitting
3135            a rex byte of 0x40, since the mere presence of rex changes
3136            the meaning of the byte register access.  Be careful. */
3137	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3138         *p++ = 0x88;
3139         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3140         goto done;
3141      }
3142      break;
3143
3144   case Ain_LdMXCSR:
3145      *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
3146      *p++ = 0x0F;
3147      *p++ = 0xAE;
3148      p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
3149      goto done;
3150
3151   case Ain_SseUComIS:
3152      /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3153      /* ucomi[sd] %srcL, %srcR */
3154      if (i->Ain.SseUComIS.sz == 8) {
3155         *p++ = 0x66;
3156      } else {
3157         goto bad;
3158         vassert(i->Ain.SseUComIS.sz == 4);
3159      }
3160      *p++ = clearWBit (
3161             rexAMode_R( vreg2ireg(i->Ain.SseUComIS.srcL),
3162                         vreg2ireg(i->Ain.SseUComIS.srcR) ));
3163      *p++ = 0x0F;
3164      *p++ = 0x2E;
3165      p = doAMode_R(p, vreg2ireg(i->Ain.SseUComIS.srcL),
3166                       vreg2ireg(i->Ain.SseUComIS.srcR) );
3167      /* pushfq */
3168      *p++ = 0x9C;
3169      /* popq %dst */
3170      *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.SseUComIS.dst)));
3171      *p++ = toUChar(0x58 + iregBits210(i->Ain.SseUComIS.dst));
3172      goto done;
3173
3174   case Ain_SseSI2SF:
3175      /* cvssi2s[sd] %src, %dst */
3176      rex = rexAMode_R( vreg2ireg(i->Ain.SseSI2SF.dst),
3177                        i->Ain.SseSI2SF.src );
3178      *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3179      *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3180      *p++ = 0x0F;
3181      *p++ = 0x2A;
3182      p = doAMode_R( p, vreg2ireg(i->Ain.SseSI2SF.dst),
3183                        i->Ain.SseSI2SF.src );
3184      goto done;
3185
3186   case Ain_SseSF2SI:
3187      /* cvss[sd]2si %src, %dst */
3188      rex = rexAMode_R( i->Ain.SseSF2SI.dst,
3189                        vreg2ireg(i->Ain.SseSF2SI.src) );
3190      *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3191      *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3192      *p++ = 0x0F;
3193      *p++ = 0x2D;
3194      p = doAMode_R( p, i->Ain.SseSF2SI.dst,
3195                        vreg2ireg(i->Ain.SseSF2SI.src) );
3196      goto done;
3197
3198   case Ain_SseSDSS:
3199      /* cvtsd2ss/cvtss2sd %src, %dst */
3200      *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3201      *p++ = clearWBit(
3202              rexAMode_R( vreg2ireg(i->Ain.SseSDSS.dst),
3203                          vreg2ireg(i->Ain.SseSDSS.src) ));
3204      *p++ = 0x0F;
3205      *p++ = 0x5A;
3206      p = doAMode_R( p, vreg2ireg(i->Ain.SseSDSS.dst),
3207                        vreg2ireg(i->Ain.SseSDSS.src) );
3208      goto done;
3209
3210   case Ain_SseLdSt:
3211      if (i->Ain.SseLdSt.sz == 8) {
3212         *p++ = 0xF2;
3213      } else
3214      if (i->Ain.SseLdSt.sz == 4) {
3215         *p++ = 0xF3;
3216      } else
3217      if (i->Ain.SseLdSt.sz != 16) {
3218         vassert(0);
3219      }
3220      *p++ = clearWBit(
3221             rexAMode_M( vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr));
3222      *p++ = 0x0F;
3223      *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3224      p = doAMode_M(p, vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr);
3225      goto done;
3226
3227   case Ain_SseLdzLO:
3228      vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3229      /* movs[sd] amode, %xmm-dst */
3230      *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3231      *p++ = clearWBit(
3232             rexAMode_M(vreg2ireg(i->Ain.SseLdzLO.reg),
3233                        i->Ain.SseLdzLO.addr));
3234      *p++ = 0x0F;
3235      *p++ = 0x10;
3236      p = doAMode_M(p, vreg2ireg(i->Ain.SseLdzLO.reg),
3237                       i->Ain.SseLdzLO.addr);
3238      goto done;
3239
3240   case Ain_Sse32Fx4:
3241      xtra = 0;
3242      *p++ = clearWBit(
3243             rexAMode_R( vreg2ireg(i->Ain.Sse32Fx4.dst),
3244                         vreg2ireg(i->Ain.Sse32Fx4.src) ));
3245      *p++ = 0x0F;
3246      switch (i->Ain.Sse32Fx4.op) {
3247         case Asse_ADDF:   *p++ = 0x58; break;
3248         case Asse_DIVF:   *p++ = 0x5E; break;
3249         case Asse_MAXF:   *p++ = 0x5F; break;
3250         case Asse_MINF:   *p++ = 0x5D; break;
3251         case Asse_MULF:   *p++ = 0x59; break;
3252         case Asse_RCPF:   *p++ = 0x53; break;
3253         case Asse_RSQRTF: *p++ = 0x52; break;
3254         case Asse_SQRTF:  *p++ = 0x51; break;
3255         case Asse_SUBF:   *p++ = 0x5C; break;
3256         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3257         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3258         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3259         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3260         default: goto bad;
3261      }
3262      p = doAMode_R(p, vreg2ireg(i->Ain.Sse32Fx4.dst),
3263                       vreg2ireg(i->Ain.Sse32Fx4.src) );
3264      if (xtra & 0x100)
3265         *p++ = toUChar(xtra & 0xFF);
3266      goto done;
3267
3268   case Ain_Sse64Fx2:
3269      xtra = 0;
3270      *p++ = 0x66;
3271      *p++ = clearWBit(
3272             rexAMode_R( vreg2ireg(i->Ain.Sse64Fx2.dst),
3273                         vreg2ireg(i->Ain.Sse64Fx2.src) ));
3274      *p++ = 0x0F;
3275      switch (i->Ain.Sse64Fx2.op) {
3276         case Asse_ADDF:   *p++ = 0x58; break;
3277         case Asse_DIVF:   *p++ = 0x5E; break;
3278         case Asse_MAXF:   *p++ = 0x5F; break;
3279         case Asse_MINF:   *p++ = 0x5D; break;
3280         case Asse_MULF:   *p++ = 0x59; break;
3281         case Asse_SQRTF:  *p++ = 0x51; break;
3282         case Asse_SUBF:   *p++ = 0x5C; break;
3283         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3284         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3285         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3286         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3287         default: goto bad;
3288      }
3289      p = doAMode_R(p, vreg2ireg(i->Ain.Sse64Fx2.dst),
3290                       vreg2ireg(i->Ain.Sse64Fx2.src) );
3291      if (xtra & 0x100)
3292         *p++ = toUChar(xtra & 0xFF);
3293      goto done;
3294
3295   case Ain_Sse32FLo:
3296      xtra = 0;
3297      *p++ = 0xF3;
3298      *p++ = clearWBit(
3299             rexAMode_R( vreg2ireg(i->Ain.Sse32FLo.dst),
3300                         vreg2ireg(i->Ain.Sse32FLo.src) ));
3301      *p++ = 0x0F;
3302      switch (i->Ain.Sse32FLo.op) {
3303         case Asse_ADDF:   *p++ = 0x58; break;
3304         case Asse_DIVF:   *p++ = 0x5E; break;
3305         case Asse_MAXF:   *p++ = 0x5F; break;
3306         case Asse_MINF:   *p++ = 0x5D; break;
3307         case Asse_MULF:   *p++ = 0x59; break;
3308         case Asse_RCPF:   *p++ = 0x53; break;
3309         case Asse_RSQRTF: *p++ = 0x52; break;
3310         case Asse_SQRTF:  *p++ = 0x51; break;
3311         case Asse_SUBF:   *p++ = 0x5C; break;
3312         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3313         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3314         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3315         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3316         default: goto bad;
3317      }
3318      p = doAMode_R(p, vreg2ireg(i->Ain.Sse32FLo.dst),
3319                       vreg2ireg(i->Ain.Sse32FLo.src) );
3320      if (xtra & 0x100)
3321         *p++ = toUChar(xtra & 0xFF);
3322      goto done;
3323
3324   case Ain_Sse64FLo:
3325      xtra = 0;
3326      *p++ = 0xF2;
3327      *p++ = clearWBit(
3328             rexAMode_R( vreg2ireg(i->Ain.Sse64FLo.dst),
3329                         vreg2ireg(i->Ain.Sse64FLo.src) ));
3330      *p++ = 0x0F;
3331      switch (i->Ain.Sse64FLo.op) {
3332         case Asse_ADDF:   *p++ = 0x58; break;
3333         case Asse_DIVF:   *p++ = 0x5E; break;
3334         case Asse_MAXF:   *p++ = 0x5F; break;
3335         case Asse_MINF:   *p++ = 0x5D; break;
3336         case Asse_MULF:   *p++ = 0x59; break;
3337         case Asse_SQRTF:  *p++ = 0x51; break;
3338         case Asse_SUBF:   *p++ = 0x5C; break;
3339         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3340         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3341         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3342         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3343         default: goto bad;
3344      }
3345      p = doAMode_R(p, vreg2ireg(i->Ain.Sse64FLo.dst),
3346                       vreg2ireg(i->Ain.Sse64FLo.src) );
3347      if (xtra & 0x100)
3348         *p++ = toUChar(xtra & 0xFF);
3349      goto done;
3350
3351   case Ain_SseReRg:
3352#     define XX(_n) *p++ = (_n)
3353
3354      rex = clearWBit(
3355            rexAMode_R( vreg2ireg(i->Ain.SseReRg.dst),
3356                        vreg2ireg(i->Ain.SseReRg.src) ));
3357
3358      switch (i->Ain.SseReRg.op) {
3359         case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3360         case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3361         case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3362         case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3363         case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3364         case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3365         case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3366         case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3367         case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3368         case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3369         case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3370         case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3371         case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3372         case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3373         case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3374         case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3375         case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3376         case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3377         case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3378         case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3379         case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3380         case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3381         case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3382         case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3383         case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3384         case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3385         case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3386         case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3387         case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3388         case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3389         case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3390         case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3391         case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3392         case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3393         case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3394         case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3395         case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3396         case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3397         case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3398         case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3399         case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3400         case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3401         case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3402         case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3403         case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3404         case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3405         case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3406         case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3407         case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3408         case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3409         case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3410         case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3411         case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3412         case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3413         case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3414         default: goto bad;
3415      }
3416      p = doAMode_R(p, vreg2ireg(i->Ain.SseReRg.dst),
3417                       vreg2ireg(i->Ain.SseReRg.src) );
3418#     undef XX
3419      goto done;
3420
3421   case Ain_SseCMov:
3422      /* jmp fwds if !condition */
3423      *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3424      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3425      ptmp = p;
3426
3427      /* movaps %src, %dst */
3428      *p++ = clearWBit(
3429             rexAMode_R( vreg2ireg(i->Ain.SseCMov.dst),
3430                         vreg2ireg(i->Ain.SseCMov.src) ));
3431      *p++ = 0x0F;
3432      *p++ = 0x28;
3433      p = doAMode_R(p, vreg2ireg(i->Ain.SseCMov.dst),
3434                       vreg2ireg(i->Ain.SseCMov.src) );
3435
3436      /* Fill in the jump offset. */
3437      *(ptmp-1) = toUChar(p - ptmp);
3438      goto done;
3439
3440   case Ain_SseShuf:
3441      *p++ = 0x66;
3442      *p++ = clearWBit(
3443             rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
3444                         vreg2ireg(i->Ain.SseShuf.src) ));
3445      *p++ = 0x0F;
3446      *p++ = 0x70;
3447      p = doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
3448                       vreg2ireg(i->Ain.SseShuf.src) );
3449      *p++ = (UChar)(i->Ain.SseShuf.order);
3450      goto done;
3451
3452   //uu case Ain_AvxLdSt: {
3453   //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
3454   //uu                           i->Ain.AvxLdSt.addr );
3455   //uu    p = emitVexPrefix(p, vex);
3456   //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
3457   //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
3458   //uu      goto done;
3459   //uu }
3460
3461   case Ain_EvCheck: {
3462      /* We generate:
3463            (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
3464            (2 bytes)  jns  nofail     expected taken
3465            (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
3466            nofail:
3467      */
3468      /* This is heavily asserted re instruction lengths.  It needs to
3469         be.  If we get given unexpected forms of .amCounter or
3470         .amFailAddr -- basically, anything that's not of the form
3471         uimm7(%rbp) -- they are likely to fail. */
3472      /* Note also that after the decl we must be very careful not to
3473         read the carry flag, else we get a partial flags stall.
3474         js/jns avoids that, though. */
3475      UChar* p0 = p;
3476      /* ---  decl 8(%rbp) --- */
3477      /* Need to compute the REX byte for the decl in order to prove
3478         that we don't need it, since this is a 32-bit inc and all
3479         registers involved in the amode are < r8.  "fake(1)" because
3480         there's no register in this encoding; instead the register
3481         field is used as a sub opcode.  The encoding for "decl r/m32"
3482         is FF /1, hence the fake(1). */
3483      rex = clearWBit(rexAMode_M(fake(1), i->Ain.EvCheck.amCounter));
3484      if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
3485      *p++ = 0xFF;
3486      p = doAMode_M(p, fake(1), i->Ain.EvCheck.amCounter);
3487      vassert(p - p0 == 3);
3488      /* --- jns nofail --- */
3489      *p++ = 0x79;
3490      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3491      vassert(p - p0 == 5);
3492      /* --- jmp* 0(%rbp) --- */
3493      /* Once again, verify we don't need REX.  The encoding is FF /4.
3494         We don't need REX.W since by default FF /4 in 64-bit mode
3495         implies a 64 bit load. */
3496      rex = clearWBit(rexAMode_M(fake(4), i->Ain.EvCheck.amFailAddr));
3497      if (rex != 0x40) goto bad;
3498      *p++ = 0xFF;
3499      p = doAMode_M(p, fake(4), i->Ain.EvCheck.amFailAddr);
3500      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3501      /* And crosscheck .. */
3502      vassert(evCheckSzB_AMD64() == 8);
3503      goto done;
3504   }
3505
3506   case Ain_ProfInc: {
3507      /* We generate   movabsq $0, %r11
3508                       incq (%r11)
3509         in the expectation that a later call to LibVEX_patchProfCtr
3510         will be used to fill in the immediate field once the right
3511         value is known.
3512         49 BB 00 00 00 00 00 00 00 00
3513         49 FF 03
3514      */
3515      *p++ = 0x49; *p++ = 0xBB;
3516      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3517      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3518      *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
3519      /* Tell the caller .. */
3520      vassert(!(*is_profInc));
3521      *is_profInc = True;
3522      goto done;
3523   }
3524
3525   default:
3526      goto bad;
3527   }
3528
3529  bad:
3530   ppAMD64Instr(i, mode64);
3531   vpanic("emit_AMD64Instr");
3532   /*NOTREACHED*/
3533
3534  done:
3535   vassert(p - &buf[0] <= 32);
3536   return p - &buf[0];
3537
3538#  undef fake
3539}
3540
3541
3542/* How big is an event check?  See case for Ain_EvCheck in
3543   emit_AMD64Instr just above.  That crosschecks what this returns, so
3544   we can tell if we're inconsistent. */
3545Int evCheckSzB_AMD64 ( void )
3546{
3547   return 8;
3548}
3549
3550
3551/* NB: what goes on here has to be very closely coordinated with the
3552   emitInstr case for XDirect, above. */
3553VexInvalRange chainXDirect_AMD64 ( void* place_to_chain,
3554                                   void* disp_cp_chain_me_EXPECTED,
3555                                   void* place_to_jump_to )
3556{
3557   /* What we're expecting to see is:
3558        movabsq $disp_cp_chain_me_EXPECTED, %r11
3559        call *%r11
3560      viz
3561        49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
3562        41 FF D3
3563   */
3564   UChar* p = (UChar*)place_to_chain;
3565   vassert(p[0] == 0x49);
3566   vassert(p[1] == 0xBB);
3567   vassert(*(ULong*)(&p[2]) == Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
3568   vassert(p[10] == 0x41);
3569   vassert(p[11] == 0xFF);
3570   vassert(p[12] == 0xD3);
3571   /* And what we want to change it to is either:
3572        (general case):
3573          movabsq $place_to_jump_to, %r11
3574          jmpq *%r11
3575        viz
3576          49 BB <8 bytes value == place_to_jump_to>
3577          41 FF E3
3578        So it's the same length (convenient, huh) and we don't
3579        need to change all the bits.
3580      ---OR---
3581        in the case where the displacement falls within 32 bits
3582          jmpq disp32   where disp32 is relative to the next insn
3583          ud2; ud2; ud2; ud2
3584        viz
3585          E9 <4 bytes == disp32>
3586          0F 0B 0F 0B 0F 0B 0F 0B
3587
3588      In both cases the replacement has the same length as the original.
3589      To remain sane & verifiable,
3590      (1) limit the displacement for the short form to
3591          (say) +/- one billion, so as to avoid wraparound
3592          off-by-ones
3593      (2) even if the short form is applicable, once every (say)
3594          1024 times use the long form anyway, so as to maintain
3595          verifiability
3596   */
3597   /* This is the delta we need to put into a JMP d32 insn.  It's
3598      relative to the start of the next insn, hence the -5.  */
3599   Long delta   = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
3600   Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
3601
3602   static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
3603   if (shortOK) {
3604      shortCTR++; // thread safety bleh
3605      if (0 == (shortCTR & 0x3FF)) {
3606         shortOK = False;
3607         if (0)
3608            vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
3609                       "using long jmp\n", shortCTR);
3610      }
3611   }
3612
3613   /* And make the modifications. */
3614   if (shortOK) {
3615      p[0]  = 0xE9;
3616      p[1]  = (delta >> 0) & 0xFF;
3617      p[2]  = (delta >> 8) & 0xFF;
3618      p[3]  = (delta >> 16) & 0xFF;
3619      p[4]  = (delta >> 24) & 0xFF;
3620      p[5]  = 0x0F; p[6]  = 0x0B;
3621      p[7]  = 0x0F; p[8]  = 0x0B;
3622      p[9]  = 0x0F; p[10] = 0x0B;
3623      p[11] = 0x0F; p[12] = 0x0B;
3624      /* sanity check on the delta -- top 32 are all 0 or all 1 */
3625      delta >>= 32;
3626      vassert(delta == 0LL || delta == -1LL);
3627   } else {
3628      /* Minimal modifications from the starting sequence. */
3629      *(ULong*)(&p[2]) = Ptr_to_ULong(place_to_jump_to);
3630      p[12] = 0xE3;
3631   }
3632   VexInvalRange vir = { (HWord)place_to_chain, 13 };
3633   return vir;
3634}
3635
3636
3637/* NB: what goes on here has to be very closely coordinated with the
3638   emitInstr case for XDirect, above. */
3639VexInvalRange unchainXDirect_AMD64 ( void* place_to_unchain,
3640                                     void* place_to_jump_to_EXPECTED,
3641                                     void* disp_cp_chain_me )
3642{
3643   /* What we're expecting to see is either:
3644        (general case)
3645          movabsq $place_to_jump_to_EXPECTED, %r11
3646          jmpq *%r11
3647        viz
3648          49 BB <8 bytes value == place_to_jump_to_EXPECTED>
3649          41 FF E3
3650      ---OR---
3651        in the case where the displacement falls within 32 bits
3652          jmpq d32
3653          ud2; ud2; ud2; ud2
3654        viz
3655          E9 <4 bytes == disp32>
3656          0F 0B 0F 0B 0F 0B 0F 0B
3657   */
3658   UChar* p     = (UChar*)place_to_unchain;
3659   Bool   valid = False;
3660   if (p[0] == 0x49 && p[1] == 0xBB
3661       && *(ULong*)(&p[2]) == Ptr_to_ULong(place_to_jump_to_EXPECTED)
3662       && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
3663      /* it's the long form */
3664      valid = True;
3665   }
3666   else
3667   if (p[0] == 0xE9
3668       && p[5]  == 0x0F && p[6]  == 0x0B
3669       && p[7]  == 0x0F && p[8]  == 0x0B
3670       && p[9]  == 0x0F && p[10] == 0x0B
3671       && p[11] == 0x0F && p[12] == 0x0B) {
3672      /* It's the short form.  Check the offset is right. */
3673      Int  s32 = *(Int*)(&p[1]);
3674      Long s64 = (Long)s32;
3675      if ((UChar*)p + 5 + s64 == (UChar*)place_to_jump_to_EXPECTED) {
3676         valid = True;
3677         if (0)
3678            vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
3679      }
3680   }
3681   vassert(valid);
3682   /* And what we want to change it to is:
3683        movabsq $disp_cp_chain_me, %r11
3684        call *%r11
3685      viz
3686        49 BB <8 bytes value == disp_cp_chain_me>
3687        41 FF D3
3688      So it's the same length (convenient, huh).
3689   */
3690   p[0] = 0x49;
3691   p[1] = 0xBB;
3692   *(ULong*)(&p[2]) = Ptr_to_ULong(disp_cp_chain_me);
3693   p[10] = 0x41;
3694   p[11] = 0xFF;
3695   p[12] = 0xD3;
3696   VexInvalRange vir = { (HWord)place_to_unchain, 13 };
3697   return vir;
3698}
3699
3700
3701/* Patch the counter address into a profile inc point, as previously
3702   created by the Ain_ProfInc case for emit_AMD64Instr. */
3703VexInvalRange patchProfInc_AMD64 ( void*  place_to_patch,
3704                                   ULong* location_of_counter )
3705{
3706   vassert(sizeof(ULong*) == 8);
3707   UChar* p = (UChar*)place_to_patch;
3708   vassert(p[0] == 0x49);
3709   vassert(p[1] == 0xBB);
3710   vassert(p[2] == 0x00);
3711   vassert(p[3] == 0x00);
3712   vassert(p[4] == 0x00);
3713   vassert(p[5] == 0x00);
3714   vassert(p[6] == 0x00);
3715   vassert(p[7] == 0x00);
3716   vassert(p[8] == 0x00);
3717   vassert(p[9] == 0x00);
3718   vassert(p[10] == 0x49);
3719   vassert(p[11] == 0xFF);
3720   vassert(p[12] == 0x03);
3721   ULong imm64 = (ULong)Ptr_to_ULong(location_of_counter);
3722   p[2] = imm64 & 0xFF; imm64 >>= 8;
3723   p[3] = imm64 & 0xFF; imm64 >>= 8;
3724   p[4] = imm64 & 0xFF; imm64 >>= 8;
3725   p[5] = imm64 & 0xFF; imm64 >>= 8;
3726   p[6] = imm64 & 0xFF; imm64 >>= 8;
3727   p[7] = imm64 & 0xFF; imm64 >>= 8;
3728   p[8] = imm64 & 0xFF; imm64 >>= 8;
3729   p[9] = imm64 & 0xFF; imm64 >>= 8;
3730   VexInvalRange vir = { (HWord)place_to_patch, 13 };
3731   return vir;
3732}
3733
3734
3735/*---------------------------------------------------------------*/
3736/*--- end                                   host_amd64_defs.c ---*/
3737/*---------------------------------------------------------------*/
3738