1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_amd64_defs.h"
43
44
45/* --------- Registers. --------- */
46
47void ppHRegAMD64 ( HReg reg )
48{
49   Int r;
50   static HChar* ireg64_names[16]
51     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
52         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
53   /* Be generic for all virtual regs. */
54   if (hregIsVirtual(reg)) {
55      ppHReg(reg);
56      return;
57   }
58   /* But specific for real regs. */
59   switch (hregClass(reg)) {
60      case HRcInt64:
61         r = hregNumber(reg);
62         vassert(r >= 0 && r < 16);
63         vex_printf("%s", ireg64_names[r]);
64         return;
65      case HRcFlt64:
66         r = hregNumber(reg);
67         vassert(r >= 0 && r < 6);
68         vex_printf("%%fake%d", r);
69         return;
70      case HRcVec128:
71         r = hregNumber(reg);
72         vassert(r >= 0 && r < 16);
73         vex_printf("%%xmm%d", r);
74         return;
75      default:
76         vpanic("ppHRegAMD64");
77   }
78}
79
80static void ppHRegAMD64_lo32 ( HReg reg )
81{
82   Int r;
83   static HChar* ireg32_names[16]
84     = { "%eax",  "%ecx",  "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
85         "%r8d",  "%r9d",  "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
86   /* Be generic for all virtual regs. */
87   if (hregIsVirtual(reg)) {
88      ppHReg(reg);
89      vex_printf("d");
90      return;
91   }
92   /* But specific for real regs. */
93   switch (hregClass(reg)) {
94      case HRcInt64:
95         r = hregNumber(reg);
96         vassert(r >= 0 && r < 16);
97         vex_printf("%s", ireg32_names[r]);
98         return;
99      default:
100         vpanic("ppHRegAMD64_lo32: invalid regclass");
101   }
102}
103
104HReg hregAMD64_RAX ( void ) { return mkHReg( 0, HRcInt64, False); }
105HReg hregAMD64_RCX ( void ) { return mkHReg( 1, HRcInt64, False); }
106HReg hregAMD64_RDX ( void ) { return mkHReg( 2, HRcInt64, False); }
107HReg hregAMD64_RBX ( void ) { return mkHReg( 3, HRcInt64, False); }
108HReg hregAMD64_RSP ( void ) { return mkHReg( 4, HRcInt64, False); }
109HReg hregAMD64_RBP ( void ) { return mkHReg( 5, HRcInt64, False); }
110HReg hregAMD64_RSI ( void ) { return mkHReg( 6, HRcInt64, False); }
111HReg hregAMD64_RDI ( void ) { return mkHReg( 7, HRcInt64, False); }
112HReg hregAMD64_R8  ( void ) { return mkHReg( 8, HRcInt64, False); }
113HReg hregAMD64_R9  ( void ) { return mkHReg( 9, HRcInt64, False); }
114HReg hregAMD64_R10 ( void ) { return mkHReg(10, HRcInt64, False); }
115HReg hregAMD64_R11 ( void ) { return mkHReg(11, HRcInt64, False); }
116HReg hregAMD64_R12 ( void ) { return mkHReg(12, HRcInt64, False); }
117HReg hregAMD64_R13 ( void ) { return mkHReg(13, HRcInt64, False); }
118HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
119HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
120
121HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
122HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
123HReg hregAMD64_XMM3  ( void ) { return mkHReg( 3, HRcVec128, False); }
124HReg hregAMD64_XMM4  ( void ) { return mkHReg( 4, HRcVec128, False); }
125HReg hregAMD64_XMM5  ( void ) { return mkHReg( 5, HRcVec128, False); }
126HReg hregAMD64_XMM6  ( void ) { return mkHReg( 6, HRcVec128, False); }
127HReg hregAMD64_XMM7  ( void ) { return mkHReg( 7, HRcVec128, False); }
128HReg hregAMD64_XMM8  ( void ) { return mkHReg( 8, HRcVec128, False); }
129HReg hregAMD64_XMM9  ( void ) { return mkHReg( 9, HRcVec128, False); }
130HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); }
131HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); }
132HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); }
133
134
135void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr )
136{
137#if 0
138   *nregs = 6;
139   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
140   (*arr)[ 0] = hregAMD64_RSI();
141   (*arr)[ 1] = hregAMD64_RDI();
142   (*arr)[ 2] = hregAMD64_RBX();
143
144   (*arr)[ 3] = hregAMD64_XMM7();
145   (*arr)[ 4] = hregAMD64_XMM8();
146   (*arr)[ 5] = hregAMD64_XMM9();
147#endif
148#if 1
149   *nregs = 20;
150   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
151   (*arr)[ 0] = hregAMD64_RSI();
152   (*arr)[ 1] = hregAMD64_RDI();
153   (*arr)[ 2] = hregAMD64_R8();
154   (*arr)[ 3] = hregAMD64_R9();
155   (*arr)[ 4] = hregAMD64_R12();
156   (*arr)[ 5] = hregAMD64_R13();
157   (*arr)[ 6] = hregAMD64_R14();
158   (*arr)[ 7] = hregAMD64_R15();
159   (*arr)[ 8] = hregAMD64_RBX();
160
161   (*arr)[ 9] = hregAMD64_XMM3();
162   (*arr)[10] = hregAMD64_XMM4();
163   (*arr)[11] = hregAMD64_XMM5();
164   (*arr)[12] = hregAMD64_XMM6();
165   (*arr)[13] = hregAMD64_XMM7();
166   (*arr)[14] = hregAMD64_XMM8();
167   (*arr)[15] = hregAMD64_XMM9();
168   (*arr)[16] = hregAMD64_XMM10();
169   (*arr)[17] = hregAMD64_XMM11();
170   (*arr)[18] = hregAMD64_XMM12();
171   (*arr)[19] = hregAMD64_R10();
172#endif
173}
174
175
176/* --------- Condition codes, Intel encoding. --------- */
177
178HChar* showAMD64CondCode ( AMD64CondCode cond )
179{
180   switch (cond) {
181      case Acc_O:      return "o";
182      case Acc_NO:     return "no";
183      case Acc_B:      return "b";
184      case Acc_NB:     return "nb";
185      case Acc_Z:      return "z";
186      case Acc_NZ:     return "nz";
187      case Acc_BE:     return "be";
188      case Acc_NBE:    return "nbe";
189      case Acc_S:      return "s";
190      case Acc_NS:     return "ns";
191      case Acc_P:      return "p";
192      case Acc_NP:     return "np";
193      case Acc_L:      return "l";
194      case Acc_NL:     return "nl";
195      case Acc_LE:     return "le";
196      case Acc_NLE:    return "nle";
197      case Acc_ALWAYS: return "ALWAYS";
198      default: vpanic("ppAMD64CondCode");
199   }
200}
201
202
203/* --------- AMD64AMode: memory address expressions. --------- */
204
205AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
206   AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
207   am->tag        = Aam_IR;
208   am->Aam.IR.imm = imm32;
209   am->Aam.IR.reg = reg;
210   return am;
211}
212AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
213   AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
214   am->tag = Aam_IRRS;
215   am->Aam.IRRS.imm   = imm32;
216   am->Aam.IRRS.base  = base;
217   am->Aam.IRRS.index = indEx;
218   am->Aam.IRRS.shift = shift;
219   vassert(shift >= 0 && shift <= 3);
220   return am;
221}
222
223void ppAMD64AMode ( AMD64AMode* am ) {
224   switch (am->tag) {
225      case Aam_IR:
226         if (am->Aam.IR.imm == 0)
227            vex_printf("(");
228         else
229            vex_printf("0x%x(", am->Aam.IR.imm);
230         ppHRegAMD64(am->Aam.IR.reg);
231         vex_printf(")");
232         return;
233      case Aam_IRRS:
234         vex_printf("0x%x(", am->Aam.IRRS.imm);
235         ppHRegAMD64(am->Aam.IRRS.base);
236         vex_printf(",");
237         ppHRegAMD64(am->Aam.IRRS.index);
238         vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
239         return;
240      default:
241         vpanic("ppAMD64AMode");
242   }
243}
244
245static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
246   switch (am->tag) {
247      case Aam_IR:
248         addHRegUse(u, HRmRead, am->Aam.IR.reg);
249         return;
250      case Aam_IRRS:
251         addHRegUse(u, HRmRead, am->Aam.IRRS.base);
252         addHRegUse(u, HRmRead, am->Aam.IRRS.index);
253         return;
254      default:
255         vpanic("addRegUsage_AMD64AMode");
256   }
257}
258
259static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
260   switch (am->tag) {
261      case Aam_IR:
262         am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
263         return;
264      case Aam_IRRS:
265         am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
266         am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
267         return;
268      default:
269         vpanic("mapRegs_AMD64AMode");
270   }
271}
272
273/* --------- Operand, which can be reg, immediate or memory. --------- */
274
275AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
276   AMD64RMI* op       = LibVEX_Alloc(sizeof(AMD64RMI));
277   op->tag            = Armi_Imm;
278   op->Armi.Imm.imm32 = imm32;
279   return op;
280}
281AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
282   AMD64RMI* op     = LibVEX_Alloc(sizeof(AMD64RMI));
283   op->tag          = Armi_Reg;
284   op->Armi.Reg.reg = reg;
285   return op;
286}
287AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
288   AMD64RMI* op    = LibVEX_Alloc(sizeof(AMD64RMI));
289   op->tag         = Armi_Mem;
290   op->Armi.Mem.am = am;
291   return op;
292}
293
294static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
295   switch (op->tag) {
296      case Armi_Imm:
297         vex_printf("$0x%x", op->Armi.Imm.imm32);
298         return;
299      case Armi_Reg:
300         if (lo32)
301            ppHRegAMD64_lo32(op->Armi.Reg.reg);
302         else
303            ppHRegAMD64(op->Armi.Reg.reg);
304         return;
305      case Armi_Mem:
306         ppAMD64AMode(op->Armi.Mem.am);
307         return;
308     default:
309         vpanic("ppAMD64RMI");
310   }
311}
312void ppAMD64RMI ( AMD64RMI* op ) {
313   ppAMD64RMI_wrk(op, False/*!lo32*/);
314}
315void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
316   ppAMD64RMI_wrk(op, True/*lo32*/);
317}
318
319/* An AMD64RMI can only be used in a "read" context (what would it mean
320   to write or modify a literal?) and so we enumerate its registers
321   accordingly. */
322static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
323   switch (op->tag) {
324      case Armi_Imm:
325         return;
326      case Armi_Reg:
327         addHRegUse(u, HRmRead, op->Armi.Reg.reg);
328         return;
329      case Armi_Mem:
330         addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
331         return;
332      default:
333         vpanic("addRegUsage_AMD64RMI");
334   }
335}
336
337static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
338   switch (op->tag) {
339      case Armi_Imm:
340         return;
341      case Armi_Reg:
342         op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
343         return;
344      case Armi_Mem:
345         mapRegs_AMD64AMode(m, op->Armi.Mem.am);
346         return;
347      default:
348         vpanic("mapRegs_AMD64RMI");
349   }
350}
351
352
353/* --------- Operand, which can be reg or immediate only. --------- */
354
355AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
356   AMD64RI* op       = LibVEX_Alloc(sizeof(AMD64RI));
357   op->tag           = Ari_Imm;
358   op->Ari.Imm.imm32 = imm32;
359   return op;
360}
361AMD64RI* AMD64RI_Reg ( HReg reg ) {
362   AMD64RI* op     = LibVEX_Alloc(sizeof(AMD64RI));
363   op->tag         = Ari_Reg;
364   op->Ari.Reg.reg = reg;
365   return op;
366}
367
368void ppAMD64RI ( AMD64RI* op ) {
369   switch (op->tag) {
370      case Ari_Imm:
371         vex_printf("$0x%x", op->Ari.Imm.imm32);
372         return;
373      case Ari_Reg:
374         ppHRegAMD64(op->Ari.Reg.reg);
375         return;
376     default:
377         vpanic("ppAMD64RI");
378   }
379}
380
381/* An AMD64RI can only be used in a "read" context (what would it mean
382   to write or modify a literal?) and so we enumerate its registers
383   accordingly. */
384static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
385   switch (op->tag) {
386      case Ari_Imm:
387         return;
388      case Ari_Reg:
389         addHRegUse(u, HRmRead, op->Ari.Reg.reg);
390         return;
391      default:
392         vpanic("addRegUsage_AMD64RI");
393   }
394}
395
396static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
397   switch (op->tag) {
398      case Ari_Imm:
399         return;
400      case Ari_Reg:
401         op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
402         return;
403      default:
404         vpanic("mapRegs_AMD64RI");
405   }
406}
407
408
409/* --------- Operand, which can be reg or memory only. --------- */
410
411AMD64RM* AMD64RM_Reg ( HReg reg ) {
412   AMD64RM* op       = LibVEX_Alloc(sizeof(AMD64RM));
413   op->tag         = Arm_Reg;
414   op->Arm.Reg.reg = reg;
415   return op;
416}
417AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
418   AMD64RM* op    = LibVEX_Alloc(sizeof(AMD64RM));
419   op->tag        = Arm_Mem;
420   op->Arm.Mem.am = am;
421   return op;
422}
423
424void ppAMD64RM ( AMD64RM* op ) {
425   switch (op->tag) {
426      case Arm_Mem:
427         ppAMD64AMode(op->Arm.Mem.am);
428         return;
429      case Arm_Reg:
430         ppHRegAMD64(op->Arm.Reg.reg);
431         return;
432     default:
433         vpanic("ppAMD64RM");
434   }
435}
436
437/* Because an AMD64RM can be both a source or destination operand, we
438   have to supply a mode -- pertaining to the operand as a whole --
439   indicating how it's being used. */
440static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
441   switch (op->tag) {
442      case Arm_Mem:
443         /* Memory is read, written or modified.  So we just want to
444            know the regs read by the amode. */
445         addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
446         return;
447      case Arm_Reg:
448         /* reg is read, written or modified.  Add it in the
449            appropriate way. */
450         addHRegUse(u, mode, op->Arm.Reg.reg);
451         return;
452     default:
453         vpanic("addRegUsage_AMD64RM");
454   }
455}
456
457static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
458{
459   switch (op->tag) {
460      case Arm_Mem:
461         mapRegs_AMD64AMode(m, op->Arm.Mem.am);
462         return;
463      case Arm_Reg:
464         op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
465         return;
466     default:
467         vpanic("mapRegs_AMD64RM");
468   }
469}
470
471
472/* --------- Instructions. --------- */
473
474static HChar* showAMD64ScalarSz ( Int sz ) {
475   switch (sz) {
476      case 2: return "w";
477      case 4: return "l";
478      case 8: return "q";
479      default: vpanic("showAMD64ScalarSz");
480   }
481}
482
483HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
484   switch (op) {
485      case Aun_NOT: return "not";
486      case Aun_NEG: return "neg";
487      default: vpanic("showAMD64UnaryOp");
488   }
489}
490
491HChar* showAMD64AluOp ( AMD64AluOp op ) {
492   switch (op) {
493      case Aalu_MOV:  return "mov";
494      case Aalu_CMP:  return "cmp";
495      case Aalu_ADD:  return "add";
496      case Aalu_SUB:  return "sub";
497      case Aalu_ADC:  return "adc";
498      case Aalu_SBB:  return "sbb";
499      case Aalu_AND:  return "and";
500      case Aalu_OR:   return "or";
501      case Aalu_XOR:  return "xor";
502      case Aalu_MUL:  return "imul";
503      default: vpanic("showAMD64AluOp");
504   }
505}
506
507HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
508   switch (op) {
509      case Ash_SHL: return "shl";
510      case Ash_SHR: return "shr";
511      case Ash_SAR: return "sar";
512      default: vpanic("showAMD64ShiftOp");
513   }
514}
515
516HChar* showA87FpOp ( A87FpOp op ) {
517   switch (op) {
518      case Afp_SCALE:  return "scale";
519      case Afp_ATAN:   return "atan";
520      case Afp_YL2X:   return "yl2x";
521      case Afp_YL2XP1: return "yl2xp1";
522      case Afp_PREM:   return "prem";
523      case Afp_PREM1:  return "prem1";
524      case Afp_SQRT:   return "sqrt";
525      case Afp_SIN:    return "sin";
526      case Afp_COS:    return "cos";
527      case Afp_TAN:    return "tan";
528      case Afp_ROUND:  return "round";
529      case Afp_2XM1:   return "2xm1";
530      default: vpanic("showA87FpOp");
531   }
532}
533
534HChar* showAMD64SseOp ( AMD64SseOp op ) {
535   switch (op) {
536      case Asse_MOV:      return "movups";
537      case Asse_ADDF:     return "add";
538      case Asse_SUBF:     return "sub";
539      case Asse_MULF:     return "mul";
540      case Asse_DIVF:     return "div";
541      case Asse_MAXF:     return "max";
542      case Asse_MINF:     return "min";
543      case Asse_CMPEQF:   return "cmpFeq";
544      case Asse_CMPLTF:   return "cmpFlt";
545      case Asse_CMPLEF:   return "cmpFle";
546      case Asse_CMPUNF:   return "cmpFun";
547      case Asse_RCPF:     return "rcp";
548      case Asse_RSQRTF:   return "rsqrt";
549      case Asse_SQRTF:    return "sqrt";
550      case Asse_AND:      return "and";
551      case Asse_OR:       return "or";
552      case Asse_XOR:      return "xor";
553      case Asse_ANDN:     return "andn";
554      case Asse_ADD8:     return "paddb";
555      case Asse_ADD16:    return "paddw";
556      case Asse_ADD32:    return "paddd";
557      case Asse_ADD64:    return "paddq";
558      case Asse_QADD8U:   return "paddusb";
559      case Asse_QADD16U:  return "paddusw";
560      case Asse_QADD8S:   return "paddsb";
561      case Asse_QADD16S:  return "paddsw";
562      case Asse_SUB8:     return "psubb";
563      case Asse_SUB16:    return "psubw";
564      case Asse_SUB32:    return "psubd";
565      case Asse_SUB64:    return "psubq";
566      case Asse_QSUB8U:   return "psubusb";
567      case Asse_QSUB16U:  return "psubusw";
568      case Asse_QSUB8S:   return "psubsb";
569      case Asse_QSUB16S:  return "psubsw";
570      case Asse_MUL16:    return "pmullw";
571      case Asse_MULHI16U: return "pmulhuw";
572      case Asse_MULHI16S: return "pmulhw";
573      case Asse_AVG8U:    return "pavgb";
574      case Asse_AVG16U:   return "pavgw";
575      case Asse_MAX16S:   return "pmaxw";
576      case Asse_MAX8U:    return "pmaxub";
577      case Asse_MIN16S:   return "pminw";
578      case Asse_MIN8U:    return "pminub";
579      case Asse_CMPEQ8:   return "pcmpeqb";
580      case Asse_CMPEQ16:  return "pcmpeqw";
581      case Asse_CMPEQ32:  return "pcmpeqd";
582      case Asse_CMPGT8S:  return "pcmpgtb";
583      case Asse_CMPGT16S: return "pcmpgtw";
584      case Asse_CMPGT32S: return "pcmpgtd";
585      case Asse_SHL16:    return "psllw";
586      case Asse_SHL32:    return "pslld";
587      case Asse_SHL64:    return "psllq";
588      case Asse_SHR16:    return "psrlw";
589      case Asse_SHR32:    return "psrld";
590      case Asse_SHR64:    return "psrlq";
591      case Asse_SAR16:    return "psraw";
592      case Asse_SAR32:    return "psrad";
593      case Asse_PACKSSD:  return "packssdw";
594      case Asse_PACKSSW:  return "packsswb";
595      case Asse_PACKUSW:  return "packuswb";
596      case Asse_UNPCKHB:  return "punpckhb";
597      case Asse_UNPCKHW:  return "punpckhw";
598      case Asse_UNPCKHD:  return "punpckhd";
599      case Asse_UNPCKHQ:  return "punpckhq";
600      case Asse_UNPCKLB:  return "punpcklb";
601      case Asse_UNPCKLW:  return "punpcklw";
602      case Asse_UNPCKLD:  return "punpckld";
603      case Asse_UNPCKLQ:  return "punpcklq";
604      default: vpanic("showAMD64SseOp");
605   }
606}
607
608AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
609   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
610   i->tag             = Ain_Imm64;
611   i->Ain.Imm64.imm64 = imm64;
612   i->Ain.Imm64.dst   = dst;
613   return i;
614}
615AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
616   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
617   i->tag            = Ain_Alu64R;
618   i->Ain.Alu64R.op  = op;
619   i->Ain.Alu64R.src = src;
620   i->Ain.Alu64R.dst = dst;
621   return i;
622}
623AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
624   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
625   i->tag            = Ain_Alu64M;
626   i->Ain.Alu64M.op  = op;
627   i->Ain.Alu64M.src = src;
628   i->Ain.Alu64M.dst = dst;
629   vassert(op != Aalu_MUL);
630   return i;
631}
632AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
633   AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
634   i->tag          = Ain_Sh64;
635   i->Ain.Sh64.op  = op;
636   i->Ain.Sh64.src = src;
637   i->Ain.Sh64.dst = dst;
638   return i;
639}
640AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
641   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
642   i->tag              = Ain_Test64;
643   i->Ain.Test64.imm32 = imm32;
644   i->Ain.Test64.dst   = dst;
645   return i;
646}
647AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
648   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
649   i->tag             = Ain_Unary64;
650   i->Ain.Unary64.op  = op;
651   i->Ain.Unary64.dst = dst;
652   return i;
653}
654AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
655   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
656   i->tag             = Ain_Lea64;
657   i->Ain.Lea64.am    = am;
658   i->Ain.Lea64.dst   = dst;
659   return i;
660}
661AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
662   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
663   i->tag            = Ain_Alu32R;
664   i->Ain.Alu32R.op  = op;
665   i->Ain.Alu32R.src = src;
666   i->Ain.Alu32R.dst = dst;
667   switch (op) {
668      case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
669      case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
670      default: vassert(0);
671   }
672   return i;
673}
674AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
675   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
676   i->tag            = Ain_MulL;
677   i->Ain.MulL.syned = syned;
678   i->Ain.MulL.src   = src;
679   return i;
680}
681AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
682   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
683   i->tag            = Ain_Div;
684   i->Ain.Div.syned  = syned;
685   i->Ain.Div.sz     = sz;
686   i->Ain.Div.src    = src;
687   vassert(sz == 4 || sz == 8);
688   return i;
689}
690AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
691   AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
692   i->tag          = Ain_Push;
693   i->Ain.Push.src = src;
694   return i;
695}
696AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms ) {
697   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
698   i->tag               = Ain_Call;
699   i->Ain.Call.cond     = cond;
700   i->Ain.Call.target   = target;
701   i->Ain.Call.regparms = regparms;
702   vassert(regparms >= 0 && regparms <= 6);
703   return i;
704}
705
706AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
707                                 AMD64CondCode cond, Bool toFastEP ) {
708   AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
709   i->tag                  = Ain_XDirect;
710   i->Ain.XDirect.dstGA    = dstGA;
711   i->Ain.XDirect.amRIP    = amRIP;
712   i->Ain.XDirect.cond     = cond;
713   i->Ain.XDirect.toFastEP = toFastEP;
714   return i;
715}
716AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
717                                AMD64CondCode cond ) {
718   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
719   i->tag              = Ain_XIndir;
720   i->Ain.XIndir.dstGA = dstGA;
721   i->Ain.XIndir.amRIP = amRIP;
722   i->Ain.XIndir.cond  = cond;
723   return i;
724}
725AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
726                                   AMD64CondCode cond, IRJumpKind jk ) {
727   AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
728   i->tag                 = Ain_XAssisted;
729   i->Ain.XAssisted.dstGA = dstGA;
730   i->Ain.XAssisted.amRIP = amRIP;
731   i->Ain.XAssisted.cond  = cond;
732   i->Ain.XAssisted.jk    = jk;
733   return i;
734}
735
736AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
737   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
738   i->tag             = Ain_CMov64;
739   i->Ain.CMov64.cond = cond;
740   i->Ain.CMov64.src  = src;
741   i->Ain.CMov64.dst  = dst;
742   vassert(cond != Acc_ALWAYS);
743   return i;
744}
745AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
746   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
747   i->tag              = Ain_MovxLQ;
748   i->Ain.MovxLQ.syned = syned;
749   i->Ain.MovxLQ.src   = src;
750   i->Ain.MovxLQ.dst   = dst;
751   return i;
752}
753AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
754                                AMD64AMode* src, HReg dst ) {
755   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
756   i->tag                = Ain_LoadEX;
757   i->Ain.LoadEX.szSmall = szSmall;
758   i->Ain.LoadEX.syned   = syned;
759   i->Ain.LoadEX.src     = src;
760   i->Ain.LoadEX.dst     = dst;
761   vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
762   return i;
763}
764AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
765   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
766   i->tag           = Ain_Store;
767   i->Ain.Store.sz  = sz;
768   i->Ain.Store.src = src;
769   i->Ain.Store.dst = dst;
770   vassert(sz == 1 || sz == 2 || sz == 4);
771   return i;
772}
773AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
774   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
775   i->tag            = Ain_Set64;
776   i->Ain.Set64.cond = cond;
777   i->Ain.Set64.dst  = dst;
778   return i;
779}
780AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
781   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
782   i->tag               = Ain_Bsfr64;
783   i->Ain.Bsfr64.isFwds = isFwds;
784   i->Ain.Bsfr64.src    = src;
785   i->Ain.Bsfr64.dst    = dst;
786   return i;
787}
788AMD64Instr* AMD64Instr_MFence ( void ) {
789   AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
790   i->tag        = Ain_MFence;
791   return i;
792}
793AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
794   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
795   i->tag           = Ain_ACAS;
796   i->Ain.ACAS.addr = addr;
797   i->Ain.ACAS.sz   = sz;
798   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
799   return i;
800}
801AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
802   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
803   i->tag            = Ain_DACAS;
804   i->Ain.DACAS.addr = addr;
805   i->Ain.DACAS.sz   = sz;
806   vassert(sz == 8 || sz == 4);
807   return i;
808}
809
810AMD64Instr* AMD64Instr_A87Free ( Int nregs )
811{
812   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
813   i->tag               = Ain_A87Free;
814   i->Ain.A87Free.nregs = nregs;
815   vassert(nregs >= 1 && nregs <= 7);
816   return i;
817}
818AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
819{
820   AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
821   i->tag                   = Ain_A87PushPop;
822   i->Ain.A87PushPop.addr   = addr;
823   i->Ain.A87PushPop.isPush = isPush;
824   i->Ain.A87PushPop.szB    = szB;
825   vassert(szB == 8 || szB == 4);
826   return i;
827}
828AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
829{
830   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
831   i->tag            = Ain_A87FpOp;
832   i->Ain.A87FpOp.op = op;
833   return i;
834}
835AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
836{
837   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
838   i->tag              = Ain_A87LdCW;
839   i->Ain.A87LdCW.addr = addr;
840   return i;
841}
842AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
843{
844   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
845   i->tag              = Ain_A87StSW;
846   i->Ain.A87StSW.addr = addr;
847   return i;
848}
849AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
850   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
851   i->tag                = Ain_LdMXCSR;
852   i->Ain.LdMXCSR.addr   = addr;
853   return i;
854}
855AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
856   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
857   i->tag                = Ain_SseUComIS;
858   i->Ain.SseUComIS.sz   = toUChar(sz);
859   i->Ain.SseUComIS.srcL = srcL;
860   i->Ain.SseUComIS.srcR = srcR;
861   i->Ain.SseUComIS.dst  = dst;
862   vassert(sz == 4 || sz == 8);
863   return i;
864}
865AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
866   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
867   i->tag              = Ain_SseSI2SF;
868   i->Ain.SseSI2SF.szS = toUChar(szS);
869   i->Ain.SseSI2SF.szD = toUChar(szD);
870   i->Ain.SseSI2SF.src = src;
871   i->Ain.SseSI2SF.dst = dst;
872   vassert(szS == 4 || szS == 8);
873   vassert(szD == 4 || szD == 8);
874   return i;
875}
876AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
877   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
878   i->tag              = Ain_SseSF2SI;
879   i->Ain.SseSF2SI.szS = toUChar(szS);
880   i->Ain.SseSF2SI.szD = toUChar(szD);
881   i->Ain.SseSF2SI.src = src;
882   i->Ain.SseSF2SI.dst = dst;
883   vassert(szS == 4 || szS == 8);
884   vassert(szD == 4 || szD == 8);
885   return i;
886}
887AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
888{
889   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
890   i->tag                = Ain_SseSDSS;
891   i->Ain.SseSDSS.from64 = from64;
892   i->Ain.SseSDSS.src    = src;
893   i->Ain.SseSDSS.dst    = dst;
894   return i;
895}
896AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
897                                 HReg reg, AMD64AMode* addr ) {
898   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
899   i->tag                = Ain_SseLdSt;
900   i->Ain.SseLdSt.isLoad = isLoad;
901   i->Ain.SseLdSt.sz     = toUChar(sz);
902   i->Ain.SseLdSt.reg    = reg;
903   i->Ain.SseLdSt.addr   = addr;
904   vassert(sz == 4 || sz == 8 || sz == 16);
905   return i;
906}
907AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
908{
909   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
910   i->tag                = Ain_SseLdzLO;
911   i->Ain.SseLdzLO.sz    = sz;
912   i->Ain.SseLdzLO.reg   = reg;
913   i->Ain.SseLdzLO.addr  = addr;
914   vassert(sz == 4 || sz == 8);
915   return i;
916}
917AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
918   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
919   i->tag              = Ain_Sse32Fx4;
920   i->Ain.Sse32Fx4.op  = op;
921   i->Ain.Sse32Fx4.src = src;
922   i->Ain.Sse32Fx4.dst = dst;
923   vassert(op != Asse_MOV);
924   return i;
925}
926AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
927   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
928   i->tag              = Ain_Sse32FLo;
929   i->Ain.Sse32FLo.op  = op;
930   i->Ain.Sse32FLo.src = src;
931   i->Ain.Sse32FLo.dst = dst;
932   vassert(op != Asse_MOV);
933   return i;
934}
935AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
936   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
937   i->tag              = Ain_Sse64Fx2;
938   i->Ain.Sse64Fx2.op  = op;
939   i->Ain.Sse64Fx2.src = src;
940   i->Ain.Sse64Fx2.dst = dst;
941   vassert(op != Asse_MOV);
942   return i;
943}
944AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
945   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
946   i->tag              = Ain_Sse64FLo;
947   i->Ain.Sse64FLo.op  = op;
948   i->Ain.Sse64FLo.src = src;
949   i->Ain.Sse64FLo.dst = dst;
950   vassert(op != Asse_MOV);
951   return i;
952}
953AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
954   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
955   i->tag             = Ain_SseReRg;
956   i->Ain.SseReRg.op  = op;
957   i->Ain.SseReRg.src = re;
958   i->Ain.SseReRg.dst = rg;
959   return i;
960}
961AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
962   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
963   i->tag              = Ain_SseCMov;
964   i->Ain.SseCMov.cond = cond;
965   i->Ain.SseCMov.src  = src;
966   i->Ain.SseCMov.dst  = dst;
967   vassert(cond != Acc_ALWAYS);
968   return i;
969}
970AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
971   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
972   i->tag               = Ain_SseShuf;
973   i->Ain.SseShuf.order = order;
974   i->Ain.SseShuf.src   = src;
975   i->Ain.SseShuf.dst   = dst;
976   vassert(order >= 0 && order <= 0xFF);
977   return i;
978}
979//uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
980//uu                                  HReg reg, AMD64AMode* addr ) {
981//uu    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
982//uu    i->tag                = Ain_AvxLdSt;
983//uu    i->Ain.AvxLdSt.isLoad = isLoad;
984//uu    i->Ain.AvxLdSt.reg    = reg;
985//uu    i->Ain.AvxLdSt.addr   = addr;
986//uu    return i;
987//uu }
988//uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
989//uu    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
990//uu    i->tag             = Ain_AvxReRg;
991//uu    i->Ain.AvxReRg.op  = op;
992//uu    i->Ain.AvxReRg.src = re;
993//uu    i->Ain.AvxReRg.dst = rg;
994//uu    return i;
995//uu }
996AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
997                                 AMD64AMode* amFailAddr ) {
998   AMD64Instr* i             = LibVEX_Alloc(sizeof(AMD64Instr));
999   i->tag                    = Ain_EvCheck;
1000   i->Ain.EvCheck.amCounter  = amCounter;
1001   i->Ain.EvCheck.amFailAddr = amFailAddr;
1002   return i;
1003}
1004AMD64Instr* AMD64Instr_ProfInc ( void ) {
1005   AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
1006   i->tag        = Ain_ProfInc;
1007   return i;
1008}
1009
1010void ppAMD64Instr ( AMD64Instr* i, Bool mode64 )
1011{
1012   vassert(mode64 == True);
1013   switch (i->tag) {
1014      case Ain_Imm64:
1015         vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1016         ppHRegAMD64(i->Ain.Imm64.dst);
1017         return;
1018      case Ain_Alu64R:
1019         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1020         ppAMD64RMI(i->Ain.Alu64R.src);
1021         vex_printf(",");
1022         ppHRegAMD64(i->Ain.Alu64R.dst);
1023         return;
1024      case Ain_Alu64M:
1025         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1026         ppAMD64RI(i->Ain.Alu64M.src);
1027         vex_printf(",");
1028         ppAMD64AMode(i->Ain.Alu64M.dst);
1029         return;
1030      case Ain_Sh64:
1031         vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1032         if (i->Ain.Sh64.src == 0)
1033            vex_printf("%%cl,");
1034         else
1035            vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1036         ppHRegAMD64(i->Ain.Sh64.dst);
1037         return;
1038      case Ain_Test64:
1039         vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1040         ppHRegAMD64(i->Ain.Test64.dst);
1041         return;
1042      case Ain_Unary64:
1043         vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1044         ppHRegAMD64(i->Ain.Unary64.dst);
1045         return;
1046      case Ain_Lea64:
1047         vex_printf("leaq ");
1048         ppAMD64AMode(i->Ain.Lea64.am);
1049         vex_printf(",");
1050         ppHRegAMD64(i->Ain.Lea64.dst);
1051         return;
1052      case Ain_Alu32R:
1053         vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1054         ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1055         vex_printf(",");
1056         ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1057         return;
1058      case Ain_MulL:
1059         vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1060         ppAMD64RM(i->Ain.MulL.src);
1061         return;
1062      case Ain_Div:
1063         vex_printf("%cdiv%s ",
1064                    i->Ain.Div.syned ? 's' : 'u',
1065                    showAMD64ScalarSz(i->Ain.Div.sz));
1066         ppAMD64RM(i->Ain.Div.src);
1067         return;
1068      case Ain_Push:
1069         vex_printf("pushq ");
1070         ppAMD64RMI(i->Ain.Push.src);
1071         return;
1072      case Ain_Call:
1073         vex_printf("call%s[%d] ",
1074                    i->Ain.Call.cond==Acc_ALWAYS
1075                       ? "" : showAMD64CondCode(i->Ain.Call.cond),
1076                    i->Ain.Call.regparms );
1077         vex_printf("0x%llx", i->Ain.Call.target);
1078         break;
1079
1080      case Ain_XDirect:
1081         vex_printf("(xDirect) ");
1082         vex_printf("if (%%rflags.%s) { ",
1083                    showAMD64CondCode(i->Ain.XDirect.cond));
1084         vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1085         vex_printf("movq %%r11,");
1086         ppAMD64AMode(i->Ain.XDirect.amRIP);
1087         vex_printf("; ");
1088         vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1089                    i->Ain.XDirect.toFastEP ? "fast" : "slow");
1090         return;
1091      case Ain_XIndir:
1092         vex_printf("(xIndir) ");
1093         vex_printf("if (%%rflags.%s) { ",
1094                    showAMD64CondCode(i->Ain.XIndir.cond));
1095         vex_printf("movq ");
1096         ppHRegAMD64(i->Ain.XIndir.dstGA);
1097         vex_printf(",");
1098         ppAMD64AMode(i->Ain.XIndir.amRIP);
1099         vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1100         return;
1101      case Ain_XAssisted:
1102         vex_printf("(xAssisted) ");
1103         vex_printf("if (%%rflags.%s) { ",
1104                    showAMD64CondCode(i->Ain.XAssisted.cond));
1105         vex_printf("movq ");
1106         ppHRegAMD64(i->Ain.XAssisted.dstGA);
1107         vex_printf(",");
1108         ppAMD64AMode(i->Ain.XAssisted.amRIP);
1109         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1110                    (Int)i->Ain.XAssisted.jk);
1111         vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1112         return;
1113
1114      case Ain_CMov64:
1115         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1116         ppAMD64RM(i->Ain.CMov64.src);
1117         vex_printf(",");
1118         ppHRegAMD64(i->Ain.CMov64.dst);
1119         return;
1120      case Ain_MovxLQ:
1121         vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1122         ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1123         vex_printf(",");
1124         ppHRegAMD64(i->Ain.MovxLQ.dst);
1125         return;
1126      case Ain_LoadEX:
1127         if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1128            vex_printf("movl ");
1129            ppAMD64AMode(i->Ain.LoadEX.src);
1130            vex_printf(",");
1131            ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1132         } else {
1133            vex_printf("mov%c%cq ",
1134                       i->Ain.LoadEX.syned ? 's' : 'z',
1135                       i->Ain.LoadEX.szSmall==1
1136                          ? 'b'
1137                          : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1138            ppAMD64AMode(i->Ain.LoadEX.src);
1139            vex_printf(",");
1140            ppHRegAMD64(i->Ain.LoadEX.dst);
1141         }
1142         return;
1143      case Ain_Store:
1144         vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1145                              : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1146         ppHRegAMD64(i->Ain.Store.src);
1147         vex_printf(",");
1148         ppAMD64AMode(i->Ain.Store.dst);
1149         return;
1150      case Ain_Set64:
1151         vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1152         ppHRegAMD64(i->Ain.Set64.dst);
1153         return;
1154      case Ain_Bsfr64:
1155         vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1156         ppHRegAMD64(i->Ain.Bsfr64.src);
1157         vex_printf(",");
1158         ppHRegAMD64(i->Ain.Bsfr64.dst);
1159         return;
1160      case Ain_MFence:
1161         vex_printf("mfence" );
1162         return;
1163      case Ain_ACAS:
1164         vex_printf("lock cmpxchg%c ",
1165                     i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1166                     : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1167         vex_printf("{%%rax->%%rbx},");
1168         ppAMD64AMode(i->Ain.ACAS.addr);
1169         return;
1170      case Ain_DACAS:
1171         vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1172                    (Int)(2 * i->Ain.DACAS.sz));
1173         ppAMD64AMode(i->Ain.DACAS.addr);
1174         return;
1175      case Ain_A87Free:
1176         vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1177         break;
1178      case Ain_A87PushPop:
1179         vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1180                    i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1181         ppAMD64AMode(i->Ain.A87PushPop.addr);
1182         break;
1183      case Ain_A87FpOp:
1184         vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1185         break;
1186      case Ain_A87LdCW:
1187         vex_printf("fldcw ");
1188         ppAMD64AMode(i->Ain.A87LdCW.addr);
1189         break;
1190      case Ain_A87StSW:
1191         vex_printf("fstsw ");
1192         ppAMD64AMode(i->Ain.A87StSW.addr);
1193         break;
1194      case Ain_LdMXCSR:
1195         vex_printf("ldmxcsr ");
1196         ppAMD64AMode(i->Ain.LdMXCSR.addr);
1197         break;
1198      case Ain_SseUComIS:
1199         vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1200         ppHRegAMD64(i->Ain.SseUComIS.srcL);
1201         vex_printf(",");
1202         ppHRegAMD64(i->Ain.SseUComIS.srcR);
1203         vex_printf(" ; pushfq ; popq ");
1204         ppHRegAMD64(i->Ain.SseUComIS.dst);
1205         break;
1206      case Ain_SseSI2SF:
1207         vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1208         (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1209            (i->Ain.SseSI2SF.src);
1210         vex_printf(",");
1211         ppHRegAMD64(i->Ain.SseSI2SF.dst);
1212         break;
1213      case Ain_SseSF2SI:
1214         vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1215         ppHRegAMD64(i->Ain.SseSF2SI.src);
1216         vex_printf(",");
1217         (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1218            (i->Ain.SseSF2SI.dst);
1219         break;
1220      case Ain_SseSDSS:
1221         vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1222         ppHRegAMD64(i->Ain.SseSDSS.src);
1223         vex_printf(",");
1224         ppHRegAMD64(i->Ain.SseSDSS.dst);
1225         break;
1226      case Ain_SseLdSt:
1227         switch (i->Ain.SseLdSt.sz) {
1228            case 4:  vex_printf("movss "); break;
1229            case 8:  vex_printf("movsd "); break;
1230            case 16: vex_printf("movups "); break;
1231            default: vassert(0);
1232         }
1233         if (i->Ain.SseLdSt.isLoad) {
1234            ppAMD64AMode(i->Ain.SseLdSt.addr);
1235            vex_printf(",");
1236            ppHRegAMD64(i->Ain.SseLdSt.reg);
1237         } else {
1238            ppHRegAMD64(i->Ain.SseLdSt.reg);
1239            vex_printf(",");
1240            ppAMD64AMode(i->Ain.SseLdSt.addr);
1241         }
1242         return;
1243      case Ain_SseLdzLO:
1244         vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1245         ppAMD64AMode(i->Ain.SseLdzLO.addr);
1246         vex_printf(",");
1247         ppHRegAMD64(i->Ain.SseLdzLO.reg);
1248         return;
1249      case Ain_Sse32Fx4:
1250         vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1251         ppHRegAMD64(i->Ain.Sse32Fx4.src);
1252         vex_printf(",");
1253         ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1254         return;
1255      case Ain_Sse32FLo:
1256         vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1257         ppHRegAMD64(i->Ain.Sse32FLo.src);
1258         vex_printf(",");
1259         ppHRegAMD64(i->Ain.Sse32FLo.dst);
1260         return;
1261      case Ain_Sse64Fx2:
1262         vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1263         ppHRegAMD64(i->Ain.Sse64Fx2.src);
1264         vex_printf(",");
1265         ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1266         return;
1267      case Ain_Sse64FLo:
1268         vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1269         ppHRegAMD64(i->Ain.Sse64FLo.src);
1270         vex_printf(",");
1271         ppHRegAMD64(i->Ain.Sse64FLo.dst);
1272         return;
1273      case Ain_SseReRg:
1274         vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1275         ppHRegAMD64(i->Ain.SseReRg.src);
1276         vex_printf(",");
1277         ppHRegAMD64(i->Ain.SseReRg.dst);
1278         return;
1279      case Ain_SseCMov:
1280         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1281         ppHRegAMD64(i->Ain.SseCMov.src);
1282         vex_printf(",");
1283         ppHRegAMD64(i->Ain.SseCMov.dst);
1284         return;
1285      case Ain_SseShuf:
1286         vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
1287         ppHRegAMD64(i->Ain.SseShuf.src);
1288         vex_printf(",");
1289         ppHRegAMD64(i->Ain.SseShuf.dst);
1290         return;
1291      //uu case Ain_AvxLdSt:
1292      //uu    vex_printf("vmovups ");
1293      //uu    if (i->Ain.AvxLdSt.isLoad) {
1294      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1295      //uu       vex_printf(",");
1296      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1297      //uu    } else {
1298      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1299      //uu       vex_printf(",");
1300      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1301      //uu    }
1302      //uu    return;
1303      //uu case Ain_AvxReRg:
1304      //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1305      //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1306      //uu    vex_printf(",");
1307      //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1308      //uu    return;
1309      case Ain_EvCheck:
1310         vex_printf("(evCheck) decl ");
1311         ppAMD64AMode(i->Ain.EvCheck.amCounter);
1312         vex_printf("; jns nofail; jmp *");
1313         ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1314         vex_printf("; nofail:");
1315         return;
1316      case Ain_ProfInc:
1317         vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1318         return;
1319      default:
1320         vpanic("ppAMD64Instr");
1321   }
1322}
1323
1324/* --------- Helpers for register allocation. --------- */
1325
1326void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
1327{
1328   Bool unary;
1329   vassert(mode64 == True);
1330   initHRegUsage(u);
1331   switch (i->tag) {
1332      case Ain_Imm64:
1333         addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1334         return;
1335      case Ain_Alu64R:
1336         addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1337         if (i->Ain.Alu64R.op == Aalu_MOV) {
1338            addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1339            return;
1340         }
1341         if (i->Ain.Alu64R.op == Aalu_CMP) {
1342            addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1343            return;
1344         }
1345         addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1346         return;
1347      case Ain_Alu64M:
1348         addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1349         addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1350         return;
1351      case Ain_Sh64:
1352         addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1353         if (i->Ain.Sh64.src == 0)
1354            addHRegUse(u, HRmRead, hregAMD64_RCX());
1355         return;
1356      case Ain_Test64:
1357         addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1358         return;
1359      case Ain_Unary64:
1360         addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1361         return;
1362      case Ain_Lea64:
1363         addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1364         addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1365         return;
1366      case Ain_Alu32R:
1367         vassert(i->Ain.Alu32R.op != Aalu_MOV);
1368         addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1369         if (i->Ain.Alu32R.op == Aalu_CMP) {
1370            addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1371            return;
1372         }
1373         addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1374         return;
1375      case Ain_MulL:
1376         addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1377         addHRegUse(u, HRmModify, hregAMD64_RAX());
1378         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1379         return;
1380      case Ain_Div:
1381         addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1382         addHRegUse(u, HRmModify, hregAMD64_RAX());
1383         addHRegUse(u, HRmModify, hregAMD64_RDX());
1384         return;
1385      case Ain_Push:
1386         addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1387         addHRegUse(u, HRmModify, hregAMD64_RSP());
1388         return;
1389      case Ain_Call:
1390         /* This is a bit subtle. */
1391         /* First off, claim it trashes all the caller-saved regs
1392            which fall within the register allocator's jurisdiction.
1393            These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
1394            and all the xmm registers.
1395         */
1396         addHRegUse(u, HRmWrite, hregAMD64_RAX());
1397         addHRegUse(u, HRmWrite, hregAMD64_RCX());
1398         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1399         addHRegUse(u, HRmWrite, hregAMD64_RSI());
1400         addHRegUse(u, HRmWrite, hregAMD64_RDI());
1401         addHRegUse(u, HRmWrite, hregAMD64_R8());
1402         addHRegUse(u, HRmWrite, hregAMD64_R9());
1403         addHRegUse(u, HRmWrite, hregAMD64_R10());
1404         addHRegUse(u, HRmWrite, hregAMD64_R11());
1405         addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1406         addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1407         addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1408         addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1409         addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1410         addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1411         addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1412         addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1413         addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1414         addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1415         addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1416         addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1417
1418         /* Now we have to state any parameter-carrying registers
1419            which might be read.  This depends on the regparmness. */
1420         switch (i->Ain.Call.regparms) {
1421            case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1422            case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1423            case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1424            case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1425            case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1426            case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1427            case 0: break;
1428            default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1429         }
1430         /* Finally, there is the issue that the insn trashes a
1431            register because the literal target address has to be
1432            loaded into a register.  Fortunately, r11 is stated in the
1433            ABI as a scratch register, and so seems a suitable victim.  */
1434         addHRegUse(u, HRmWrite, hregAMD64_R11());
1435         /* Upshot of this is that the assembler really must use r11,
1436            and no other, as a destination temporary. */
1437         return;
1438      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1439         conditionally exit the block.  Hence we only need to list (1)
1440         the registers that they read, and (2) the registers that they
1441         write in the case where the block is not exited.  (2) is
1442         empty, hence only (1) is relevant here. */
1443      case Ain_XDirect:
1444         /* Don't bother to mention the write to %r11, since it is not
1445            available to the allocator. */
1446         addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1447         return;
1448      case Ain_XIndir:
1449         /* Ditto re %r11 */
1450         addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1451         addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1452         return;
1453      case Ain_XAssisted:
1454         /* Ditto re %r11 and %rbp (the baseblock ptr) */
1455         addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1456         addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1457         return;
1458      case Ain_CMov64:
1459         addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
1460         addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1461         return;
1462      case Ain_MovxLQ:
1463         addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1464         addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1465         return;
1466      case Ain_LoadEX:
1467         addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1468         addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1469         return;
1470      case Ain_Store:
1471         addHRegUse(u, HRmRead, i->Ain.Store.src);
1472         addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1473         return;
1474      case Ain_Set64:
1475         addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1476         return;
1477      case Ain_Bsfr64:
1478         addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1479         addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1480         return;
1481      case Ain_MFence:
1482         return;
1483      case Ain_ACAS:
1484         addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1485         addHRegUse(u, HRmRead, hregAMD64_RBX());
1486         addHRegUse(u, HRmModify, hregAMD64_RAX());
1487         return;
1488      case Ain_DACAS:
1489         addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1490         addHRegUse(u, HRmRead, hregAMD64_RCX());
1491         addHRegUse(u, HRmRead, hregAMD64_RBX());
1492         addHRegUse(u, HRmModify, hregAMD64_RDX());
1493         addHRegUse(u, HRmModify, hregAMD64_RAX());
1494         return;
1495      case Ain_A87Free:
1496         return;
1497      case Ain_A87PushPop:
1498         addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1499         return;
1500      case Ain_A87FpOp:
1501         return;
1502      case Ain_A87LdCW:
1503         addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1504         return;
1505      case Ain_A87StSW:
1506         addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1507         return;
1508      case Ain_LdMXCSR:
1509         addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1510         return;
1511      case Ain_SseUComIS:
1512         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1513         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1514         addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1515         return;
1516      case Ain_SseSI2SF:
1517         addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1518         addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1519         return;
1520      case Ain_SseSF2SI:
1521         addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1522         addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1523         return;
1524      case Ain_SseSDSS:
1525         addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1526         addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1527         return;
1528      case Ain_SseLdSt:
1529         addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1530         addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1531                       i->Ain.SseLdSt.reg);
1532         return;
1533      case Ain_SseLdzLO:
1534         addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1535         addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1536         return;
1537      case Ain_Sse32Fx4:
1538         vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1539         unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1540                         || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1541                         || i->Ain.Sse32Fx4.op == Asse_SQRTF );
1542         addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1543         addHRegUse(u, unary ? HRmWrite : HRmModify,
1544                       i->Ain.Sse32Fx4.dst);
1545         return;
1546      case Ain_Sse32FLo:
1547         vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1548         unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1549                         || i->Ain.Sse32FLo.op == Asse_RSQRTF
1550                         || i->Ain.Sse32FLo.op == Asse_SQRTF );
1551         addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1552         addHRegUse(u, unary ? HRmWrite : HRmModify,
1553                       i->Ain.Sse32FLo.dst);
1554         return;
1555      case Ain_Sse64Fx2:
1556         vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1557         unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1558                         || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1559                         || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1560         addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1561         addHRegUse(u, unary ? HRmWrite : HRmModify,
1562                       i->Ain.Sse64Fx2.dst);
1563         return;
1564      case Ain_Sse64FLo:
1565         vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1566         unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1567                         || i->Ain.Sse64FLo.op == Asse_RSQRTF
1568                         || i->Ain.Sse64FLo.op == Asse_SQRTF );
1569         addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1570         addHRegUse(u, unary ? HRmWrite : HRmModify,
1571                       i->Ain.Sse64FLo.dst);
1572         return;
1573      case Ain_SseReRg:
1574         if ( (i->Ain.SseReRg.op == Asse_XOR
1575               || i->Ain.SseReRg.op == Asse_CMPEQ32)
1576              && i->Ain.SseReRg.src == i->Ain.SseReRg.dst) {
1577            /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1578               r,r' as a write of a value to r, and independent of any
1579               previous value in r */
1580            /* (as opposed to a rite of passage :-) */
1581            addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1582         } else {
1583            addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1584            addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1585                             ? HRmWrite : HRmModify,
1586                          i->Ain.SseReRg.dst);
1587         }
1588         return;
1589      case Ain_SseCMov:
1590         addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1591         addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1592         return;
1593      case Ain_SseShuf:
1594         addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1595         addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1596         return;
1597      //uu case Ain_AvxLdSt:
1598      //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1599      //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1600      //uu               i->Ain.AvxLdSt.reg);
1601      //uu return;
1602      //uu case Ain_AvxReRg:
1603      //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1604      //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1605      //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1606      //uu       /* See comments on the case for Ain_SseReRg. */
1607      //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1608      //uu    } else {
1609      //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1610      //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1611      //uu                        ? HRmWrite : HRmModify,
1612      //uu                     i->Ain.AvxReRg.dst);
1613      //uu    }
1614      //uu    return;
1615      case Ain_EvCheck:
1616         /* We expect both amodes only to mention %rbp, so this is in
1617            fact pointless, since %rbp isn't allocatable, but anyway.. */
1618         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1619         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1620         return;
1621      case Ain_ProfInc:
1622         addHRegUse(u, HRmWrite, hregAMD64_R11());
1623         return;
1624      default:
1625         ppAMD64Instr(i, mode64);
1626         vpanic("getRegUsage_AMD64Instr");
1627   }
1628}
1629
1630/* local helper */
1631static inline void mapReg(HRegRemap* m, HReg* r)
1632{
1633   *r = lookupHRegRemap(m, *r);
1634}
1635
1636void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1637{
1638   vassert(mode64 == True);
1639   switch (i->tag) {
1640      case Ain_Imm64:
1641         mapReg(m, &i->Ain.Imm64.dst);
1642         return;
1643      case Ain_Alu64R:
1644         mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1645         mapReg(m, &i->Ain.Alu64R.dst);
1646         return;
1647      case Ain_Alu64M:
1648         mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1649         mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1650         return;
1651      case Ain_Sh64:
1652         mapReg(m, &i->Ain.Sh64.dst);
1653         return;
1654      case Ain_Test64:
1655         mapReg(m, &i->Ain.Test64.dst);
1656         return;
1657      case Ain_Unary64:
1658         mapReg(m, &i->Ain.Unary64.dst);
1659         return;
1660      case Ain_Lea64:
1661         mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1662         mapReg(m, &i->Ain.Lea64.dst);
1663         return;
1664      case Ain_Alu32R:
1665         mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1666         mapReg(m, &i->Ain.Alu32R.dst);
1667         return;
1668      case Ain_MulL:
1669         mapRegs_AMD64RM(m, i->Ain.MulL.src);
1670         return;
1671      case Ain_Div:
1672         mapRegs_AMD64RM(m, i->Ain.Div.src);
1673         return;
1674      case Ain_Push:
1675         mapRegs_AMD64RMI(m, i->Ain.Push.src);
1676         return;
1677      case Ain_Call:
1678         return;
1679      case Ain_XDirect:
1680         mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1681         return;
1682      case Ain_XIndir:
1683         mapReg(m, &i->Ain.XIndir.dstGA);
1684         mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1685         return;
1686      case Ain_XAssisted:
1687         mapReg(m, &i->Ain.XAssisted.dstGA);
1688         mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1689         return;
1690      case Ain_CMov64:
1691         mapRegs_AMD64RM(m, i->Ain.CMov64.src);
1692         mapReg(m, &i->Ain.CMov64.dst);
1693         return;
1694      case Ain_MovxLQ:
1695         mapReg(m, &i->Ain.MovxLQ.src);
1696         mapReg(m, &i->Ain.MovxLQ.dst);
1697         return;
1698      case Ain_LoadEX:
1699         mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1700         mapReg(m, &i->Ain.LoadEX.dst);
1701         return;
1702      case Ain_Store:
1703         mapReg(m, &i->Ain.Store.src);
1704         mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1705         return;
1706      case Ain_Set64:
1707         mapReg(m, &i->Ain.Set64.dst);
1708         return;
1709      case Ain_Bsfr64:
1710         mapReg(m, &i->Ain.Bsfr64.src);
1711         mapReg(m, &i->Ain.Bsfr64.dst);
1712         return;
1713      case Ain_MFence:
1714         return;
1715      case Ain_ACAS:
1716         mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1717         return;
1718      case Ain_DACAS:
1719         mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1720         return;
1721      case Ain_A87Free:
1722         return;
1723      case Ain_A87PushPop:
1724         mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1725         return;
1726      case Ain_A87FpOp:
1727         return;
1728      case Ain_A87LdCW:
1729         mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1730         return;
1731      case Ain_A87StSW:
1732         mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1733         return;
1734      case Ain_LdMXCSR:
1735         mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1736         return;
1737      case Ain_SseUComIS:
1738         mapReg(m, &i->Ain.SseUComIS.srcL);
1739         mapReg(m, &i->Ain.SseUComIS.srcR);
1740         mapReg(m, &i->Ain.SseUComIS.dst);
1741         return;
1742      case Ain_SseSI2SF:
1743         mapReg(m, &i->Ain.SseSI2SF.src);
1744         mapReg(m, &i->Ain.SseSI2SF.dst);
1745         return;
1746      case Ain_SseSF2SI:
1747         mapReg(m, &i->Ain.SseSF2SI.src);
1748         mapReg(m, &i->Ain.SseSF2SI.dst);
1749         return;
1750      case Ain_SseSDSS:
1751         mapReg(m, &i->Ain.SseSDSS.src);
1752         mapReg(m, &i->Ain.SseSDSS.dst);
1753         return;
1754      case Ain_SseLdSt:
1755         mapReg(m, &i->Ain.SseLdSt.reg);
1756         mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1757         break;
1758      case Ain_SseLdzLO:
1759         mapReg(m, &i->Ain.SseLdzLO.reg);
1760         mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1761         break;
1762      case Ain_Sse32Fx4:
1763         mapReg(m, &i->Ain.Sse32Fx4.src);
1764         mapReg(m, &i->Ain.Sse32Fx4.dst);
1765         return;
1766      case Ain_Sse32FLo:
1767         mapReg(m, &i->Ain.Sse32FLo.src);
1768         mapReg(m, &i->Ain.Sse32FLo.dst);
1769         return;
1770      case Ain_Sse64Fx2:
1771         mapReg(m, &i->Ain.Sse64Fx2.src);
1772         mapReg(m, &i->Ain.Sse64Fx2.dst);
1773         return;
1774      case Ain_Sse64FLo:
1775         mapReg(m, &i->Ain.Sse64FLo.src);
1776         mapReg(m, &i->Ain.Sse64FLo.dst);
1777         return;
1778      case Ain_SseReRg:
1779         mapReg(m, &i->Ain.SseReRg.src);
1780         mapReg(m, &i->Ain.SseReRg.dst);
1781         return;
1782      case Ain_SseCMov:
1783         mapReg(m, &i->Ain.SseCMov.src);
1784         mapReg(m, &i->Ain.SseCMov.dst);
1785         return;
1786      case Ain_SseShuf:
1787         mapReg(m, &i->Ain.SseShuf.src);
1788         mapReg(m, &i->Ain.SseShuf.dst);
1789         return;
1790      //uu case Ain_AvxLdSt:
1791      //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1792      //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1793      //uu    break;
1794      //uu case Ain_AvxReRg:
1795      //uu    mapReg(m, &i->Ain.AvxReRg.src);
1796      //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1797      //uu    return;
1798      case Ain_EvCheck:
1799         /* We expect both amodes only to mention %rbp, so this is in
1800            fact pointless, since %rbp isn't allocatable, but anyway.. */
1801         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1802         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1803         return;
1804      case Ain_ProfInc:
1805         /* hardwires r11 -- nothing to modify. */
1806         return;
1807      default:
1808         ppAMD64Instr(i, mode64);
1809         vpanic("mapRegs_AMD64Instr");
1810   }
1811}
1812
1813/* Figure out if i represents a reg-reg move, and if so assign the
1814   source and destination to *src and *dst.  If in doubt say No.  Used
1815   by the register allocator to do move coalescing.
1816*/
1817Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst )
1818{
1819   switch (i->tag) {
1820      case Ain_Alu64R:
1821         /* Moves between integer regs */
1822         if (i->Ain.Alu64R.op != Aalu_MOV)
1823            return False;
1824         if (i->Ain.Alu64R.src->tag != Armi_Reg)
1825            return False;
1826         *src = i->Ain.Alu64R.src->Armi.Reg.reg;
1827         *dst = i->Ain.Alu64R.dst;
1828         return True;
1829      case Ain_SseReRg:
1830         /* Moves between SSE regs */
1831         if (i->Ain.SseReRg.op != Asse_MOV)
1832            return False;
1833         *src = i->Ain.SseReRg.src;
1834         *dst = i->Ain.SseReRg.dst;
1835         return True;
1836      //uu case Ain_AvxReRg:
1837      //uu    /* Moves between AVX regs */
1838      //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
1839      //uu       return False;
1840      //uu    *src = i->Ain.AvxReRg.src;
1841      //uu    *dst = i->Ain.AvxReRg.dst;
1842      //uu    return True;
1843      default:
1844         return False;
1845   }
1846   /*NOTREACHED*/
1847}
1848
1849
1850/* Generate amd64 spill/reload instructions under the direction of the
1851   register allocator.  Note it's critical these don't write the
1852   condition codes. */
1853
1854void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1855                      HReg rreg, Int offsetB, Bool mode64 )
1856{
1857   AMD64AMode* am;
1858   vassert(offsetB >= 0);
1859   vassert(!hregIsVirtual(rreg));
1860   vassert(mode64 == True);
1861   *i1 = *i2 = NULL;
1862   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1863   switch (hregClass(rreg)) {
1864      case HRcInt64:
1865         *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
1866         return;
1867      case HRcVec128:
1868         *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
1869         return;
1870      default:
1871         ppHRegClass(hregClass(rreg));
1872         vpanic("genSpill_AMD64: unimplemented regclass");
1873   }
1874}
1875
1876void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1877                       HReg rreg, Int offsetB, Bool mode64 )
1878{
1879   AMD64AMode* am;
1880   vassert(offsetB >= 0);
1881   vassert(!hregIsVirtual(rreg));
1882   vassert(mode64 == True);
1883   *i1 = *i2 = NULL;
1884   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1885   switch (hregClass(rreg)) {
1886      case HRcInt64:
1887         *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
1888         return;
1889      case HRcVec128:
1890         *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
1891         return;
1892      default:
1893         ppHRegClass(hregClass(rreg));
1894         vpanic("genReload_AMD64: unimplemented regclass");
1895   }
1896}
1897
1898
1899/* --------- The amd64 assembler (bleh.) --------- */
1900
1901/* Produce the low three bits of an integer register number. */
1902static UChar iregBits210 ( HReg r )
1903{
1904   UInt n;
1905   vassert(hregClass(r) == HRcInt64);
1906   vassert(!hregIsVirtual(r));
1907   n = hregNumber(r);
1908   vassert(n <= 15);
1909   return toUChar(n & 7);
1910}
1911
1912/* Produce bit 3 of an integer register number. */
1913static UChar iregBit3 ( HReg r )
1914{
1915   UInt n;
1916   vassert(hregClass(r) == HRcInt64);
1917   vassert(!hregIsVirtual(r));
1918   n = hregNumber(r);
1919   vassert(n <= 15);
1920   return toUChar((n >> 3) & 1);
1921}
1922
1923/* Produce a complete 4-bit integer register number. */
1924static UChar iregBits3210 ( HReg r )
1925{
1926   UInt n;
1927   vassert(hregClass(r) == HRcInt64);
1928   vassert(!hregIsVirtual(r));
1929   n = hregNumber(r);
1930   vassert(n <= 15);
1931   return toUChar(n);
1932}
1933
1934/* Given an xmm (128bit V-class) register number, produce the
1935   equivalent numbered register in 64-bit I-class.  This is a bit of
1936   fakery which facilitates using functions that work on integer
1937   register numbers to be used when assembling SSE instructions
1938   too. */
1939static UInt vreg2ireg ( HReg r )
1940{
1941   UInt n;
1942   vassert(hregClass(r) == HRcVec128);
1943   vassert(!hregIsVirtual(r));
1944   n = hregNumber(r);
1945   vassert(n <= 15);
1946   return mkHReg(n, HRcInt64, False);
1947}
1948
1949//uu /* Ditto for ymm regs. */
1950//uu static UInt dvreg2ireg ( HReg r )
1951//uu {
1952//uu    UInt n;
1953//uu    vassert(hregClass(r) == HRcVec256);
1954//uu    vassert(!hregIsVirtual(r));
1955//uu    n = hregNumber(r);
1956//uu    vassert(n <= 15);
1957//uu    return mkHReg(n, HRcInt64, False);
1958//uu }
1959
1960static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
1961{
1962   return toUChar( ((mod & 3) << 6)
1963                   | ((reg & 7) << 3)
1964                   | (regmem & 7) );
1965}
1966
1967static UChar mkSIB ( Int shift, Int regindex, Int regbase )
1968{
1969   return toUChar( ((shift & 3) << 6)
1970                   | ((regindex & 7) << 3)
1971                   | (regbase & 7) );
1972}
1973
1974static UChar* emit32 ( UChar* p, UInt w32 )
1975{
1976   *p++ = toUChar((w32)       & 0x000000FF);
1977   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1978   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1979   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1980   return p;
1981}
1982
1983static UChar* emit64 ( UChar* p, ULong w64 )
1984{
1985   p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
1986   p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
1987   return p;
1988}
1989
1990/* Does a sign-extend of the lowest 8 bits give
1991   the original number? */
1992static Bool fits8bits ( UInt w32 )
1993{
1994   Int i32 = (Int)w32;
1995   return toBool(i32 == ((i32 << 24) >> 24));
1996}
1997/* Can the lower 32 bits be signedly widened to produce the whole
1998   64-bit value?  In other words, are the top 33 bits either all 0 or
1999   all 1 ? */
2000static Bool fitsIn32Bits ( ULong x )
2001{
2002   Long y0 = (Long)x;
2003   Long y1 = y0;
2004   y1 <<= 32;
2005   y1 >>=/*s*/ 32;
2006   return toBool(x == y1);
2007}
2008
2009
2010/* Forming mod-reg-rm bytes and scale-index-base bytes.
2011
2012     greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2013                       =  00 greg ereg
2014
2015     greg,  d8(ereg)   |  ereg is neither of: RSP R12
2016                       =  01 greg ereg, d8
2017
2018     greg,  d32(ereg)  |  ereg is neither of: RSP R12
2019                       =  10 greg ereg, d32
2020
2021     greg,  d8(ereg)   |  ereg is either: RSP R12
2022                       =  01 greg 100, 0x24, d8
2023                       (lowest bit of rex distinguishes R12/RSP)
2024
2025     greg,  d32(ereg)  |  ereg is either: RSP R12
2026                       =  10 greg 100, 0x24, d32
2027                       (lowest bit of rex distinguishes R12/RSP)
2028
2029     -----------------------------------------------
2030
2031     greg,  d8(base,index,scale)
2032               |  index != RSP
2033               =  01 greg 100, scale index base, d8
2034
2035     greg,  d32(base,index,scale)
2036               |  index != RSP
2037               =  10 greg 100, scale index base, d32
2038*/
2039static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2040{
2041   if (am->tag == Aam_IR) {
2042      if (am->Aam.IR.imm == 0
2043          && am->Aam.IR.reg != hregAMD64_RSP()
2044          && am->Aam.IR.reg != hregAMD64_RBP()
2045          && am->Aam.IR.reg != hregAMD64_R12()
2046          && am->Aam.IR.reg != hregAMD64_R13()
2047         ) {
2048         *p++ = mkModRegRM(0, iregBits210(greg),
2049                              iregBits210(am->Aam.IR.reg));
2050         return p;
2051      }
2052      if (fits8bits(am->Aam.IR.imm)
2053          && am->Aam.IR.reg != hregAMD64_RSP()
2054          && am->Aam.IR.reg != hregAMD64_R12()
2055         ) {
2056         *p++ = mkModRegRM(1, iregBits210(greg),
2057                              iregBits210(am->Aam.IR.reg));
2058         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2059         return p;
2060      }
2061      if (am->Aam.IR.reg != hregAMD64_RSP()
2062          && am->Aam.IR.reg != hregAMD64_R12()
2063         ) {
2064         *p++ = mkModRegRM(2, iregBits210(greg),
2065                              iregBits210(am->Aam.IR.reg));
2066         p = emit32(p, am->Aam.IR.imm);
2067         return p;
2068      }
2069      if ((am->Aam.IR.reg == hregAMD64_RSP()
2070           || am->Aam.IR.reg == hregAMD64_R12())
2071          && fits8bits(am->Aam.IR.imm)) {
2072 	 *p++ = mkModRegRM(1, iregBits210(greg), 4);
2073         *p++ = 0x24;
2074         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2075         return p;
2076      }
2077      if (/* (am->Aam.IR.reg == hregAMD64_RSP()
2078	     || wait for test case for RSP case */
2079          am->Aam.IR.reg == hregAMD64_R12()) {
2080 	 *p++ = mkModRegRM(2, iregBits210(greg), 4);
2081         *p++ = 0x24;
2082         p = emit32(p, am->Aam.IR.imm);
2083         return p;
2084      }
2085      ppAMD64AMode(am);
2086      vpanic("doAMode_M: can't emit amode IR");
2087      /*NOTREACHED*/
2088   }
2089   if (am->tag == Aam_IRRS) {
2090      if (fits8bits(am->Aam.IRRS.imm)
2091          && am->Aam.IRRS.index != hregAMD64_RSP()) {
2092         *p++ = mkModRegRM(1, iregBits210(greg), 4);
2093         *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
2094                                          am->Aam.IRRS.base);
2095         *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2096         return p;
2097      }
2098      if (am->Aam.IRRS.index != hregAMD64_RSP()) {
2099         *p++ = mkModRegRM(2, iregBits210(greg), 4);
2100         *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
2101                                          am->Aam.IRRS.base);
2102         p = emit32(p, am->Aam.IRRS.imm);
2103         return p;
2104      }
2105      ppAMD64AMode(am);
2106      vpanic("doAMode_M: can't emit amode IRRS");
2107      /*NOTREACHED*/
2108   }
2109   vpanic("doAMode_M: unknown amode");
2110   /*NOTREACHED*/
2111}
2112
2113
2114/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2115static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2116{
2117   *p++ = mkModRegRM(3, iregBits210(greg), iregBits210(ereg));
2118   return p;
2119}
2120
2121
2122/* Clear the W bit on a REX byte, thereby changing the operand size
2123   back to whatever that instruction's default operand size is. */
2124static inline UChar clearWBit ( UChar rex )
2125{
2126   return toUChar(rex & ~(1<<3));
2127}
2128
2129
2130/* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2131static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2132{
2133   if (am->tag == Aam_IR) {
2134      UChar W = 1;  /* we want 64-bit mode */
2135      UChar R = iregBit3(greg);
2136      UChar X = 0; /* not relevant */
2137      UChar B = iregBit3(am->Aam.IR.reg);
2138      return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
2139   }
2140   if (am->tag == Aam_IRRS) {
2141      UChar W = 1;  /* we want 64-bit mode */
2142      UChar R = iregBit3(greg);
2143      UChar X = iregBit3(am->Aam.IRRS.index);
2144      UChar B = iregBit3(am->Aam.IRRS.base);
2145      return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
2146   }
2147   vassert(0);
2148   return 0; /*NOTREACHED*/
2149}
2150
2151/* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2152static UChar rexAMode_R ( HReg greg, HReg ereg )
2153{
2154   UChar W = 1;  /* we want 64-bit mode */
2155   UChar R = iregBit3(greg);
2156   UChar X = 0; /* not relevant */
2157   UChar B = iregBit3(ereg);
2158   return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
2159}
2160
2161
2162//uu /* May 2012: this VEX prefix stuff is currently unused, but has
2163//uu    verified correct (I reckon).  Certainly it has been known to
2164//uu    produce correct VEX prefixes during testing. */
2165//uu
2166//uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2167//uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2168//uu    in verbatim.  There's no range checking on the bits. */
2169//uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2170//uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2171//uu                             UInt L, UInt pp )
2172//uu {
2173//uu    UChar byte0 = 0;
2174//uu    UChar byte1 = 0;
2175//uu    UChar byte2 = 0;
2176//uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2177//uu       /* 2 byte encoding is possible. */
2178//uu       byte0 = 0xC5;
2179//uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2180//uu               | (L << 2) | pp;
2181//uu    } else {
2182//uu       /* 3 byte encoding is needed. */
2183//uu       byte0 = 0xC4;
2184//uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2185//uu               | ((rexB ^ 1) << 5) | mmmmm;
2186//uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2187//uu    }
2188//uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2189//uu }
2190//uu
2191//uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2192//uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2193//uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2194//uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2195//uu    vvvv=1111 (unused 3rd reg). */
2196//uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2197//uu {
2198//uu    UChar L       = 1; /* size = 256 */
2199//uu    UChar pp      = 0; /* no SIMD prefix */
2200//uu    UChar mmmmm   = 1; /* 0F */
2201//uu    UChar notVvvv = 0; /* unused */
2202//uu    UChar rexW    = 0;
2203//uu    UChar rexR    = 0;
2204//uu    UChar rexX    = 0;
2205//uu    UChar rexB    = 0;
2206//uu    /* Same logic as in rexAMode_M. */
2207//uu    if (am->tag == Aam_IR) {
2208//uu       rexR = iregBit3(greg);
2209//uu       rexX = 0; /* not relevant */
2210//uu       rexB = iregBit3(am->Aam.IR.reg);
2211//uu    }
2212//uu    else if (am->tag == Aam_IRRS) {
2213//uu       rexR = iregBit3(greg);
2214//uu       rexX = iregBit3(am->Aam.IRRS.index);
2215//uu       rexB = iregBit3(am->Aam.IRRS.base);
2216//uu    } else {
2217//uu       vassert(0);
2218//uu    }
2219//uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2220//uu }
2221//uu
2222//uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2223//uu {
2224//uu    switch (vex & 0xFF) {
2225//uu       case 0xC5:
2226//uu          *p++ = 0xC5;
2227//uu          *p++ = (vex >> 8) & 0xFF;
2228//uu          vassert(0 == (vex >> 16));
2229//uu          break;
2230//uu       case 0xC4:
2231//uu          *p++ = 0xC4;
2232//uu          *p++ = (vex >> 8) & 0xFF;
2233//uu          *p++ = (vex >> 16) & 0xFF;
2234//uu          vassert(0 == (vex >> 24));
2235//uu          break;
2236//uu       default:
2237//uu          vassert(0);
2238//uu    }
2239//uu    return p;
2240//uu }
2241
2242
2243/* Emit ffree %st(N) */
2244static UChar* do_ffree_st ( UChar* p, Int n )
2245{
2246   vassert(n >= 0 && n <= 7);
2247   *p++ = 0xDD;
2248   *p++ = toUChar(0xC0 + n);
2249   return p;
2250}
2251
2252/* Emit an instruction into buf and return the number of bytes used.
2253   Note that buf is not the insn's final place, and therefore it is
2254   imperative to emit position-independent code.  If the emitted
2255   instruction was a profiler inc, set *is_profInc to True, else
2256   leave it unchanged. */
2257
2258Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2259                      UChar* buf, Int nbuf, AMD64Instr* i,
2260                      Bool mode64,
2261                      void* disp_cp_chain_me_to_slowEP,
2262                      void* disp_cp_chain_me_to_fastEP,
2263                      void* disp_cp_xindir,
2264                      void* disp_cp_xassisted )
2265{
2266   UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2267   UInt   xtra;
2268   UInt   reg;
2269   UChar  rex;
2270   UChar* p = &buf[0];
2271   UChar* ptmp;
2272   Int    j;
2273   vassert(nbuf >= 32);
2274   vassert(mode64 == True);
2275
2276   /* Wrap an integer as a int register, for use assembling
2277      GrpN insns, in which the greg field is used as a sub-opcode
2278      and does not really contain a register. */
2279#  define fake(_n) mkHReg((_n), HRcInt64, False)
2280
2281   /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2282
2283   switch (i->tag) {
2284
2285   case Ain_Imm64:
2286      if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2287         /* Use the short form (load into 32 bit reg, + default
2288            widening rule) for constants under 1 million.  We could
2289            use this form for the range 0 to 0x7FFFFFFF inclusive, but
2290            limit it to a smaller range for verifiability purposes. */
2291         if (1 & iregBit3(i->Ain.Imm64.dst))
2292            *p++ = 0x41;
2293         *p++ = 0xB8 + iregBits210(i->Ain.Imm64.dst);
2294         p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2295      } else {
2296         *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Imm64.dst)));
2297         *p++ = toUChar(0xB8 + iregBits210(i->Ain.Imm64.dst));
2298         p = emit64(p, i->Ain.Imm64.imm64);
2299      }
2300      goto done;
2301
2302   case Ain_Alu64R:
2303      /* Deal specially with MOV */
2304      if (i->Ain.Alu64R.op == Aalu_MOV) {
2305         switch (i->Ain.Alu64R.src->tag) {
2306            case Armi_Imm:
2307               if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2308                  /* Actually we could use this form for constants in
2309                     the range 0 through 0x7FFFFFFF inclusive, but
2310                     limit it to a small range for verifiability
2311                     purposes. */
2312                  /* Generate "movl $imm32, 32-bit-register" and let
2313                     the default zero-extend rule cause the upper half
2314                     of the dst to be zeroed out too.  This saves 1
2315                     and sometimes 2 bytes compared to the more
2316                     obvious encoding in the 'else' branch. */
2317                  if (1 & iregBit3(i->Ain.Alu64R.dst))
2318                     *p++ = 0x41;
2319                  *p++ = 0xB8 + iregBits210(i->Ain.Alu64R.dst);
2320                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2321               } else {
2322                  *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Alu64R.dst)));
2323                  *p++ = 0xC7;
2324                  *p++ = toUChar(0xC0 + iregBits210(i->Ain.Alu64R.dst));
2325                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2326               }
2327               goto done;
2328            case Armi_Reg:
2329               *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2330                                  i->Ain.Alu64R.dst );
2331               *p++ = 0x89;
2332               p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2333                                i->Ain.Alu64R.dst);
2334               goto done;
2335            case Armi_Mem:
2336               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2337                                 i->Ain.Alu64R.src->Armi.Mem.am);
2338               *p++ = 0x8B;
2339               p = doAMode_M(p, i->Ain.Alu64R.dst,
2340                                i->Ain.Alu64R.src->Armi.Mem.am);
2341               goto done;
2342            default:
2343               goto bad;
2344         }
2345      }
2346      /* MUL */
2347      if (i->Ain.Alu64R.op == Aalu_MUL) {
2348         switch (i->Ain.Alu64R.src->tag) {
2349            case Armi_Reg:
2350               *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2351                                  i->Ain.Alu64R.src->Armi.Reg.reg);
2352               *p++ = 0x0F;
2353               *p++ = 0xAF;
2354               p = doAMode_R(p, i->Ain.Alu64R.dst,
2355                                i->Ain.Alu64R.src->Armi.Reg.reg);
2356               goto done;
2357            case Armi_Mem:
2358               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2359                                 i->Ain.Alu64R.src->Armi.Mem.am);
2360               *p++ = 0x0F;
2361               *p++ = 0xAF;
2362               p = doAMode_M(p, i->Ain.Alu64R.dst,
2363                                i->Ain.Alu64R.src->Armi.Mem.am);
2364               goto done;
2365            case Armi_Imm:
2366               if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2367                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2368                  *p++ = 0x6B;
2369                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2370                  *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2371               } else {
2372                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2373                  *p++ = 0x69;
2374                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2375                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2376               }
2377               goto done;
2378            default:
2379               goto bad;
2380         }
2381      }
2382      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2383      opc = opc_rr = subopc_imm = opc_imma = 0;
2384      switch (i->Ain.Alu64R.op) {
2385         case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2386                        subopc_imm = 2; opc_imma = 0x15; break;
2387         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2388                        subopc_imm = 0; opc_imma = 0x05; break;
2389         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2390                        subopc_imm = 5; opc_imma = 0x2D; break;
2391         case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2392                        subopc_imm = 3; opc_imma = 0x1D; break;
2393         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2394                        subopc_imm = 4; opc_imma = 0x25; break;
2395         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2396                        subopc_imm = 6; opc_imma = 0x35; break;
2397         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2398                        subopc_imm = 1; opc_imma = 0x0D; break;
2399         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2400                        subopc_imm = 7; opc_imma = 0x3D; break;
2401         default: goto bad;
2402      }
2403      switch (i->Ain.Alu64R.src->tag) {
2404         case Armi_Imm:
2405            if (i->Ain.Alu64R.dst == hregAMD64_RAX()
2406                && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2407               goto bad; /* FIXME: awaiting test case */
2408               *p++ = toUChar(opc_imma);
2409               p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2410            } else
2411            if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2412               *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst );
2413               *p++ = 0x83;
2414               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
2415               *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2416            } else {
2417               *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst);
2418               *p++ = 0x81;
2419               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
2420               p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2421            }
2422            goto done;
2423         case Armi_Reg:
2424            *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2425                               i->Ain.Alu64R.dst);
2426            *p++ = toUChar(opc_rr);
2427            p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2428                             i->Ain.Alu64R.dst);
2429            goto done;
2430         case Armi_Mem:
2431            *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2432                               i->Ain.Alu64R.src->Armi.Mem.am);
2433            *p++ = toUChar(opc);
2434            p = doAMode_M(p, i->Ain.Alu64R.dst,
2435                             i->Ain.Alu64R.src->Armi.Mem.am);
2436            goto done;
2437         default:
2438            goto bad;
2439      }
2440      break;
2441
2442   case Ain_Alu64M:
2443      /* Deal specially with MOV */
2444      if (i->Ain.Alu64M.op == Aalu_MOV) {
2445         switch (i->Ain.Alu64M.src->tag) {
2446            case Ari_Reg:
2447               *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2448                                 i->Ain.Alu64M.dst);
2449               *p++ = 0x89;
2450               p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2451                                i->Ain.Alu64M.dst);
2452               goto done;
2453            case Ari_Imm:
2454               *p++ = rexAMode_M(fake(0), i->Ain.Alu64M.dst);
2455               *p++ = 0xC7;
2456               p = doAMode_M(p, fake(0), i->Ain.Alu64M.dst);
2457               p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2458               goto done;
2459            default:
2460               goto bad;
2461         }
2462      }
2463      break;
2464
2465   case Ain_Sh64:
2466      opc_cl = opc_imm = subopc = 0;
2467      switch (i->Ain.Sh64.op) {
2468         case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2469         case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2470         case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2471         default: goto bad;
2472      }
2473      if (i->Ain.Sh64.src == 0) {
2474         *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
2475         *p++ = toUChar(opc_cl);
2476         p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
2477         goto done;
2478      } else {
2479         *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
2480         *p++ = toUChar(opc_imm);
2481         p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
2482         *p++ = (UChar)(i->Ain.Sh64.src);
2483         goto done;
2484      }
2485      break;
2486
2487   case Ain_Test64:
2488      /* testq sign-extend($imm32), %reg */
2489      *p++ = rexAMode_R(fake(0), i->Ain.Test64.dst);
2490      *p++ = 0xF7;
2491      p = doAMode_R(p, fake(0), i->Ain.Test64.dst);
2492      p = emit32(p, i->Ain.Test64.imm32);
2493      goto done;
2494
2495   case Ain_Unary64:
2496      if (i->Ain.Unary64.op == Aun_NOT) {
2497         *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
2498         *p++ = 0xF7;
2499         p = doAMode_R(p, fake(2), i->Ain.Unary64.dst);
2500         goto done;
2501      }
2502      if (i->Ain.Unary64.op == Aun_NEG) {
2503         *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
2504         *p++ = 0xF7;
2505         p = doAMode_R(p, fake(3), i->Ain.Unary64.dst);
2506         goto done;
2507      }
2508      break;
2509
2510   case Ain_Lea64:
2511      *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2512      *p++ = 0x8D;
2513      p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2514      goto done;
2515
2516   case Ain_Alu32R:
2517      /* ADD/SUB/AND/OR/XOR/CMP */
2518      opc = opc_rr = subopc_imm = opc_imma = 0;
2519      switch (i->Ain.Alu32R.op) {
2520         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2521                        subopc_imm = 0; opc_imma = 0x05; break;
2522         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2523                        subopc_imm = 5; opc_imma = 0x2D; break;
2524         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2525                        subopc_imm = 4; opc_imma = 0x25; break;
2526         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2527                        subopc_imm = 6; opc_imma = 0x35; break;
2528         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2529                        subopc_imm = 1; opc_imma = 0x0D; break;
2530         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2531                        subopc_imm = 7; opc_imma = 0x3D; break;
2532         default: goto bad;
2533      }
2534      switch (i->Ain.Alu32R.src->tag) {
2535         case Armi_Imm:
2536            if (i->Ain.Alu32R.dst == hregAMD64_RAX()
2537                && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2538               goto bad; /* FIXME: awaiting test case */
2539               *p++ = toUChar(opc_imma);
2540               p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2541            } else
2542            if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2543               rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst ) );
2544               if (rex != 0x40) *p++ = rex;
2545               *p++ = 0x83;
2546               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
2547               *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2548            } else {
2549               rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst) );
2550               if (rex != 0x40) *p++ = rex;
2551               *p++ = 0x81;
2552               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
2553               p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2554            }
2555            goto done;
2556         case Armi_Reg:
2557            rex  = clearWBit(
2558                   rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2559                               i->Ain.Alu32R.dst) );
2560            if (rex != 0x40) *p++ = rex;
2561            *p++ = toUChar(opc_rr);
2562            p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2563                             i->Ain.Alu32R.dst);
2564            goto done;
2565         case Armi_Mem:
2566            rex  = clearWBit(
2567                   rexAMode_M( i->Ain.Alu32R.dst,
2568                               i->Ain.Alu32R.src->Armi.Mem.am) );
2569            if (rex != 0x40) *p++ = rex;
2570            *p++ = toUChar(opc);
2571            p = doAMode_M(p, i->Ain.Alu32R.dst,
2572                             i->Ain.Alu32R.src->Armi.Mem.am);
2573            goto done;
2574         default:
2575            goto bad;
2576      }
2577      break;
2578
2579   case Ain_MulL:
2580      subopc = i->Ain.MulL.syned ? 5 : 4;
2581      switch (i->Ain.MulL.src->tag)  {
2582         case Arm_Mem:
2583            *p++ = rexAMode_M( fake(0),
2584                               i->Ain.MulL.src->Arm.Mem.am);
2585            *p++ = 0xF7;
2586            p = doAMode_M(p, fake(subopc),
2587                             i->Ain.MulL.src->Arm.Mem.am);
2588            goto done;
2589         case Arm_Reg:
2590            *p++ = rexAMode_R(fake(0),
2591                              i->Ain.MulL.src->Arm.Reg.reg);
2592            *p++ = 0xF7;
2593            p = doAMode_R(p, fake(subopc),
2594                             i->Ain.MulL.src->Arm.Reg.reg);
2595            goto done;
2596         default:
2597            goto bad;
2598      }
2599      break;
2600
2601   case Ain_Div:
2602      subopc = i->Ain.Div.syned ? 7 : 6;
2603      if (i->Ain.Div.sz == 4) {
2604         switch (i->Ain.Div.src->tag)  {
2605            case Arm_Mem:
2606               goto bad;
2607               /*FIXME*/
2608               *p++ = 0xF7;
2609               p = doAMode_M(p, fake(subopc),
2610                                i->Ain.Div.src->Arm.Mem.am);
2611               goto done;
2612            case Arm_Reg:
2613               *p++ = clearWBit(
2614                      rexAMode_R( fake(0), i->Ain.Div.src->Arm.Reg.reg));
2615               *p++ = 0xF7;
2616               p = doAMode_R(p, fake(subopc),
2617                                i->Ain.Div.src->Arm.Reg.reg);
2618               goto done;
2619            default:
2620               goto bad;
2621         }
2622      }
2623      if (i->Ain.Div.sz == 8) {
2624         switch (i->Ain.Div.src->tag)  {
2625            case Arm_Mem:
2626               *p++ = rexAMode_M( fake(0),
2627                                  i->Ain.Div.src->Arm.Mem.am);
2628               *p++ = 0xF7;
2629               p = doAMode_M(p, fake(subopc),
2630                                i->Ain.Div.src->Arm.Mem.am);
2631               goto done;
2632            case Arm_Reg:
2633               *p++ = rexAMode_R( fake(0),
2634                                  i->Ain.Div.src->Arm.Reg.reg);
2635               *p++ = 0xF7;
2636               p = doAMode_R(p, fake(subopc),
2637                                i->Ain.Div.src->Arm.Reg.reg);
2638               goto done;
2639            default:
2640               goto bad;
2641         }
2642      }
2643      break;
2644
2645   case Ain_Push:
2646      switch (i->Ain.Push.src->tag) {
2647         case Armi_Mem:
2648            *p++ = clearWBit(
2649                   rexAMode_M(fake(0), i->Ain.Push.src->Armi.Mem.am));
2650            *p++ = 0xFF;
2651            p = doAMode_M(p, fake(6), i->Ain.Push.src->Armi.Mem.am);
2652            goto done;
2653         case Armi_Imm:
2654            *p++ = 0x68;
2655            p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2656            goto done;
2657         case Armi_Reg:
2658            *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.Push.src->Armi.Reg.reg)));
2659            *p++ = toUChar(0x50 + iregBits210(i->Ain.Push.src->Armi.Reg.reg));
2660            goto done;
2661        default:
2662            goto bad;
2663      }
2664
2665   case Ain_Call: {
2666      /* As per detailed comment for Ain_Call in
2667         getRegUsage_AMD64Instr above, %r11 is used as an address
2668         temporary. */
2669      /* jump over the following two insns if the condition does not
2670         hold */
2671      Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2672      if (i->Ain.Call.cond != Acc_ALWAYS) {
2673         *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2674         *p++ = shortImm ? 10 : 13;
2675         /* 10 or 13 bytes in the next two insns */
2676      }
2677      if (shortImm) {
2678         /* 7 bytes: movl sign-extend(imm32), %r11 */
2679         *p++ = 0x49;
2680         *p++ = 0xC7;
2681         *p++ = 0xC3;
2682         p = emit32(p, (UInt)i->Ain.Call.target);
2683      } else {
2684         /* 10 bytes: movabsq $target, %r11 */
2685         *p++ = 0x49;
2686         *p++ = 0xBB;
2687         p = emit64(p, i->Ain.Call.target);
2688      }
2689      /* 3 bytes: call *%r11 */
2690      *p++ = 0x41;
2691      *p++ = 0xFF;
2692      *p++ = 0xD3;
2693      goto done;
2694   }
2695
2696   case Ain_XDirect: {
2697      /* NB: what goes on here has to be very closely coordinated with the
2698         chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
2699      /* We're generating chain-me requests here, so we need to be
2700         sure this is actually allowed -- no-redir translations can't
2701         use chain-me's.  Hence: */
2702      vassert(disp_cp_chain_me_to_slowEP != NULL);
2703      vassert(disp_cp_chain_me_to_fastEP != NULL);
2704
2705      HReg r11 = hregAMD64_R11();
2706
2707      /* Use ptmp for backpatching conditional jumps. */
2708      ptmp = NULL;
2709
2710      /* First off, if this is conditional, create a conditional
2711         jump over the rest of it. */
2712      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2713         /* jmp fwds if !condition */
2714         *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
2715         ptmp = p; /* fill in this bit later */
2716         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2717      }
2718
2719      /* Update the guest RIP. */
2720      if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
2721         /* use a shorter encoding */
2722         /* movl sign-extend(dstGA), %r11 */
2723         *p++ = 0x49;
2724         *p++ = 0xC7;
2725         *p++ = 0xC3;
2726         p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
2727      } else {
2728         /* movabsq $dstGA, %r11 */
2729         *p++ = 0x49;
2730         *p++ = 0xBB;
2731         p = emit64(p, i->Ain.XDirect.dstGA);
2732      }
2733
2734      /* movq %r11, amRIP */
2735      *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
2736      *p++ = 0x89;
2737      p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
2738
2739      /* --- FIRST PATCHABLE BYTE follows --- */
2740      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2741         to) backs up the return address, so as to find the address of
2742         the first patchable byte.  So: don't change the length of the
2743         two instructions below. */
2744      /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
2745      *p++ = 0x49;
2746      *p++ = 0xBB;
2747      void* disp_cp_chain_me
2748               = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2749                                         : disp_cp_chain_me_to_slowEP;
2750      p = emit64(p, Ptr_to_ULong(disp_cp_chain_me));
2751      /* call *%r11 */
2752      *p++ = 0x41;
2753      *p++ = 0xFF;
2754      *p++ = 0xD3;
2755      /* --- END of PATCHABLE BYTES --- */
2756
2757      /* Fix up the conditional jump, if there was one. */
2758      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2759         Int delta = p - ptmp;
2760         vassert(delta > 0 && delta < 40);
2761         *ptmp = toUChar(delta-1);
2762      }
2763      goto done;
2764   }
2765
2766   case Ain_XIndir: {
2767      /* We're generating transfers that could lead indirectly to a
2768         chain-me, so we need to be sure this is actually allowed --
2769         no-redir translations are not allowed to reach normal
2770         translations without going through the scheduler.  That means
2771         no XDirects or XIndirs out from no-redir translations.
2772         Hence: */
2773      vassert(disp_cp_xindir != NULL);
2774
2775      /* Use ptmp for backpatching conditional jumps. */
2776      ptmp = NULL;
2777
2778      /* First off, if this is conditional, create a conditional
2779         jump over the rest of it. */
2780      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2781         /* jmp fwds if !condition */
2782         *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
2783         ptmp = p; /* fill in this bit later */
2784         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2785      }
2786
2787      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2788      *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2789      *p++ = 0x89;
2790      p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2791
2792      /* get $disp_cp_xindir into %r11 */
2793      if (fitsIn32Bits(Ptr_to_ULong(disp_cp_xindir))) {
2794         /* use a shorter encoding */
2795         /* movl sign-extend(disp_cp_xindir), %r11 */
2796         *p++ = 0x49;
2797         *p++ = 0xC7;
2798         *p++ = 0xC3;
2799         p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
2800      } else {
2801         /* movabsq $disp_cp_xindir, %r11 */
2802         *p++ = 0x49;
2803         *p++ = 0xBB;
2804         p = emit64(p, Ptr_to_ULong(disp_cp_xindir));
2805      }
2806
2807      /* jmp *%r11 */
2808      *p++ = 0x41;
2809      *p++ = 0xFF;
2810      *p++ = 0xE3;
2811
2812      /* Fix up the conditional jump, if there was one. */
2813      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2814         Int delta = p - ptmp;
2815         vassert(delta > 0 && delta < 40);
2816         *ptmp = toUChar(delta-1);
2817      }
2818      goto done;
2819   }
2820
2821   case Ain_XAssisted: {
2822      /* Use ptmp for backpatching conditional jumps. */
2823      ptmp = NULL;
2824
2825      /* First off, if this is conditional, create a conditional
2826         jump over the rest of it. */
2827      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
2828         /* jmp fwds if !condition */
2829         *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
2830         ptmp = p; /* fill in this bit later */
2831         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2832      }
2833
2834      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2835      *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2836      *p++ = 0x89;
2837      p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2838      /* movl $magic_number, %ebp.  Since these numbers are all small positive
2839         integers, we can get away with "movl $N, %ebp" rather than
2840         the longer "movq $N, %rbp". */
2841      UInt trcval = 0;
2842      switch (i->Ain.XAssisted.jk) {
2843         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
2844         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
2845         case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
2846         case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
2847         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
2848         case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
2849         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
2850         case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
2851         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
2852         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
2853         case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
2854         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
2855         /* We don't expect to see the following being assisted. */
2856         case Ijk_Ret:
2857         case Ijk_Call:
2858         /* fallthrough */
2859         default:
2860            ppIRJumpKind(i->Ain.XAssisted.jk);
2861            vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
2862      }
2863      vassert(trcval != 0);
2864      *p++ = 0xBD;
2865      p = emit32(p, trcval);
2866      /* movabsq $disp_assisted, %r11 */
2867      *p++ = 0x49;
2868      *p++ = 0xBB;
2869      p = emit64(p, Ptr_to_ULong(disp_cp_xassisted));
2870      /* jmp *%r11 */
2871      *p++ = 0x41;
2872      *p++ = 0xFF;
2873      *p++ = 0xE3;
2874
2875      /* Fix up the conditional jump, if there was one. */
2876      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
2877         Int delta = p - ptmp;
2878         vassert(delta > 0 && delta < 40);
2879         *ptmp = toUChar(delta-1);
2880      }
2881      goto done;
2882   }
2883
2884   case Ain_CMov64:
2885      vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
2886      if (i->Ain.CMov64.src->tag == Arm_Reg) {
2887         *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
2888         *p++ = 0x0F;
2889         *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
2890         p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
2891         goto done;
2892      }
2893      if (i->Ain.CMov64.src->tag == Arm_Mem) {
2894         *p++ = rexAMode_M(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
2895         *p++ = 0x0F;
2896         *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
2897         p = doAMode_M(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
2898         goto done;
2899      }
2900      break;
2901
2902   case Ain_MovxLQ:
2903      /* No, _don't_ ask me why the sense of the args has to be
2904         different in the S vs Z case.  I don't know. */
2905      if (i->Ain.MovxLQ.syned) {
2906         /* Need REX.W = 1 here, but rexAMode_R does that for us. */
2907         *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
2908         *p++ = 0x63;
2909         p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
2910      } else {
2911         /* Produce a 32-bit reg-reg move, since the implicit
2912            zero-extend does what we want. */
2913         *p++ = clearWBit (
2914                   rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
2915         *p++ = 0x89;
2916         p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
2917      }
2918      goto done;
2919
2920   case Ain_LoadEX:
2921      if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
2922         /* movzbq */
2923         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2924         *p++ = 0x0F;
2925         *p++ = 0xB6;
2926         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2927         goto done;
2928      }
2929      if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
2930         /* movzwq */
2931         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2932         *p++ = 0x0F;
2933         *p++ = 0xB7;
2934         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2935         goto done;
2936      }
2937      if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
2938         /* movzlq */
2939         /* This isn't really an existing AMD64 instruction per se.
2940            Rather, we have to do a 32-bit load.  Because a 32-bit
2941            write implicitly clears the upper 32 bits of the target
2942            register, we get what we want. */
2943         *p++ = clearWBit(
2944                rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
2945         *p++ = 0x8B;
2946         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
2947         goto done;
2948      }
2949      break;
2950
2951   case Ain_Set64:
2952      /* Make the destination register be 1 or 0, depending on whether
2953         the relevant condition holds.  Complication: the top 56 bits
2954         of the destination should be forced to zero, but doing 'xorq
2955         %r,%r' kills the flag(s) we are about to read.  Sigh.  So
2956         start off my moving $0 into the dest. */
2957      reg = iregBits3210(i->Ain.Set64.dst);
2958      vassert(reg < 16);
2959
2960      /* movq $0, %dst */
2961      *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
2962      *p++ = 0xC7;
2963      *p++ = toUChar(0xC0 + (reg & 7));
2964      p = emit32(p, 0);
2965
2966      /* setb lo8(%dst) */
2967      /* note, 8-bit register rex trickyness.  Be careful here. */
2968      *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
2969      *p++ = 0x0F;
2970      *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
2971      *p++ = toUChar(0xC0 + (reg & 7));
2972      goto done;
2973
2974   case Ain_Bsfr64:
2975      *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
2976      *p++ = 0x0F;
2977      if (i->Ain.Bsfr64.isFwds) {
2978         *p++ = 0xBC;
2979      } else {
2980         *p++ = 0xBD;
2981      }
2982      p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
2983      goto done;
2984
2985   case Ain_MFence:
2986      /* mfence */
2987      *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
2988      goto done;
2989
2990   case Ain_ACAS:
2991      /* lock */
2992      *p++ = 0xF0;
2993      if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
2994      /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
2995         in %rbx.  The new-value register is hardwired to be %rbx
2996         since dealing with byte integer registers is too much hassle,
2997         so we force the register operand to %rbx (could equally be
2998         %rcx or %rdx). */
2999      rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3000      if (i->Ain.ACAS.sz != 8)
3001         rex = clearWBit(rex);
3002
3003      *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3004      *p++ = 0x0F;
3005      if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3006      p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3007      goto done;
3008
3009   case Ain_DACAS:
3010      /* lock */
3011      *p++ = 0xF0;
3012      /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3013         value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3014         aren't encoded in the insn. */
3015      rex = rexAMode_M( fake(1), i->Ain.ACAS.addr );
3016      if (i->Ain.ACAS.sz != 8)
3017         rex = clearWBit(rex);
3018      *p++ = rex;
3019      *p++ = 0x0F;
3020      *p++ = 0xC7;
3021      p = doAMode_M(p, fake(1), i->Ain.DACAS.addr);
3022      goto done;
3023
3024   case Ain_A87Free:
3025      vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3026      for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3027         p = do_ffree_st(p, 7-j);
3028      }
3029      goto done;
3030
3031   case Ain_A87PushPop:
3032      vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3033      if (i->Ain.A87PushPop.isPush) {
3034         /* Load from memory into %st(0): flds/fldl amode */
3035         *p++ = clearWBit(
3036                   rexAMode_M(fake(0), i->Ain.A87PushPop.addr) );
3037         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3038	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Ain.A87PushPop.addr);
3039      } else {
3040         /* Dump %st(0) to memory: fstps/fstpl amode */
3041         *p++ = clearWBit(
3042                   rexAMode_M(fake(3), i->Ain.A87PushPop.addr) );
3043         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3044         p = doAMode_M(p, fake(3)/*subopcode*/, i->Ain.A87PushPop.addr);
3045         goto done;
3046      }
3047      goto done;
3048
3049   case Ain_A87FpOp:
3050      switch (i->Ain.A87FpOp.op) {
3051         case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3052         case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3053         case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3054         case Afp_TAN:    *p++ = 0xD9; *p++ = 0xF2; break;
3055         case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3056         case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3057         case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3058         case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3059         case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3060         case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3061         case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3062         case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3063         default: goto bad;
3064      }
3065      goto done;
3066
3067   case Ain_A87LdCW:
3068      *p++ = clearWBit(
3069                rexAMode_M(fake(5), i->Ain.A87LdCW.addr) );
3070      *p++ = 0xD9;
3071      p = doAMode_M(p, fake(5)/*subopcode*/, i->Ain.A87LdCW.addr);
3072      goto done;
3073
3074   case Ain_A87StSW:
3075      *p++ = clearWBit(
3076                rexAMode_M(fake(7), i->Ain.A87StSW.addr) );
3077      *p++ = 0xDD;
3078      p = doAMode_M(p, fake(7)/*subopcode*/, i->Ain.A87StSW.addr);
3079      goto done;
3080
3081   case Ain_Store:
3082      if (i->Ain.Store.sz == 2) {
3083         /* This just goes to show the crazyness of the instruction
3084            set encoding.  We have to insert two prefix bytes, but be
3085            careful to avoid a conflict in what the size should be, by
3086            ensuring that REX.W = 0. */
3087         *p++ = 0x66; /* override to 16-bits */
3088	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3089         *p++ = 0x89;
3090         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3091         goto done;
3092      }
3093      if (i->Ain.Store.sz == 4) {
3094	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3095         *p++ = 0x89;
3096         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3097         goto done;
3098      }
3099      if (i->Ain.Store.sz == 1) {
3100         /* This is one place where it would be wrong to skip emitting
3101            a rex byte of 0x40, since the mere presence of rex changes
3102            the meaning of the byte register access.  Be careful. */
3103	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3104         *p++ = 0x88;
3105         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3106         goto done;
3107      }
3108      break;
3109
3110   case Ain_LdMXCSR:
3111      *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
3112      *p++ = 0x0F;
3113      *p++ = 0xAE;
3114      p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
3115      goto done;
3116
3117   case Ain_SseUComIS:
3118      /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3119      /* ucomi[sd] %srcL, %srcR */
3120      if (i->Ain.SseUComIS.sz == 8) {
3121         *p++ = 0x66;
3122      } else {
3123         goto bad;
3124         vassert(i->Ain.SseUComIS.sz == 4);
3125      }
3126      *p++ = clearWBit (
3127             rexAMode_R( vreg2ireg(i->Ain.SseUComIS.srcL),
3128                         vreg2ireg(i->Ain.SseUComIS.srcR) ));
3129      *p++ = 0x0F;
3130      *p++ = 0x2E;
3131      p = doAMode_R(p, vreg2ireg(i->Ain.SseUComIS.srcL),
3132                       vreg2ireg(i->Ain.SseUComIS.srcR) );
3133      /* pushfq */
3134      *p++ = 0x9C;
3135      /* popq %dst */
3136      *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.SseUComIS.dst)));
3137      *p++ = toUChar(0x58 + iregBits210(i->Ain.SseUComIS.dst));
3138      goto done;
3139
3140   case Ain_SseSI2SF:
3141      /* cvssi2s[sd] %src, %dst */
3142      rex = rexAMode_R( vreg2ireg(i->Ain.SseSI2SF.dst),
3143                        i->Ain.SseSI2SF.src );
3144      *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3145      *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3146      *p++ = 0x0F;
3147      *p++ = 0x2A;
3148      p = doAMode_R( p, vreg2ireg(i->Ain.SseSI2SF.dst),
3149                        i->Ain.SseSI2SF.src );
3150      goto done;
3151
3152   case Ain_SseSF2SI:
3153      /* cvss[sd]2si %src, %dst */
3154      rex = rexAMode_R( i->Ain.SseSF2SI.dst,
3155                        vreg2ireg(i->Ain.SseSF2SI.src) );
3156      *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3157      *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3158      *p++ = 0x0F;
3159      *p++ = 0x2D;
3160      p = doAMode_R( p, i->Ain.SseSF2SI.dst,
3161                        vreg2ireg(i->Ain.SseSF2SI.src) );
3162      goto done;
3163
3164   case Ain_SseSDSS:
3165      /* cvtsd2ss/cvtss2sd %src, %dst */
3166      *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3167      *p++ = clearWBit(
3168              rexAMode_R( vreg2ireg(i->Ain.SseSDSS.dst),
3169                          vreg2ireg(i->Ain.SseSDSS.src) ));
3170      *p++ = 0x0F;
3171      *p++ = 0x5A;
3172      p = doAMode_R( p, vreg2ireg(i->Ain.SseSDSS.dst),
3173                        vreg2ireg(i->Ain.SseSDSS.src) );
3174      goto done;
3175
3176   case Ain_SseLdSt:
3177      if (i->Ain.SseLdSt.sz == 8) {
3178         *p++ = 0xF2;
3179      } else
3180      if (i->Ain.SseLdSt.sz == 4) {
3181         *p++ = 0xF3;
3182      } else
3183      if (i->Ain.SseLdSt.sz != 16) {
3184         vassert(0);
3185      }
3186      *p++ = clearWBit(
3187             rexAMode_M( vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr));
3188      *p++ = 0x0F;
3189      *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3190      p = doAMode_M(p, vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr);
3191      goto done;
3192
3193   case Ain_SseLdzLO:
3194      vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3195      /* movs[sd] amode, %xmm-dst */
3196      *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3197      *p++ = clearWBit(
3198             rexAMode_M(vreg2ireg(i->Ain.SseLdzLO.reg),
3199                        i->Ain.SseLdzLO.addr));
3200      *p++ = 0x0F;
3201      *p++ = 0x10;
3202      p = doAMode_M(p, vreg2ireg(i->Ain.SseLdzLO.reg),
3203                       i->Ain.SseLdzLO.addr);
3204      goto done;
3205
3206   case Ain_Sse32Fx4:
3207      xtra = 0;
3208      *p++ = clearWBit(
3209             rexAMode_R( vreg2ireg(i->Ain.Sse32Fx4.dst),
3210                         vreg2ireg(i->Ain.Sse32Fx4.src) ));
3211      *p++ = 0x0F;
3212      switch (i->Ain.Sse32Fx4.op) {
3213         case Asse_ADDF:   *p++ = 0x58; break;
3214         case Asse_DIVF:   *p++ = 0x5E; break;
3215         case Asse_MAXF:   *p++ = 0x5F; break;
3216         case Asse_MINF:   *p++ = 0x5D; break;
3217         case Asse_MULF:   *p++ = 0x59; break;
3218         case Asse_RCPF:   *p++ = 0x53; break;
3219         case Asse_RSQRTF: *p++ = 0x52; break;
3220         case Asse_SQRTF:  *p++ = 0x51; break;
3221         case Asse_SUBF:   *p++ = 0x5C; break;
3222         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3223         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3224         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3225         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3226         default: goto bad;
3227      }
3228      p = doAMode_R(p, vreg2ireg(i->Ain.Sse32Fx4.dst),
3229                       vreg2ireg(i->Ain.Sse32Fx4.src) );
3230      if (xtra & 0x100)
3231         *p++ = toUChar(xtra & 0xFF);
3232      goto done;
3233
3234   case Ain_Sse64Fx2:
3235      xtra = 0;
3236      *p++ = 0x66;
3237      *p++ = clearWBit(
3238             rexAMode_R( vreg2ireg(i->Ain.Sse64Fx2.dst),
3239                         vreg2ireg(i->Ain.Sse64Fx2.src) ));
3240      *p++ = 0x0F;
3241      switch (i->Ain.Sse64Fx2.op) {
3242         case Asse_ADDF:   *p++ = 0x58; break;
3243         case Asse_DIVF:   *p++ = 0x5E; break;
3244         case Asse_MAXF:   *p++ = 0x5F; break;
3245         case Asse_MINF:   *p++ = 0x5D; break;
3246         case Asse_MULF:   *p++ = 0x59; break;
3247         case Asse_SQRTF:  *p++ = 0x51; break;
3248         case Asse_SUBF:   *p++ = 0x5C; break;
3249         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3250         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3251         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3252         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3253         default: goto bad;
3254      }
3255      p = doAMode_R(p, vreg2ireg(i->Ain.Sse64Fx2.dst),
3256                       vreg2ireg(i->Ain.Sse64Fx2.src) );
3257      if (xtra & 0x100)
3258         *p++ = toUChar(xtra & 0xFF);
3259      goto done;
3260
3261   case Ain_Sse32FLo:
3262      xtra = 0;
3263      *p++ = 0xF3;
3264      *p++ = clearWBit(
3265             rexAMode_R( vreg2ireg(i->Ain.Sse32FLo.dst),
3266                         vreg2ireg(i->Ain.Sse32FLo.src) ));
3267      *p++ = 0x0F;
3268      switch (i->Ain.Sse32FLo.op) {
3269         case Asse_ADDF:   *p++ = 0x58; break;
3270         case Asse_DIVF:   *p++ = 0x5E; break;
3271         case Asse_MAXF:   *p++ = 0x5F; break;
3272         case Asse_MINF:   *p++ = 0x5D; break;
3273         case Asse_MULF:   *p++ = 0x59; break;
3274         case Asse_RCPF:   *p++ = 0x53; break;
3275         case Asse_RSQRTF: *p++ = 0x52; break;
3276         case Asse_SQRTF:  *p++ = 0x51; break;
3277         case Asse_SUBF:   *p++ = 0x5C; break;
3278         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3279         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3280         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3281         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3282         default: goto bad;
3283      }
3284      p = doAMode_R(p, vreg2ireg(i->Ain.Sse32FLo.dst),
3285                       vreg2ireg(i->Ain.Sse32FLo.src) );
3286      if (xtra & 0x100)
3287         *p++ = toUChar(xtra & 0xFF);
3288      goto done;
3289
3290   case Ain_Sse64FLo:
3291      xtra = 0;
3292      *p++ = 0xF2;
3293      *p++ = clearWBit(
3294             rexAMode_R( vreg2ireg(i->Ain.Sse64FLo.dst),
3295                         vreg2ireg(i->Ain.Sse64FLo.src) ));
3296      *p++ = 0x0F;
3297      switch (i->Ain.Sse64FLo.op) {
3298         case Asse_ADDF:   *p++ = 0x58; break;
3299         case Asse_DIVF:   *p++ = 0x5E; break;
3300         case Asse_MAXF:   *p++ = 0x5F; break;
3301         case Asse_MINF:   *p++ = 0x5D; break;
3302         case Asse_MULF:   *p++ = 0x59; break;
3303         case Asse_SQRTF:  *p++ = 0x51; break;
3304         case Asse_SUBF:   *p++ = 0x5C; break;
3305         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3306         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3307         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3308         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3309         default: goto bad;
3310      }
3311      p = doAMode_R(p, vreg2ireg(i->Ain.Sse64FLo.dst),
3312                       vreg2ireg(i->Ain.Sse64FLo.src) );
3313      if (xtra & 0x100)
3314         *p++ = toUChar(xtra & 0xFF);
3315      goto done;
3316
3317   case Ain_SseReRg:
3318#     define XX(_n) *p++ = (_n)
3319
3320      rex = clearWBit(
3321            rexAMode_R( vreg2ireg(i->Ain.SseReRg.dst),
3322                        vreg2ireg(i->Ain.SseReRg.src) ));
3323
3324      switch (i->Ain.SseReRg.op) {
3325         case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3326         case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3327         case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3328         case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3329         case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3330         case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3331         case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3332         case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3333         case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3334         case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3335         case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3336         case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3337         case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3338         case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3339         case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3340         case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3341         case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3342         case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3343         case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3344         case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3345         case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3346         case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3347         case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3348         case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3349         case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3350         case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3351         case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3352         case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3353         case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3354         case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3355         case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3356         case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3357         case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3358         case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3359         case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3360         case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3361         case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3362         case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3363         case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3364         case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3365         case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3366         case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3367         case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3368         case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3369         case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3370         case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3371         case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3372         case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3373         case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3374         case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3375         case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3376         case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3377         case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3378         case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3379         case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3380         default: goto bad;
3381      }
3382      p = doAMode_R(p, vreg2ireg(i->Ain.SseReRg.dst),
3383                       vreg2ireg(i->Ain.SseReRg.src) );
3384#     undef XX
3385      goto done;
3386
3387   case Ain_SseCMov:
3388      /* jmp fwds if !condition */
3389      *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3390      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3391      ptmp = p;
3392
3393      /* movaps %src, %dst */
3394      *p++ = clearWBit(
3395             rexAMode_R( vreg2ireg(i->Ain.SseCMov.dst),
3396                         vreg2ireg(i->Ain.SseCMov.src) ));
3397      *p++ = 0x0F;
3398      *p++ = 0x28;
3399      p = doAMode_R(p, vreg2ireg(i->Ain.SseCMov.dst),
3400                       vreg2ireg(i->Ain.SseCMov.src) );
3401
3402      /* Fill in the jump offset. */
3403      *(ptmp-1) = toUChar(p - ptmp);
3404      goto done;
3405
3406   case Ain_SseShuf:
3407      *p++ = 0x66;
3408      *p++ = clearWBit(
3409             rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
3410                         vreg2ireg(i->Ain.SseShuf.src) ));
3411      *p++ = 0x0F;
3412      *p++ = 0x70;
3413      p = doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
3414                       vreg2ireg(i->Ain.SseShuf.src) );
3415      *p++ = (UChar)(i->Ain.SseShuf.order);
3416      goto done;
3417
3418   //uu case Ain_AvxLdSt: {
3419   //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
3420   //uu                           i->Ain.AvxLdSt.addr );
3421   //uu    p = emitVexPrefix(p, vex);
3422   //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
3423   //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
3424   //uu      goto done;
3425   //uu }
3426
3427   case Ain_EvCheck: {
3428      /* We generate:
3429            (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
3430            (2 bytes)  jns  nofail     expected taken
3431            (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
3432            nofail:
3433      */
3434      /* This is heavily asserted re instruction lengths.  It needs to
3435         be.  If we get given unexpected forms of .amCounter or
3436         .amFailAddr -- basically, anything that's not of the form
3437         uimm7(%rbp) -- they are likely to fail. */
3438      /* Note also that after the decl we must be very careful not to
3439         read the carry flag, else we get a partial flags stall.
3440         js/jns avoids that, though. */
3441      UChar* p0 = p;
3442      /* ---  decl 8(%rbp) --- */
3443      /* Need to compute the REX byte for the decl in order to prove
3444         that we don't need it, since this is a 32-bit inc and all
3445         registers involved in the amode are < r8.  "fake(1)" because
3446         there's no register in this encoding; instead the register
3447         field is used as a sub opcode.  The encoding for "decl r/m32"
3448         is FF /1, hence the fake(1). */
3449      rex = clearWBit(rexAMode_M(fake(1), i->Ain.EvCheck.amCounter));
3450      if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
3451      *p++ = 0xFF;
3452      p = doAMode_M(p, fake(1), i->Ain.EvCheck.amCounter);
3453      vassert(p - p0 == 3);
3454      /* --- jns nofail --- */
3455      *p++ = 0x79;
3456      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3457      vassert(p - p0 == 5);
3458      /* --- jmp* 0(%rbp) --- */
3459      /* Once again, verify we don't need REX.  The encoding is FF /4.
3460         We don't need REX.W since by default FF /4 in 64-bit mode
3461         implies a 64 bit load. */
3462      rex = clearWBit(rexAMode_M(fake(4), i->Ain.EvCheck.amFailAddr));
3463      if (rex != 0x40) goto bad;
3464      *p++ = 0xFF;
3465      p = doAMode_M(p, fake(4), i->Ain.EvCheck.amFailAddr);
3466      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3467      /* And crosscheck .. */
3468      vassert(evCheckSzB_AMD64() == 8);
3469      goto done;
3470   }
3471
3472   case Ain_ProfInc: {
3473      /* We generate   movabsq $0, %r11
3474                       incq (%r11)
3475         in the expectation that a later call to LibVEX_patchProfCtr
3476         will be used to fill in the immediate field once the right
3477         value is known.
3478         49 BB 00 00 00 00 00 00 00 00
3479         49 FF 03
3480      */
3481      *p++ = 0x49; *p++ = 0xBB;
3482      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3483      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3484      *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
3485      /* Tell the caller .. */
3486      vassert(!(*is_profInc));
3487      *is_profInc = True;
3488      goto done;
3489   }
3490
3491   default:
3492      goto bad;
3493   }
3494
3495  bad:
3496   ppAMD64Instr(i, mode64);
3497   vpanic("emit_AMD64Instr");
3498   /*NOTREACHED*/
3499
3500  done:
3501   vassert(p - &buf[0] <= 32);
3502   return p - &buf[0];
3503
3504#  undef fake
3505}
3506
3507
3508/* How big is an event check?  See case for Ain_EvCheck in
3509   emit_AMD64Instr just above.  That crosschecks what this returns, so
3510   we can tell if we're inconsistent. */
3511Int evCheckSzB_AMD64 ( void )
3512{
3513   return 8;
3514}
3515
3516
3517/* NB: what goes on here has to be very closely coordinated with the
3518   emitInstr case for XDirect, above. */
3519VexInvalRange chainXDirect_AMD64 ( void* place_to_chain,
3520                                   void* disp_cp_chain_me_EXPECTED,
3521                                   void* place_to_jump_to )
3522{
3523   /* What we're expecting to see is:
3524        movabsq $disp_cp_chain_me_EXPECTED, %r11
3525        call *%r11
3526      viz
3527        49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
3528        41 FF D3
3529   */
3530   UChar* p = (UChar*)place_to_chain;
3531   vassert(p[0] == 0x49);
3532   vassert(p[1] == 0xBB);
3533   vassert(*(ULong*)(&p[2]) == Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
3534   vassert(p[10] == 0x41);
3535   vassert(p[11] == 0xFF);
3536   vassert(p[12] == 0xD3);
3537   /* And what we want to change it to is either:
3538        (general case):
3539          movabsq $place_to_jump_to, %r11
3540          jmpq *%r11
3541        viz
3542          49 BB <8 bytes value == place_to_jump_to>
3543          41 FF E3
3544        So it's the same length (convenient, huh) and we don't
3545        need to change all the bits.
3546      ---OR---
3547        in the case where the displacement falls within 32 bits
3548          jmpq disp32   where disp32 is relative to the next insn
3549          ud2; ud2; ud2; ud2
3550        viz
3551          E9 <4 bytes == disp32>
3552          0F 0B 0F 0B 0F 0B 0F 0B
3553
3554      In both cases the replacement has the same length as the original.
3555      To remain sane & verifiable,
3556      (1) limit the displacement for the short form to
3557          (say) +/- one billion, so as to avoid wraparound
3558          off-by-ones
3559      (2) even if the short form is applicable, once every (say)
3560          1024 times use the long form anyway, so as to maintain
3561          verifiability
3562   */
3563   /* This is the delta we need to put into a JMP d32 insn.  It's
3564      relative to the start of the next insn, hence the -5.  */
3565   Long delta   = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
3566   Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
3567
3568   static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
3569   if (shortOK) {
3570      shortCTR++; // thread safety bleh
3571      if (0 == (shortCTR & 0x3FF)) {
3572         shortOK = False;
3573         if (0)
3574            vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
3575                       "using long jmp\n", shortCTR);
3576      }
3577   }
3578
3579   /* And make the modifications. */
3580   if (shortOK) {
3581      p[0]  = 0xE9;
3582      p[1]  = (delta >> 0) & 0xFF;
3583      p[2]  = (delta >> 8) & 0xFF;
3584      p[3]  = (delta >> 16) & 0xFF;
3585      p[4]  = (delta >> 24) & 0xFF;
3586      p[5]  = 0x0F; p[6]  = 0x0B;
3587      p[7]  = 0x0F; p[8]  = 0x0B;
3588      p[9]  = 0x0F; p[10] = 0x0B;
3589      p[11] = 0x0F; p[12] = 0x0B;
3590      /* sanity check on the delta -- top 32 are all 0 or all 1 */
3591      delta >>= 32;
3592      vassert(delta == 0LL || delta == -1LL);
3593   } else {
3594      /* Minimal modifications from the starting sequence. */
3595      *(ULong*)(&p[2]) = Ptr_to_ULong(place_to_jump_to);
3596      p[12] = 0xE3;
3597   }
3598   VexInvalRange vir = {0, 0};
3599   return vir;
3600}
3601
3602
3603/* NB: what goes on here has to be very closely coordinated with the
3604   emitInstr case for XDirect, above. */
3605VexInvalRange unchainXDirect_AMD64 ( void* place_to_unchain,
3606                                     void* place_to_jump_to_EXPECTED,
3607                                     void* disp_cp_chain_me )
3608{
3609   /* What we're expecting to see is either:
3610        (general case)
3611          movabsq $place_to_jump_to_EXPECTED, %r11
3612          jmpq *%r11
3613        viz
3614          49 BB <8 bytes value == place_to_jump_to_EXPECTED>
3615          41 FF E3
3616      ---OR---
3617        in the case where the displacement falls within 32 bits
3618          jmpq d32
3619          ud2; ud2; ud2; ud2
3620        viz
3621          E9 <4 bytes == disp32>
3622          0F 0B 0F 0B 0F 0B 0F 0B
3623   */
3624   UChar* p     = (UChar*)place_to_unchain;
3625   Bool   valid = False;
3626   if (p[0] == 0x49 && p[1] == 0xBB
3627       && *(ULong*)(&p[2]) == Ptr_to_ULong(place_to_jump_to_EXPECTED)
3628       && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
3629      /* it's the long form */
3630      valid = True;
3631   }
3632   else
3633   if (p[0] == 0xE9
3634       && p[5]  == 0x0F && p[6]  == 0x0B
3635       && p[7]  == 0x0F && p[8]  == 0x0B
3636       && p[9]  == 0x0F && p[10] == 0x0B
3637       && p[11] == 0x0F && p[12] == 0x0B) {
3638      /* It's the short form.  Check the offset is right. */
3639      Int  s32 = *(Int*)(&p[1]);
3640      Long s64 = (Long)s32;
3641      if ((UChar*)p + 5 + s64 == (UChar*)place_to_jump_to_EXPECTED) {
3642         valid = True;
3643         if (0)
3644            vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
3645      }
3646   }
3647   vassert(valid);
3648   /* And what we want to change it to is:
3649        movabsq $disp_cp_chain_me, %r11
3650        call *%r11
3651      viz
3652        49 BB <8 bytes value == disp_cp_chain_me>
3653        41 FF D3
3654      So it's the same length (convenient, huh).
3655   */
3656   p[0] = 0x49;
3657   p[1] = 0xBB;
3658   *(ULong*)(&p[2]) = Ptr_to_ULong(disp_cp_chain_me);
3659   p[10] = 0x41;
3660   p[11] = 0xFF;
3661   p[12] = 0xD3;
3662   VexInvalRange vir = {0, 0};
3663   return vir;
3664}
3665
3666
3667/* Patch the counter address into a profile inc point, as previously
3668   created by the Ain_ProfInc case for emit_AMD64Instr. */
3669VexInvalRange patchProfInc_AMD64 ( void*  place_to_patch,
3670                                   ULong* location_of_counter )
3671{
3672   vassert(sizeof(ULong*) == 8);
3673   UChar* p = (UChar*)place_to_patch;
3674   vassert(p[0] == 0x49);
3675   vassert(p[1] == 0xBB);
3676   vassert(p[2] == 0x00);
3677   vassert(p[3] == 0x00);
3678   vassert(p[4] == 0x00);
3679   vassert(p[5] == 0x00);
3680   vassert(p[6] == 0x00);
3681   vassert(p[7] == 0x00);
3682   vassert(p[8] == 0x00);
3683   vassert(p[9] == 0x00);
3684   vassert(p[10] == 0x49);
3685   vassert(p[11] == 0xFF);
3686   vassert(p[12] == 0x03);
3687   ULong imm64 = (ULong)Ptr_to_ULong(location_of_counter);
3688   p[2] = imm64 & 0xFF; imm64 >>= 8;
3689   p[3] = imm64 & 0xFF; imm64 >>= 8;
3690   p[4] = imm64 & 0xFF; imm64 >>= 8;
3691   p[5] = imm64 & 0xFF; imm64 >>= 8;
3692   p[6] = imm64 & 0xFF; imm64 >>= 8;
3693   p[7] = imm64 & 0xFF; imm64 >>= 8;
3694   p[8] = imm64 & 0xFF; imm64 >>= 8;
3695   p[9] = imm64 & 0xFF; imm64 >>= 8;
3696   VexInvalRange vir = {0, 0};
3697   return vir;
3698}
3699
3700
3701/*---------------------------------------------------------------*/
3702/*--- end                                   host_amd64_defs.c ---*/
3703/*---------------------------------------------------------------*/
3704