1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_amd64_defs.h"
43
44
45/* --------- Registers. --------- */
46
47const RRegUniverse* getRRegUniverse_AMD64 ( void )
48{
49   /* The real-register universe is a big constant, so we just want to
50      initialise it once. */
51   static RRegUniverse rRegUniverse_AMD64;
52   static Bool         rRegUniverse_AMD64_initted = False;
53
54   /* Handy shorthand, nothing more */
55   RRegUniverse* ru = &rRegUniverse_AMD64;
56
57   /* This isn't thread-safe.  Sigh. */
58   if (LIKELY(rRegUniverse_AMD64_initted))
59      return ru;
60
61   RRegUniverse__init(ru);
62
63   /* Add the registers.  The initial segment of this array must be
64      those available for allocation by reg-alloc, and those that
65      follow are not available for allocation. */
66   ru->regs[ru->size++] = hregAMD64_RSI();
67   ru->regs[ru->size++] = hregAMD64_RDI();
68   ru->regs[ru->size++] = hregAMD64_R8();
69   ru->regs[ru->size++] = hregAMD64_R9();
70   ru->regs[ru->size++] = hregAMD64_R12();
71   ru->regs[ru->size++] = hregAMD64_R13();
72   ru->regs[ru->size++] = hregAMD64_R14();
73   ru->regs[ru->size++] = hregAMD64_R15();
74   ru->regs[ru->size++] = hregAMD64_RBX();
75   ru->regs[ru->size++] = hregAMD64_XMM3();
76   ru->regs[ru->size++] = hregAMD64_XMM4();
77   ru->regs[ru->size++] = hregAMD64_XMM5();
78   ru->regs[ru->size++] = hregAMD64_XMM6();
79   ru->regs[ru->size++] = hregAMD64_XMM7();
80   ru->regs[ru->size++] = hregAMD64_XMM8();
81   ru->regs[ru->size++] = hregAMD64_XMM9();
82   ru->regs[ru->size++] = hregAMD64_XMM10();
83   ru->regs[ru->size++] = hregAMD64_XMM11();
84   ru->regs[ru->size++] = hregAMD64_XMM12();
85   ru->regs[ru->size++] = hregAMD64_R10();
86   ru->allocable = ru->size;
87   /* And other regs, not available to the allocator. */
88   ru->regs[ru->size++] = hregAMD64_RAX();
89   ru->regs[ru->size++] = hregAMD64_RCX();
90   ru->regs[ru->size++] = hregAMD64_RDX();
91   ru->regs[ru->size++] = hregAMD64_RSP();
92   ru->regs[ru->size++] = hregAMD64_RBP();
93   ru->regs[ru->size++] = hregAMD64_R11();
94   ru->regs[ru->size++] = hregAMD64_XMM0();
95   ru->regs[ru->size++] = hregAMD64_XMM1();
96
97   rRegUniverse_AMD64_initted = True;
98
99   RRegUniverse__check_is_sane(ru);
100   return ru;
101}
102
103
104void ppHRegAMD64 ( HReg reg )
105{
106   Int r;
107   static const HChar* ireg64_names[16]
108     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
109         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
110   /* Be generic for all virtual regs. */
111   if (hregIsVirtual(reg)) {
112      ppHReg(reg);
113      return;
114   }
115   /* But specific for real regs. */
116   switch (hregClass(reg)) {
117      case HRcInt64:
118         r = hregEncoding(reg);
119         vassert(r >= 0 && r < 16);
120         vex_printf("%s", ireg64_names[r]);
121         return;
122      case HRcVec128:
123         r = hregEncoding(reg);
124         vassert(r >= 0 && r < 16);
125         vex_printf("%%xmm%d", r);
126         return;
127      default:
128         vpanic("ppHRegAMD64");
129   }
130}
131
132static void ppHRegAMD64_lo32 ( HReg reg )
133{
134   Int r;
135   static const HChar* ireg32_names[16]
136     = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
137         "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
138   /* Be generic for all virtual regs. */
139   if (hregIsVirtual(reg)) {
140      ppHReg(reg);
141      vex_printf("d");
142      return;
143   }
144   /* But specific for real regs. */
145   switch (hregClass(reg)) {
146      case HRcInt64:
147         r = hregEncoding(reg);
148         vassert(r >= 0 && r < 16);
149         vex_printf("%s", ireg32_names[r]);
150         return;
151      default:
152         vpanic("ppHRegAMD64_lo32: invalid regclass");
153   }
154}
155
156
157/* --------- Condition codes, Intel encoding. --------- */
158
159const HChar* showAMD64CondCode ( AMD64CondCode cond )
160{
161   switch (cond) {
162      case Acc_O:      return "o";
163      case Acc_NO:     return "no";
164      case Acc_B:      return "b";
165      case Acc_NB:     return "nb";
166      case Acc_Z:      return "z";
167      case Acc_NZ:     return "nz";
168      case Acc_BE:     return "be";
169      case Acc_NBE:    return "nbe";
170      case Acc_S:      return "s";
171      case Acc_NS:     return "ns";
172      case Acc_P:      return "p";
173      case Acc_NP:     return "np";
174      case Acc_L:      return "l";
175      case Acc_NL:     return "nl";
176      case Acc_LE:     return "le";
177      case Acc_NLE:    return "nle";
178      case Acc_ALWAYS: return "ALWAYS";
179      default: vpanic("ppAMD64CondCode");
180   }
181}
182
183
184/* --------- AMD64AMode: memory address expressions. --------- */
185
186AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
187   AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
188   am->tag        = Aam_IR;
189   am->Aam.IR.imm = imm32;
190   am->Aam.IR.reg = reg;
191   return am;
192}
193AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
194   AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
195   am->tag = Aam_IRRS;
196   am->Aam.IRRS.imm   = imm32;
197   am->Aam.IRRS.base  = base;
198   am->Aam.IRRS.index = indEx;
199   am->Aam.IRRS.shift = shift;
200   vassert(shift >= 0 && shift <= 3);
201   return am;
202}
203
204void ppAMD64AMode ( AMD64AMode* am ) {
205   switch (am->tag) {
206      case Aam_IR:
207         if (am->Aam.IR.imm == 0)
208            vex_printf("(");
209         else
210            vex_printf("0x%x(", am->Aam.IR.imm);
211         ppHRegAMD64(am->Aam.IR.reg);
212         vex_printf(")");
213         return;
214      case Aam_IRRS:
215         vex_printf("0x%x(", am->Aam.IRRS.imm);
216         ppHRegAMD64(am->Aam.IRRS.base);
217         vex_printf(",");
218         ppHRegAMD64(am->Aam.IRRS.index);
219         vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
220         return;
221      default:
222         vpanic("ppAMD64AMode");
223   }
224}
225
226static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
227   switch (am->tag) {
228      case Aam_IR:
229         addHRegUse(u, HRmRead, am->Aam.IR.reg);
230         return;
231      case Aam_IRRS:
232         addHRegUse(u, HRmRead, am->Aam.IRRS.base);
233         addHRegUse(u, HRmRead, am->Aam.IRRS.index);
234         return;
235      default:
236         vpanic("addRegUsage_AMD64AMode");
237   }
238}
239
240static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
241   switch (am->tag) {
242      case Aam_IR:
243         am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
244         return;
245      case Aam_IRRS:
246         am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
247         am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
248         return;
249      default:
250         vpanic("mapRegs_AMD64AMode");
251   }
252}
253
254/* --------- Operand, which can be reg, immediate or memory. --------- */
255
256AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
257   AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
258   op->tag            = Armi_Imm;
259   op->Armi.Imm.imm32 = imm32;
260   return op;
261}
262AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
263   AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
264   op->tag          = Armi_Reg;
265   op->Armi.Reg.reg = reg;
266   return op;
267}
268AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
269   AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
270   op->tag         = Armi_Mem;
271   op->Armi.Mem.am = am;
272   return op;
273}
274
275static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
276   switch (op->tag) {
277      case Armi_Imm:
278         vex_printf("$0x%x", op->Armi.Imm.imm32);
279         return;
280      case Armi_Reg:
281         if (lo32)
282            ppHRegAMD64_lo32(op->Armi.Reg.reg);
283         else
284            ppHRegAMD64(op->Armi.Reg.reg);
285         return;
286      case Armi_Mem:
287         ppAMD64AMode(op->Armi.Mem.am);
288         return;
289     default:
290         vpanic("ppAMD64RMI");
291   }
292}
293void ppAMD64RMI ( AMD64RMI* op ) {
294   ppAMD64RMI_wrk(op, False/*!lo32*/);
295}
296void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
297   ppAMD64RMI_wrk(op, True/*lo32*/);
298}
299
300/* An AMD64RMI can only be used in a "read" context (what would it mean
301   to write or modify a literal?) and so we enumerate its registers
302   accordingly. */
303static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
304   switch (op->tag) {
305      case Armi_Imm:
306         return;
307      case Armi_Reg:
308         addHRegUse(u, HRmRead, op->Armi.Reg.reg);
309         return;
310      case Armi_Mem:
311         addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
312         return;
313      default:
314         vpanic("addRegUsage_AMD64RMI");
315   }
316}
317
318static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
319   switch (op->tag) {
320      case Armi_Imm:
321         return;
322      case Armi_Reg:
323         op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
324         return;
325      case Armi_Mem:
326         mapRegs_AMD64AMode(m, op->Armi.Mem.am);
327         return;
328      default:
329         vpanic("mapRegs_AMD64RMI");
330   }
331}
332
333
334/* --------- Operand, which can be reg or immediate only. --------- */
335
336AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
337   AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
338   op->tag           = Ari_Imm;
339   op->Ari.Imm.imm32 = imm32;
340   return op;
341}
342AMD64RI* AMD64RI_Reg ( HReg reg ) {
343   AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
344   op->tag         = Ari_Reg;
345   op->Ari.Reg.reg = reg;
346   return op;
347}
348
349void ppAMD64RI ( AMD64RI* op ) {
350   switch (op->tag) {
351      case Ari_Imm:
352         vex_printf("$0x%x", op->Ari.Imm.imm32);
353         return;
354      case Ari_Reg:
355         ppHRegAMD64(op->Ari.Reg.reg);
356         return;
357     default:
358         vpanic("ppAMD64RI");
359   }
360}
361
362/* An AMD64RI can only be used in a "read" context (what would it mean
363   to write or modify a literal?) and so we enumerate its registers
364   accordingly. */
365static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
366   switch (op->tag) {
367      case Ari_Imm:
368         return;
369      case Ari_Reg:
370         addHRegUse(u, HRmRead, op->Ari.Reg.reg);
371         return;
372      default:
373         vpanic("addRegUsage_AMD64RI");
374   }
375}
376
377static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
378   switch (op->tag) {
379      case Ari_Imm:
380         return;
381      case Ari_Reg:
382         op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
383         return;
384      default:
385         vpanic("mapRegs_AMD64RI");
386   }
387}
388
389
390/* --------- Operand, which can be reg or memory only. --------- */
391
392AMD64RM* AMD64RM_Reg ( HReg reg ) {
393   AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
394   op->tag         = Arm_Reg;
395   op->Arm.Reg.reg = reg;
396   return op;
397}
398AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
399   AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
400   op->tag        = Arm_Mem;
401   op->Arm.Mem.am = am;
402   return op;
403}
404
405void ppAMD64RM ( AMD64RM* op ) {
406   switch (op->tag) {
407      case Arm_Mem:
408         ppAMD64AMode(op->Arm.Mem.am);
409         return;
410      case Arm_Reg:
411         ppHRegAMD64(op->Arm.Reg.reg);
412         return;
413     default:
414         vpanic("ppAMD64RM");
415   }
416}
417
418/* Because an AMD64RM can be both a source or destination operand, we
419   have to supply a mode -- pertaining to the operand as a whole --
420   indicating how it's being used. */
421static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
422   switch (op->tag) {
423      case Arm_Mem:
424         /* Memory is read, written or modified.  So we just want to
425            know the regs read by the amode. */
426         addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
427         return;
428      case Arm_Reg:
429         /* reg is read, written or modified.  Add it in the
430            appropriate way. */
431         addHRegUse(u, mode, op->Arm.Reg.reg);
432         return;
433     default:
434         vpanic("addRegUsage_AMD64RM");
435   }
436}
437
438static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
439{
440   switch (op->tag) {
441      case Arm_Mem:
442         mapRegs_AMD64AMode(m, op->Arm.Mem.am);
443         return;
444      case Arm_Reg:
445         op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
446         return;
447     default:
448         vpanic("mapRegs_AMD64RM");
449   }
450}
451
452
453/* --------- Instructions. --------- */
454
455static const HChar* showAMD64ScalarSz ( Int sz ) {
456   switch (sz) {
457      case 2: return "w";
458      case 4: return "l";
459      case 8: return "q";
460      default: vpanic("showAMD64ScalarSz");
461   }
462}
463
464const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
465   switch (op) {
466      case Aun_NOT: return "not";
467      case Aun_NEG: return "neg";
468      default: vpanic("showAMD64UnaryOp");
469   }
470}
471
472const HChar* showAMD64AluOp ( AMD64AluOp op ) {
473   switch (op) {
474      case Aalu_MOV:  return "mov";
475      case Aalu_CMP:  return "cmp";
476      case Aalu_ADD:  return "add";
477      case Aalu_SUB:  return "sub";
478      case Aalu_ADC:  return "adc";
479      case Aalu_SBB:  return "sbb";
480      case Aalu_AND:  return "and";
481      case Aalu_OR:   return "or";
482      case Aalu_XOR:  return "xor";
483      case Aalu_MUL:  return "imul";
484      default: vpanic("showAMD64AluOp");
485   }
486}
487
488const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
489   switch (op) {
490      case Ash_SHL: return "shl";
491      case Ash_SHR: return "shr";
492      case Ash_SAR: return "sar";
493      default: vpanic("showAMD64ShiftOp");
494   }
495}
496
497const HChar* showA87FpOp ( A87FpOp op ) {
498   switch (op) {
499      case Afp_SCALE:  return "scale";
500      case Afp_ATAN:   return "atan";
501      case Afp_YL2X:   return "yl2x";
502      case Afp_YL2XP1: return "yl2xp1";
503      case Afp_PREM:   return "prem";
504      case Afp_PREM1:  return "prem1";
505      case Afp_SQRT:   return "sqrt";
506      case Afp_SIN:    return "sin";
507      case Afp_COS:    return "cos";
508      case Afp_TAN:    return "tan";
509      case Afp_ROUND:  return "round";
510      case Afp_2XM1:   return "2xm1";
511      default: vpanic("showA87FpOp");
512   }
513}
514
515const HChar* showAMD64SseOp ( AMD64SseOp op ) {
516   switch (op) {
517      case Asse_MOV:      return "movups";
518      case Asse_ADDF:     return "add";
519      case Asse_SUBF:     return "sub";
520      case Asse_MULF:     return "mul";
521      case Asse_DIVF:     return "div";
522      case Asse_MAXF:     return "max";
523      case Asse_MINF:     return "min";
524      case Asse_CMPEQF:   return "cmpFeq";
525      case Asse_CMPLTF:   return "cmpFlt";
526      case Asse_CMPLEF:   return "cmpFle";
527      case Asse_CMPUNF:   return "cmpFun";
528      case Asse_RCPF:     return "rcp";
529      case Asse_RSQRTF:   return "rsqrt";
530      case Asse_SQRTF:    return "sqrt";
531      case Asse_AND:      return "and";
532      case Asse_OR:       return "or";
533      case Asse_XOR:      return "xor";
534      case Asse_ANDN:     return "andn";
535      case Asse_ADD8:     return "paddb";
536      case Asse_ADD16:    return "paddw";
537      case Asse_ADD32:    return "paddd";
538      case Asse_ADD64:    return "paddq";
539      case Asse_QADD8U:   return "paddusb";
540      case Asse_QADD16U:  return "paddusw";
541      case Asse_QADD8S:   return "paddsb";
542      case Asse_QADD16S:  return "paddsw";
543      case Asse_SUB8:     return "psubb";
544      case Asse_SUB16:    return "psubw";
545      case Asse_SUB32:    return "psubd";
546      case Asse_SUB64:    return "psubq";
547      case Asse_QSUB8U:   return "psubusb";
548      case Asse_QSUB16U:  return "psubusw";
549      case Asse_QSUB8S:   return "psubsb";
550      case Asse_QSUB16S:  return "psubsw";
551      case Asse_MUL16:    return "pmullw";
552      case Asse_MULHI16U: return "pmulhuw";
553      case Asse_MULHI16S: return "pmulhw";
554      case Asse_AVG8U:    return "pavgb";
555      case Asse_AVG16U:   return "pavgw";
556      case Asse_MAX16S:   return "pmaxw";
557      case Asse_MAX8U:    return "pmaxub";
558      case Asse_MIN16S:   return "pminw";
559      case Asse_MIN8U:    return "pminub";
560      case Asse_CMPEQ8:   return "pcmpeqb";
561      case Asse_CMPEQ16:  return "pcmpeqw";
562      case Asse_CMPEQ32:  return "pcmpeqd";
563      case Asse_CMPGT8S:  return "pcmpgtb";
564      case Asse_CMPGT16S: return "pcmpgtw";
565      case Asse_CMPGT32S: return "pcmpgtd";
566      case Asse_SHL16:    return "psllw";
567      case Asse_SHL32:    return "pslld";
568      case Asse_SHL64:    return "psllq";
569      case Asse_SHR16:    return "psrlw";
570      case Asse_SHR32:    return "psrld";
571      case Asse_SHR64:    return "psrlq";
572      case Asse_SAR16:    return "psraw";
573      case Asse_SAR32:    return "psrad";
574      case Asse_PACKSSD:  return "packssdw";
575      case Asse_PACKSSW:  return "packsswb";
576      case Asse_PACKUSW:  return "packuswb";
577      case Asse_UNPCKHB:  return "punpckhb";
578      case Asse_UNPCKHW:  return "punpckhw";
579      case Asse_UNPCKHD:  return "punpckhd";
580      case Asse_UNPCKHQ:  return "punpckhq";
581      case Asse_UNPCKLB:  return "punpcklb";
582      case Asse_UNPCKLW:  return "punpcklw";
583      case Asse_UNPCKLD:  return "punpckld";
584      case Asse_UNPCKLQ:  return "punpcklq";
585      default: vpanic("showAMD64SseOp");
586   }
587}
588
589AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
590   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
591   i->tag             = Ain_Imm64;
592   i->Ain.Imm64.imm64 = imm64;
593   i->Ain.Imm64.dst   = dst;
594   return i;
595}
596AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
597   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
598   i->tag            = Ain_Alu64R;
599   i->Ain.Alu64R.op  = op;
600   i->Ain.Alu64R.src = src;
601   i->Ain.Alu64R.dst = dst;
602   return i;
603}
604AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
605   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
606   i->tag            = Ain_Alu64M;
607   i->Ain.Alu64M.op  = op;
608   i->Ain.Alu64M.src = src;
609   i->Ain.Alu64M.dst = dst;
610   vassert(op != Aalu_MUL);
611   return i;
612}
613AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
614   AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
615   i->tag          = Ain_Sh64;
616   i->Ain.Sh64.op  = op;
617   i->Ain.Sh64.src = src;
618   i->Ain.Sh64.dst = dst;
619   return i;
620}
621AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
622   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
623   i->tag              = Ain_Test64;
624   i->Ain.Test64.imm32 = imm32;
625   i->Ain.Test64.dst   = dst;
626   return i;
627}
628AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
629   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
630   i->tag             = Ain_Unary64;
631   i->Ain.Unary64.op  = op;
632   i->Ain.Unary64.dst = dst;
633   return i;
634}
635AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
636   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
637   i->tag             = Ain_Lea64;
638   i->Ain.Lea64.am    = am;
639   i->Ain.Lea64.dst   = dst;
640   return i;
641}
642AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
643   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
644   i->tag            = Ain_Alu32R;
645   i->Ain.Alu32R.op  = op;
646   i->Ain.Alu32R.src = src;
647   i->Ain.Alu32R.dst = dst;
648   switch (op) {
649      case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
650      case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
651      default: vassert(0);
652   }
653   return i;
654}
655AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
656   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
657   i->tag            = Ain_MulL;
658   i->Ain.MulL.syned = syned;
659   i->Ain.MulL.src   = src;
660   return i;
661}
662AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
663   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
664   i->tag            = Ain_Div;
665   i->Ain.Div.syned  = syned;
666   i->Ain.Div.sz     = sz;
667   i->Ain.Div.src    = src;
668   vassert(sz == 4 || sz == 8);
669   return i;
670}
671AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
672   AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
673   i->tag          = Ain_Push;
674   i->Ain.Push.src = src;
675   return i;
676}
677AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
678                              RetLoc rloc ) {
679   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
680   i->tag               = Ain_Call;
681   i->Ain.Call.cond     = cond;
682   i->Ain.Call.target   = target;
683   i->Ain.Call.regparms = regparms;
684   i->Ain.Call.rloc     = rloc;
685   vassert(regparms >= 0 && regparms <= 6);
686   vassert(is_sane_RetLoc(rloc));
687   return i;
688}
689
690AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
691                                 AMD64CondCode cond, Bool toFastEP ) {
692   AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
693   i->tag                  = Ain_XDirect;
694   i->Ain.XDirect.dstGA    = dstGA;
695   i->Ain.XDirect.amRIP    = amRIP;
696   i->Ain.XDirect.cond     = cond;
697   i->Ain.XDirect.toFastEP = toFastEP;
698   return i;
699}
700AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
701                                AMD64CondCode cond ) {
702   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
703   i->tag              = Ain_XIndir;
704   i->Ain.XIndir.dstGA = dstGA;
705   i->Ain.XIndir.amRIP = amRIP;
706   i->Ain.XIndir.cond  = cond;
707   return i;
708}
709AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
710                                   AMD64CondCode cond, IRJumpKind jk ) {
711   AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
712   i->tag                 = Ain_XAssisted;
713   i->Ain.XAssisted.dstGA = dstGA;
714   i->Ain.XAssisted.amRIP = amRIP;
715   i->Ain.XAssisted.cond  = cond;
716   i->Ain.XAssisted.jk    = jk;
717   return i;
718}
719
720AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
721   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
722   i->tag             = Ain_CMov64;
723   i->Ain.CMov64.cond = cond;
724   i->Ain.CMov64.src  = src;
725   i->Ain.CMov64.dst  = dst;
726   vassert(cond != Acc_ALWAYS);
727   return i;
728}
729AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
730                               AMD64AMode* addr, HReg dst ) {
731   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
732   i->tag            = Ain_CLoad;
733   i->Ain.CLoad.cond = cond;
734   i->Ain.CLoad.szB  = szB;
735   i->Ain.CLoad.addr = addr;
736   i->Ain.CLoad.dst  = dst;
737   vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
738   return i;
739}
740AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
741                                HReg src, AMD64AMode* addr ) {
742   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
743   i->tag             = Ain_CStore;
744   i->Ain.CStore.cond = cond;
745   i->Ain.CStore.szB  = szB;
746   i->Ain.CStore.src  = src;
747   i->Ain.CStore.addr = addr;
748   vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
749   return i;
750}
751AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
752   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
753   i->tag              = Ain_MovxLQ;
754   i->Ain.MovxLQ.syned = syned;
755   i->Ain.MovxLQ.src   = src;
756   i->Ain.MovxLQ.dst   = dst;
757   return i;
758}
759AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
760                                AMD64AMode* src, HReg dst ) {
761   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
762   i->tag                = Ain_LoadEX;
763   i->Ain.LoadEX.szSmall = szSmall;
764   i->Ain.LoadEX.syned   = syned;
765   i->Ain.LoadEX.src     = src;
766   i->Ain.LoadEX.dst     = dst;
767   vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
768   return i;
769}
770AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
771   AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
772   i->tag           = Ain_Store;
773   i->Ain.Store.sz  = sz;
774   i->Ain.Store.src = src;
775   i->Ain.Store.dst = dst;
776   vassert(sz == 1 || sz == 2 || sz == 4);
777   return i;
778}
779AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
780   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
781   i->tag            = Ain_Set64;
782   i->Ain.Set64.cond = cond;
783   i->Ain.Set64.dst  = dst;
784   return i;
785}
786AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
787   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
788   i->tag               = Ain_Bsfr64;
789   i->Ain.Bsfr64.isFwds = isFwds;
790   i->Ain.Bsfr64.src    = src;
791   i->Ain.Bsfr64.dst    = dst;
792   return i;
793}
794AMD64Instr* AMD64Instr_MFence ( void ) {
795   AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
796   i->tag        = Ain_MFence;
797   return i;
798}
799AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
800   AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
801   i->tag           = Ain_ACAS;
802   i->Ain.ACAS.addr = addr;
803   i->Ain.ACAS.sz   = sz;
804   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
805   return i;
806}
807AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
808   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
809   i->tag            = Ain_DACAS;
810   i->Ain.DACAS.addr = addr;
811   i->Ain.DACAS.sz   = sz;
812   vassert(sz == 8 || sz == 4);
813   return i;
814}
815
816AMD64Instr* AMD64Instr_A87Free ( Int nregs )
817{
818   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
819   i->tag               = Ain_A87Free;
820   i->Ain.A87Free.nregs = nregs;
821   vassert(nregs >= 1 && nregs <= 7);
822   return i;
823}
824AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
825{
826   AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
827   i->tag                   = Ain_A87PushPop;
828   i->Ain.A87PushPop.addr   = addr;
829   i->Ain.A87PushPop.isPush = isPush;
830   i->Ain.A87PushPop.szB    = szB;
831   vassert(szB == 8 || szB == 4);
832   return i;
833}
834AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
835{
836   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
837   i->tag            = Ain_A87FpOp;
838   i->Ain.A87FpOp.op = op;
839   return i;
840}
841AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
842{
843   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
844   i->tag              = Ain_A87LdCW;
845   i->Ain.A87LdCW.addr = addr;
846   return i;
847}
848AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
849{
850   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
851   i->tag              = Ain_A87StSW;
852   i->Ain.A87StSW.addr = addr;
853   return i;
854}
855AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
856   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
857   i->tag                = Ain_LdMXCSR;
858   i->Ain.LdMXCSR.addr   = addr;
859   return i;
860}
861AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
862   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
863   i->tag                = Ain_SseUComIS;
864   i->Ain.SseUComIS.sz   = toUChar(sz);
865   i->Ain.SseUComIS.srcL = srcL;
866   i->Ain.SseUComIS.srcR = srcR;
867   i->Ain.SseUComIS.dst  = dst;
868   vassert(sz == 4 || sz == 8);
869   return i;
870}
871AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
872   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
873   i->tag              = Ain_SseSI2SF;
874   i->Ain.SseSI2SF.szS = toUChar(szS);
875   i->Ain.SseSI2SF.szD = toUChar(szD);
876   i->Ain.SseSI2SF.src = src;
877   i->Ain.SseSI2SF.dst = dst;
878   vassert(szS == 4 || szS == 8);
879   vassert(szD == 4 || szD == 8);
880   return i;
881}
882AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
883   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
884   i->tag              = Ain_SseSF2SI;
885   i->Ain.SseSF2SI.szS = toUChar(szS);
886   i->Ain.SseSF2SI.szD = toUChar(szD);
887   i->Ain.SseSF2SI.src = src;
888   i->Ain.SseSF2SI.dst = dst;
889   vassert(szS == 4 || szS == 8);
890   vassert(szD == 4 || szD == 8);
891   return i;
892}
893AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
894{
895   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
896   i->tag                = Ain_SseSDSS;
897   i->Ain.SseSDSS.from64 = from64;
898   i->Ain.SseSDSS.src    = src;
899   i->Ain.SseSDSS.dst    = dst;
900   return i;
901}
902AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
903                                 HReg reg, AMD64AMode* addr ) {
904   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
905   i->tag                = Ain_SseLdSt;
906   i->Ain.SseLdSt.isLoad = isLoad;
907   i->Ain.SseLdSt.sz     = toUChar(sz);
908   i->Ain.SseLdSt.reg    = reg;
909   i->Ain.SseLdSt.addr   = addr;
910   vassert(sz == 4 || sz == 8 || sz == 16);
911   return i;
912}
913AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
914{
915   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
916   i->tag                = Ain_SseLdzLO;
917   i->Ain.SseLdzLO.sz    = sz;
918   i->Ain.SseLdzLO.reg   = reg;
919   i->Ain.SseLdzLO.addr  = addr;
920   vassert(sz == 4 || sz == 8);
921   return i;
922}
923AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
924   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
925   i->tag              = Ain_Sse32Fx4;
926   i->Ain.Sse32Fx4.op  = op;
927   i->Ain.Sse32Fx4.src = src;
928   i->Ain.Sse32Fx4.dst = dst;
929   vassert(op != Asse_MOV);
930   return i;
931}
932AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
933   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
934   i->tag              = Ain_Sse32FLo;
935   i->Ain.Sse32FLo.op  = op;
936   i->Ain.Sse32FLo.src = src;
937   i->Ain.Sse32FLo.dst = dst;
938   vassert(op != Asse_MOV);
939   return i;
940}
941AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
942   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
943   i->tag              = Ain_Sse64Fx2;
944   i->Ain.Sse64Fx2.op  = op;
945   i->Ain.Sse64Fx2.src = src;
946   i->Ain.Sse64Fx2.dst = dst;
947   vassert(op != Asse_MOV);
948   return i;
949}
950AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
951   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
952   i->tag              = Ain_Sse64FLo;
953   i->Ain.Sse64FLo.op  = op;
954   i->Ain.Sse64FLo.src = src;
955   i->Ain.Sse64FLo.dst = dst;
956   vassert(op != Asse_MOV);
957   return i;
958}
959AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
960   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
961   i->tag             = Ain_SseReRg;
962   i->Ain.SseReRg.op  = op;
963   i->Ain.SseReRg.src = re;
964   i->Ain.SseReRg.dst = rg;
965   return i;
966}
967AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
968   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
969   i->tag              = Ain_SseCMov;
970   i->Ain.SseCMov.cond = cond;
971   i->Ain.SseCMov.src  = src;
972   i->Ain.SseCMov.dst  = dst;
973   vassert(cond != Acc_ALWAYS);
974   return i;
975}
976AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
977   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
978   i->tag               = Ain_SseShuf;
979   i->Ain.SseShuf.order = order;
980   i->Ain.SseShuf.src   = src;
981   i->Ain.SseShuf.dst   = dst;
982   vassert(order >= 0 && order <= 0xFF);
983   return i;
984}
985//uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
986//uu                                  HReg reg, AMD64AMode* addr ) {
987//uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
988//uu    i->tag                = Ain_AvxLdSt;
989//uu    i->Ain.AvxLdSt.isLoad = isLoad;
990//uu    i->Ain.AvxLdSt.reg    = reg;
991//uu    i->Ain.AvxLdSt.addr   = addr;
992//uu    return i;
993//uu }
994//uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
995//uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
996//uu    i->tag             = Ain_AvxReRg;
997//uu    i->Ain.AvxReRg.op  = op;
998//uu    i->Ain.AvxReRg.src = re;
999//uu    i->Ain.AvxReRg.dst = rg;
1000//uu    return i;
1001//uu }
1002AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1003                                 AMD64AMode* amFailAddr ) {
1004   AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1005   i->tag                    = Ain_EvCheck;
1006   i->Ain.EvCheck.amCounter  = amCounter;
1007   i->Ain.EvCheck.amFailAddr = amFailAddr;
1008   return i;
1009}
1010AMD64Instr* AMD64Instr_ProfInc ( void ) {
1011   AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1012   i->tag        = Ain_ProfInc;
1013   return i;
1014}
1015
1016void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1017{
1018   vassert(mode64 == True);
1019   switch (i->tag) {
1020      case Ain_Imm64:
1021         vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1022         ppHRegAMD64(i->Ain.Imm64.dst);
1023         return;
1024      case Ain_Alu64R:
1025         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1026         ppAMD64RMI(i->Ain.Alu64R.src);
1027         vex_printf(",");
1028         ppHRegAMD64(i->Ain.Alu64R.dst);
1029         return;
1030      case Ain_Alu64M:
1031         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1032         ppAMD64RI(i->Ain.Alu64M.src);
1033         vex_printf(",");
1034         ppAMD64AMode(i->Ain.Alu64M.dst);
1035         return;
1036      case Ain_Sh64:
1037         vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1038         if (i->Ain.Sh64.src == 0)
1039            vex_printf("%%cl,");
1040         else
1041            vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1042         ppHRegAMD64(i->Ain.Sh64.dst);
1043         return;
1044      case Ain_Test64:
1045         vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1046         ppHRegAMD64(i->Ain.Test64.dst);
1047         return;
1048      case Ain_Unary64:
1049         vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1050         ppHRegAMD64(i->Ain.Unary64.dst);
1051         return;
1052      case Ain_Lea64:
1053         vex_printf("leaq ");
1054         ppAMD64AMode(i->Ain.Lea64.am);
1055         vex_printf(",");
1056         ppHRegAMD64(i->Ain.Lea64.dst);
1057         return;
1058      case Ain_Alu32R:
1059         vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1060         ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1061         vex_printf(",");
1062         ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1063         return;
1064      case Ain_MulL:
1065         vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1066         ppAMD64RM(i->Ain.MulL.src);
1067         return;
1068      case Ain_Div:
1069         vex_printf("%cdiv%s ",
1070                    i->Ain.Div.syned ? 's' : 'u',
1071                    showAMD64ScalarSz(i->Ain.Div.sz));
1072         ppAMD64RM(i->Ain.Div.src);
1073         return;
1074      case Ain_Push:
1075         vex_printf("pushq ");
1076         ppAMD64RMI(i->Ain.Push.src);
1077         return;
1078      case Ain_Call:
1079         vex_printf("call%s[%d,",
1080                    i->Ain.Call.cond==Acc_ALWAYS
1081                       ? "" : showAMD64CondCode(i->Ain.Call.cond),
1082                    i->Ain.Call.regparms );
1083         ppRetLoc(i->Ain.Call.rloc);
1084         vex_printf("] 0x%llx", i->Ain.Call.target);
1085         break;
1086
1087      case Ain_XDirect:
1088         vex_printf("(xDirect) ");
1089         vex_printf("if (%%rflags.%s) { ",
1090                    showAMD64CondCode(i->Ain.XDirect.cond));
1091         vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1092         vex_printf("movq %%r11,");
1093         ppAMD64AMode(i->Ain.XDirect.amRIP);
1094         vex_printf("; ");
1095         vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1096                    i->Ain.XDirect.toFastEP ? "fast" : "slow");
1097         return;
1098      case Ain_XIndir:
1099         vex_printf("(xIndir) ");
1100         vex_printf("if (%%rflags.%s) { ",
1101                    showAMD64CondCode(i->Ain.XIndir.cond));
1102         vex_printf("movq ");
1103         ppHRegAMD64(i->Ain.XIndir.dstGA);
1104         vex_printf(",");
1105         ppAMD64AMode(i->Ain.XIndir.amRIP);
1106         vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1107         return;
1108      case Ain_XAssisted:
1109         vex_printf("(xAssisted) ");
1110         vex_printf("if (%%rflags.%s) { ",
1111                    showAMD64CondCode(i->Ain.XAssisted.cond));
1112         vex_printf("movq ");
1113         ppHRegAMD64(i->Ain.XAssisted.dstGA);
1114         vex_printf(",");
1115         ppAMD64AMode(i->Ain.XAssisted.amRIP);
1116         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1117                    (Int)i->Ain.XAssisted.jk);
1118         vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1119         return;
1120
1121      case Ain_CMov64:
1122         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1123         ppHRegAMD64(i->Ain.CMov64.src);
1124         vex_printf(",");
1125         ppHRegAMD64(i->Ain.CMov64.dst);
1126         return;
1127      case Ain_CLoad:
1128         vex_printf("if (%%rflags.%s) { ",
1129                    showAMD64CondCode(i->Ain.CLoad.cond));
1130         vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1131         ppAMD64AMode(i->Ain.CLoad.addr);
1132         vex_printf(", ");
1133         (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1134            (i->Ain.CLoad.dst);
1135         vex_printf(" }");
1136         return;
1137      case Ain_CStore:
1138         vex_printf("if (%%rflags.%s) { ",
1139                    showAMD64CondCode(i->Ain.CStore.cond));
1140         vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1141         (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1142            (i->Ain.CStore.src);
1143         vex_printf(", ");
1144         ppAMD64AMode(i->Ain.CStore.addr);
1145         vex_printf(" }");
1146         return;
1147
1148      case Ain_MovxLQ:
1149         vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1150         ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1151         vex_printf(",");
1152         ppHRegAMD64(i->Ain.MovxLQ.dst);
1153         return;
1154      case Ain_LoadEX:
1155         if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1156            vex_printf("movl ");
1157            ppAMD64AMode(i->Ain.LoadEX.src);
1158            vex_printf(",");
1159            ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1160         } else {
1161            vex_printf("mov%c%cq ",
1162                       i->Ain.LoadEX.syned ? 's' : 'z',
1163                       i->Ain.LoadEX.szSmall==1
1164                          ? 'b'
1165                          : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1166            ppAMD64AMode(i->Ain.LoadEX.src);
1167            vex_printf(",");
1168            ppHRegAMD64(i->Ain.LoadEX.dst);
1169         }
1170         return;
1171      case Ain_Store:
1172         vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1173                              : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1174         ppHRegAMD64(i->Ain.Store.src);
1175         vex_printf(",");
1176         ppAMD64AMode(i->Ain.Store.dst);
1177         return;
1178      case Ain_Set64:
1179         vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1180         ppHRegAMD64(i->Ain.Set64.dst);
1181         return;
1182      case Ain_Bsfr64:
1183         vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1184         ppHRegAMD64(i->Ain.Bsfr64.src);
1185         vex_printf(",");
1186         ppHRegAMD64(i->Ain.Bsfr64.dst);
1187         return;
1188      case Ain_MFence:
1189         vex_printf("mfence" );
1190         return;
1191      case Ain_ACAS:
1192         vex_printf("lock cmpxchg%c ",
1193                     i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1194                     : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1195         vex_printf("{%%rax->%%rbx},");
1196         ppAMD64AMode(i->Ain.ACAS.addr);
1197         return;
1198      case Ain_DACAS:
1199         vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1200                    (Int)(2 * i->Ain.DACAS.sz));
1201         ppAMD64AMode(i->Ain.DACAS.addr);
1202         return;
1203      case Ain_A87Free:
1204         vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1205         break;
1206      case Ain_A87PushPop:
1207         vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1208                    i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1209         ppAMD64AMode(i->Ain.A87PushPop.addr);
1210         break;
1211      case Ain_A87FpOp:
1212         vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1213         break;
1214      case Ain_A87LdCW:
1215         vex_printf("fldcw ");
1216         ppAMD64AMode(i->Ain.A87LdCW.addr);
1217         break;
1218      case Ain_A87StSW:
1219         vex_printf("fstsw ");
1220         ppAMD64AMode(i->Ain.A87StSW.addr);
1221         break;
1222      case Ain_LdMXCSR:
1223         vex_printf("ldmxcsr ");
1224         ppAMD64AMode(i->Ain.LdMXCSR.addr);
1225         break;
1226      case Ain_SseUComIS:
1227         vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1228         ppHRegAMD64(i->Ain.SseUComIS.srcL);
1229         vex_printf(",");
1230         ppHRegAMD64(i->Ain.SseUComIS.srcR);
1231         vex_printf(" ; pushfq ; popq ");
1232         ppHRegAMD64(i->Ain.SseUComIS.dst);
1233         break;
1234      case Ain_SseSI2SF:
1235         vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1236         (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1237            (i->Ain.SseSI2SF.src);
1238         vex_printf(",");
1239         ppHRegAMD64(i->Ain.SseSI2SF.dst);
1240         break;
1241      case Ain_SseSF2SI:
1242         vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1243         ppHRegAMD64(i->Ain.SseSF2SI.src);
1244         vex_printf(",");
1245         (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1246            (i->Ain.SseSF2SI.dst);
1247         break;
1248      case Ain_SseSDSS:
1249         vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1250         ppHRegAMD64(i->Ain.SseSDSS.src);
1251         vex_printf(",");
1252         ppHRegAMD64(i->Ain.SseSDSS.dst);
1253         break;
1254      case Ain_SseLdSt:
1255         switch (i->Ain.SseLdSt.sz) {
1256            case 4:  vex_printf("movss "); break;
1257            case 8:  vex_printf("movsd "); break;
1258            case 16: vex_printf("movups "); break;
1259            default: vassert(0);
1260         }
1261         if (i->Ain.SseLdSt.isLoad) {
1262            ppAMD64AMode(i->Ain.SseLdSt.addr);
1263            vex_printf(",");
1264            ppHRegAMD64(i->Ain.SseLdSt.reg);
1265         } else {
1266            ppHRegAMD64(i->Ain.SseLdSt.reg);
1267            vex_printf(",");
1268            ppAMD64AMode(i->Ain.SseLdSt.addr);
1269         }
1270         return;
1271      case Ain_SseLdzLO:
1272         vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1273         ppAMD64AMode(i->Ain.SseLdzLO.addr);
1274         vex_printf(",");
1275         ppHRegAMD64(i->Ain.SseLdzLO.reg);
1276         return;
1277      case Ain_Sse32Fx4:
1278         vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1279         ppHRegAMD64(i->Ain.Sse32Fx4.src);
1280         vex_printf(",");
1281         ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1282         return;
1283      case Ain_Sse32FLo:
1284         vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1285         ppHRegAMD64(i->Ain.Sse32FLo.src);
1286         vex_printf(",");
1287         ppHRegAMD64(i->Ain.Sse32FLo.dst);
1288         return;
1289      case Ain_Sse64Fx2:
1290         vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1291         ppHRegAMD64(i->Ain.Sse64Fx2.src);
1292         vex_printf(",");
1293         ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1294         return;
1295      case Ain_Sse64FLo:
1296         vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1297         ppHRegAMD64(i->Ain.Sse64FLo.src);
1298         vex_printf(",");
1299         ppHRegAMD64(i->Ain.Sse64FLo.dst);
1300         return;
1301      case Ain_SseReRg:
1302         vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1303         ppHRegAMD64(i->Ain.SseReRg.src);
1304         vex_printf(",");
1305         ppHRegAMD64(i->Ain.SseReRg.dst);
1306         return;
1307      case Ain_SseCMov:
1308         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1309         ppHRegAMD64(i->Ain.SseCMov.src);
1310         vex_printf(",");
1311         ppHRegAMD64(i->Ain.SseCMov.dst);
1312         return;
1313      case Ain_SseShuf:
1314         vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
1315         ppHRegAMD64(i->Ain.SseShuf.src);
1316         vex_printf(",");
1317         ppHRegAMD64(i->Ain.SseShuf.dst);
1318         return;
1319      //uu case Ain_AvxLdSt:
1320      //uu    vex_printf("vmovups ");
1321      //uu    if (i->Ain.AvxLdSt.isLoad) {
1322      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1323      //uu       vex_printf(",");
1324      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1325      //uu    } else {
1326      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1327      //uu       vex_printf(",");
1328      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1329      //uu    }
1330      //uu    return;
1331      //uu case Ain_AvxReRg:
1332      //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1333      //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1334      //uu    vex_printf(",");
1335      //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1336      //uu    return;
1337      case Ain_EvCheck:
1338         vex_printf("(evCheck) decl ");
1339         ppAMD64AMode(i->Ain.EvCheck.amCounter);
1340         vex_printf("; jns nofail; jmp *");
1341         ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1342         vex_printf("; nofail:");
1343         return;
1344      case Ain_ProfInc:
1345         vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1346         return;
1347      default:
1348         vpanic("ppAMD64Instr");
1349   }
1350}
1351
1352/* --------- Helpers for register allocation. --------- */
1353
1354void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1355{
1356   Bool unary;
1357   vassert(mode64 == True);
1358   initHRegUsage(u);
1359   switch (i->tag) {
1360      case Ain_Imm64:
1361         addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1362         return;
1363      case Ain_Alu64R:
1364         addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1365         if (i->Ain.Alu64R.op == Aalu_MOV) {
1366            addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1367            return;
1368         }
1369         if (i->Ain.Alu64R.op == Aalu_CMP) {
1370            addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1371            return;
1372         }
1373         addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1374         return;
1375      case Ain_Alu64M:
1376         addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1377         addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1378         return;
1379      case Ain_Sh64:
1380         addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1381         if (i->Ain.Sh64.src == 0)
1382            addHRegUse(u, HRmRead, hregAMD64_RCX());
1383         return;
1384      case Ain_Test64:
1385         addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1386         return;
1387      case Ain_Unary64:
1388         addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1389         return;
1390      case Ain_Lea64:
1391         addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1392         addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1393         return;
1394      case Ain_Alu32R:
1395         vassert(i->Ain.Alu32R.op != Aalu_MOV);
1396         addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1397         if (i->Ain.Alu32R.op == Aalu_CMP) {
1398            addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1399            return;
1400         }
1401         addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1402         return;
1403      case Ain_MulL:
1404         addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1405         addHRegUse(u, HRmModify, hregAMD64_RAX());
1406         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1407         return;
1408      case Ain_Div:
1409         addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1410         addHRegUse(u, HRmModify, hregAMD64_RAX());
1411         addHRegUse(u, HRmModify, hregAMD64_RDX());
1412         return;
1413      case Ain_Push:
1414         addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1415         addHRegUse(u, HRmModify, hregAMD64_RSP());
1416         return;
1417      case Ain_Call:
1418         /* This is a bit subtle. */
1419         /* First off, claim it trashes all the caller-saved regs
1420            which fall within the register allocator's jurisdiction.
1421            These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
1422            and all the xmm registers.
1423         */
1424         addHRegUse(u, HRmWrite, hregAMD64_RAX());
1425         addHRegUse(u, HRmWrite, hregAMD64_RCX());
1426         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1427         addHRegUse(u, HRmWrite, hregAMD64_RSI());
1428         addHRegUse(u, HRmWrite, hregAMD64_RDI());
1429         addHRegUse(u, HRmWrite, hregAMD64_R8());
1430         addHRegUse(u, HRmWrite, hregAMD64_R9());
1431         addHRegUse(u, HRmWrite, hregAMD64_R10());
1432         addHRegUse(u, HRmWrite, hregAMD64_R11());
1433         addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1434         addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1435         addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1436         addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1437         addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1438         addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1439         addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1440         addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1441         addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1442         addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1443         addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1444         addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1445
1446         /* Now we have to state any parameter-carrying registers
1447            which might be read.  This depends on the regparmness. */
1448         switch (i->Ain.Call.regparms) {
1449            case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1450            case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1451            case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1452            case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1453            case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1454            case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1455            case 0: break;
1456            default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1457         }
1458         /* Finally, there is the issue that the insn trashes a
1459            register because the literal target address has to be
1460            loaded into a register.  Fortunately, r11 is stated in the
1461            ABI as a scratch register, and so seems a suitable victim.  */
1462         addHRegUse(u, HRmWrite, hregAMD64_R11());
1463         /* Upshot of this is that the assembler really must use r11,
1464            and no other, as a destination temporary. */
1465         return;
1466      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1467         conditionally exit the block.  Hence we only need to list (1)
1468         the registers that they read, and (2) the registers that they
1469         write in the case where the block is not exited.  (2) is
1470         empty, hence only (1) is relevant here. */
1471      case Ain_XDirect:
1472         /* Don't bother to mention the write to %r11, since it is not
1473            available to the allocator. */
1474         addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1475         return;
1476      case Ain_XIndir:
1477         /* Ditto re %r11 */
1478         addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1479         addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1480         return;
1481      case Ain_XAssisted:
1482         /* Ditto re %r11 and %rbp (the baseblock ptr) */
1483         addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1484         addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1485         return;
1486      case Ain_CMov64:
1487         addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1488         addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1489         return;
1490      case Ain_CLoad:
1491         addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1492         addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1493         return;
1494      case Ain_CStore:
1495         addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1496         addHRegUse(u, HRmRead, i->Ain.CStore.src);
1497         return;
1498      case Ain_MovxLQ:
1499         addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1500         addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1501         return;
1502      case Ain_LoadEX:
1503         addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1504         addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1505         return;
1506      case Ain_Store:
1507         addHRegUse(u, HRmRead, i->Ain.Store.src);
1508         addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1509         return;
1510      case Ain_Set64:
1511         addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1512         return;
1513      case Ain_Bsfr64:
1514         addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1515         addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1516         return;
1517      case Ain_MFence:
1518         return;
1519      case Ain_ACAS:
1520         addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1521         addHRegUse(u, HRmRead, hregAMD64_RBX());
1522         addHRegUse(u, HRmModify, hregAMD64_RAX());
1523         return;
1524      case Ain_DACAS:
1525         addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1526         addHRegUse(u, HRmRead, hregAMD64_RCX());
1527         addHRegUse(u, HRmRead, hregAMD64_RBX());
1528         addHRegUse(u, HRmModify, hregAMD64_RDX());
1529         addHRegUse(u, HRmModify, hregAMD64_RAX());
1530         return;
1531      case Ain_A87Free:
1532         return;
1533      case Ain_A87PushPop:
1534         addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1535         return;
1536      case Ain_A87FpOp:
1537         return;
1538      case Ain_A87LdCW:
1539         addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1540         return;
1541      case Ain_A87StSW:
1542         addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1543         return;
1544      case Ain_LdMXCSR:
1545         addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1546         return;
1547      case Ain_SseUComIS:
1548         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1549         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1550         addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1551         return;
1552      case Ain_SseSI2SF:
1553         addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1554         addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1555         return;
1556      case Ain_SseSF2SI:
1557         addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1558         addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1559         return;
1560      case Ain_SseSDSS:
1561         addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1562         addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1563         return;
1564      case Ain_SseLdSt:
1565         addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1566         addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1567                       i->Ain.SseLdSt.reg);
1568         return;
1569      case Ain_SseLdzLO:
1570         addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1571         addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1572         return;
1573      case Ain_Sse32Fx4:
1574         vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1575         unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1576                         || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1577                         || i->Ain.Sse32Fx4.op == Asse_SQRTF );
1578         addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1579         addHRegUse(u, unary ? HRmWrite : HRmModify,
1580                       i->Ain.Sse32Fx4.dst);
1581         return;
1582      case Ain_Sse32FLo:
1583         vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1584         unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1585                         || i->Ain.Sse32FLo.op == Asse_RSQRTF
1586                         || i->Ain.Sse32FLo.op == Asse_SQRTF );
1587         addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1588         addHRegUse(u, unary ? HRmWrite : HRmModify,
1589                       i->Ain.Sse32FLo.dst);
1590         return;
1591      case Ain_Sse64Fx2:
1592         vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1593         unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1594                         || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1595                         || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1596         addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1597         addHRegUse(u, unary ? HRmWrite : HRmModify,
1598                       i->Ain.Sse64Fx2.dst);
1599         return;
1600      case Ain_Sse64FLo:
1601         vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1602         unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1603                         || i->Ain.Sse64FLo.op == Asse_RSQRTF
1604                         || i->Ain.Sse64FLo.op == Asse_SQRTF );
1605         addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1606         addHRegUse(u, unary ? HRmWrite : HRmModify,
1607                       i->Ain.Sse64FLo.dst);
1608         return;
1609      case Ain_SseReRg:
1610         if ( (i->Ain.SseReRg.op == Asse_XOR
1611               || i->Ain.SseReRg.op == Asse_CMPEQ32)
1612              && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1613            /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1614               r,r' as a write of a value to r, and independent of any
1615               previous value in r */
1616            /* (as opposed to a rite of passage :-) */
1617            addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1618         } else {
1619            addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1620            addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1621                             ? HRmWrite : HRmModify,
1622                          i->Ain.SseReRg.dst);
1623         }
1624         return;
1625      case Ain_SseCMov:
1626         addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1627         addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1628         return;
1629      case Ain_SseShuf:
1630         addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1631         addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1632         return;
1633      //uu case Ain_AvxLdSt:
1634      //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1635      //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1636      //uu               i->Ain.AvxLdSt.reg);
1637      //uu return;
1638      //uu case Ain_AvxReRg:
1639      //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1640      //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1641      //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1642      //uu       /* See comments on the case for Ain_SseReRg. */
1643      //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1644      //uu    } else {
1645      //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1646      //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1647      //uu                        ? HRmWrite : HRmModify,
1648      //uu                     i->Ain.AvxReRg.dst);
1649      //uu    }
1650      //uu    return;
1651      case Ain_EvCheck:
1652         /* We expect both amodes only to mention %rbp, so this is in
1653            fact pointless, since %rbp isn't allocatable, but anyway.. */
1654         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1655         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1656         return;
1657      case Ain_ProfInc:
1658         addHRegUse(u, HRmWrite, hregAMD64_R11());
1659         return;
1660      default:
1661         ppAMD64Instr(i, mode64);
1662         vpanic("getRegUsage_AMD64Instr");
1663   }
1664}
1665
1666/* local helper */
1667static inline void mapReg(HRegRemap* m, HReg* r)
1668{
1669   *r = lookupHRegRemap(m, *r);
1670}
1671
1672void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1673{
1674   vassert(mode64 == True);
1675   switch (i->tag) {
1676      case Ain_Imm64:
1677         mapReg(m, &i->Ain.Imm64.dst);
1678         return;
1679      case Ain_Alu64R:
1680         mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1681         mapReg(m, &i->Ain.Alu64R.dst);
1682         return;
1683      case Ain_Alu64M:
1684         mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1685         mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1686         return;
1687      case Ain_Sh64:
1688         mapReg(m, &i->Ain.Sh64.dst);
1689         return;
1690      case Ain_Test64:
1691         mapReg(m, &i->Ain.Test64.dst);
1692         return;
1693      case Ain_Unary64:
1694         mapReg(m, &i->Ain.Unary64.dst);
1695         return;
1696      case Ain_Lea64:
1697         mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1698         mapReg(m, &i->Ain.Lea64.dst);
1699         return;
1700      case Ain_Alu32R:
1701         mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1702         mapReg(m, &i->Ain.Alu32R.dst);
1703         return;
1704      case Ain_MulL:
1705         mapRegs_AMD64RM(m, i->Ain.MulL.src);
1706         return;
1707      case Ain_Div:
1708         mapRegs_AMD64RM(m, i->Ain.Div.src);
1709         return;
1710      case Ain_Push:
1711         mapRegs_AMD64RMI(m, i->Ain.Push.src);
1712         return;
1713      case Ain_Call:
1714         return;
1715      case Ain_XDirect:
1716         mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1717         return;
1718      case Ain_XIndir:
1719         mapReg(m, &i->Ain.XIndir.dstGA);
1720         mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1721         return;
1722      case Ain_XAssisted:
1723         mapReg(m, &i->Ain.XAssisted.dstGA);
1724         mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1725         return;
1726      case Ain_CMov64:
1727         mapReg(m, &i->Ain.CMov64.src);
1728         mapReg(m, &i->Ain.CMov64.dst);
1729         return;
1730      case Ain_CLoad:
1731         mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1732         mapReg(m, &i->Ain.CLoad.dst);
1733         return;
1734      case Ain_CStore:
1735         mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1736         mapReg(m, &i->Ain.CStore.src);
1737         return;
1738      case Ain_MovxLQ:
1739         mapReg(m, &i->Ain.MovxLQ.src);
1740         mapReg(m, &i->Ain.MovxLQ.dst);
1741         return;
1742      case Ain_LoadEX:
1743         mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1744         mapReg(m, &i->Ain.LoadEX.dst);
1745         return;
1746      case Ain_Store:
1747         mapReg(m, &i->Ain.Store.src);
1748         mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1749         return;
1750      case Ain_Set64:
1751         mapReg(m, &i->Ain.Set64.dst);
1752         return;
1753      case Ain_Bsfr64:
1754         mapReg(m, &i->Ain.Bsfr64.src);
1755         mapReg(m, &i->Ain.Bsfr64.dst);
1756         return;
1757      case Ain_MFence:
1758         return;
1759      case Ain_ACAS:
1760         mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1761         return;
1762      case Ain_DACAS:
1763         mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1764         return;
1765      case Ain_A87Free:
1766         return;
1767      case Ain_A87PushPop:
1768         mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1769         return;
1770      case Ain_A87FpOp:
1771         return;
1772      case Ain_A87LdCW:
1773         mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1774         return;
1775      case Ain_A87StSW:
1776         mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1777         return;
1778      case Ain_LdMXCSR:
1779         mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1780         return;
1781      case Ain_SseUComIS:
1782         mapReg(m, &i->Ain.SseUComIS.srcL);
1783         mapReg(m, &i->Ain.SseUComIS.srcR);
1784         mapReg(m, &i->Ain.SseUComIS.dst);
1785         return;
1786      case Ain_SseSI2SF:
1787         mapReg(m, &i->Ain.SseSI2SF.src);
1788         mapReg(m, &i->Ain.SseSI2SF.dst);
1789         return;
1790      case Ain_SseSF2SI:
1791         mapReg(m, &i->Ain.SseSF2SI.src);
1792         mapReg(m, &i->Ain.SseSF2SI.dst);
1793         return;
1794      case Ain_SseSDSS:
1795         mapReg(m, &i->Ain.SseSDSS.src);
1796         mapReg(m, &i->Ain.SseSDSS.dst);
1797         return;
1798      case Ain_SseLdSt:
1799         mapReg(m, &i->Ain.SseLdSt.reg);
1800         mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1801         break;
1802      case Ain_SseLdzLO:
1803         mapReg(m, &i->Ain.SseLdzLO.reg);
1804         mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1805         break;
1806      case Ain_Sse32Fx4:
1807         mapReg(m, &i->Ain.Sse32Fx4.src);
1808         mapReg(m, &i->Ain.Sse32Fx4.dst);
1809         return;
1810      case Ain_Sse32FLo:
1811         mapReg(m, &i->Ain.Sse32FLo.src);
1812         mapReg(m, &i->Ain.Sse32FLo.dst);
1813         return;
1814      case Ain_Sse64Fx2:
1815         mapReg(m, &i->Ain.Sse64Fx2.src);
1816         mapReg(m, &i->Ain.Sse64Fx2.dst);
1817         return;
1818      case Ain_Sse64FLo:
1819         mapReg(m, &i->Ain.Sse64FLo.src);
1820         mapReg(m, &i->Ain.Sse64FLo.dst);
1821         return;
1822      case Ain_SseReRg:
1823         mapReg(m, &i->Ain.SseReRg.src);
1824         mapReg(m, &i->Ain.SseReRg.dst);
1825         return;
1826      case Ain_SseCMov:
1827         mapReg(m, &i->Ain.SseCMov.src);
1828         mapReg(m, &i->Ain.SseCMov.dst);
1829         return;
1830      case Ain_SseShuf:
1831         mapReg(m, &i->Ain.SseShuf.src);
1832         mapReg(m, &i->Ain.SseShuf.dst);
1833         return;
1834      //uu case Ain_AvxLdSt:
1835      //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1836      //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1837      //uu    break;
1838      //uu case Ain_AvxReRg:
1839      //uu    mapReg(m, &i->Ain.AvxReRg.src);
1840      //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1841      //uu    return;
1842      case Ain_EvCheck:
1843         /* We expect both amodes only to mention %rbp, so this is in
1844            fact pointless, since %rbp isn't allocatable, but anyway.. */
1845         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1846         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1847         return;
1848      case Ain_ProfInc:
1849         /* hardwires r11 -- nothing to modify. */
1850         return;
1851      default:
1852         ppAMD64Instr(i, mode64);
1853         vpanic("mapRegs_AMD64Instr");
1854   }
1855}
1856
1857/* Figure out if i represents a reg-reg move, and if so assign the
1858   source and destination to *src and *dst.  If in doubt say No.  Used
1859   by the register allocator to do move coalescing.
1860*/
1861Bool isMove_AMD64Instr ( const AMD64Instr* i, HReg* src, HReg* dst )
1862{
1863   switch (i->tag) {
1864      case Ain_Alu64R:
1865         /* Moves between integer regs */
1866         if (i->Ain.Alu64R.op != Aalu_MOV)
1867            return False;
1868         if (i->Ain.Alu64R.src->tag != Armi_Reg)
1869            return False;
1870         *src = i->Ain.Alu64R.src->Armi.Reg.reg;
1871         *dst = i->Ain.Alu64R.dst;
1872         return True;
1873      case Ain_SseReRg:
1874         /* Moves between SSE regs */
1875         if (i->Ain.SseReRg.op != Asse_MOV)
1876            return False;
1877         *src = i->Ain.SseReRg.src;
1878         *dst = i->Ain.SseReRg.dst;
1879         return True;
1880      //uu case Ain_AvxReRg:
1881      //uu    /* Moves between AVX regs */
1882      //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
1883      //uu       return False;
1884      //uu    *src = i->Ain.AvxReRg.src;
1885      //uu    *dst = i->Ain.AvxReRg.dst;
1886      //uu    return True;
1887      default:
1888         return False;
1889   }
1890   /*NOTREACHED*/
1891}
1892
1893
1894/* Generate amd64 spill/reload instructions under the direction of the
1895   register allocator.  Note it's critical these don't write the
1896   condition codes. */
1897
1898void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1899                      HReg rreg, Int offsetB, Bool mode64 )
1900{
1901   AMD64AMode* am;
1902   vassert(offsetB >= 0);
1903   vassert(!hregIsVirtual(rreg));
1904   vassert(mode64 == True);
1905   *i1 = *i2 = NULL;
1906   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1907   switch (hregClass(rreg)) {
1908      case HRcInt64:
1909         *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
1910         return;
1911      case HRcVec128:
1912         *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
1913         return;
1914      default:
1915         ppHRegClass(hregClass(rreg));
1916         vpanic("genSpill_AMD64: unimplemented regclass");
1917   }
1918}
1919
1920void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1921                       HReg rreg, Int offsetB, Bool mode64 )
1922{
1923   AMD64AMode* am;
1924   vassert(offsetB >= 0);
1925   vassert(!hregIsVirtual(rreg));
1926   vassert(mode64 == True);
1927   *i1 = *i2 = NULL;
1928   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1929   switch (hregClass(rreg)) {
1930      case HRcInt64:
1931         *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
1932         return;
1933      case HRcVec128:
1934         *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
1935         return;
1936      default:
1937         ppHRegClass(hregClass(rreg));
1938         vpanic("genReload_AMD64: unimplemented regclass");
1939   }
1940}
1941
1942
1943/* --------- The amd64 assembler (bleh.) --------- */
1944
1945/* Produce the low three bits of an integer register number. */
1946inline static UInt iregEnc210 ( HReg r )
1947{
1948   UInt n;
1949   vassert(hregClass(r) == HRcInt64);
1950   vassert(!hregIsVirtual(r));
1951   n = hregEncoding(r);
1952   vassert(n <= 15);
1953   return n & 7;
1954}
1955
1956/* Produce bit 3 of an integer register number. */
1957inline static UInt iregEnc3 ( HReg r )
1958{
1959   UInt n;
1960   vassert(hregClass(r) == HRcInt64);
1961   vassert(!hregIsVirtual(r));
1962   n = hregEncoding(r);
1963   vassert(n <= 15);
1964   return (n >> 3) & 1;
1965}
1966
1967/* Produce a complete 4-bit integer register number. */
1968inline static UInt iregEnc3210 ( HReg r )
1969{
1970   UInt n;
1971   vassert(hregClass(r) == HRcInt64);
1972   vassert(!hregIsVirtual(r));
1973   n = hregEncoding(r);
1974   vassert(n <= 15);
1975   return n;
1976}
1977
1978/* Produce a complete 4-bit integer register number. */
1979inline static UInt vregEnc3210 ( HReg r )
1980{
1981   UInt n;
1982   vassert(hregClass(r) == HRcVec128);
1983   vassert(!hregIsVirtual(r));
1984   n = hregEncoding(r);
1985   vassert(n <= 15);
1986   return n;
1987}
1988
1989inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
1990{
1991   vassert(mod < 4);
1992   vassert((reg|regmem) < 8);
1993   return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
1994}
1995
1996inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
1997{
1998   vassert(shift < 4);
1999   vassert((regindex|regbase) < 8);
2000   return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2001}
2002
2003static UChar* emit32 ( UChar* p, UInt w32 )
2004{
2005   *p++ = toUChar((w32)       & 0x000000FF);
2006   *p++ = toUChar((w32 >>  8) & 0x000000FF);
2007   *p++ = toUChar((w32 >> 16) & 0x000000FF);
2008   *p++ = toUChar((w32 >> 24) & 0x000000FF);
2009   return p;
2010}
2011
2012static UChar* emit64 ( UChar* p, ULong w64 )
2013{
2014   p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2015   p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2016   return p;
2017}
2018
2019/* Does a sign-extend of the lowest 8 bits give
2020   the original number? */
2021static Bool fits8bits ( UInt w32 )
2022{
2023   Int i32 = (Int)w32;
2024   return toBool(i32 == ((Int)(w32 << 24) >> 24));
2025}
2026/* Can the lower 32 bits be signedly widened to produce the whole
2027   64-bit value?  In other words, are the top 33 bits either all 0 or
2028   all 1 ? */
2029static Bool fitsIn32Bits ( ULong x )
2030{
2031   Long y1;
2032   y1 = x << 32;
2033   y1 >>=/*s*/ 32;
2034   return toBool(x == y1);
2035}
2036
2037
2038/* Forming mod-reg-rm bytes and scale-index-base bytes.
2039
2040     greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2041                       =  00 greg ereg
2042
2043     greg,  d8(ereg)   |  ereg is neither of: RSP R12
2044                       =  01 greg ereg, d8
2045
2046     greg,  d32(ereg)  |  ereg is neither of: RSP R12
2047                       =  10 greg ereg, d32
2048
2049     greg,  d8(ereg)   |  ereg is either: RSP R12
2050                       =  01 greg 100, 0x24, d8
2051                       (lowest bit of rex distinguishes R12/RSP)
2052
2053     greg,  d32(ereg)  |  ereg is either: RSP R12
2054                       =  10 greg 100, 0x24, d32
2055                       (lowest bit of rex distinguishes R12/RSP)
2056
2057     -----------------------------------------------
2058
2059     greg,  d8(base,index,scale)
2060               |  index != RSP
2061               =  01 greg 100, scale index base, d8
2062
2063     greg,  d32(base,index,scale)
2064               |  index != RSP
2065               =  10 greg 100, scale index base, d32
2066*/
2067static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2068{
2069   UInt gregEnc210 = gregEnc3210 & 7;
2070   if (am->tag == Aam_IR) {
2071      if (am->Aam.IR.imm == 0
2072          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2073          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2074          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2075          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2076         ) {
2077         *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2078         return p;
2079      }
2080      if (fits8bits(am->Aam.IR.imm)
2081          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2082          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2083         ) {
2084         *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2085         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2086         return p;
2087      }
2088      if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2089          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2090         ) {
2091         *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2092         p = emit32(p, am->Aam.IR.imm);
2093         return p;
2094      }
2095      if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2096           || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2097          && fits8bits(am->Aam.IR.imm)) {
2098 	 *p++ = mkModRegRM(1, gregEnc210, 4);
2099         *p++ = 0x24;
2100         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2101         return p;
2102      }
2103      if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2104	      || wait for test case for RSP case */
2105          sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2106 	 *p++ = mkModRegRM(2, gregEnc210, 4);
2107         *p++ = 0x24;
2108         p = emit32(p, am->Aam.IR.imm);
2109         return p;
2110      }
2111      ppAMD64AMode(am);
2112      vpanic("doAMode_M: can't emit amode IR");
2113      /*NOTREACHED*/
2114   }
2115   if (am->tag == Aam_IRRS) {
2116      if (fits8bits(am->Aam.IRRS.imm)
2117          && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2118         *p++ = mkModRegRM(1, gregEnc210, 4);
2119         *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2120                                          iregEnc210(am->Aam.IRRS.base));
2121         *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2122         return p;
2123      }
2124      if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2125         *p++ = mkModRegRM(2, gregEnc210, 4);
2126         *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2127                                          iregEnc210(am->Aam.IRRS.base));
2128         p = emit32(p, am->Aam.IRRS.imm);
2129         return p;
2130      }
2131      ppAMD64AMode(am);
2132      vpanic("doAMode_M: can't emit amode IRRS");
2133      /*NOTREACHED*/
2134   }
2135   vpanic("doAMode_M: unknown amode");
2136   /*NOTREACHED*/
2137}
2138
2139static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2140{
2141   return doAMode_M__wrk(p, iregEnc3210(greg), am);
2142}
2143
2144static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2145{
2146   vassert(gregEnc3210 < 16);
2147   return doAMode_M__wrk(p, gregEnc3210, am);
2148}
2149
2150
2151/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2152inline
2153static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2154{
2155   *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2156   return p;
2157}
2158
2159static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2160{
2161   return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2162}
2163
2164static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2165{
2166   vassert(gregEnc3210 < 16);
2167   return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2168}
2169
2170static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2171{
2172   vassert(eregEnc3210 < 16);
2173   return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2174}
2175
2176static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2177{
2178   vassert( (gregEnc3210|eregEnc3210) < 16);
2179   return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2180}
2181
2182
2183/* Clear the W bit on a REX byte, thereby changing the operand size
2184   back to whatever that instruction's default operand size is. */
2185static inline UChar clearWBit ( UChar rex )
2186{
2187   return rex & ~(1<<3);
2188}
2189
2190
2191/* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2192inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2193{
2194   if (am->tag == Aam_IR) {
2195      UChar W = 1;  /* we want 64-bit mode */
2196      UChar R = (gregEnc3210 >> 3) & 1;
2197      UChar X = 0; /* not relevant */
2198      UChar B = iregEnc3(am->Aam.IR.reg);
2199      return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2200   }
2201   if (am->tag == Aam_IRRS) {
2202      UChar W = 1;  /* we want 64-bit mode */
2203      UChar R = (gregEnc3210 >> 3) & 1;
2204      UChar X = iregEnc3(am->Aam.IRRS.index);
2205      UChar B = iregEnc3(am->Aam.IRRS.base);
2206      return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2207   }
2208   vassert(0);
2209   return 0; /*NOTREACHED*/
2210}
2211
2212static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2213{
2214   return rexAMode_M__wrk(iregEnc3210(greg), am);
2215}
2216
2217static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2218{
2219   vassert(gregEnc3210 < 16);
2220   return rexAMode_M__wrk(gregEnc3210, am);
2221}
2222
2223
2224/* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2225inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2226{
2227   UChar W = 1;  /* we want 64-bit mode */
2228   UChar R = (gregEnc3210 >> 3) & 1;
2229   UChar X = 0; /* not relevant */
2230   UChar B = (eregEnc3210 >> 3) & 1;
2231   return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2232}
2233
2234static UChar rexAMode_R ( HReg greg, HReg ereg )
2235{
2236   return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2237}
2238
2239static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2240{
2241   vassert(gregEnc3210 < 16);
2242   return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2243}
2244
2245static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2246{
2247   vassert(eregEnc3210 < 16);
2248   return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2249}
2250
2251static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2252{
2253   vassert((gregEnc3210|eregEnc3210) < 16);
2254   return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2255}
2256
2257
2258//uu /* May 2012: this VEX prefix stuff is currently unused, but has
2259//uu    verified correct (I reckon).  Certainly it has been known to
2260//uu    produce correct VEX prefixes during testing. */
2261//uu
2262//uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2263//uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2264//uu    in verbatim.  There's no range checking on the bits. */
2265//uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2266//uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2267//uu                             UInt L, UInt pp )
2268//uu {
2269//uu    UChar byte0 = 0;
2270//uu    UChar byte1 = 0;
2271//uu    UChar byte2 = 0;
2272//uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2273//uu       /* 2 byte encoding is possible. */
2274//uu       byte0 = 0xC5;
2275//uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2276//uu               | (L << 2) | pp;
2277//uu    } else {
2278//uu       /* 3 byte encoding is needed. */
2279//uu       byte0 = 0xC4;
2280//uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2281//uu               | ((rexB ^ 1) << 5) | mmmmm;
2282//uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2283//uu    }
2284//uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2285//uu }
2286//uu
2287//uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2288//uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2289//uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2290//uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2291//uu    vvvv=1111 (unused 3rd reg). */
2292//uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2293//uu {
2294//uu    UChar L       = 1; /* size = 256 */
2295//uu    UChar pp      = 0; /* no SIMD prefix */
2296//uu    UChar mmmmm   = 1; /* 0F */
2297//uu    UChar notVvvv = 0; /* unused */
2298//uu    UChar rexW    = 0;
2299//uu    UChar rexR    = 0;
2300//uu    UChar rexX    = 0;
2301//uu    UChar rexB    = 0;
2302//uu    /* Same logic as in rexAMode_M. */
2303//uu    if (am->tag == Aam_IR) {
2304//uu       rexR = iregEnc3(greg);
2305//uu       rexX = 0; /* not relevant */
2306//uu       rexB = iregEnc3(am->Aam.IR.reg);
2307//uu    }
2308//uu    else if (am->tag == Aam_IRRS) {
2309//uu       rexR = iregEnc3(greg);
2310//uu       rexX = iregEnc3(am->Aam.IRRS.index);
2311//uu       rexB = iregEnc3(am->Aam.IRRS.base);
2312//uu    } else {
2313//uu       vassert(0);
2314//uu    }
2315//uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2316//uu }
2317//uu
2318//uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2319//uu {
2320//uu    switch (vex & 0xFF) {
2321//uu       case 0xC5:
2322//uu          *p++ = 0xC5;
2323//uu          *p++ = (vex >> 8) & 0xFF;
2324//uu          vassert(0 == (vex >> 16));
2325//uu          break;
2326//uu       case 0xC4:
2327//uu          *p++ = 0xC4;
2328//uu          *p++ = (vex >> 8) & 0xFF;
2329//uu          *p++ = (vex >> 16) & 0xFF;
2330//uu          vassert(0 == (vex >> 24));
2331//uu          break;
2332//uu       default:
2333//uu          vassert(0);
2334//uu    }
2335//uu    return p;
2336//uu }
2337
2338
2339/* Emit ffree %st(N) */
2340static UChar* do_ffree_st ( UChar* p, Int n )
2341{
2342   vassert(n >= 0 && n <= 7);
2343   *p++ = 0xDD;
2344   *p++ = toUChar(0xC0 + n);
2345   return p;
2346}
2347
2348/* Emit an instruction into buf and return the number of bytes used.
2349   Note that buf is not the insn's final place, and therefore it is
2350   imperative to emit position-independent code.  If the emitted
2351   instruction was a profiler inc, set *is_profInc to True, else
2352   leave it unchanged. */
2353
2354Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2355                      UChar* buf, Int nbuf, const AMD64Instr* i,
2356                      Bool mode64, VexEndness endness_host,
2357                      const void* disp_cp_chain_me_to_slowEP,
2358                      const void* disp_cp_chain_me_to_fastEP,
2359                      const void* disp_cp_xindir,
2360                      const void* disp_cp_xassisted )
2361{
2362   UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2363   UInt   xtra;
2364   UInt   reg;
2365   UChar  rex;
2366   UChar* p = &buf[0];
2367   UChar* ptmp;
2368   Int    j;
2369   vassert(nbuf >= 32);
2370   vassert(mode64 == True);
2371
2372   /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2373
2374   switch (i->tag) {
2375
2376   case Ain_Imm64:
2377      if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2378         /* Use the short form (load into 32 bit reg, + default
2379            widening rule) for constants under 1 million.  We could
2380            use this form for the range 0 to 0x7FFFFFFF inclusive, but
2381            limit it to a smaller range for verifiability purposes. */
2382         if (1 & iregEnc3(i->Ain.Imm64.dst))
2383            *p++ = 0x41;
2384         *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2385         p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2386      } else {
2387         *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2388         *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2389         p = emit64(p, i->Ain.Imm64.imm64);
2390      }
2391      goto done;
2392
2393   case Ain_Alu64R:
2394      /* Deal specially with MOV */
2395      if (i->Ain.Alu64R.op == Aalu_MOV) {
2396         switch (i->Ain.Alu64R.src->tag) {
2397            case Armi_Imm:
2398               if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2399                  /* Actually we could use this form for constants in
2400                     the range 0 through 0x7FFFFFFF inclusive, but
2401                     limit it to a small range for verifiability
2402                     purposes. */
2403                  /* Generate "movl $imm32, 32-bit-register" and let
2404                     the default zero-extend rule cause the upper half
2405                     of the dst to be zeroed out too.  This saves 1
2406                     and sometimes 2 bytes compared to the more
2407                     obvious encoding in the 'else' branch. */
2408                  if (1 & iregEnc3(i->Ain.Alu64R.dst))
2409                     *p++ = 0x41;
2410                  *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2411                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2412               } else {
2413                  *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2414                  *p++ = 0xC7;
2415                  *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2416                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2417               }
2418               goto done;
2419            case Armi_Reg:
2420               *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2421                                  i->Ain.Alu64R.dst );
2422               *p++ = 0x89;
2423               p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2424                                i->Ain.Alu64R.dst);
2425               goto done;
2426            case Armi_Mem:
2427               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2428                                 i->Ain.Alu64R.src->Armi.Mem.am);
2429               *p++ = 0x8B;
2430               p = doAMode_M(p, i->Ain.Alu64R.dst,
2431                                i->Ain.Alu64R.src->Armi.Mem.am);
2432               goto done;
2433            default:
2434               goto bad;
2435         }
2436      }
2437      /* MUL */
2438      if (i->Ain.Alu64R.op == Aalu_MUL) {
2439         switch (i->Ain.Alu64R.src->tag) {
2440            case Armi_Reg:
2441               *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2442                                  i->Ain.Alu64R.src->Armi.Reg.reg);
2443               *p++ = 0x0F;
2444               *p++ = 0xAF;
2445               p = doAMode_R(p, i->Ain.Alu64R.dst,
2446                                i->Ain.Alu64R.src->Armi.Reg.reg);
2447               goto done;
2448            case Armi_Mem:
2449               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2450                                 i->Ain.Alu64R.src->Armi.Mem.am);
2451               *p++ = 0x0F;
2452               *p++ = 0xAF;
2453               p = doAMode_M(p, i->Ain.Alu64R.dst,
2454                                i->Ain.Alu64R.src->Armi.Mem.am);
2455               goto done;
2456            case Armi_Imm:
2457               if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2458                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2459                  *p++ = 0x6B;
2460                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2461                  *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2462               } else {
2463                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2464                  *p++ = 0x69;
2465                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2466                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2467               }
2468               goto done;
2469            default:
2470               goto bad;
2471         }
2472      }
2473      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2474      opc = opc_rr = subopc_imm = opc_imma = 0;
2475      switch (i->Ain.Alu64R.op) {
2476         case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2477                        subopc_imm = 2; opc_imma = 0x15; break;
2478         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2479                        subopc_imm = 0; opc_imma = 0x05; break;
2480         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2481                        subopc_imm = 5; opc_imma = 0x2D; break;
2482         case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2483                        subopc_imm = 3; opc_imma = 0x1D; break;
2484         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2485                        subopc_imm = 4; opc_imma = 0x25; break;
2486         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2487                        subopc_imm = 6; opc_imma = 0x35; break;
2488         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2489                        subopc_imm = 1; opc_imma = 0x0D; break;
2490         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2491                        subopc_imm = 7; opc_imma = 0x3D; break;
2492         default: goto bad;
2493      }
2494      switch (i->Ain.Alu64R.src->tag) {
2495         case Armi_Imm:
2496            if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2497                && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2498               goto bad; /* FIXME: awaiting test case */
2499               *p++ = toUChar(opc_imma);
2500               p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2501            } else
2502            if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2503               *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2504               *p++ = 0x83;
2505               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2506               *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2507            } else {
2508               *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2509               *p++ = 0x81;
2510               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2511               p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2512            }
2513            goto done;
2514         case Armi_Reg:
2515            *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2516                               i->Ain.Alu64R.dst);
2517            *p++ = toUChar(opc_rr);
2518            p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2519                             i->Ain.Alu64R.dst);
2520            goto done;
2521         case Armi_Mem:
2522            *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2523                               i->Ain.Alu64R.src->Armi.Mem.am);
2524            *p++ = toUChar(opc);
2525            p = doAMode_M(p, i->Ain.Alu64R.dst,
2526                             i->Ain.Alu64R.src->Armi.Mem.am);
2527            goto done;
2528         default:
2529            goto bad;
2530      }
2531      break;
2532
2533   case Ain_Alu64M:
2534      /* Deal specially with MOV */
2535      if (i->Ain.Alu64M.op == Aalu_MOV) {
2536         switch (i->Ain.Alu64M.src->tag) {
2537            case Ari_Reg:
2538               *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2539                                 i->Ain.Alu64M.dst);
2540               *p++ = 0x89;
2541               p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2542                                i->Ain.Alu64M.dst);
2543               goto done;
2544            case Ari_Imm:
2545               *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2546               *p++ = 0xC7;
2547               p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2548               p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2549               goto done;
2550            default:
2551               goto bad;
2552         }
2553      }
2554      break;
2555
2556   case Ain_Sh64:
2557      opc_cl = opc_imm = subopc = 0;
2558      switch (i->Ain.Sh64.op) {
2559         case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2560         case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2561         case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2562         default: goto bad;
2563      }
2564      if (i->Ain.Sh64.src == 0) {
2565         *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2566         *p++ = toUChar(opc_cl);
2567         p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2568         goto done;
2569      } else {
2570         *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2571         *p++ = toUChar(opc_imm);
2572         p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2573         *p++ = (UChar)(i->Ain.Sh64.src);
2574         goto done;
2575      }
2576      break;
2577
2578   case Ain_Test64:
2579      /* testq sign-extend($imm32), %reg */
2580      *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2581      *p++ = 0xF7;
2582      p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2583      p = emit32(p, i->Ain.Test64.imm32);
2584      goto done;
2585
2586   case Ain_Unary64:
2587      if (i->Ain.Unary64.op == Aun_NOT) {
2588         *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2589         *p++ = 0xF7;
2590         p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2591         goto done;
2592      }
2593      if (i->Ain.Unary64.op == Aun_NEG) {
2594         *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2595         *p++ = 0xF7;
2596         p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2597         goto done;
2598      }
2599      break;
2600
2601   case Ain_Lea64:
2602      *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2603      *p++ = 0x8D;
2604      p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2605      goto done;
2606
2607   case Ain_Alu32R:
2608      /* ADD/SUB/AND/OR/XOR/CMP */
2609      opc = opc_rr = subopc_imm = opc_imma = 0;
2610      switch (i->Ain.Alu32R.op) {
2611         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2612                        subopc_imm = 0; opc_imma = 0x05; break;
2613         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2614                        subopc_imm = 5; opc_imma = 0x2D; break;
2615         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2616                        subopc_imm = 4; opc_imma = 0x25; break;
2617         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2618                        subopc_imm = 6; opc_imma = 0x35; break;
2619         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2620                        subopc_imm = 1; opc_imma = 0x0D; break;
2621         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2622                        subopc_imm = 7; opc_imma = 0x3D; break;
2623         default: goto bad;
2624      }
2625      switch (i->Ain.Alu32R.src->tag) {
2626         case Armi_Imm:
2627            if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2628                && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2629               goto bad; /* FIXME: awaiting test case */
2630               *p++ = toUChar(opc_imma);
2631               p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2632            } else
2633            if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2634               rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2635               if (rex != 0x40) *p++ = rex;
2636               *p++ = 0x83;
2637               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2638               *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2639            } else {
2640               rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2641               if (rex != 0x40) *p++ = rex;
2642               *p++ = 0x81;
2643               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2644               p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2645            }
2646            goto done;
2647         case Armi_Reg:
2648            rex  = clearWBit(
2649                   rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2650                               i->Ain.Alu32R.dst) );
2651            if (rex != 0x40) *p++ = rex;
2652            *p++ = toUChar(opc_rr);
2653            p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2654                             i->Ain.Alu32R.dst);
2655            goto done;
2656         case Armi_Mem:
2657            rex  = clearWBit(
2658                   rexAMode_M( i->Ain.Alu32R.dst,
2659                               i->Ain.Alu32R.src->Armi.Mem.am) );
2660            if (rex != 0x40) *p++ = rex;
2661            *p++ = toUChar(opc);
2662            p = doAMode_M(p, i->Ain.Alu32R.dst,
2663                             i->Ain.Alu32R.src->Armi.Mem.am);
2664            goto done;
2665         default:
2666            goto bad;
2667      }
2668      break;
2669
2670   case Ain_MulL:
2671      subopc = i->Ain.MulL.syned ? 5 : 4;
2672      switch (i->Ain.MulL.src->tag)  {
2673         case Arm_Mem:
2674            *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2675            *p++ = 0xF7;
2676            p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2677            goto done;
2678         case Arm_Reg:
2679            *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2680            *p++ = 0xF7;
2681            p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2682            goto done;
2683         default:
2684            goto bad;
2685      }
2686      break;
2687
2688   case Ain_Div:
2689      subopc = i->Ain.Div.syned ? 7 : 6;
2690      if (i->Ain.Div.sz == 4) {
2691         switch (i->Ain.Div.src->tag)  {
2692            case Arm_Mem:
2693               goto bad;
2694               /*FIXME*/
2695               *p++ = 0xF7;
2696               p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2697               goto done;
2698            case Arm_Reg:
2699               *p++ = clearWBit(
2700                      rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2701               *p++ = 0xF7;
2702               p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2703               goto done;
2704            default:
2705               goto bad;
2706         }
2707      }
2708      if (i->Ain.Div.sz == 8) {
2709         switch (i->Ain.Div.src->tag)  {
2710            case Arm_Mem:
2711               *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
2712               *p++ = 0xF7;
2713               p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2714               goto done;
2715            case Arm_Reg:
2716               *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
2717               *p++ = 0xF7;
2718               p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2719               goto done;
2720            default:
2721               goto bad;
2722         }
2723      }
2724      break;
2725
2726   case Ain_Push:
2727      switch (i->Ain.Push.src->tag) {
2728         case Armi_Mem:
2729            *p++ = clearWBit(
2730                   rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
2731            *p++ = 0xFF;
2732            p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
2733            goto done;
2734         case Armi_Imm:
2735            *p++ = 0x68;
2736            p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2737            goto done;
2738         case Armi_Reg:
2739            *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
2740            *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
2741            goto done;
2742        default:
2743            goto bad;
2744      }
2745
2746   case Ain_Call: {
2747      /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
2748         above, %r11 is used as an address temporary. */
2749      /* If we don't need to do any fixup actions in the case that the
2750         call doesn't happen, just do the simple thing and emit
2751         straight-line code.  This is usually the case. */
2752      if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
2753          || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
2754         /* jump over the following two insns if the condition does
2755            not hold */
2756         Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2757         if (i->Ain.Call.cond != Acc_ALWAYS) {
2758            *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2759            *p++ = shortImm ? 10 : 13;
2760            /* 10 or 13 bytes in the next two insns */
2761         }
2762         if (shortImm) {
2763            /* 7 bytes: movl sign-extend(imm32), %r11 */
2764            *p++ = 0x49;
2765            *p++ = 0xC7;
2766            *p++ = 0xC3;
2767            p = emit32(p, (UInt)i->Ain.Call.target);
2768         } else {
2769            /* 10 bytes: movabsq $target, %r11 */
2770            *p++ = 0x49;
2771            *p++ = 0xBB;
2772            p = emit64(p, i->Ain.Call.target);
2773         }
2774         /* 3 bytes: call *%r11 */
2775         *p++ = 0x41;
2776         *p++ = 0xFF;
2777         *p++ = 0xD3;
2778      } else {
2779         Int delta;
2780         /* Complex case.  We have to generate an if-then-else diamond. */
2781         // before:
2782         //   j{!cond} else:
2783         //   movabsq $target, %r11
2784         //   call* %r11
2785         // preElse:
2786         //   jmp after:
2787         // else:
2788         //   movabsq $0x5555555555555555, %rax  // possibly
2789         //   movq %rax, %rdx                    // possibly
2790         // after:
2791
2792         // before:
2793         UChar* pBefore = p;
2794
2795         //   j{!cond} else:
2796         *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2797         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2798
2799         //   movabsq $target, %r11
2800         *p++ = 0x49;
2801         *p++ = 0xBB;
2802         p = emit64(p, i->Ain.Call.target);
2803
2804         //   call* %r11
2805         *p++ = 0x41;
2806         *p++ = 0xFF;
2807         *p++ = 0xD3;
2808
2809         // preElse:
2810         UChar* pPreElse = p;
2811
2812         //   jmp after:
2813         *p++ = 0xEB;
2814         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2815
2816         // else:
2817         UChar* pElse = p;
2818
2819         /* Do the 'else' actions */
2820         switch (i->Ain.Call.rloc.pri) {
2821            case RLPri_Int:
2822               // movabsq $0x5555555555555555, %rax
2823               *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
2824               break;
2825            case RLPri_2Int:
2826               vassert(0); //ATC
2827               // movabsq $0x5555555555555555, %rax
2828               *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
2829               // movq %rax, %rdx
2830               *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
2831            case RLPri_None: case RLPri_INVALID: default:
2832               vassert(0);
2833         }
2834
2835         // after:
2836         UChar* pAfter = p;
2837
2838         // Fix up the branch offsets.  The +2s in the offset
2839         // calculations are there because x86 requires conditional
2840         // branches to have their offset stated relative to the
2841         // instruction immediately following the branch insn.  And in
2842         // both cases the branch insns are 2 bytes long.
2843
2844         // First, the "j{!cond} else:" at pBefore.
2845         delta = (Int)(Long)(pElse - (pBefore + 2));
2846         vassert(delta >= 0 && delta < 100/*arbitrary*/);
2847         *(pBefore+1) = (UChar)delta;
2848
2849         // And secondly, the "jmp after:" at pPreElse.
2850         delta = (Int)(Long)(pAfter - (pPreElse + 2));
2851         vassert(delta >= 0 && delta < 100/*arbitrary*/);
2852         *(pPreElse+1) = (UChar)delta;
2853      }
2854      goto done;
2855   }
2856
2857   case Ain_XDirect: {
2858      /* NB: what goes on here has to be very closely coordinated with the
2859         chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
2860      /* We're generating chain-me requests here, so we need to be
2861         sure this is actually allowed -- no-redir translations can't
2862         use chain-me's.  Hence: */
2863      vassert(disp_cp_chain_me_to_slowEP != NULL);
2864      vassert(disp_cp_chain_me_to_fastEP != NULL);
2865
2866      HReg r11 = hregAMD64_R11();
2867
2868      /* Use ptmp for backpatching conditional jumps. */
2869      ptmp = NULL;
2870
2871      /* First off, if this is conditional, create a conditional
2872         jump over the rest of it. */
2873      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2874         /* jmp fwds if !condition */
2875         *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
2876         ptmp = p; /* fill in this bit later */
2877         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2878      }
2879
2880      /* Update the guest RIP. */
2881      if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
2882         /* use a shorter encoding */
2883         /* movl sign-extend(dstGA), %r11 */
2884         *p++ = 0x49;
2885         *p++ = 0xC7;
2886         *p++ = 0xC3;
2887         p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
2888      } else {
2889         /* movabsq $dstGA, %r11 */
2890         *p++ = 0x49;
2891         *p++ = 0xBB;
2892         p = emit64(p, i->Ain.XDirect.dstGA);
2893      }
2894
2895      /* movq %r11, amRIP */
2896      *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
2897      *p++ = 0x89;
2898      p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
2899
2900      /* --- FIRST PATCHABLE BYTE follows --- */
2901      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2902         to) backs up the return address, so as to find the address of
2903         the first patchable byte.  So: don't change the length of the
2904         two instructions below. */
2905      /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
2906      *p++ = 0x49;
2907      *p++ = 0xBB;
2908      const void* disp_cp_chain_me
2909               = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2910                                         : disp_cp_chain_me_to_slowEP;
2911      p = emit64(p, (Addr)disp_cp_chain_me);
2912      /* call *%r11 */
2913      *p++ = 0x41;
2914      *p++ = 0xFF;
2915      *p++ = 0xD3;
2916      /* --- END of PATCHABLE BYTES --- */
2917
2918      /* Fix up the conditional jump, if there was one. */
2919      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2920         Int delta = p - ptmp;
2921         vassert(delta > 0 && delta < 40);
2922         *ptmp = toUChar(delta-1);
2923      }
2924      goto done;
2925   }
2926
2927   case Ain_XIndir: {
2928      /* We're generating transfers that could lead indirectly to a
2929         chain-me, so we need to be sure this is actually allowed --
2930         no-redir translations are not allowed to reach normal
2931         translations without going through the scheduler.  That means
2932         no XDirects or XIndirs out from no-redir translations.
2933         Hence: */
2934      vassert(disp_cp_xindir != NULL);
2935
2936      /* Use ptmp for backpatching conditional jumps. */
2937      ptmp = NULL;
2938
2939      /* First off, if this is conditional, create a conditional
2940         jump over the rest of it. */
2941      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2942         /* jmp fwds if !condition */
2943         *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
2944         ptmp = p; /* fill in this bit later */
2945         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2946      }
2947
2948      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2949      *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2950      *p++ = 0x89;
2951      p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2952
2953      /* get $disp_cp_xindir into %r11 */
2954      if (fitsIn32Bits((Addr)disp_cp_xindir)) {
2955         /* use a shorter encoding */
2956         /* movl sign-extend(disp_cp_xindir), %r11 */
2957         *p++ = 0x49;
2958         *p++ = 0xC7;
2959         *p++ = 0xC3;
2960         p = emit32(p, (UInt)(Addr)disp_cp_xindir);
2961      } else {
2962         /* movabsq $disp_cp_xindir, %r11 */
2963         *p++ = 0x49;
2964         *p++ = 0xBB;
2965         p = emit64(p, (Addr)disp_cp_xindir);
2966      }
2967
2968      /* jmp *%r11 */
2969      *p++ = 0x41;
2970      *p++ = 0xFF;
2971      *p++ = 0xE3;
2972
2973      /* Fix up the conditional jump, if there was one. */
2974      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2975         Int delta = p - ptmp;
2976         vassert(delta > 0 && delta < 40);
2977         *ptmp = toUChar(delta-1);
2978      }
2979      goto done;
2980   }
2981
2982   case Ain_XAssisted: {
2983      /* Use ptmp for backpatching conditional jumps. */
2984      ptmp = NULL;
2985
2986      /* First off, if this is conditional, create a conditional
2987         jump over the rest of it. */
2988      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
2989         /* jmp fwds if !condition */
2990         *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
2991         ptmp = p; /* fill in this bit later */
2992         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2993      }
2994
2995      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2996      *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2997      *p++ = 0x89;
2998      p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2999      /* movl $magic_number, %ebp.  Since these numbers are all small positive
3000         integers, we can get away with "movl $N, %ebp" rather than
3001         the longer "movq $N, %rbp". */
3002      UInt trcval = 0;
3003      switch (i->Ain.XAssisted.jk) {
3004         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3005         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3006         case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3007         case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3008         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3009         case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3010         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3011         case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3012         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3013         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3014         case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3015         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3016         /* We don't expect to see the following being assisted. */
3017         case Ijk_Ret:
3018         case Ijk_Call:
3019         /* fallthrough */
3020         default:
3021            ppIRJumpKind(i->Ain.XAssisted.jk);
3022            vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3023      }
3024      vassert(trcval != 0);
3025      *p++ = 0xBD;
3026      p = emit32(p, trcval);
3027      /* movabsq $disp_assisted, %r11 */
3028      *p++ = 0x49;
3029      *p++ = 0xBB;
3030      p = emit64(p, (Addr)disp_cp_xassisted);
3031      /* jmp *%r11 */
3032      *p++ = 0x41;
3033      *p++ = 0xFF;
3034      *p++ = 0xE3;
3035
3036      /* Fix up the conditional jump, if there was one. */
3037      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3038         Int delta = p - ptmp;
3039         vassert(delta > 0 && delta < 40);
3040         *ptmp = toUChar(delta-1);
3041      }
3042      goto done;
3043   }
3044
3045   case Ain_CMov64:
3046      vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3047      *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3048      *p++ = 0x0F;
3049      *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3050      p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3051      goto done;
3052
3053   case Ain_CLoad: {
3054      vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3055
3056      /* Only 32- or 64-bit variants are allowed. */
3057      vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3058
3059      /* Use ptmp for backpatching conditional jumps. */
3060      ptmp = NULL;
3061
3062      /* jmp fwds if !condition */
3063      *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3064      ptmp = p; /* fill in this bit later */
3065      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3066
3067      /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3068         load, which, by the default zero-extension rule, zeroes out
3069         the upper half of the destination, as required. */
3070      rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3071      *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3072      *p++ = 0x8B;
3073      p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3074
3075      /* Fix up the conditional branch */
3076      Int delta = p - ptmp;
3077      vassert(delta > 0 && delta < 40);
3078      *ptmp = toUChar(delta-1);
3079      goto done;
3080   }
3081
3082   case Ain_CStore: {
3083      /* AFAICS this is identical to Ain_CStore except that the opcode
3084         is 0x89 instead of 0x8B. */
3085      vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3086
3087      /* Only 32- or 64-bit variants are allowed. */
3088      vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3089
3090      /* Use ptmp for backpatching conditional jumps. */
3091      ptmp = NULL;
3092
3093      /* jmp fwds if !condition */
3094      *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3095      ptmp = p; /* fill in this bit later */
3096      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3097
3098      /* Now the store. */
3099      rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3100      *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3101      *p++ = 0x89;
3102      p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3103
3104      /* Fix up the conditional branch */
3105      Int delta = p - ptmp;
3106      vassert(delta > 0 && delta < 40);
3107      *ptmp = toUChar(delta-1);
3108      goto done;
3109   }
3110
3111   case Ain_MovxLQ:
3112      /* No, _don't_ ask me why the sense of the args has to be
3113         different in the S vs Z case.  I don't know. */
3114      if (i->Ain.MovxLQ.syned) {
3115         /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3116         *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3117         *p++ = 0x63;
3118         p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3119      } else {
3120         /* Produce a 32-bit reg-reg move, since the implicit
3121            zero-extend does what we want. */
3122         *p++ = clearWBit (
3123                   rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3124         *p++ = 0x89;
3125         p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3126      }
3127      goto done;
3128
3129   case Ain_LoadEX:
3130      if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3131         /* movzbq */
3132         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3133         *p++ = 0x0F;
3134         *p++ = 0xB6;
3135         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3136         goto done;
3137      }
3138      if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3139         /* movzwq */
3140         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3141         *p++ = 0x0F;
3142         *p++ = 0xB7;
3143         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3144         goto done;
3145      }
3146      if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3147         /* movzlq */
3148         /* This isn't really an existing AMD64 instruction per se.
3149            Rather, we have to do a 32-bit load.  Because a 32-bit
3150            write implicitly clears the upper 32 bits of the target
3151            register, we get what we want. */
3152         *p++ = clearWBit(
3153                rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3154         *p++ = 0x8B;
3155         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3156         goto done;
3157      }
3158      break;
3159
3160   case Ain_Set64:
3161      /* Make the destination register be 1 or 0, depending on whether
3162         the relevant condition holds.  Complication: the top 56 bits
3163         of the destination should be forced to zero, but doing 'xorq
3164         %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3165         start off my moving $0 into the dest. */
3166      reg = iregEnc3210(i->Ain.Set64.dst);
3167      vassert(reg < 16);
3168
3169      /* movq $0, %dst */
3170      *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3171      *p++ = 0xC7;
3172      *p++ = toUChar(0xC0 + (reg & 7));
3173      p = emit32(p, 0);
3174
3175      /* setb lo8(%dst) */
3176      /* note, 8-bit register rex trickyness.  Be careful here. */
3177      *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3178      *p++ = 0x0F;
3179      *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3180      *p++ = toUChar(0xC0 + (reg & 7));
3181      goto done;
3182
3183   case Ain_Bsfr64:
3184      *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3185      *p++ = 0x0F;
3186      if (i->Ain.Bsfr64.isFwds) {
3187         *p++ = 0xBC;
3188      } else {
3189         *p++ = 0xBD;
3190      }
3191      p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3192      goto done;
3193
3194   case Ain_MFence:
3195      /* mfence */
3196      *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3197      goto done;
3198
3199   case Ain_ACAS:
3200      /* lock */
3201      *p++ = 0xF0;
3202      if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3203      /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3204         in %rbx.  The new-value register is hardwired to be %rbx
3205         since dealing with byte integer registers is too much hassle,
3206         so we force the register operand to %rbx (could equally be
3207         %rcx or %rdx). */
3208      rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3209      if (i->Ain.ACAS.sz != 8)
3210         rex = clearWBit(rex);
3211
3212      *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3213      *p++ = 0x0F;
3214      if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3215      p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3216      goto done;
3217
3218   case Ain_DACAS:
3219      /* lock */
3220      *p++ = 0xF0;
3221      /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3222         value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3223         aren't encoded in the insn. */
3224      rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3225      if (i->Ain.ACAS.sz != 8)
3226         rex = clearWBit(rex);
3227      *p++ = rex;
3228      *p++ = 0x0F;
3229      *p++ = 0xC7;
3230      p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3231      goto done;
3232
3233   case Ain_A87Free:
3234      vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3235      for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3236         p = do_ffree_st(p, 7-j);
3237      }
3238      goto done;
3239
3240   case Ain_A87PushPop:
3241      vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3242      if (i->Ain.A87PushPop.isPush) {
3243         /* Load from memory into %st(0): flds/fldl amode */
3244         *p++ = clearWBit(
3245                   rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3246         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3247	 p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3248      } else {
3249         /* Dump %st(0) to memory: fstps/fstpl amode */
3250         *p++ = clearWBit(
3251                   rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3252         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3253         p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3254         goto done;
3255      }
3256      goto done;
3257
3258   case Ain_A87FpOp:
3259      switch (i->Ain.A87FpOp.op) {
3260         case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3261         case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3262         case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3263         case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3264         case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3265         case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3266         case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3267         case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3268         case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3269         case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3270         case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3271         case Afp_TAN:
3272            /* fptan pushes 1.0 on the FP stack, except when the
3273               argument is out of range.  Hence we have to do the
3274               instruction, then inspect C2 to see if there is an out
3275               of range condition.  If there is, we skip the fincstp
3276               that is used by the in-range case to get rid of this
3277               extra 1.0 value. */
3278            *p++ = 0xD9; *p++ = 0xF2; // fptan
3279            *p++ = 0x50;              // pushq %rax
3280            *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3281            *p++ = 0x66; *p++ = 0xA9;
3282            *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3283            *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3284            *p++ = 0xD9; *p++ = 0xF7; // fincstp
3285            *p++ = 0x58;              // after_fincstp: popq %rax
3286            break;
3287         default:
3288            goto bad;
3289      }
3290      goto done;
3291
3292   case Ain_A87LdCW:
3293      *p++ = clearWBit(
3294                rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3295      *p++ = 0xD9;
3296      p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3297      goto done;
3298
3299   case Ain_A87StSW:
3300      *p++ = clearWBit(
3301                rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3302      *p++ = 0xDD;
3303      p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3304      goto done;
3305
3306   case Ain_Store:
3307      if (i->Ain.Store.sz == 2) {
3308         /* This just goes to show the crazyness of the instruction
3309            set encoding.  We have to insert two prefix bytes, but be
3310            careful to avoid a conflict in what the size should be, by
3311            ensuring that REX.W = 0. */
3312         *p++ = 0x66; /* override to 16-bits */
3313	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3314         *p++ = 0x89;
3315         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3316         goto done;
3317      }
3318      if (i->Ain.Store.sz == 4) {
3319	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3320         *p++ = 0x89;
3321         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3322         goto done;
3323      }
3324      if (i->Ain.Store.sz == 1) {
3325         /* This is one place where it would be wrong to skip emitting
3326            a rex byte of 0x40, since the mere presence of rex changes
3327            the meaning of the byte register access.  Be careful. */
3328	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3329         *p++ = 0x88;
3330         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3331         goto done;
3332      }
3333      break;
3334
3335   case Ain_LdMXCSR:
3336      *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3337      *p++ = 0x0F;
3338      *p++ = 0xAE;
3339      p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3340      goto done;
3341
3342   case Ain_SseUComIS:
3343      /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3344      /* ucomi[sd] %srcL, %srcR */
3345      if (i->Ain.SseUComIS.sz == 8) {
3346         *p++ = 0x66;
3347      } else {
3348         goto bad;
3349         vassert(i->Ain.SseUComIS.sz == 4);
3350      }
3351      *p++ = clearWBit (
3352             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3353                                 vregEnc3210(i->Ain.SseUComIS.srcR) ));
3354      *p++ = 0x0F;
3355      *p++ = 0x2E;
3356      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3357                               vregEnc3210(i->Ain.SseUComIS.srcR) );
3358      /* pushfq */
3359      *p++ = 0x9C;
3360      /* popq %dst */
3361      *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3362      *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3363      goto done;
3364
3365   case Ain_SseSI2SF:
3366      /* cvssi2s[sd] %src, %dst */
3367      rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3368                                i->Ain.SseSI2SF.src );
3369      *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3370      *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3371      *p++ = 0x0F;
3372      *p++ = 0x2A;
3373      p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3374                                i->Ain.SseSI2SF.src );
3375      goto done;
3376
3377   case Ain_SseSF2SI:
3378      /* cvss[sd]2si %src, %dst */
3379      rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3380                                vregEnc3210(i->Ain.SseSF2SI.src) );
3381      *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3382      *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3383      *p++ = 0x0F;
3384      *p++ = 0x2D;
3385      p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3386                                vregEnc3210(i->Ain.SseSF2SI.src) );
3387      goto done;
3388
3389   case Ain_SseSDSS:
3390      /* cvtsd2ss/cvtss2sd %src, %dst */
3391      *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3392      *p++ = clearWBit(
3393              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3394                                  vregEnc3210(i->Ain.SseSDSS.src) ));
3395      *p++ = 0x0F;
3396      *p++ = 0x5A;
3397      p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3398                                vregEnc3210(i->Ain.SseSDSS.src) );
3399      goto done;
3400
3401   case Ain_SseLdSt:
3402      if (i->Ain.SseLdSt.sz == 8) {
3403         *p++ = 0xF2;
3404      } else
3405      if (i->Ain.SseLdSt.sz == 4) {
3406         *p++ = 0xF3;
3407      } else
3408      if (i->Ain.SseLdSt.sz != 16) {
3409         vassert(0);
3410      }
3411      *p++ = clearWBit(
3412             rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3413                            i->Ain.SseLdSt.addr));
3414      *p++ = 0x0F;
3415      *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3416      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3417                           i->Ain.SseLdSt.addr);
3418      goto done;
3419
3420   case Ain_SseLdzLO:
3421      vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3422      /* movs[sd] amode, %xmm-dst */
3423      *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3424      *p++ = clearWBit(
3425             rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3426                            i->Ain.SseLdzLO.addr));
3427      *p++ = 0x0F;
3428      *p++ = 0x10;
3429      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3430                           i->Ain.SseLdzLO.addr);
3431      goto done;
3432
3433   case Ain_Sse32Fx4:
3434      xtra = 0;
3435      *p++ = clearWBit(
3436             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
3437                                 vregEnc3210(i->Ain.Sse32Fx4.src) ));
3438      *p++ = 0x0F;
3439      switch (i->Ain.Sse32Fx4.op) {
3440         case Asse_ADDF:   *p++ = 0x58; break;
3441         case Asse_DIVF:   *p++ = 0x5E; break;
3442         case Asse_MAXF:   *p++ = 0x5F; break;
3443         case Asse_MINF:   *p++ = 0x5D; break;
3444         case Asse_MULF:   *p++ = 0x59; break;
3445         case Asse_RCPF:   *p++ = 0x53; break;
3446         case Asse_RSQRTF: *p++ = 0x52; break;
3447         case Asse_SQRTF:  *p++ = 0x51; break;
3448         case Asse_SUBF:   *p++ = 0x5C; break;
3449         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3450         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3451         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3452         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3453         default: goto bad;
3454      }
3455      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst),
3456                               vregEnc3210(i->Ain.Sse32Fx4.src) );
3457      if (xtra & 0x100)
3458         *p++ = toUChar(xtra & 0xFF);
3459      goto done;
3460
3461   case Ain_Sse64Fx2:
3462      xtra = 0;
3463      *p++ = 0x66;
3464      *p++ = clearWBit(
3465             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3466                                 vregEnc3210(i->Ain.Sse64Fx2.src) ));
3467      *p++ = 0x0F;
3468      switch (i->Ain.Sse64Fx2.op) {
3469         case Asse_ADDF:   *p++ = 0x58; break;
3470         case Asse_DIVF:   *p++ = 0x5E; break;
3471         case Asse_MAXF:   *p++ = 0x5F; break;
3472         case Asse_MINF:   *p++ = 0x5D; break;
3473         case Asse_MULF:   *p++ = 0x59; break;
3474         case Asse_SQRTF:  *p++ = 0x51; break;
3475         case Asse_SUBF:   *p++ = 0x5C; break;
3476         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3477         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3478         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3479         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3480         default: goto bad;
3481      }
3482      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3483                               vregEnc3210(i->Ain.Sse64Fx2.src) );
3484      if (xtra & 0x100)
3485         *p++ = toUChar(xtra & 0xFF);
3486      goto done;
3487
3488   case Ain_Sse32FLo:
3489      xtra = 0;
3490      *p++ = 0xF3;
3491      *p++ = clearWBit(
3492             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3493                                 vregEnc3210(i->Ain.Sse32FLo.src) ));
3494      *p++ = 0x0F;
3495      switch (i->Ain.Sse32FLo.op) {
3496         case Asse_ADDF:   *p++ = 0x58; break;
3497         case Asse_DIVF:   *p++ = 0x5E; break;
3498         case Asse_MAXF:   *p++ = 0x5F; break;
3499         case Asse_MINF:   *p++ = 0x5D; break;
3500         case Asse_MULF:   *p++ = 0x59; break;
3501         case Asse_RCPF:   *p++ = 0x53; break;
3502         case Asse_RSQRTF: *p++ = 0x52; break;
3503         case Asse_SQRTF:  *p++ = 0x51; break;
3504         case Asse_SUBF:   *p++ = 0x5C; break;
3505         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3506         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3507         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3508         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3509         default: goto bad;
3510      }
3511      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3512                               vregEnc3210(i->Ain.Sse32FLo.src) );
3513      if (xtra & 0x100)
3514         *p++ = toUChar(xtra & 0xFF);
3515      goto done;
3516
3517   case Ain_Sse64FLo:
3518      xtra = 0;
3519      *p++ = 0xF2;
3520      *p++ = clearWBit(
3521             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3522                                 vregEnc3210(i->Ain.Sse64FLo.src) ));
3523      *p++ = 0x0F;
3524      switch (i->Ain.Sse64FLo.op) {
3525         case Asse_ADDF:   *p++ = 0x58; break;
3526         case Asse_DIVF:   *p++ = 0x5E; break;
3527         case Asse_MAXF:   *p++ = 0x5F; break;
3528         case Asse_MINF:   *p++ = 0x5D; break;
3529         case Asse_MULF:   *p++ = 0x59; break;
3530         case Asse_SQRTF:  *p++ = 0x51; break;
3531         case Asse_SUBF:   *p++ = 0x5C; break;
3532         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3533         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3534         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3535         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3536         default: goto bad;
3537      }
3538      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3539                               vregEnc3210(i->Ain.Sse64FLo.src) );
3540      if (xtra & 0x100)
3541         *p++ = toUChar(xtra & 0xFF);
3542      goto done;
3543
3544   case Ain_SseReRg:
3545#     define XX(_n) *p++ = (_n)
3546
3547      rex = clearWBit(
3548            rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3549                                vregEnc3210(i->Ain.SseReRg.src) ));
3550
3551      switch (i->Ain.SseReRg.op) {
3552         case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3553         case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3554         case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3555         case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3556         case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3557         case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3558         case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3559         case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3560         case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3561         case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3562         case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3563         case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3564         case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3565         case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3566         case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3567         case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3568         case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3569         case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3570         case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3571         case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3572         case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3573         case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3574         case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3575         case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3576         case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3577         case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3578         case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3579         case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3580         case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3581         case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3582         case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3583         case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3584         case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3585         case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3586         case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3587         case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3588         case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3589         case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3590         case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3591         case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3592         case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3593         case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3594         case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3595         case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3596         case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3597         case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3598         case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3599         case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3600         case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3601         case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3602         case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3603         case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3604         case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3605         case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3606         case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3607         default: goto bad;
3608      }
3609      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
3610                               vregEnc3210(i->Ain.SseReRg.src) );
3611#     undef XX
3612      goto done;
3613
3614   case Ain_SseCMov:
3615      /* jmp fwds if !condition */
3616      *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3617      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3618      ptmp = p;
3619
3620      /* movaps %src, %dst */
3621      *p++ = clearWBit(
3622             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
3623                                 vregEnc3210(i->Ain.SseCMov.src) ));
3624      *p++ = 0x0F;
3625      *p++ = 0x28;
3626      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
3627                               vregEnc3210(i->Ain.SseCMov.src) );
3628
3629      /* Fill in the jump offset. */
3630      *(ptmp-1) = toUChar(p - ptmp);
3631      goto done;
3632
3633   case Ain_SseShuf:
3634      *p++ = 0x66;
3635      *p++ = clearWBit(
3636             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
3637                                 vregEnc3210(i->Ain.SseShuf.src) ));
3638      *p++ = 0x0F;
3639      *p++ = 0x70;
3640      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
3641                               vregEnc3210(i->Ain.SseShuf.src) );
3642      *p++ = (UChar)(i->Ain.SseShuf.order);
3643      goto done;
3644
3645   //uu case Ain_AvxLdSt: {
3646   //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
3647   //uu                           i->Ain.AvxLdSt.addr );
3648   //uu    p = emitVexPrefix(p, vex);
3649   //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
3650   //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
3651   //uu      goto done;
3652   //uu }
3653
3654   case Ain_EvCheck: {
3655      /* We generate:
3656            (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
3657            (2 bytes)  jns  nofail     expected taken
3658            (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
3659            nofail:
3660      */
3661      /* This is heavily asserted re instruction lengths.  It needs to
3662         be.  If we get given unexpected forms of .amCounter or
3663         .amFailAddr -- basically, anything that's not of the form
3664         uimm7(%rbp) -- they are likely to fail. */
3665      /* Note also that after the decl we must be very careful not to
3666         read the carry flag, else we get a partial flags stall.
3667         js/jns avoids that, though. */
3668      UChar* p0 = p;
3669      /* ---  decl 8(%rbp) --- */
3670      /* Need to compute the REX byte for the decl in order to prove
3671         that we don't need it, since this is a 32-bit inc and all
3672         registers involved in the amode are < r8.  "1" because
3673         there's no register in this encoding; instead the register
3674         field is used as a sub opcode.  The encoding for "decl r/m32"
3675         is FF /1, hence the "1". */
3676      rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
3677      if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
3678      *p++ = 0xFF;
3679      p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
3680      vassert(p - p0 == 3);
3681      /* --- jns nofail --- */
3682      *p++ = 0x79;
3683      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3684      vassert(p - p0 == 5);
3685      /* --- jmp* 0(%rbp) --- */
3686      /* Once again, verify we don't need REX.  The encoding is FF /4.
3687         We don't need REX.W since by default FF /4 in 64-bit mode
3688         implies a 64 bit load. */
3689      rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
3690      if (rex != 0x40) goto bad;
3691      *p++ = 0xFF;
3692      p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
3693      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3694      /* And crosscheck .. */
3695      vassert(evCheckSzB_AMD64() == 8);
3696      goto done;
3697   }
3698
3699   case Ain_ProfInc: {
3700      /* We generate   movabsq $0, %r11
3701                       incq (%r11)
3702         in the expectation that a later call to LibVEX_patchProfCtr
3703         will be used to fill in the immediate field once the right
3704         value is known.
3705         49 BB 00 00 00 00 00 00 00 00
3706         49 FF 03
3707      */
3708      *p++ = 0x49; *p++ = 0xBB;
3709      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3710      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3711      *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
3712      /* Tell the caller .. */
3713      vassert(!(*is_profInc));
3714      *is_profInc = True;
3715      goto done;
3716   }
3717
3718   default:
3719      goto bad;
3720   }
3721
3722  bad:
3723   ppAMD64Instr(i, mode64);
3724   vpanic("emit_AMD64Instr");
3725   /*NOTREACHED*/
3726
3727  done:
3728   vassert(p - &buf[0] <= 32);
3729   return p - &buf[0];
3730}
3731
3732
3733/* How big is an event check?  See case for Ain_EvCheck in
3734   emit_AMD64Instr just above.  That crosschecks what this returns, so
3735   we can tell if we're inconsistent. */
3736Int evCheckSzB_AMD64 (void)
3737{
3738   return 8;
3739}
3740
3741
3742/* NB: what goes on here has to be very closely coordinated with the
3743   emitInstr case for XDirect, above. */
3744VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
3745                                   void* place_to_chain,
3746                                   const void* disp_cp_chain_me_EXPECTED,
3747                                   const void* place_to_jump_to )
3748{
3749   vassert(endness_host == VexEndnessLE);
3750
3751   /* What we're expecting to see is:
3752        movabsq $disp_cp_chain_me_EXPECTED, %r11
3753        call *%r11
3754      viz
3755        49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
3756        41 FF D3
3757   */
3758   UChar* p = (UChar*)place_to_chain;
3759   vassert(p[0] == 0x49);
3760   vassert(p[1] == 0xBB);
3761   vassert(*(Addr*)(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
3762   vassert(p[10] == 0x41);
3763   vassert(p[11] == 0xFF);
3764   vassert(p[12] == 0xD3);
3765   /* And what we want to change it to is either:
3766        (general case):
3767          movabsq $place_to_jump_to, %r11
3768          jmpq *%r11
3769        viz
3770          49 BB <8 bytes value == place_to_jump_to>
3771          41 FF E3
3772        So it's the same length (convenient, huh) and we don't
3773        need to change all the bits.
3774      ---OR---
3775        in the case where the displacement falls within 32 bits
3776          jmpq disp32   where disp32 is relative to the next insn
3777          ud2; ud2; ud2; ud2
3778        viz
3779          E9 <4 bytes == disp32>
3780          0F 0B 0F 0B 0F 0B 0F 0B
3781
3782      In both cases the replacement has the same length as the original.
3783      To remain sane & verifiable,
3784      (1) limit the displacement for the short form to
3785          (say) +/- one billion, so as to avoid wraparound
3786          off-by-ones
3787      (2) even if the short form is applicable, once every (say)
3788          1024 times use the long form anyway, so as to maintain
3789          verifiability
3790   */
3791   /* This is the delta we need to put into a JMP d32 insn.  It's
3792      relative to the start of the next insn, hence the -5.  */
3793   Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
3794   Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
3795
3796   static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
3797   if (shortOK) {
3798      shortCTR++; // thread safety bleh
3799      if (0 == (shortCTR & 0x3FF)) {
3800         shortOK = False;
3801         if (0)
3802            vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
3803                       "using long jmp\n", shortCTR);
3804      }
3805   }
3806
3807   /* And make the modifications. */
3808   if (shortOK) {
3809      p[0]  = 0xE9;
3810      p[1]  = (delta >> 0) & 0xFF;
3811      p[2]  = (delta >> 8) & 0xFF;
3812      p[3]  = (delta >> 16) & 0xFF;
3813      p[4]  = (delta >> 24) & 0xFF;
3814      p[5]  = 0x0F; p[6]  = 0x0B;
3815      p[7]  = 0x0F; p[8]  = 0x0B;
3816      p[9]  = 0x0F; p[10] = 0x0B;
3817      p[11] = 0x0F; p[12] = 0x0B;
3818      /* sanity check on the delta -- top 32 are all 0 or all 1 */
3819      delta >>= 32;
3820      vassert(delta == 0LL || delta == -1LL);
3821   } else {
3822      /* Minimal modifications from the starting sequence. */
3823     *(Addr*)(&p[2]) = (Addr)place_to_jump_to;
3824      p[12] = 0xE3;
3825   }
3826   VexInvalRange vir = { (HWord)place_to_chain, 13 };
3827   return vir;
3828}
3829
3830
3831/* NB: what goes on here has to be very closely coordinated with the
3832   emitInstr case for XDirect, above. */
3833VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
3834                                     void* place_to_unchain,
3835                                     const void* place_to_jump_to_EXPECTED,
3836                                     const void* disp_cp_chain_me )
3837{
3838   vassert(endness_host == VexEndnessLE);
3839
3840   /* What we're expecting to see is either:
3841        (general case)
3842          movabsq $place_to_jump_to_EXPECTED, %r11
3843          jmpq *%r11
3844        viz
3845          49 BB <8 bytes value == place_to_jump_to_EXPECTED>
3846          41 FF E3
3847      ---OR---
3848        in the case where the displacement falls within 32 bits
3849          jmpq d32
3850          ud2; ud2; ud2; ud2
3851        viz
3852          E9 <4 bytes == disp32>
3853          0F 0B 0F 0B 0F 0B 0F 0B
3854   */
3855   UChar* p     = (UChar*)place_to_unchain;
3856   Bool   valid = False;
3857   if (p[0] == 0x49 && p[1] == 0xBB
3858       && *(Addr*)(&p[2]) == (Addr)place_to_jump_to_EXPECTED
3859       && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
3860      /* it's the long form */
3861      valid = True;
3862   }
3863   else
3864   if (p[0] == 0xE9
3865       && p[5]  == 0x0F && p[6]  == 0x0B
3866       && p[7]  == 0x0F && p[8]  == 0x0B
3867       && p[9]  == 0x0F && p[10] == 0x0B
3868       && p[11] == 0x0F && p[12] == 0x0B) {
3869      /* It's the short form.  Check the offset is right. */
3870      Int  s32 = *(Int*)(&p[1]);
3871      Long s64 = (Long)s32;
3872      if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
3873         valid = True;
3874         if (0)
3875            vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
3876      }
3877   }
3878   vassert(valid);
3879   /* And what we want to change it to is:
3880        movabsq $disp_cp_chain_me, %r11
3881        call *%r11
3882      viz
3883        49 BB <8 bytes value == disp_cp_chain_me>
3884        41 FF D3
3885      So it's the same length (convenient, huh).
3886   */
3887   p[0] = 0x49;
3888   p[1] = 0xBB;
3889   *(Addr*)(&p[2]) = (Addr)disp_cp_chain_me;
3890   p[10] = 0x41;
3891   p[11] = 0xFF;
3892   p[12] = 0xD3;
3893   VexInvalRange vir = { (HWord)place_to_unchain, 13 };
3894   return vir;
3895}
3896
3897
3898/* Patch the counter address into a profile inc point, as previously
3899   created by the Ain_ProfInc case for emit_AMD64Instr. */
3900VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
3901                                   void*  place_to_patch,
3902                                   const ULong* location_of_counter )
3903{
3904   vassert(endness_host == VexEndnessLE);
3905   vassert(sizeof(ULong*) == 8);
3906   UChar* p = (UChar*)place_to_patch;
3907   vassert(p[0] == 0x49);
3908   vassert(p[1] == 0xBB);
3909   vassert(p[2] == 0x00);
3910   vassert(p[3] == 0x00);
3911   vassert(p[4] == 0x00);
3912   vassert(p[5] == 0x00);
3913   vassert(p[6] == 0x00);
3914   vassert(p[7] == 0x00);
3915   vassert(p[8] == 0x00);
3916   vassert(p[9] == 0x00);
3917   vassert(p[10] == 0x49);
3918   vassert(p[11] == 0xFF);
3919   vassert(p[12] == 0x03);
3920   ULong imm64 = (ULong)(Addr)location_of_counter;
3921   p[2] = imm64 & 0xFF; imm64 >>= 8;
3922   p[3] = imm64 & 0xFF; imm64 >>= 8;
3923   p[4] = imm64 & 0xFF; imm64 >>= 8;
3924   p[5] = imm64 & 0xFF; imm64 >>= 8;
3925   p[6] = imm64 & 0xFF; imm64 >>= 8;
3926   p[7] = imm64 & 0xFF; imm64 >>= 8;
3927   p[8] = imm64 & 0xFF; imm64 >>= 8;
3928   p[9] = imm64 & 0xFF; imm64 >>= 8;
3929   VexInvalRange vir = { (HWord)place_to_patch, 13 };
3930   return vir;
3931}
3932
3933
3934/*---------------------------------------------------------------*/
3935/*--- end                                   host_amd64_defs.c ---*/
3936/*---------------------------------------------------------------*/
3937