1
2/*---------------------------------------------------------------*/
3/*--- begin                                 host_amd64_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_amd64_defs.h"
43
44
45/* --------- Registers. --------- */
46
47const RRegUniverse* getRRegUniverse_AMD64 ( void )
48{
49   /* The real-register universe is a big constant, so we just want to
50      initialise it once. */
51   static RRegUniverse rRegUniverse_AMD64;
52   static Bool         rRegUniverse_AMD64_initted = False;
53
54   /* Handy shorthand, nothing more */
55   RRegUniverse* ru = &rRegUniverse_AMD64;
56
57   /* This isn't thread-safe.  Sigh. */
58   if (LIKELY(rRegUniverse_AMD64_initted))
59      return ru;
60
61   RRegUniverse__init(ru);
62
63   /* Add the registers.  The initial segment of this array must be
64      those available for allocation by reg-alloc, and those that
65      follow are not available for allocation. */
66   ru->regs[ru->size++] = hregAMD64_RSI();
67   ru->regs[ru->size++] = hregAMD64_RDI();
68   ru->regs[ru->size++] = hregAMD64_R8();
69   ru->regs[ru->size++] = hregAMD64_R9();
70   ru->regs[ru->size++] = hregAMD64_R12();
71   ru->regs[ru->size++] = hregAMD64_R13();
72   ru->regs[ru->size++] = hregAMD64_R14();
73   ru->regs[ru->size++] = hregAMD64_R15();
74   ru->regs[ru->size++] = hregAMD64_RBX();
75   ru->regs[ru->size++] = hregAMD64_XMM3();
76   ru->regs[ru->size++] = hregAMD64_XMM4();
77   ru->regs[ru->size++] = hregAMD64_XMM5();
78   ru->regs[ru->size++] = hregAMD64_XMM6();
79   ru->regs[ru->size++] = hregAMD64_XMM7();
80   ru->regs[ru->size++] = hregAMD64_XMM8();
81   ru->regs[ru->size++] = hregAMD64_XMM9();
82   ru->regs[ru->size++] = hregAMD64_XMM10();
83   ru->regs[ru->size++] = hregAMD64_XMM11();
84   ru->regs[ru->size++] = hregAMD64_XMM12();
85   ru->regs[ru->size++] = hregAMD64_R10();
86   ru->allocable = ru->size;
87   /* And other regs, not available to the allocator. */
88   ru->regs[ru->size++] = hregAMD64_RAX();
89   ru->regs[ru->size++] = hregAMD64_RCX();
90   ru->regs[ru->size++] = hregAMD64_RDX();
91   ru->regs[ru->size++] = hregAMD64_RSP();
92   ru->regs[ru->size++] = hregAMD64_RBP();
93   ru->regs[ru->size++] = hregAMD64_R11();
94   ru->regs[ru->size++] = hregAMD64_XMM0();
95   ru->regs[ru->size++] = hregAMD64_XMM1();
96
97   rRegUniverse_AMD64_initted = True;
98
99   RRegUniverse__check_is_sane(ru);
100   return ru;
101}
102
103
104void ppHRegAMD64 ( HReg reg )
105{
106   Int r;
107   static const HChar* ireg64_names[16]
108     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
109         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
110   /* Be generic for all virtual regs. */
111   if (hregIsVirtual(reg)) {
112      ppHReg(reg);
113      return;
114   }
115   /* But specific for real regs. */
116   switch (hregClass(reg)) {
117      case HRcInt64:
118         r = hregEncoding(reg);
119         vassert(r >= 0 && r < 16);
120         vex_printf("%s", ireg64_names[r]);
121         return;
122      case HRcVec128:
123         r = hregEncoding(reg);
124         vassert(r >= 0 && r < 16);
125         vex_printf("%%xmm%d", r);
126         return;
127      default:
128         vpanic("ppHRegAMD64");
129   }
130}
131
132static void ppHRegAMD64_lo32 ( HReg reg )
133{
134   Int r;
135   static const HChar* ireg32_names[16]
136     = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
137         "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
138   /* Be generic for all virtual regs. */
139   if (hregIsVirtual(reg)) {
140      ppHReg(reg);
141      vex_printf("d");
142      return;
143   }
144   /* But specific for real regs. */
145   switch (hregClass(reg)) {
146      case HRcInt64:
147         r = hregEncoding(reg);
148         vassert(r >= 0 && r < 16);
149         vex_printf("%s", ireg32_names[r]);
150         return;
151      default:
152         vpanic("ppHRegAMD64_lo32: invalid regclass");
153   }
154}
155
156
157/* --------- Condition codes, Intel encoding. --------- */
158
159const HChar* showAMD64CondCode ( AMD64CondCode cond )
160{
161   switch (cond) {
162      case Acc_O:      return "o";
163      case Acc_NO:     return "no";
164      case Acc_B:      return "b";
165      case Acc_NB:     return "nb";
166      case Acc_Z:      return "z";
167      case Acc_NZ:     return "nz";
168      case Acc_BE:     return "be";
169      case Acc_NBE:    return "nbe";
170      case Acc_S:      return "s";
171      case Acc_NS:     return "ns";
172      case Acc_P:      return "p";
173      case Acc_NP:     return "np";
174      case Acc_L:      return "l";
175      case Acc_NL:     return "nl";
176      case Acc_LE:     return "le";
177      case Acc_NLE:    return "nle";
178      case Acc_ALWAYS: return "ALWAYS";
179      default: vpanic("ppAMD64CondCode");
180   }
181}
182
183
184/* --------- AMD64AMode: memory address expressions. --------- */
185
186AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
187   AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
188   am->tag        = Aam_IR;
189   am->Aam.IR.imm = imm32;
190   am->Aam.IR.reg = reg;
191   return am;
192}
193AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
194   AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
195   am->tag = Aam_IRRS;
196   am->Aam.IRRS.imm   = imm32;
197   am->Aam.IRRS.base  = base;
198   am->Aam.IRRS.index = indEx;
199   am->Aam.IRRS.shift = shift;
200   vassert(shift >= 0 && shift <= 3);
201   return am;
202}
203
204void ppAMD64AMode ( AMD64AMode* am ) {
205   switch (am->tag) {
206      case Aam_IR:
207         if (am->Aam.IR.imm == 0)
208            vex_printf("(");
209         else
210            vex_printf("0x%x(", am->Aam.IR.imm);
211         ppHRegAMD64(am->Aam.IR.reg);
212         vex_printf(")");
213         return;
214      case Aam_IRRS:
215         vex_printf("0x%x(", am->Aam.IRRS.imm);
216         ppHRegAMD64(am->Aam.IRRS.base);
217         vex_printf(",");
218         ppHRegAMD64(am->Aam.IRRS.index);
219         vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
220         return;
221      default:
222         vpanic("ppAMD64AMode");
223   }
224}
225
226static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
227   switch (am->tag) {
228      case Aam_IR:
229         addHRegUse(u, HRmRead, am->Aam.IR.reg);
230         return;
231      case Aam_IRRS:
232         addHRegUse(u, HRmRead, am->Aam.IRRS.base);
233         addHRegUse(u, HRmRead, am->Aam.IRRS.index);
234         return;
235      default:
236         vpanic("addRegUsage_AMD64AMode");
237   }
238}
239
240static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
241   switch (am->tag) {
242      case Aam_IR:
243         am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
244         return;
245      case Aam_IRRS:
246         am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
247         am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
248         return;
249      default:
250         vpanic("mapRegs_AMD64AMode");
251   }
252}
253
254/* --------- Operand, which can be reg, immediate or memory. --------- */
255
256AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
257   AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
258   op->tag            = Armi_Imm;
259   op->Armi.Imm.imm32 = imm32;
260   return op;
261}
262AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
263   AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
264   op->tag          = Armi_Reg;
265   op->Armi.Reg.reg = reg;
266   return op;
267}
268AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
269   AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
270   op->tag         = Armi_Mem;
271   op->Armi.Mem.am = am;
272   return op;
273}
274
275static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
276   switch (op->tag) {
277      case Armi_Imm:
278         vex_printf("$0x%x", op->Armi.Imm.imm32);
279         return;
280      case Armi_Reg:
281         if (lo32)
282            ppHRegAMD64_lo32(op->Armi.Reg.reg);
283         else
284            ppHRegAMD64(op->Armi.Reg.reg);
285         return;
286      case Armi_Mem:
287         ppAMD64AMode(op->Armi.Mem.am);
288         return;
289     default:
290         vpanic("ppAMD64RMI");
291   }
292}
293void ppAMD64RMI ( AMD64RMI* op ) {
294   ppAMD64RMI_wrk(op, False/*!lo32*/);
295}
296void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
297   ppAMD64RMI_wrk(op, True/*lo32*/);
298}
299
300/* An AMD64RMI can only be used in a "read" context (what would it mean
301   to write or modify a literal?) and so we enumerate its registers
302   accordingly. */
303static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
304   switch (op->tag) {
305      case Armi_Imm:
306         return;
307      case Armi_Reg:
308         addHRegUse(u, HRmRead, op->Armi.Reg.reg);
309         return;
310      case Armi_Mem:
311         addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
312         return;
313      default:
314         vpanic("addRegUsage_AMD64RMI");
315   }
316}
317
318static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
319   switch (op->tag) {
320      case Armi_Imm:
321         return;
322      case Armi_Reg:
323         op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
324         return;
325      case Armi_Mem:
326         mapRegs_AMD64AMode(m, op->Armi.Mem.am);
327         return;
328      default:
329         vpanic("mapRegs_AMD64RMI");
330   }
331}
332
333
334/* --------- Operand, which can be reg or immediate only. --------- */
335
336AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
337   AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
338   op->tag           = Ari_Imm;
339   op->Ari.Imm.imm32 = imm32;
340   return op;
341}
342AMD64RI* AMD64RI_Reg ( HReg reg ) {
343   AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
344   op->tag         = Ari_Reg;
345   op->Ari.Reg.reg = reg;
346   return op;
347}
348
349void ppAMD64RI ( AMD64RI* op ) {
350   switch (op->tag) {
351      case Ari_Imm:
352         vex_printf("$0x%x", op->Ari.Imm.imm32);
353         return;
354      case Ari_Reg:
355         ppHRegAMD64(op->Ari.Reg.reg);
356         return;
357     default:
358         vpanic("ppAMD64RI");
359   }
360}
361
362/* An AMD64RI can only be used in a "read" context (what would it mean
363   to write or modify a literal?) and so we enumerate its registers
364   accordingly. */
365static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
366   switch (op->tag) {
367      case Ari_Imm:
368         return;
369      case Ari_Reg:
370         addHRegUse(u, HRmRead, op->Ari.Reg.reg);
371         return;
372      default:
373         vpanic("addRegUsage_AMD64RI");
374   }
375}
376
377static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
378   switch (op->tag) {
379      case Ari_Imm:
380         return;
381      case Ari_Reg:
382         op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
383         return;
384      default:
385         vpanic("mapRegs_AMD64RI");
386   }
387}
388
389
390/* --------- Operand, which can be reg or memory only. --------- */
391
392AMD64RM* AMD64RM_Reg ( HReg reg ) {
393   AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
394   op->tag         = Arm_Reg;
395   op->Arm.Reg.reg = reg;
396   return op;
397}
398AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
399   AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
400   op->tag        = Arm_Mem;
401   op->Arm.Mem.am = am;
402   return op;
403}
404
405void ppAMD64RM ( AMD64RM* op ) {
406   switch (op->tag) {
407      case Arm_Mem:
408         ppAMD64AMode(op->Arm.Mem.am);
409         return;
410      case Arm_Reg:
411         ppHRegAMD64(op->Arm.Reg.reg);
412         return;
413     default:
414         vpanic("ppAMD64RM");
415   }
416}
417
418/* Because an AMD64RM can be both a source or destination operand, we
419   have to supply a mode -- pertaining to the operand as a whole --
420   indicating how it's being used. */
421static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
422   switch (op->tag) {
423      case Arm_Mem:
424         /* Memory is read, written or modified.  So we just want to
425            know the regs read by the amode. */
426         addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
427         return;
428      case Arm_Reg:
429         /* reg is read, written or modified.  Add it in the
430            appropriate way. */
431         addHRegUse(u, mode, op->Arm.Reg.reg);
432         return;
433     default:
434         vpanic("addRegUsage_AMD64RM");
435   }
436}
437
438static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
439{
440   switch (op->tag) {
441      case Arm_Mem:
442         mapRegs_AMD64AMode(m, op->Arm.Mem.am);
443         return;
444      case Arm_Reg:
445         op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
446         return;
447     default:
448         vpanic("mapRegs_AMD64RM");
449   }
450}
451
452
453/* --------- Instructions. --------- */
454
455static const HChar* showAMD64ScalarSz ( Int sz ) {
456   switch (sz) {
457      case 2: return "w";
458      case 4: return "l";
459      case 8: return "q";
460      default: vpanic("showAMD64ScalarSz");
461   }
462}
463
464const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
465   switch (op) {
466      case Aun_NOT: return "not";
467      case Aun_NEG: return "neg";
468      default: vpanic("showAMD64UnaryOp");
469   }
470}
471
472const HChar* showAMD64AluOp ( AMD64AluOp op ) {
473   switch (op) {
474      case Aalu_MOV:  return "mov";
475      case Aalu_CMP:  return "cmp";
476      case Aalu_ADD:  return "add";
477      case Aalu_SUB:  return "sub";
478      case Aalu_ADC:  return "adc";
479      case Aalu_SBB:  return "sbb";
480      case Aalu_AND:  return "and";
481      case Aalu_OR:   return "or";
482      case Aalu_XOR:  return "xor";
483      case Aalu_MUL:  return "imul";
484      default: vpanic("showAMD64AluOp");
485   }
486}
487
488const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
489   switch (op) {
490      case Ash_SHL: return "shl";
491      case Ash_SHR: return "shr";
492      case Ash_SAR: return "sar";
493      default: vpanic("showAMD64ShiftOp");
494   }
495}
496
497const HChar* showA87FpOp ( A87FpOp op ) {
498   switch (op) {
499      case Afp_SCALE:  return "scale";
500      case Afp_ATAN:   return "atan";
501      case Afp_YL2X:   return "yl2x";
502      case Afp_YL2XP1: return "yl2xp1";
503      case Afp_PREM:   return "prem";
504      case Afp_PREM1:  return "prem1";
505      case Afp_SQRT:   return "sqrt";
506      case Afp_SIN:    return "sin";
507      case Afp_COS:    return "cos";
508      case Afp_TAN:    return "tan";
509      case Afp_ROUND:  return "round";
510      case Afp_2XM1:   return "2xm1";
511      default: vpanic("showA87FpOp");
512   }
513}
514
515const HChar* showAMD64SseOp ( AMD64SseOp op ) {
516   switch (op) {
517      case Asse_MOV:      return "movups";
518      case Asse_ADDF:     return "add";
519      case Asse_SUBF:     return "sub";
520      case Asse_MULF:     return "mul";
521      case Asse_DIVF:     return "div";
522      case Asse_MAXF:     return "max";
523      case Asse_MINF:     return "min";
524      case Asse_CMPEQF:   return "cmpFeq";
525      case Asse_CMPLTF:   return "cmpFlt";
526      case Asse_CMPLEF:   return "cmpFle";
527      case Asse_CMPUNF:   return "cmpFun";
528      case Asse_RCPF:     return "rcp";
529      case Asse_RSQRTF:   return "rsqrt";
530      case Asse_SQRTF:    return "sqrt";
531      case Asse_AND:      return "and";
532      case Asse_OR:       return "or";
533      case Asse_XOR:      return "xor";
534      case Asse_ANDN:     return "andn";
535      case Asse_ADD8:     return "paddb";
536      case Asse_ADD16:    return "paddw";
537      case Asse_ADD32:    return "paddd";
538      case Asse_ADD64:    return "paddq";
539      case Asse_QADD8U:   return "paddusb";
540      case Asse_QADD16U:  return "paddusw";
541      case Asse_QADD8S:   return "paddsb";
542      case Asse_QADD16S:  return "paddsw";
543      case Asse_SUB8:     return "psubb";
544      case Asse_SUB16:    return "psubw";
545      case Asse_SUB32:    return "psubd";
546      case Asse_SUB64:    return "psubq";
547      case Asse_QSUB8U:   return "psubusb";
548      case Asse_QSUB16U:  return "psubusw";
549      case Asse_QSUB8S:   return "psubsb";
550      case Asse_QSUB16S:  return "psubsw";
551      case Asse_MUL16:    return "pmullw";
552      case Asse_MULHI16U: return "pmulhuw";
553      case Asse_MULHI16S: return "pmulhw";
554      case Asse_AVG8U:    return "pavgb";
555      case Asse_AVG16U:   return "pavgw";
556      case Asse_MAX16S:   return "pmaxw";
557      case Asse_MAX8U:    return "pmaxub";
558      case Asse_MIN16S:   return "pminw";
559      case Asse_MIN8U:    return "pminub";
560      case Asse_CMPEQ8:   return "pcmpeqb";
561      case Asse_CMPEQ16:  return "pcmpeqw";
562      case Asse_CMPEQ32:  return "pcmpeqd";
563      case Asse_CMPGT8S:  return "pcmpgtb";
564      case Asse_CMPGT16S: return "pcmpgtw";
565      case Asse_CMPGT32S: return "pcmpgtd";
566      case Asse_SHL16:    return "psllw";
567      case Asse_SHL32:    return "pslld";
568      case Asse_SHL64:    return "psllq";
569      case Asse_SHR16:    return "psrlw";
570      case Asse_SHR32:    return "psrld";
571      case Asse_SHR64:    return "psrlq";
572      case Asse_SAR16:    return "psraw";
573      case Asse_SAR32:    return "psrad";
574      case Asse_PACKSSD:  return "packssdw";
575      case Asse_PACKSSW:  return "packsswb";
576      case Asse_PACKUSW:  return "packuswb";
577      case Asse_UNPCKHB:  return "punpckhb";
578      case Asse_UNPCKHW:  return "punpckhw";
579      case Asse_UNPCKHD:  return "punpckhd";
580      case Asse_UNPCKHQ:  return "punpckhq";
581      case Asse_UNPCKLB:  return "punpcklb";
582      case Asse_UNPCKLW:  return "punpcklw";
583      case Asse_UNPCKLD:  return "punpckld";
584      case Asse_UNPCKLQ:  return "punpcklq";
585      default: vpanic("showAMD64SseOp");
586   }
587}
588
589AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
590   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
591   i->tag             = Ain_Imm64;
592   i->Ain.Imm64.imm64 = imm64;
593   i->Ain.Imm64.dst   = dst;
594   return i;
595}
596AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
597   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
598   i->tag            = Ain_Alu64R;
599   i->Ain.Alu64R.op  = op;
600   i->Ain.Alu64R.src = src;
601   i->Ain.Alu64R.dst = dst;
602   return i;
603}
604AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
605   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
606   i->tag            = Ain_Alu64M;
607   i->Ain.Alu64M.op  = op;
608   i->Ain.Alu64M.src = src;
609   i->Ain.Alu64M.dst = dst;
610   vassert(op != Aalu_MUL);
611   return i;
612}
613AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
614   AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
615   i->tag          = Ain_Sh64;
616   i->Ain.Sh64.op  = op;
617   i->Ain.Sh64.src = src;
618   i->Ain.Sh64.dst = dst;
619   return i;
620}
621AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
622   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
623   i->tag              = Ain_Test64;
624   i->Ain.Test64.imm32 = imm32;
625   i->Ain.Test64.dst   = dst;
626   return i;
627}
628AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
629   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
630   i->tag             = Ain_Unary64;
631   i->Ain.Unary64.op  = op;
632   i->Ain.Unary64.dst = dst;
633   return i;
634}
635AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
636   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
637   i->tag             = Ain_Lea64;
638   i->Ain.Lea64.am    = am;
639   i->Ain.Lea64.dst   = dst;
640   return i;
641}
642AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
643   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
644   i->tag            = Ain_Alu32R;
645   i->Ain.Alu32R.op  = op;
646   i->Ain.Alu32R.src = src;
647   i->Ain.Alu32R.dst = dst;
648   switch (op) {
649      case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
650      case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
651      default: vassert(0);
652   }
653   return i;
654}
655AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
656   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
657   i->tag            = Ain_MulL;
658   i->Ain.MulL.syned = syned;
659   i->Ain.MulL.src   = src;
660   return i;
661}
662AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
663   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
664   i->tag            = Ain_Div;
665   i->Ain.Div.syned  = syned;
666   i->Ain.Div.sz     = sz;
667   i->Ain.Div.src    = src;
668   vassert(sz == 4 || sz == 8);
669   return i;
670}
671AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
672   AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
673   i->tag          = Ain_Push;
674   i->Ain.Push.src = src;
675   return i;
676}
677AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
678                              RetLoc rloc ) {
679   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
680   i->tag               = Ain_Call;
681   i->Ain.Call.cond     = cond;
682   i->Ain.Call.target   = target;
683   i->Ain.Call.regparms = regparms;
684   i->Ain.Call.rloc     = rloc;
685   vassert(regparms >= 0 && regparms <= 6);
686   vassert(is_sane_RetLoc(rloc));
687   return i;
688}
689
690AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
691                                 AMD64CondCode cond, Bool toFastEP ) {
692   AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
693   i->tag                  = Ain_XDirect;
694   i->Ain.XDirect.dstGA    = dstGA;
695   i->Ain.XDirect.amRIP    = amRIP;
696   i->Ain.XDirect.cond     = cond;
697   i->Ain.XDirect.toFastEP = toFastEP;
698   return i;
699}
700AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
701                                AMD64CondCode cond ) {
702   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
703   i->tag              = Ain_XIndir;
704   i->Ain.XIndir.dstGA = dstGA;
705   i->Ain.XIndir.amRIP = amRIP;
706   i->Ain.XIndir.cond  = cond;
707   return i;
708}
709AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
710                                   AMD64CondCode cond, IRJumpKind jk ) {
711   AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
712   i->tag                 = Ain_XAssisted;
713   i->Ain.XAssisted.dstGA = dstGA;
714   i->Ain.XAssisted.amRIP = amRIP;
715   i->Ain.XAssisted.cond  = cond;
716   i->Ain.XAssisted.jk    = jk;
717   return i;
718}
719
720AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
721   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
722   i->tag             = Ain_CMov64;
723   i->Ain.CMov64.cond = cond;
724   i->Ain.CMov64.src  = src;
725   i->Ain.CMov64.dst  = dst;
726   vassert(cond != Acc_ALWAYS);
727   return i;
728}
729AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
730                               AMD64AMode* addr, HReg dst ) {
731   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
732   i->tag            = Ain_CLoad;
733   i->Ain.CLoad.cond = cond;
734   i->Ain.CLoad.szB  = szB;
735   i->Ain.CLoad.addr = addr;
736   i->Ain.CLoad.dst  = dst;
737   vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
738   return i;
739}
740AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
741                                HReg src, AMD64AMode* addr ) {
742   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
743   i->tag             = Ain_CStore;
744   i->Ain.CStore.cond = cond;
745   i->Ain.CStore.szB  = szB;
746   i->Ain.CStore.src  = src;
747   i->Ain.CStore.addr = addr;
748   vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
749   return i;
750}
751AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
752   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
753   i->tag              = Ain_MovxLQ;
754   i->Ain.MovxLQ.syned = syned;
755   i->Ain.MovxLQ.src   = src;
756   i->Ain.MovxLQ.dst   = dst;
757   return i;
758}
759AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
760                                AMD64AMode* src, HReg dst ) {
761   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
762   i->tag                = Ain_LoadEX;
763   i->Ain.LoadEX.szSmall = szSmall;
764   i->Ain.LoadEX.syned   = syned;
765   i->Ain.LoadEX.src     = src;
766   i->Ain.LoadEX.dst     = dst;
767   vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
768   return i;
769}
770AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
771   AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
772   i->tag           = Ain_Store;
773   i->Ain.Store.sz  = sz;
774   i->Ain.Store.src = src;
775   i->Ain.Store.dst = dst;
776   vassert(sz == 1 || sz == 2 || sz == 4);
777   return i;
778}
779AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
780   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
781   i->tag            = Ain_Set64;
782   i->Ain.Set64.cond = cond;
783   i->Ain.Set64.dst  = dst;
784   return i;
785}
786AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
787   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
788   i->tag               = Ain_Bsfr64;
789   i->Ain.Bsfr64.isFwds = isFwds;
790   i->Ain.Bsfr64.src    = src;
791   i->Ain.Bsfr64.dst    = dst;
792   return i;
793}
794AMD64Instr* AMD64Instr_MFence ( void ) {
795   AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
796   i->tag        = Ain_MFence;
797   return i;
798}
799AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
800   AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
801   i->tag           = Ain_ACAS;
802   i->Ain.ACAS.addr = addr;
803   i->Ain.ACAS.sz   = sz;
804   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
805   return i;
806}
807AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
808   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
809   i->tag            = Ain_DACAS;
810   i->Ain.DACAS.addr = addr;
811   i->Ain.DACAS.sz   = sz;
812   vassert(sz == 8 || sz == 4);
813   return i;
814}
815
816AMD64Instr* AMD64Instr_A87Free ( Int nregs )
817{
818   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
819   i->tag               = Ain_A87Free;
820   i->Ain.A87Free.nregs = nregs;
821   vassert(nregs >= 1 && nregs <= 7);
822   return i;
823}
824AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
825{
826   AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
827   i->tag                   = Ain_A87PushPop;
828   i->Ain.A87PushPop.addr   = addr;
829   i->Ain.A87PushPop.isPush = isPush;
830   i->Ain.A87PushPop.szB    = szB;
831   vassert(szB == 8 || szB == 4);
832   return i;
833}
834AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
835{
836   AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
837   i->tag            = Ain_A87FpOp;
838   i->Ain.A87FpOp.op = op;
839   return i;
840}
841AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
842{
843   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
844   i->tag              = Ain_A87LdCW;
845   i->Ain.A87LdCW.addr = addr;
846   return i;
847}
848AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
849{
850   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
851   i->tag              = Ain_A87StSW;
852   i->Ain.A87StSW.addr = addr;
853   return i;
854}
855AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
856   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
857   i->tag                = Ain_LdMXCSR;
858   i->Ain.LdMXCSR.addr   = addr;
859   return i;
860}
861AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
862   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
863   i->tag                = Ain_SseUComIS;
864   i->Ain.SseUComIS.sz   = toUChar(sz);
865   i->Ain.SseUComIS.srcL = srcL;
866   i->Ain.SseUComIS.srcR = srcR;
867   i->Ain.SseUComIS.dst  = dst;
868   vassert(sz == 4 || sz == 8);
869   return i;
870}
871AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
872   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
873   i->tag              = Ain_SseSI2SF;
874   i->Ain.SseSI2SF.szS = toUChar(szS);
875   i->Ain.SseSI2SF.szD = toUChar(szD);
876   i->Ain.SseSI2SF.src = src;
877   i->Ain.SseSI2SF.dst = dst;
878   vassert(szS == 4 || szS == 8);
879   vassert(szD == 4 || szD == 8);
880   return i;
881}
882AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
883   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
884   i->tag              = Ain_SseSF2SI;
885   i->Ain.SseSF2SI.szS = toUChar(szS);
886   i->Ain.SseSF2SI.szD = toUChar(szD);
887   i->Ain.SseSF2SI.src = src;
888   i->Ain.SseSF2SI.dst = dst;
889   vassert(szS == 4 || szS == 8);
890   vassert(szD == 4 || szD == 8);
891   return i;
892}
893AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
894{
895   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
896   i->tag                = Ain_SseSDSS;
897   i->Ain.SseSDSS.from64 = from64;
898   i->Ain.SseSDSS.src    = src;
899   i->Ain.SseSDSS.dst    = dst;
900   return i;
901}
902AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
903                                 HReg reg, AMD64AMode* addr ) {
904   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
905   i->tag                = Ain_SseLdSt;
906   i->Ain.SseLdSt.isLoad = isLoad;
907   i->Ain.SseLdSt.sz     = toUChar(sz);
908   i->Ain.SseLdSt.reg    = reg;
909   i->Ain.SseLdSt.addr   = addr;
910   vassert(sz == 4 || sz == 8 || sz == 16);
911   return i;
912}
913AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
914                                   HReg src, AMD64AMode* addr )
915{
916   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
917   i->tag                = Ain_SseCStore;
918   i->Ain.SseCStore.cond = cond;
919   i->Ain.SseCStore.src  = src;
920   i->Ain.SseCStore.addr = addr;
921   vassert(cond != Acc_ALWAYS);
922   return i;
923}
924AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
925                                  AMD64AMode* addr, HReg dst )
926{
927   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
928   i->tag               = Ain_SseCLoad;
929   i->Ain.SseCLoad.cond = cond;
930   i->Ain.SseCLoad.addr = addr;
931   i->Ain.SseCLoad.dst  = dst;
932   vassert(cond != Acc_ALWAYS);
933   return i;
934}
935AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
936{
937   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
938   i->tag                = Ain_SseLdzLO;
939   i->Ain.SseLdzLO.sz    = sz;
940   i->Ain.SseLdzLO.reg   = reg;
941   i->Ain.SseLdzLO.addr  = addr;
942   vassert(sz == 4 || sz == 8);
943   return i;
944}
945AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
946   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
947   i->tag              = Ain_Sse32Fx4;
948   i->Ain.Sse32Fx4.op  = op;
949   i->Ain.Sse32Fx4.src = src;
950   i->Ain.Sse32Fx4.dst = dst;
951   vassert(op != Asse_MOV);
952   return i;
953}
954AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
955   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
956   i->tag              = Ain_Sse32FLo;
957   i->Ain.Sse32FLo.op  = op;
958   i->Ain.Sse32FLo.src = src;
959   i->Ain.Sse32FLo.dst = dst;
960   vassert(op != Asse_MOV);
961   return i;
962}
963AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
964   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
965   i->tag              = Ain_Sse64Fx2;
966   i->Ain.Sse64Fx2.op  = op;
967   i->Ain.Sse64Fx2.src = src;
968   i->Ain.Sse64Fx2.dst = dst;
969   vassert(op != Asse_MOV);
970   return i;
971}
972AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
973   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
974   i->tag              = Ain_Sse64FLo;
975   i->Ain.Sse64FLo.op  = op;
976   i->Ain.Sse64FLo.src = src;
977   i->Ain.Sse64FLo.dst = dst;
978   vassert(op != Asse_MOV);
979   return i;
980}
981AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
982   AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
983   i->tag             = Ain_SseReRg;
984   i->Ain.SseReRg.op  = op;
985   i->Ain.SseReRg.src = re;
986   i->Ain.SseReRg.dst = rg;
987   return i;
988}
989AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
990   AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
991   i->tag              = Ain_SseCMov;
992   i->Ain.SseCMov.cond = cond;
993   i->Ain.SseCMov.src  = src;
994   i->Ain.SseCMov.dst  = dst;
995   vassert(cond != Acc_ALWAYS);
996   return i;
997}
998AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
999   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1000   i->tag               = Ain_SseShuf;
1001   i->Ain.SseShuf.order = order;
1002   i->Ain.SseShuf.src   = src;
1003   i->Ain.SseShuf.dst   = dst;
1004   vassert(order >= 0 && order <= 0xFF);
1005   return i;
1006}
1007//uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
1008//uu                                  HReg reg, AMD64AMode* addr ) {
1009//uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1010//uu    i->tag                = Ain_AvxLdSt;
1011//uu    i->Ain.AvxLdSt.isLoad = isLoad;
1012//uu    i->Ain.AvxLdSt.reg    = reg;
1013//uu    i->Ain.AvxLdSt.addr   = addr;
1014//uu    return i;
1015//uu }
1016//uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
1017//uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1018//uu    i->tag             = Ain_AvxReRg;
1019//uu    i->Ain.AvxReRg.op  = op;
1020//uu    i->Ain.AvxReRg.src = re;
1021//uu    i->Ain.AvxReRg.dst = rg;
1022//uu    return i;
1023//uu }
1024AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1025                                 AMD64AMode* amFailAddr ) {
1026   AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1027   i->tag                    = Ain_EvCheck;
1028   i->Ain.EvCheck.amCounter  = amCounter;
1029   i->Ain.EvCheck.amFailAddr = amFailAddr;
1030   return i;
1031}
1032AMD64Instr* AMD64Instr_ProfInc ( void ) {
1033   AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1034   i->tag        = Ain_ProfInc;
1035   return i;
1036}
1037
1038void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1039{
1040   vassert(mode64 == True);
1041   switch (i->tag) {
1042      case Ain_Imm64:
1043         vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1044         ppHRegAMD64(i->Ain.Imm64.dst);
1045         return;
1046      case Ain_Alu64R:
1047         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1048         ppAMD64RMI(i->Ain.Alu64R.src);
1049         vex_printf(",");
1050         ppHRegAMD64(i->Ain.Alu64R.dst);
1051         return;
1052      case Ain_Alu64M:
1053         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1054         ppAMD64RI(i->Ain.Alu64M.src);
1055         vex_printf(",");
1056         ppAMD64AMode(i->Ain.Alu64M.dst);
1057         return;
1058      case Ain_Sh64:
1059         vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1060         if (i->Ain.Sh64.src == 0)
1061            vex_printf("%%cl,");
1062         else
1063            vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1064         ppHRegAMD64(i->Ain.Sh64.dst);
1065         return;
1066      case Ain_Test64:
1067         vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1068         ppHRegAMD64(i->Ain.Test64.dst);
1069         return;
1070      case Ain_Unary64:
1071         vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1072         ppHRegAMD64(i->Ain.Unary64.dst);
1073         return;
1074      case Ain_Lea64:
1075         vex_printf("leaq ");
1076         ppAMD64AMode(i->Ain.Lea64.am);
1077         vex_printf(",");
1078         ppHRegAMD64(i->Ain.Lea64.dst);
1079         return;
1080      case Ain_Alu32R:
1081         vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1082         ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1083         vex_printf(",");
1084         ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1085         return;
1086      case Ain_MulL:
1087         vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1088         ppAMD64RM(i->Ain.MulL.src);
1089         return;
1090      case Ain_Div:
1091         vex_printf("%cdiv%s ",
1092                    i->Ain.Div.syned ? 's' : 'u',
1093                    showAMD64ScalarSz(i->Ain.Div.sz));
1094         ppAMD64RM(i->Ain.Div.src);
1095         return;
1096      case Ain_Push:
1097         vex_printf("pushq ");
1098         ppAMD64RMI(i->Ain.Push.src);
1099         return;
1100      case Ain_Call:
1101         vex_printf("call%s[%d,",
1102                    i->Ain.Call.cond==Acc_ALWAYS
1103                       ? "" : showAMD64CondCode(i->Ain.Call.cond),
1104                    i->Ain.Call.regparms );
1105         ppRetLoc(i->Ain.Call.rloc);
1106         vex_printf("] 0x%llx", i->Ain.Call.target);
1107         break;
1108
1109      case Ain_XDirect:
1110         vex_printf("(xDirect) ");
1111         vex_printf("if (%%rflags.%s) { ",
1112                    showAMD64CondCode(i->Ain.XDirect.cond));
1113         vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1114         vex_printf("movq %%r11,");
1115         ppAMD64AMode(i->Ain.XDirect.amRIP);
1116         vex_printf("; ");
1117         vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1118                    i->Ain.XDirect.toFastEP ? "fast" : "slow");
1119         return;
1120      case Ain_XIndir:
1121         vex_printf("(xIndir) ");
1122         vex_printf("if (%%rflags.%s) { ",
1123                    showAMD64CondCode(i->Ain.XIndir.cond));
1124         vex_printf("movq ");
1125         ppHRegAMD64(i->Ain.XIndir.dstGA);
1126         vex_printf(",");
1127         ppAMD64AMode(i->Ain.XIndir.amRIP);
1128         vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1129         return;
1130      case Ain_XAssisted:
1131         vex_printf("(xAssisted) ");
1132         vex_printf("if (%%rflags.%s) { ",
1133                    showAMD64CondCode(i->Ain.XAssisted.cond));
1134         vex_printf("movq ");
1135         ppHRegAMD64(i->Ain.XAssisted.dstGA);
1136         vex_printf(",");
1137         ppAMD64AMode(i->Ain.XAssisted.amRIP);
1138         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1139                    (Int)i->Ain.XAssisted.jk);
1140         vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1141         return;
1142
1143      case Ain_CMov64:
1144         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1145         ppHRegAMD64(i->Ain.CMov64.src);
1146         vex_printf(",");
1147         ppHRegAMD64(i->Ain.CMov64.dst);
1148         return;
1149      case Ain_CLoad:
1150         vex_printf("if (%%rflags.%s) { ",
1151                    showAMD64CondCode(i->Ain.CLoad.cond));
1152         vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1153         ppAMD64AMode(i->Ain.CLoad.addr);
1154         vex_printf(", ");
1155         (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1156            (i->Ain.CLoad.dst);
1157         vex_printf(" }");
1158         return;
1159      case Ain_CStore:
1160         vex_printf("if (%%rflags.%s) { ",
1161                    showAMD64CondCode(i->Ain.CStore.cond));
1162         vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1163         (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1164            (i->Ain.CStore.src);
1165         vex_printf(", ");
1166         ppAMD64AMode(i->Ain.CStore.addr);
1167         vex_printf(" }");
1168         return;
1169
1170      case Ain_MovxLQ:
1171         vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1172         ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1173         vex_printf(",");
1174         ppHRegAMD64(i->Ain.MovxLQ.dst);
1175         return;
1176      case Ain_LoadEX:
1177         if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1178            vex_printf("movl ");
1179            ppAMD64AMode(i->Ain.LoadEX.src);
1180            vex_printf(",");
1181            ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1182         } else {
1183            vex_printf("mov%c%cq ",
1184                       i->Ain.LoadEX.syned ? 's' : 'z',
1185                       i->Ain.LoadEX.szSmall==1
1186                          ? 'b'
1187                          : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1188            ppAMD64AMode(i->Ain.LoadEX.src);
1189            vex_printf(",");
1190            ppHRegAMD64(i->Ain.LoadEX.dst);
1191         }
1192         return;
1193      case Ain_Store:
1194         vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1195                              : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1196         ppHRegAMD64(i->Ain.Store.src);
1197         vex_printf(",");
1198         ppAMD64AMode(i->Ain.Store.dst);
1199         return;
1200      case Ain_Set64:
1201         vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1202         ppHRegAMD64(i->Ain.Set64.dst);
1203         return;
1204      case Ain_Bsfr64:
1205         vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1206         ppHRegAMD64(i->Ain.Bsfr64.src);
1207         vex_printf(",");
1208         ppHRegAMD64(i->Ain.Bsfr64.dst);
1209         return;
1210      case Ain_MFence:
1211         vex_printf("mfence" );
1212         return;
1213      case Ain_ACAS:
1214         vex_printf("lock cmpxchg%c ",
1215                     i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1216                     : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1217         vex_printf("{%%rax->%%rbx},");
1218         ppAMD64AMode(i->Ain.ACAS.addr);
1219         return;
1220      case Ain_DACAS:
1221         vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1222                    (Int)(2 * i->Ain.DACAS.sz));
1223         ppAMD64AMode(i->Ain.DACAS.addr);
1224         return;
1225      case Ain_A87Free:
1226         vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1227         break;
1228      case Ain_A87PushPop:
1229         vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1230                    i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1231         ppAMD64AMode(i->Ain.A87PushPop.addr);
1232         break;
1233      case Ain_A87FpOp:
1234         vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1235         break;
1236      case Ain_A87LdCW:
1237         vex_printf("fldcw ");
1238         ppAMD64AMode(i->Ain.A87LdCW.addr);
1239         break;
1240      case Ain_A87StSW:
1241         vex_printf("fstsw ");
1242         ppAMD64AMode(i->Ain.A87StSW.addr);
1243         break;
1244      case Ain_LdMXCSR:
1245         vex_printf("ldmxcsr ");
1246         ppAMD64AMode(i->Ain.LdMXCSR.addr);
1247         break;
1248      case Ain_SseUComIS:
1249         vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1250         ppHRegAMD64(i->Ain.SseUComIS.srcL);
1251         vex_printf(",");
1252         ppHRegAMD64(i->Ain.SseUComIS.srcR);
1253         vex_printf(" ; pushfq ; popq ");
1254         ppHRegAMD64(i->Ain.SseUComIS.dst);
1255         break;
1256      case Ain_SseSI2SF:
1257         vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1258         (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1259            (i->Ain.SseSI2SF.src);
1260         vex_printf(",");
1261         ppHRegAMD64(i->Ain.SseSI2SF.dst);
1262         break;
1263      case Ain_SseSF2SI:
1264         vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1265         ppHRegAMD64(i->Ain.SseSF2SI.src);
1266         vex_printf(",");
1267         (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1268            (i->Ain.SseSF2SI.dst);
1269         break;
1270      case Ain_SseSDSS:
1271         vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1272         ppHRegAMD64(i->Ain.SseSDSS.src);
1273         vex_printf(",");
1274         ppHRegAMD64(i->Ain.SseSDSS.dst);
1275         break;
1276      case Ain_SseLdSt:
1277         switch (i->Ain.SseLdSt.sz) {
1278            case 4:  vex_printf("movss "); break;
1279            case 8:  vex_printf("movsd "); break;
1280            case 16: vex_printf("movups "); break;
1281            default: vassert(0);
1282         }
1283         if (i->Ain.SseLdSt.isLoad) {
1284            ppAMD64AMode(i->Ain.SseLdSt.addr);
1285            vex_printf(",");
1286            ppHRegAMD64(i->Ain.SseLdSt.reg);
1287         } else {
1288            ppHRegAMD64(i->Ain.SseLdSt.reg);
1289            vex_printf(",");
1290            ppAMD64AMode(i->Ain.SseLdSt.addr);
1291         }
1292         return;
1293      case Ain_SseCStore:
1294         vex_printf("if (%%rflags.%s) { ",
1295                    showAMD64CondCode(i->Ain.SseCStore.cond));
1296         vex_printf("movups ");
1297         ppHRegAMD64(i->Ain.SseCStore.src);
1298         vex_printf(", ");
1299         ppAMD64AMode(i->Ain.SseCStore.addr);
1300         vex_printf(" }");
1301         return;
1302      case Ain_SseCLoad:
1303         vex_printf("if (%%rflags.%s) { ",
1304                    showAMD64CondCode(i->Ain.SseCLoad.cond));
1305         vex_printf("movups ");
1306         ppAMD64AMode(i->Ain.SseCLoad.addr);
1307         vex_printf(", ");
1308         ppHRegAMD64(i->Ain.SseCLoad.dst);
1309         vex_printf(" }");
1310         return;
1311      case Ain_SseLdzLO:
1312         vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1313         ppAMD64AMode(i->Ain.SseLdzLO.addr);
1314         vex_printf(",");
1315         ppHRegAMD64(i->Ain.SseLdzLO.reg);
1316         return;
1317      case Ain_Sse32Fx4:
1318         vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1319         ppHRegAMD64(i->Ain.Sse32Fx4.src);
1320         vex_printf(",");
1321         ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1322         return;
1323      case Ain_Sse32FLo:
1324         vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1325         ppHRegAMD64(i->Ain.Sse32FLo.src);
1326         vex_printf(",");
1327         ppHRegAMD64(i->Ain.Sse32FLo.dst);
1328         return;
1329      case Ain_Sse64Fx2:
1330         vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1331         ppHRegAMD64(i->Ain.Sse64Fx2.src);
1332         vex_printf(",");
1333         ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1334         return;
1335      case Ain_Sse64FLo:
1336         vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1337         ppHRegAMD64(i->Ain.Sse64FLo.src);
1338         vex_printf(",");
1339         ppHRegAMD64(i->Ain.Sse64FLo.dst);
1340         return;
1341      case Ain_SseReRg:
1342         vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1343         ppHRegAMD64(i->Ain.SseReRg.src);
1344         vex_printf(",");
1345         ppHRegAMD64(i->Ain.SseReRg.dst);
1346         return;
1347      case Ain_SseCMov:
1348         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1349         ppHRegAMD64(i->Ain.SseCMov.src);
1350         vex_printf(",");
1351         ppHRegAMD64(i->Ain.SseCMov.dst);
1352         return;
1353      case Ain_SseShuf:
1354         vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
1355         ppHRegAMD64(i->Ain.SseShuf.src);
1356         vex_printf(",");
1357         ppHRegAMD64(i->Ain.SseShuf.dst);
1358         return;
1359      //uu case Ain_AvxLdSt:
1360      //uu    vex_printf("vmovups ");
1361      //uu    if (i->Ain.AvxLdSt.isLoad) {
1362      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1363      //uu       vex_printf(",");
1364      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1365      //uu    } else {
1366      //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1367      //uu       vex_printf(",");
1368      //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1369      //uu    }
1370      //uu    return;
1371      //uu case Ain_AvxReRg:
1372      //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1373      //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1374      //uu    vex_printf(",");
1375      //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1376      //uu    return;
1377      case Ain_EvCheck:
1378         vex_printf("(evCheck) decl ");
1379         ppAMD64AMode(i->Ain.EvCheck.amCounter);
1380         vex_printf("; jns nofail; jmp *");
1381         ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1382         vex_printf("; nofail:");
1383         return;
1384      case Ain_ProfInc:
1385         vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1386         return;
1387      default:
1388         vpanic("ppAMD64Instr");
1389   }
1390}
1391
1392/* --------- Helpers for register allocation. --------- */
1393
1394void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1395{
1396   Bool unary;
1397   vassert(mode64 == True);
1398   initHRegUsage(u);
1399   switch (i->tag) {
1400      case Ain_Imm64:
1401         addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1402         return;
1403      case Ain_Alu64R:
1404         addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1405         if (i->Ain.Alu64R.op == Aalu_MOV) {
1406            addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1407            return;
1408         }
1409         if (i->Ain.Alu64R.op == Aalu_CMP) {
1410            addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1411            return;
1412         }
1413         addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1414         return;
1415      case Ain_Alu64M:
1416         addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1417         addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1418         return;
1419      case Ain_Sh64:
1420         addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1421         if (i->Ain.Sh64.src == 0)
1422            addHRegUse(u, HRmRead, hregAMD64_RCX());
1423         return;
1424      case Ain_Test64:
1425         addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1426         return;
1427      case Ain_Unary64:
1428         addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1429         return;
1430      case Ain_Lea64:
1431         addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1432         addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1433         return;
1434      case Ain_Alu32R:
1435         vassert(i->Ain.Alu32R.op != Aalu_MOV);
1436         addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1437         if (i->Ain.Alu32R.op == Aalu_CMP) {
1438            addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1439            return;
1440         }
1441         addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1442         return;
1443      case Ain_MulL:
1444         addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1445         addHRegUse(u, HRmModify, hregAMD64_RAX());
1446         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1447         return;
1448      case Ain_Div:
1449         addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1450         addHRegUse(u, HRmModify, hregAMD64_RAX());
1451         addHRegUse(u, HRmModify, hregAMD64_RDX());
1452         return;
1453      case Ain_Push:
1454         addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1455         addHRegUse(u, HRmModify, hregAMD64_RSP());
1456         return;
1457      case Ain_Call:
1458         /* This is a bit subtle. */
1459         /* First off, claim it trashes all the caller-saved regs
1460            which fall within the register allocator's jurisdiction.
1461            These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
1462            and all the xmm registers.
1463         */
1464         addHRegUse(u, HRmWrite, hregAMD64_RAX());
1465         addHRegUse(u, HRmWrite, hregAMD64_RCX());
1466         addHRegUse(u, HRmWrite, hregAMD64_RDX());
1467         addHRegUse(u, HRmWrite, hregAMD64_RSI());
1468         addHRegUse(u, HRmWrite, hregAMD64_RDI());
1469         addHRegUse(u, HRmWrite, hregAMD64_R8());
1470         addHRegUse(u, HRmWrite, hregAMD64_R9());
1471         addHRegUse(u, HRmWrite, hregAMD64_R10());
1472         addHRegUse(u, HRmWrite, hregAMD64_R11());
1473         addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1474         addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1475         addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1476         addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1477         addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1478         addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1479         addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1480         addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1481         addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1482         addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1483         addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1484         addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1485
1486         /* Now we have to state any parameter-carrying registers
1487            which might be read.  This depends on the regparmness. */
1488         switch (i->Ain.Call.regparms) {
1489            case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1490            case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1491            case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1492            case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1493            case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1494            case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1495            case 0: break;
1496            default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1497         }
1498         /* Finally, there is the issue that the insn trashes a
1499            register because the literal target address has to be
1500            loaded into a register.  Fortunately, r11 is stated in the
1501            ABI as a scratch register, and so seems a suitable victim.  */
1502         addHRegUse(u, HRmWrite, hregAMD64_R11());
1503         /* Upshot of this is that the assembler really must use r11,
1504            and no other, as a destination temporary. */
1505         return;
1506      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1507         conditionally exit the block.  Hence we only need to list (1)
1508         the registers that they read, and (2) the registers that they
1509         write in the case where the block is not exited.  (2) is
1510         empty, hence only (1) is relevant here. */
1511      case Ain_XDirect:
1512         /* Don't bother to mention the write to %r11, since it is not
1513            available to the allocator. */
1514         addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1515         return;
1516      case Ain_XIndir:
1517         /* Ditto re %r11 */
1518         addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1519         addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1520         return;
1521      case Ain_XAssisted:
1522         /* Ditto re %r11 and %rbp (the baseblock ptr) */
1523         addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1524         addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1525         return;
1526      case Ain_CMov64:
1527         addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1528         addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1529         return;
1530      case Ain_CLoad:
1531         addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1532         addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1533         return;
1534      case Ain_CStore:
1535         addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1536         addHRegUse(u, HRmRead, i->Ain.CStore.src);
1537         return;
1538      case Ain_MovxLQ:
1539         addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1540         addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1541         return;
1542      case Ain_LoadEX:
1543         addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1544         addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1545         return;
1546      case Ain_Store:
1547         addHRegUse(u, HRmRead, i->Ain.Store.src);
1548         addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1549         return;
1550      case Ain_Set64:
1551         addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1552         return;
1553      case Ain_Bsfr64:
1554         addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1555         addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1556         return;
1557      case Ain_MFence:
1558         return;
1559      case Ain_ACAS:
1560         addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1561         addHRegUse(u, HRmRead, hregAMD64_RBX());
1562         addHRegUse(u, HRmModify, hregAMD64_RAX());
1563         return;
1564      case Ain_DACAS:
1565         addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1566         addHRegUse(u, HRmRead, hregAMD64_RCX());
1567         addHRegUse(u, HRmRead, hregAMD64_RBX());
1568         addHRegUse(u, HRmModify, hregAMD64_RDX());
1569         addHRegUse(u, HRmModify, hregAMD64_RAX());
1570         return;
1571      case Ain_A87Free:
1572         return;
1573      case Ain_A87PushPop:
1574         addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1575         return;
1576      case Ain_A87FpOp:
1577         return;
1578      case Ain_A87LdCW:
1579         addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1580         return;
1581      case Ain_A87StSW:
1582         addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1583         return;
1584      case Ain_LdMXCSR:
1585         addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1586         return;
1587      case Ain_SseUComIS:
1588         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1589         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1590         addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1591         return;
1592      case Ain_SseSI2SF:
1593         addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1594         addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1595         return;
1596      case Ain_SseSF2SI:
1597         addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1598         addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1599         return;
1600      case Ain_SseSDSS:
1601         addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1602         addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1603         return;
1604      case Ain_SseLdSt:
1605         addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1606         addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1607                       i->Ain.SseLdSt.reg);
1608         return;
1609      case Ain_SseCStore:
1610         addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
1611         addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
1612         return;
1613      case Ain_SseCLoad:
1614         addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
1615         addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
1616         return;
1617      case Ain_SseLdzLO:
1618         addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1619         addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1620         return;
1621      case Ain_Sse32Fx4:
1622         vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1623         unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1624                         || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1625                         || i->Ain.Sse32Fx4.op == Asse_SQRTF );
1626         addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1627         addHRegUse(u, unary ? HRmWrite : HRmModify,
1628                       i->Ain.Sse32Fx4.dst);
1629         return;
1630      case Ain_Sse32FLo:
1631         vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1632         unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1633                         || i->Ain.Sse32FLo.op == Asse_RSQRTF
1634                         || i->Ain.Sse32FLo.op == Asse_SQRTF );
1635         addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1636         addHRegUse(u, unary ? HRmWrite : HRmModify,
1637                       i->Ain.Sse32FLo.dst);
1638         return;
1639      case Ain_Sse64Fx2:
1640         vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1641         unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1642                         || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1643                         || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1644         addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1645         addHRegUse(u, unary ? HRmWrite : HRmModify,
1646                       i->Ain.Sse64Fx2.dst);
1647         return;
1648      case Ain_Sse64FLo:
1649         vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1650         unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1651                         || i->Ain.Sse64FLo.op == Asse_RSQRTF
1652                         || i->Ain.Sse64FLo.op == Asse_SQRTF );
1653         addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1654         addHRegUse(u, unary ? HRmWrite : HRmModify,
1655                       i->Ain.Sse64FLo.dst);
1656         return;
1657      case Ain_SseReRg:
1658         if ( (i->Ain.SseReRg.op == Asse_XOR
1659               || i->Ain.SseReRg.op == Asse_CMPEQ32)
1660              && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1661            /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1662               r,r' as a write of a value to r, and independent of any
1663               previous value in r */
1664            /* (as opposed to a rite of passage :-) */
1665            addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1666         } else {
1667            addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1668            addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1669                             ? HRmWrite : HRmModify,
1670                          i->Ain.SseReRg.dst);
1671         }
1672         return;
1673      case Ain_SseCMov:
1674         addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1675         addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1676         return;
1677      case Ain_SseShuf:
1678         addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1679         addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1680         return;
1681      //uu case Ain_AvxLdSt:
1682      //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1683      //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1684      //uu               i->Ain.AvxLdSt.reg);
1685      //uu return;
1686      //uu case Ain_AvxReRg:
1687      //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1688      //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1689      //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1690      //uu       /* See comments on the case for Ain_SseReRg. */
1691      //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1692      //uu    } else {
1693      //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1694      //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1695      //uu                        ? HRmWrite : HRmModify,
1696      //uu                     i->Ain.AvxReRg.dst);
1697      //uu    }
1698      //uu    return;
1699      case Ain_EvCheck:
1700         /* We expect both amodes only to mention %rbp, so this is in
1701            fact pointless, since %rbp isn't allocatable, but anyway.. */
1702         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1703         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1704         return;
1705      case Ain_ProfInc:
1706         addHRegUse(u, HRmWrite, hregAMD64_R11());
1707         return;
1708      default:
1709         ppAMD64Instr(i, mode64);
1710         vpanic("getRegUsage_AMD64Instr");
1711   }
1712}
1713
1714/* local helper */
1715static inline void mapReg(HRegRemap* m, HReg* r)
1716{
1717   *r = lookupHRegRemap(m, *r);
1718}
1719
1720void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1721{
1722   vassert(mode64 == True);
1723   switch (i->tag) {
1724      case Ain_Imm64:
1725         mapReg(m, &i->Ain.Imm64.dst);
1726         return;
1727      case Ain_Alu64R:
1728         mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1729         mapReg(m, &i->Ain.Alu64R.dst);
1730         return;
1731      case Ain_Alu64M:
1732         mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1733         mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1734         return;
1735      case Ain_Sh64:
1736         mapReg(m, &i->Ain.Sh64.dst);
1737         return;
1738      case Ain_Test64:
1739         mapReg(m, &i->Ain.Test64.dst);
1740         return;
1741      case Ain_Unary64:
1742         mapReg(m, &i->Ain.Unary64.dst);
1743         return;
1744      case Ain_Lea64:
1745         mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1746         mapReg(m, &i->Ain.Lea64.dst);
1747         return;
1748      case Ain_Alu32R:
1749         mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1750         mapReg(m, &i->Ain.Alu32R.dst);
1751         return;
1752      case Ain_MulL:
1753         mapRegs_AMD64RM(m, i->Ain.MulL.src);
1754         return;
1755      case Ain_Div:
1756         mapRegs_AMD64RM(m, i->Ain.Div.src);
1757         return;
1758      case Ain_Push:
1759         mapRegs_AMD64RMI(m, i->Ain.Push.src);
1760         return;
1761      case Ain_Call:
1762         return;
1763      case Ain_XDirect:
1764         mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1765         return;
1766      case Ain_XIndir:
1767         mapReg(m, &i->Ain.XIndir.dstGA);
1768         mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1769         return;
1770      case Ain_XAssisted:
1771         mapReg(m, &i->Ain.XAssisted.dstGA);
1772         mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1773         return;
1774      case Ain_CMov64:
1775         mapReg(m, &i->Ain.CMov64.src);
1776         mapReg(m, &i->Ain.CMov64.dst);
1777         return;
1778      case Ain_CLoad:
1779         mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1780         mapReg(m, &i->Ain.CLoad.dst);
1781         return;
1782      case Ain_CStore:
1783         mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1784         mapReg(m, &i->Ain.CStore.src);
1785         return;
1786      case Ain_MovxLQ:
1787         mapReg(m, &i->Ain.MovxLQ.src);
1788         mapReg(m, &i->Ain.MovxLQ.dst);
1789         return;
1790      case Ain_LoadEX:
1791         mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1792         mapReg(m, &i->Ain.LoadEX.dst);
1793         return;
1794      case Ain_Store:
1795         mapReg(m, &i->Ain.Store.src);
1796         mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1797         return;
1798      case Ain_Set64:
1799         mapReg(m, &i->Ain.Set64.dst);
1800         return;
1801      case Ain_Bsfr64:
1802         mapReg(m, &i->Ain.Bsfr64.src);
1803         mapReg(m, &i->Ain.Bsfr64.dst);
1804         return;
1805      case Ain_MFence:
1806         return;
1807      case Ain_ACAS:
1808         mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1809         return;
1810      case Ain_DACAS:
1811         mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1812         return;
1813      case Ain_A87Free:
1814         return;
1815      case Ain_A87PushPop:
1816         mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1817         return;
1818      case Ain_A87FpOp:
1819         return;
1820      case Ain_A87LdCW:
1821         mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1822         return;
1823      case Ain_A87StSW:
1824         mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1825         return;
1826      case Ain_LdMXCSR:
1827         mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1828         return;
1829      case Ain_SseUComIS:
1830         mapReg(m, &i->Ain.SseUComIS.srcL);
1831         mapReg(m, &i->Ain.SseUComIS.srcR);
1832         mapReg(m, &i->Ain.SseUComIS.dst);
1833         return;
1834      case Ain_SseSI2SF:
1835         mapReg(m, &i->Ain.SseSI2SF.src);
1836         mapReg(m, &i->Ain.SseSI2SF.dst);
1837         return;
1838      case Ain_SseSF2SI:
1839         mapReg(m, &i->Ain.SseSF2SI.src);
1840         mapReg(m, &i->Ain.SseSF2SI.dst);
1841         return;
1842      case Ain_SseSDSS:
1843         mapReg(m, &i->Ain.SseSDSS.src);
1844         mapReg(m, &i->Ain.SseSDSS.dst);
1845         return;
1846      case Ain_SseLdSt:
1847         mapReg(m, &i->Ain.SseLdSt.reg);
1848         mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1849         break;
1850      case Ain_SseCStore:
1851         mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
1852         mapReg(m, &i->Ain.SseCStore.src);
1853         return;
1854      case Ain_SseCLoad:
1855         mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
1856         mapReg(m, &i->Ain.SseCLoad.dst);
1857         return;
1858      case Ain_SseLdzLO:
1859         mapReg(m, &i->Ain.SseLdzLO.reg);
1860         mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1861         break;
1862      case Ain_Sse32Fx4:
1863         mapReg(m, &i->Ain.Sse32Fx4.src);
1864         mapReg(m, &i->Ain.Sse32Fx4.dst);
1865         return;
1866      case Ain_Sse32FLo:
1867         mapReg(m, &i->Ain.Sse32FLo.src);
1868         mapReg(m, &i->Ain.Sse32FLo.dst);
1869         return;
1870      case Ain_Sse64Fx2:
1871         mapReg(m, &i->Ain.Sse64Fx2.src);
1872         mapReg(m, &i->Ain.Sse64Fx2.dst);
1873         return;
1874      case Ain_Sse64FLo:
1875         mapReg(m, &i->Ain.Sse64FLo.src);
1876         mapReg(m, &i->Ain.Sse64FLo.dst);
1877         return;
1878      case Ain_SseReRg:
1879         mapReg(m, &i->Ain.SseReRg.src);
1880         mapReg(m, &i->Ain.SseReRg.dst);
1881         return;
1882      case Ain_SseCMov:
1883         mapReg(m, &i->Ain.SseCMov.src);
1884         mapReg(m, &i->Ain.SseCMov.dst);
1885         return;
1886      case Ain_SseShuf:
1887         mapReg(m, &i->Ain.SseShuf.src);
1888         mapReg(m, &i->Ain.SseShuf.dst);
1889         return;
1890      //uu case Ain_AvxLdSt:
1891      //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1892      //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1893      //uu    break;
1894      //uu case Ain_AvxReRg:
1895      //uu    mapReg(m, &i->Ain.AvxReRg.src);
1896      //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1897      //uu    return;
1898      case Ain_EvCheck:
1899         /* We expect both amodes only to mention %rbp, so this is in
1900            fact pointless, since %rbp isn't allocatable, but anyway.. */
1901         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1902         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1903         return;
1904      case Ain_ProfInc:
1905         /* hardwires r11 -- nothing to modify. */
1906         return;
1907      default:
1908         ppAMD64Instr(i, mode64);
1909         vpanic("mapRegs_AMD64Instr");
1910   }
1911}
1912
1913/* Figure out if i represents a reg-reg move, and if so assign the
1914   source and destination to *src and *dst.  If in doubt say No.  Used
1915   by the register allocator to do move coalescing.
1916*/
1917Bool isMove_AMD64Instr ( const AMD64Instr* i, HReg* src, HReg* dst )
1918{
1919   switch (i->tag) {
1920      case Ain_Alu64R:
1921         /* Moves between integer regs */
1922         if (i->Ain.Alu64R.op != Aalu_MOV)
1923            return False;
1924         if (i->Ain.Alu64R.src->tag != Armi_Reg)
1925            return False;
1926         *src = i->Ain.Alu64R.src->Armi.Reg.reg;
1927         *dst = i->Ain.Alu64R.dst;
1928         return True;
1929      case Ain_SseReRg:
1930         /* Moves between SSE regs */
1931         if (i->Ain.SseReRg.op != Asse_MOV)
1932            return False;
1933         *src = i->Ain.SseReRg.src;
1934         *dst = i->Ain.SseReRg.dst;
1935         return True;
1936      //uu case Ain_AvxReRg:
1937      //uu    /* Moves between AVX regs */
1938      //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
1939      //uu       return False;
1940      //uu    *src = i->Ain.AvxReRg.src;
1941      //uu    *dst = i->Ain.AvxReRg.dst;
1942      //uu    return True;
1943      default:
1944         return False;
1945   }
1946   /*NOTREACHED*/
1947}
1948
1949
1950/* Generate amd64 spill/reload instructions under the direction of the
1951   register allocator.  Note it's critical these don't write the
1952   condition codes. */
1953
1954void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1955                      HReg rreg, Int offsetB, Bool mode64 )
1956{
1957   AMD64AMode* am;
1958   vassert(offsetB >= 0);
1959   vassert(!hregIsVirtual(rreg));
1960   vassert(mode64 == True);
1961   *i1 = *i2 = NULL;
1962   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1963   switch (hregClass(rreg)) {
1964      case HRcInt64:
1965         *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
1966         return;
1967      case HRcVec128:
1968         *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
1969         return;
1970      default:
1971         ppHRegClass(hregClass(rreg));
1972         vpanic("genSpill_AMD64: unimplemented regclass");
1973   }
1974}
1975
1976void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1977                       HReg rreg, Int offsetB, Bool mode64 )
1978{
1979   AMD64AMode* am;
1980   vassert(offsetB >= 0);
1981   vassert(!hregIsVirtual(rreg));
1982   vassert(mode64 == True);
1983   *i1 = *i2 = NULL;
1984   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1985   switch (hregClass(rreg)) {
1986      case HRcInt64:
1987         *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
1988         return;
1989      case HRcVec128:
1990         *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
1991         return;
1992      default:
1993         ppHRegClass(hregClass(rreg));
1994         vpanic("genReload_AMD64: unimplemented regclass");
1995   }
1996}
1997
1998AMD64Instr* directReload_AMD64( AMD64Instr* i, HReg vreg, Short spill_off )
1999{
2000   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
2001
2002   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
2003      Convert to: src=RMI_Mem, dst=Reg
2004   */
2005   if (i->tag == Ain_Alu64R
2006       && (i->Ain.Alu64R.op == Aalu_MOV || i->Ain.Alu64R.op == Aalu_OR
2007           || i->Ain.Alu64R.op == Aalu_XOR)
2008       && i->Ain.Alu64R.src->tag == Armi_Reg
2009       && sameHReg(i->Ain.Alu64R.src->Armi.Reg.reg, vreg)) {
2010      vassert(! sameHReg(i->Ain.Alu64R.dst, vreg));
2011      return AMD64Instr_Alu64R(
2012                i->Ain.Alu64R.op,
2013                AMD64RMI_Mem( AMD64AMode_IR( spill_off, hregAMD64_RBP())),
2014                i->Ain.Alu64R.dst
2015             );
2016   }
2017
2018   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
2019      Convert to: src=RI_Imm, dst=Mem
2020   */
2021   if (i->tag == Ain_Alu64R
2022       && (i->Ain.Alu64R.op == Aalu_CMP)
2023       && i->Ain.Alu64R.src->tag == Armi_Imm
2024       && sameHReg(i->Ain.Alu64R.dst, vreg)) {
2025      return AMD64Instr_Alu64M(
2026                i->Ain.Alu64R.op,
2027                AMD64RI_Imm( i->Ain.Alu64R.src->Armi.Imm.imm32 ),
2028                AMD64AMode_IR( spill_off, hregAMD64_RBP())
2029             );
2030   }
2031
2032   return NULL;
2033}
2034
2035
2036/* --------- The amd64 assembler (bleh.) --------- */
2037
2038/* Produce the low three bits of an integer register number. */
2039inline static UInt iregEnc210 ( HReg r )
2040{
2041   UInt n;
2042   vassert(hregClass(r) == HRcInt64);
2043   vassert(!hregIsVirtual(r));
2044   n = hregEncoding(r);
2045   vassert(n <= 15);
2046   return n & 7;
2047}
2048
2049/* Produce bit 3 of an integer register number. */
2050inline static UInt iregEnc3 ( HReg r )
2051{
2052   UInt n;
2053   vassert(hregClass(r) == HRcInt64);
2054   vassert(!hregIsVirtual(r));
2055   n = hregEncoding(r);
2056   vassert(n <= 15);
2057   return (n >> 3) & 1;
2058}
2059
2060/* Produce a complete 4-bit integer register number. */
2061inline static UInt iregEnc3210 ( HReg r )
2062{
2063   UInt n;
2064   vassert(hregClass(r) == HRcInt64);
2065   vassert(!hregIsVirtual(r));
2066   n = hregEncoding(r);
2067   vassert(n <= 15);
2068   return n;
2069}
2070
2071/* Produce a complete 4-bit integer register number. */
2072inline static UInt vregEnc3210 ( HReg r )
2073{
2074   UInt n;
2075   vassert(hregClass(r) == HRcVec128);
2076   vassert(!hregIsVirtual(r));
2077   n = hregEncoding(r);
2078   vassert(n <= 15);
2079   return n;
2080}
2081
2082inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
2083{
2084   vassert(mod < 4);
2085   vassert((reg|regmem) < 8);
2086   return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
2087}
2088
2089inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
2090{
2091   vassert(shift < 4);
2092   vassert((regindex|regbase) < 8);
2093   return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2094}
2095
2096static UChar* emit32 ( UChar* p, UInt w32 )
2097{
2098   *p++ = toUChar((w32)       & 0x000000FF);
2099   *p++ = toUChar((w32 >>  8) & 0x000000FF);
2100   *p++ = toUChar((w32 >> 16) & 0x000000FF);
2101   *p++ = toUChar((w32 >> 24) & 0x000000FF);
2102   return p;
2103}
2104
2105static UChar* emit64 ( UChar* p, ULong w64 )
2106{
2107   p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2108   p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2109   return p;
2110}
2111
2112/* Does a sign-extend of the lowest 8 bits give
2113   the original number? */
2114static Bool fits8bits ( UInt w32 )
2115{
2116   Int i32 = (Int)w32;
2117   return toBool(i32 == ((Int)(w32 << 24) >> 24));
2118}
2119/* Can the lower 32 bits be signedly widened to produce the whole
2120   64-bit value?  In other words, are the top 33 bits either all 0 or
2121   all 1 ? */
2122static Bool fitsIn32Bits ( ULong x )
2123{
2124   Long y1;
2125   y1 = x << 32;
2126   y1 >>=/*s*/ 32;
2127   return toBool(x == y1);
2128}
2129
2130
2131/* Forming mod-reg-rm bytes and scale-index-base bytes.
2132
2133     greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2134                       =  00 greg ereg
2135
2136     greg,  d8(ereg)   |  ereg is neither of: RSP R12
2137                       =  01 greg ereg, d8
2138
2139     greg,  d32(ereg)  |  ereg is neither of: RSP R12
2140                       =  10 greg ereg, d32
2141
2142     greg,  d8(ereg)   |  ereg is either: RSP R12
2143                       =  01 greg 100, 0x24, d8
2144                       (lowest bit of rex distinguishes R12/RSP)
2145
2146     greg,  d32(ereg)  |  ereg is either: RSP R12
2147                       =  10 greg 100, 0x24, d32
2148                       (lowest bit of rex distinguishes R12/RSP)
2149
2150     -----------------------------------------------
2151
2152     greg,  d8(base,index,scale)
2153               |  index != RSP
2154               =  01 greg 100, scale index base, d8
2155
2156     greg,  d32(base,index,scale)
2157               |  index != RSP
2158               =  10 greg 100, scale index base, d32
2159*/
2160static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2161{
2162   UInt gregEnc210 = gregEnc3210 & 7;
2163   if (am->tag == Aam_IR) {
2164      if (am->Aam.IR.imm == 0
2165          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2166          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2167          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2168          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2169         ) {
2170         *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2171         return p;
2172      }
2173      if (fits8bits(am->Aam.IR.imm)
2174          && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2175          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2176         ) {
2177         *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2178         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2179         return p;
2180      }
2181      if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2182          && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2183         ) {
2184         *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2185         p = emit32(p, am->Aam.IR.imm);
2186         return p;
2187      }
2188      if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2189           || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2190          && fits8bits(am->Aam.IR.imm)) {
2191 	 *p++ = mkModRegRM(1, gregEnc210, 4);
2192         *p++ = 0x24;
2193         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2194         return p;
2195      }
2196      if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2197	      || wait for test case for RSP case */
2198          sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2199 	 *p++ = mkModRegRM(2, gregEnc210, 4);
2200         *p++ = 0x24;
2201         p = emit32(p, am->Aam.IR.imm);
2202         return p;
2203      }
2204      ppAMD64AMode(am);
2205      vpanic("doAMode_M: can't emit amode IR");
2206      /*NOTREACHED*/
2207   }
2208   if (am->tag == Aam_IRRS) {
2209      if (fits8bits(am->Aam.IRRS.imm)
2210          && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2211         *p++ = mkModRegRM(1, gregEnc210, 4);
2212         *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2213                                          iregEnc210(am->Aam.IRRS.base));
2214         *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2215         return p;
2216      }
2217      if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2218         *p++ = mkModRegRM(2, gregEnc210, 4);
2219         *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2220                                          iregEnc210(am->Aam.IRRS.base));
2221         p = emit32(p, am->Aam.IRRS.imm);
2222         return p;
2223      }
2224      ppAMD64AMode(am);
2225      vpanic("doAMode_M: can't emit amode IRRS");
2226      /*NOTREACHED*/
2227   }
2228   vpanic("doAMode_M: unknown amode");
2229   /*NOTREACHED*/
2230}
2231
2232static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2233{
2234   return doAMode_M__wrk(p, iregEnc3210(greg), am);
2235}
2236
2237static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2238{
2239   vassert(gregEnc3210 < 16);
2240   return doAMode_M__wrk(p, gregEnc3210, am);
2241}
2242
2243
2244/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2245inline
2246static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2247{
2248   *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2249   return p;
2250}
2251
2252static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2253{
2254   return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2255}
2256
2257static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2258{
2259   vassert(gregEnc3210 < 16);
2260   return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2261}
2262
2263static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2264{
2265   vassert(eregEnc3210 < 16);
2266   return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2267}
2268
2269static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2270{
2271   vassert( (gregEnc3210|eregEnc3210) < 16);
2272   return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2273}
2274
2275
2276/* Clear the W bit on a REX byte, thereby changing the operand size
2277   back to whatever that instruction's default operand size is. */
2278static inline UChar clearWBit ( UChar rex )
2279{
2280   return rex & ~(1<<3);
2281}
2282
2283
2284/* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2285inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2286{
2287   if (am->tag == Aam_IR) {
2288      UChar W = 1;  /* we want 64-bit mode */
2289      UChar R = (gregEnc3210 >> 3) & 1;
2290      UChar X = 0; /* not relevant */
2291      UChar B = iregEnc3(am->Aam.IR.reg);
2292      return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2293   }
2294   if (am->tag == Aam_IRRS) {
2295      UChar W = 1;  /* we want 64-bit mode */
2296      UChar R = (gregEnc3210 >> 3) & 1;
2297      UChar X = iregEnc3(am->Aam.IRRS.index);
2298      UChar B = iregEnc3(am->Aam.IRRS.base);
2299      return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2300   }
2301   vassert(0);
2302   return 0; /*NOTREACHED*/
2303}
2304
2305static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2306{
2307   return rexAMode_M__wrk(iregEnc3210(greg), am);
2308}
2309
2310static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2311{
2312   vassert(gregEnc3210 < 16);
2313   return rexAMode_M__wrk(gregEnc3210, am);
2314}
2315
2316
2317/* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2318inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2319{
2320   UChar W = 1;  /* we want 64-bit mode */
2321   UChar R = (gregEnc3210 >> 3) & 1;
2322   UChar X = 0; /* not relevant */
2323   UChar B = (eregEnc3210 >> 3) & 1;
2324   return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2325}
2326
2327static UChar rexAMode_R ( HReg greg, HReg ereg )
2328{
2329   return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2330}
2331
2332static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2333{
2334   vassert(gregEnc3210 < 16);
2335   return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2336}
2337
2338static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2339{
2340   vassert(eregEnc3210 < 16);
2341   return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2342}
2343
2344static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2345{
2346   vassert((gregEnc3210|eregEnc3210) < 16);
2347   return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2348}
2349
2350
2351//uu /* May 2012: this VEX prefix stuff is currently unused, but has
2352//uu    verified correct (I reckon).  Certainly it has been known to
2353//uu    produce correct VEX prefixes during testing. */
2354//uu
2355//uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2356//uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2357//uu    in verbatim.  There's no range checking on the bits. */
2358//uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2359//uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2360//uu                             UInt L, UInt pp )
2361//uu {
2362//uu    UChar byte0 = 0;
2363//uu    UChar byte1 = 0;
2364//uu    UChar byte2 = 0;
2365//uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2366//uu       /* 2 byte encoding is possible. */
2367//uu       byte0 = 0xC5;
2368//uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2369//uu               | (L << 2) | pp;
2370//uu    } else {
2371//uu       /* 3 byte encoding is needed. */
2372//uu       byte0 = 0xC4;
2373//uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2374//uu               | ((rexB ^ 1) << 5) | mmmmm;
2375//uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2376//uu    }
2377//uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2378//uu }
2379//uu
2380//uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2381//uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2382//uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2383//uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2384//uu    vvvv=1111 (unused 3rd reg). */
2385//uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2386//uu {
2387//uu    UChar L       = 1; /* size = 256 */
2388//uu    UChar pp      = 0; /* no SIMD prefix */
2389//uu    UChar mmmmm   = 1; /* 0F */
2390//uu    UChar notVvvv = 0; /* unused */
2391//uu    UChar rexW    = 0;
2392//uu    UChar rexR    = 0;
2393//uu    UChar rexX    = 0;
2394//uu    UChar rexB    = 0;
2395//uu    /* Same logic as in rexAMode_M. */
2396//uu    if (am->tag == Aam_IR) {
2397//uu       rexR = iregEnc3(greg);
2398//uu       rexX = 0; /* not relevant */
2399//uu       rexB = iregEnc3(am->Aam.IR.reg);
2400//uu    }
2401//uu    else if (am->tag == Aam_IRRS) {
2402//uu       rexR = iregEnc3(greg);
2403//uu       rexX = iregEnc3(am->Aam.IRRS.index);
2404//uu       rexB = iregEnc3(am->Aam.IRRS.base);
2405//uu    } else {
2406//uu       vassert(0);
2407//uu    }
2408//uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2409//uu }
2410//uu
2411//uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2412//uu {
2413//uu    switch (vex & 0xFF) {
2414//uu       case 0xC5:
2415//uu          *p++ = 0xC5;
2416//uu          *p++ = (vex >> 8) & 0xFF;
2417//uu          vassert(0 == (vex >> 16));
2418//uu          break;
2419//uu       case 0xC4:
2420//uu          *p++ = 0xC4;
2421//uu          *p++ = (vex >> 8) & 0xFF;
2422//uu          *p++ = (vex >> 16) & 0xFF;
2423//uu          vassert(0 == (vex >> 24));
2424//uu          break;
2425//uu       default:
2426//uu          vassert(0);
2427//uu    }
2428//uu    return p;
2429//uu }
2430
2431
2432/* Emit ffree %st(N) */
2433static UChar* do_ffree_st ( UChar* p, Int n )
2434{
2435   vassert(n >= 0 && n <= 7);
2436   *p++ = 0xDD;
2437   *p++ = toUChar(0xC0 + n);
2438   return p;
2439}
2440
2441/* Emit an instruction into buf and return the number of bytes used.
2442   Note that buf is not the insn's final place, and therefore it is
2443   imperative to emit position-independent code.  If the emitted
2444   instruction was a profiler inc, set *is_profInc to True, else
2445   leave it unchanged. */
2446
2447Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2448                      UChar* buf, Int nbuf, const AMD64Instr* i,
2449                      Bool mode64, VexEndness endness_host,
2450                      const void* disp_cp_chain_me_to_slowEP,
2451                      const void* disp_cp_chain_me_to_fastEP,
2452                      const void* disp_cp_xindir,
2453                      const void* disp_cp_xassisted )
2454{
2455   UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2456   UInt   xtra;
2457   UInt   reg;
2458   UChar  rex;
2459   UChar* p = &buf[0];
2460   UChar* ptmp;
2461   Int    j;
2462   vassert(nbuf >= 64);
2463   vassert(mode64 == True);
2464
2465   /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2466
2467   switch (i->tag) {
2468
2469   case Ain_Imm64:
2470      if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2471         /* Use the short form (load into 32 bit reg, + default
2472            widening rule) for constants under 1 million.  We could
2473            use this form for the range 0 to 0x7FFFFFFF inclusive, but
2474            limit it to a smaller range for verifiability purposes. */
2475         if (1 & iregEnc3(i->Ain.Imm64.dst))
2476            *p++ = 0x41;
2477         *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2478         p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2479      } else {
2480         *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2481         *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2482         p = emit64(p, i->Ain.Imm64.imm64);
2483      }
2484      goto done;
2485
2486   case Ain_Alu64R:
2487      /* Deal specially with MOV */
2488      if (i->Ain.Alu64R.op == Aalu_MOV) {
2489         switch (i->Ain.Alu64R.src->tag) {
2490            case Armi_Imm:
2491               if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2492                  /* Actually we could use this form for constants in
2493                     the range 0 through 0x7FFFFFFF inclusive, but
2494                     limit it to a small range for verifiability
2495                     purposes. */
2496                  /* Generate "movl $imm32, 32-bit-register" and let
2497                     the default zero-extend rule cause the upper half
2498                     of the dst to be zeroed out too.  This saves 1
2499                     and sometimes 2 bytes compared to the more
2500                     obvious encoding in the 'else' branch. */
2501                  if (1 & iregEnc3(i->Ain.Alu64R.dst))
2502                     *p++ = 0x41;
2503                  *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2504                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2505               } else {
2506                  *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2507                  *p++ = 0xC7;
2508                  *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2509                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2510               }
2511               goto done;
2512            case Armi_Reg:
2513               *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2514                                  i->Ain.Alu64R.dst );
2515               *p++ = 0x89;
2516               p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2517                                i->Ain.Alu64R.dst);
2518               goto done;
2519            case Armi_Mem:
2520               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2521                                 i->Ain.Alu64R.src->Armi.Mem.am);
2522               *p++ = 0x8B;
2523               p = doAMode_M(p, i->Ain.Alu64R.dst,
2524                                i->Ain.Alu64R.src->Armi.Mem.am);
2525               goto done;
2526            default:
2527               goto bad;
2528         }
2529      }
2530      /* MUL */
2531      if (i->Ain.Alu64R.op == Aalu_MUL) {
2532         switch (i->Ain.Alu64R.src->tag) {
2533            case Armi_Reg:
2534               *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2535                                  i->Ain.Alu64R.src->Armi.Reg.reg);
2536               *p++ = 0x0F;
2537               *p++ = 0xAF;
2538               p = doAMode_R(p, i->Ain.Alu64R.dst,
2539                                i->Ain.Alu64R.src->Armi.Reg.reg);
2540               goto done;
2541            case Armi_Mem:
2542               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2543                                 i->Ain.Alu64R.src->Armi.Mem.am);
2544               *p++ = 0x0F;
2545               *p++ = 0xAF;
2546               p = doAMode_M(p, i->Ain.Alu64R.dst,
2547                                i->Ain.Alu64R.src->Armi.Mem.am);
2548               goto done;
2549            case Armi_Imm:
2550               if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2551                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2552                  *p++ = 0x6B;
2553                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2554                  *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2555               } else {
2556                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2557                  *p++ = 0x69;
2558                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2559                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2560               }
2561               goto done;
2562            default:
2563               goto bad;
2564         }
2565      }
2566      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2567      opc = opc_rr = subopc_imm = opc_imma = 0;
2568      switch (i->Ain.Alu64R.op) {
2569         case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2570                        subopc_imm = 2; opc_imma = 0x15; break;
2571         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2572                        subopc_imm = 0; opc_imma = 0x05; break;
2573         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2574                        subopc_imm = 5; opc_imma = 0x2D; break;
2575         case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2576                        subopc_imm = 3; opc_imma = 0x1D; break;
2577         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2578                        subopc_imm = 4; opc_imma = 0x25; break;
2579         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2580                        subopc_imm = 6; opc_imma = 0x35; break;
2581         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2582                        subopc_imm = 1; opc_imma = 0x0D; break;
2583         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2584                        subopc_imm = 7; opc_imma = 0x3D; break;
2585         default: goto bad;
2586      }
2587      switch (i->Ain.Alu64R.src->tag) {
2588         case Armi_Imm:
2589            if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2590                && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2591               goto bad; /* FIXME: awaiting test case */
2592               *p++ = toUChar(opc_imma);
2593               p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2594            } else
2595            if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2596               *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2597               *p++ = 0x83;
2598               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2599               *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2600            } else {
2601               *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2602               *p++ = 0x81;
2603               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2604               p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2605            }
2606            goto done;
2607         case Armi_Reg:
2608            *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2609                               i->Ain.Alu64R.dst);
2610            *p++ = toUChar(opc_rr);
2611            p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2612                             i->Ain.Alu64R.dst);
2613            goto done;
2614         case Armi_Mem:
2615            *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2616                               i->Ain.Alu64R.src->Armi.Mem.am);
2617            *p++ = toUChar(opc);
2618            p = doAMode_M(p, i->Ain.Alu64R.dst,
2619                             i->Ain.Alu64R.src->Armi.Mem.am);
2620            goto done;
2621         default:
2622            goto bad;
2623      }
2624      break;
2625
2626   case Ain_Alu64M:
2627      /* Deal specially with MOV */
2628      if (i->Ain.Alu64M.op == Aalu_MOV) {
2629         switch (i->Ain.Alu64M.src->tag) {
2630            case Ari_Reg:
2631               *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2632                                 i->Ain.Alu64M.dst);
2633               *p++ = 0x89;
2634               p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2635                                i->Ain.Alu64M.dst);
2636               goto done;
2637            case Ari_Imm:
2638               *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2639               *p++ = 0xC7;
2640               p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2641               p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2642               goto done;
2643            default:
2644               goto bad;
2645         }
2646      }
2647      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2648         allowed here. (This is derived from the x86 version of same). */
2649      opc = subopc_imm = opc_imma = 0;
2650      switch (i->Ain.Alu64M.op) {
2651         case Aalu_CMP: opc = 0x39; subopc_imm = 7; break;
2652         default: goto bad;
2653      }
2654      switch (i->Ain.Alu64M.src->tag) {
2655         /*
2656         case Xri_Reg:
2657            *p++ = toUChar(opc);
2658            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2659                             i->Xin.Alu32M.dst);
2660            goto done;
2661         */
2662         case Ari_Imm:
2663            if (fits8bits(i->Ain.Alu64M.src->Ari.Imm.imm32)) {
2664               *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2665               *p++ = 0x83;
2666               p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2667               *p++ = toUChar(0xFF & i->Ain.Alu64M.src->Ari.Imm.imm32);
2668               goto done;
2669            } else {
2670               *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2671               *p++ = 0x81;
2672               p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2673               p    = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2674               goto done;
2675            }
2676         default:
2677            goto bad;
2678      }
2679
2680      break;
2681
2682   case Ain_Sh64:
2683      opc_cl = opc_imm = subopc = 0;
2684      switch (i->Ain.Sh64.op) {
2685         case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2686         case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2687         case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2688         default: goto bad;
2689      }
2690      if (i->Ain.Sh64.src == 0) {
2691         *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2692         *p++ = toUChar(opc_cl);
2693         p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2694         goto done;
2695      } else {
2696         *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2697         *p++ = toUChar(opc_imm);
2698         p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2699         *p++ = (UChar)(i->Ain.Sh64.src);
2700         goto done;
2701      }
2702      break;
2703
2704   case Ain_Test64:
2705      /* testq sign-extend($imm32), %reg */
2706      *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2707      *p++ = 0xF7;
2708      p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2709      p = emit32(p, i->Ain.Test64.imm32);
2710      goto done;
2711
2712   case Ain_Unary64:
2713      if (i->Ain.Unary64.op == Aun_NOT) {
2714         *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2715         *p++ = 0xF7;
2716         p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2717         goto done;
2718      }
2719      if (i->Ain.Unary64.op == Aun_NEG) {
2720         *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2721         *p++ = 0xF7;
2722         p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2723         goto done;
2724      }
2725      break;
2726
2727   case Ain_Lea64:
2728      *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2729      *p++ = 0x8D;
2730      p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2731      goto done;
2732
2733   case Ain_Alu32R:
2734      /* ADD/SUB/AND/OR/XOR/CMP */
2735      opc = opc_rr = subopc_imm = opc_imma = 0;
2736      switch (i->Ain.Alu32R.op) {
2737         case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2738                        subopc_imm = 0; opc_imma = 0x05; break;
2739         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2740                        subopc_imm = 5; opc_imma = 0x2D; break;
2741         case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2742                        subopc_imm = 4; opc_imma = 0x25; break;
2743         case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2744                        subopc_imm = 6; opc_imma = 0x35; break;
2745         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2746                        subopc_imm = 1; opc_imma = 0x0D; break;
2747         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2748                        subopc_imm = 7; opc_imma = 0x3D; break;
2749         default: goto bad;
2750      }
2751      switch (i->Ain.Alu32R.src->tag) {
2752         case Armi_Imm:
2753            if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2754                && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2755               goto bad; /* FIXME: awaiting test case */
2756               *p++ = toUChar(opc_imma);
2757               p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2758            } else
2759            if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2760               rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2761               if (rex != 0x40) *p++ = rex;
2762               *p++ = 0x83;
2763               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2764               *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2765            } else {
2766               rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2767               if (rex != 0x40) *p++ = rex;
2768               *p++ = 0x81;
2769               p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2770               p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2771            }
2772            goto done;
2773         case Armi_Reg:
2774            rex  = clearWBit(
2775                   rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2776                               i->Ain.Alu32R.dst) );
2777            if (rex != 0x40) *p++ = rex;
2778            *p++ = toUChar(opc_rr);
2779            p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2780                             i->Ain.Alu32R.dst);
2781            goto done;
2782         case Armi_Mem:
2783            rex  = clearWBit(
2784                   rexAMode_M( i->Ain.Alu32R.dst,
2785                               i->Ain.Alu32R.src->Armi.Mem.am) );
2786            if (rex != 0x40) *p++ = rex;
2787            *p++ = toUChar(opc);
2788            p = doAMode_M(p, i->Ain.Alu32R.dst,
2789                             i->Ain.Alu32R.src->Armi.Mem.am);
2790            goto done;
2791         default:
2792            goto bad;
2793      }
2794      break;
2795
2796   case Ain_MulL:
2797      subopc = i->Ain.MulL.syned ? 5 : 4;
2798      switch (i->Ain.MulL.src->tag)  {
2799         case Arm_Mem:
2800            *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2801            *p++ = 0xF7;
2802            p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2803            goto done;
2804         case Arm_Reg:
2805            *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2806            *p++ = 0xF7;
2807            p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2808            goto done;
2809         default:
2810            goto bad;
2811      }
2812      break;
2813
2814   case Ain_Div:
2815      subopc = i->Ain.Div.syned ? 7 : 6;
2816      if (i->Ain.Div.sz == 4) {
2817         switch (i->Ain.Div.src->tag)  {
2818            case Arm_Mem:
2819               goto bad;
2820               /*FIXME*/
2821               *p++ = 0xF7;
2822               p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2823               goto done;
2824            case Arm_Reg:
2825               *p++ = clearWBit(
2826                      rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2827               *p++ = 0xF7;
2828               p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2829               goto done;
2830            default:
2831               goto bad;
2832         }
2833      }
2834      if (i->Ain.Div.sz == 8) {
2835         switch (i->Ain.Div.src->tag)  {
2836            case Arm_Mem:
2837               *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
2838               *p++ = 0xF7;
2839               p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2840               goto done;
2841            case Arm_Reg:
2842               *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
2843               *p++ = 0xF7;
2844               p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2845               goto done;
2846            default:
2847               goto bad;
2848         }
2849      }
2850      break;
2851
2852   case Ain_Push:
2853      switch (i->Ain.Push.src->tag) {
2854         case Armi_Mem:
2855            *p++ = clearWBit(
2856                   rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
2857            *p++ = 0xFF;
2858            p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
2859            goto done;
2860         case Armi_Imm:
2861            *p++ = 0x68;
2862            p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2863            goto done;
2864         case Armi_Reg:
2865            *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
2866            *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
2867            goto done;
2868        default:
2869            goto bad;
2870      }
2871
2872   case Ain_Call: {
2873      /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
2874         above, %r11 is used as an address temporary. */
2875      /* If we don't need to do any fixup actions in the case that the
2876         call doesn't happen, just do the simple thing and emit
2877         straight-line code.  This is usually the case. */
2878      if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
2879          || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
2880         /* jump over the following two insns if the condition does
2881            not hold */
2882         Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2883         if (i->Ain.Call.cond != Acc_ALWAYS) {
2884            *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2885            *p++ = shortImm ? 10 : 13;
2886            /* 10 or 13 bytes in the next two insns */
2887         }
2888         if (shortImm) {
2889            /* 7 bytes: movl sign-extend(imm32), %r11 */
2890            *p++ = 0x49;
2891            *p++ = 0xC7;
2892            *p++ = 0xC3;
2893            p = emit32(p, (UInt)i->Ain.Call.target);
2894         } else {
2895            /* 10 bytes: movabsq $target, %r11 */
2896            *p++ = 0x49;
2897            *p++ = 0xBB;
2898            p = emit64(p, i->Ain.Call.target);
2899         }
2900         /* 3 bytes: call *%r11 */
2901         *p++ = 0x41;
2902         *p++ = 0xFF;
2903         *p++ = 0xD3;
2904      } else {
2905         Int delta;
2906         /* Complex case.  We have to generate an if-then-else diamond. */
2907         // before:
2908         //   j{!cond} else:
2909         //   movabsq $target, %r11
2910         //   call* %r11
2911         // preElse:
2912         //   jmp after:
2913         // else:
2914         //   movabsq $0x5555555555555555, %rax  // possibly
2915         //   movq %rax, %rdx                    // possibly
2916         // after:
2917
2918         // before:
2919         UChar* pBefore = p;
2920
2921         //   j{!cond} else:
2922         *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2923         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2924
2925         //   movabsq $target, %r11
2926         *p++ = 0x49;
2927         *p++ = 0xBB;
2928         p = emit64(p, i->Ain.Call.target);
2929
2930         //   call* %r11
2931         *p++ = 0x41;
2932         *p++ = 0xFF;
2933         *p++ = 0xD3;
2934
2935         // preElse:
2936         UChar* pPreElse = p;
2937
2938         //   jmp after:
2939         *p++ = 0xEB;
2940         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2941
2942         // else:
2943         UChar* pElse = p;
2944
2945         /* Do the 'else' actions */
2946         switch (i->Ain.Call.rloc.pri) {
2947            case RLPri_Int:
2948               // movabsq $0x5555555555555555, %rax
2949               *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
2950               break;
2951            case RLPri_2Int:
2952               goto bad; //ATC
2953               // movabsq $0x5555555555555555, %rax
2954               *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
2955               // movq %rax, %rdx
2956               *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
2957               break;
2958            case RLPri_V128SpRel:
2959               if (i->Ain.Call.rloc.spOff == 0) {
2960                  // We could accept any |spOff| here, but that's more
2961                  // hassle and the only value we're ever going to get
2962                  // is zero (I believe.)  Hence take the easy path :)
2963                  // We need a scag register -- r11 can be it.
2964                  // movabsq $0x5555555555555555, %r11
2965                  *p++ = 0x49; *p++ = 0xBB;
2966                  p = emit64(p, 0x5555555555555555ULL);
2967                  // movq %r11, 0(%rsp)
2968                  *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
2969                  // movq %r11, 8(%rsp)
2970                  *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
2971                  *p++ = 0x08;
2972                  break;
2973               }
2974               goto bad; //ATC for all other spOff values
2975            case RLPri_V256SpRel:
2976               goto bad; //ATC
2977            case RLPri_None: case RLPri_INVALID: default:
2978               vassert(0); // should never get here
2979         }
2980
2981         // after:
2982         UChar* pAfter = p;
2983
2984         // Fix up the branch offsets.  The +2s in the offset
2985         // calculations are there because x86 requires conditional
2986         // branches to have their offset stated relative to the
2987         // instruction immediately following the branch insn.  And in
2988         // both cases the branch insns are 2 bytes long.
2989
2990         // First, the "j{!cond} else:" at pBefore.
2991         delta = (Int)(Long)(pElse - (pBefore + 2));
2992         vassert(delta >= 0 && delta < 100/*arbitrary*/);
2993         *(pBefore+1) = (UChar)delta;
2994
2995         // And secondly, the "jmp after:" at pPreElse.
2996         delta = (Int)(Long)(pAfter - (pPreElse + 2));
2997         vassert(delta >= 0 && delta < 100/*arbitrary*/);
2998         *(pPreElse+1) = (UChar)delta;
2999      }
3000      goto done;
3001   }
3002
3003   case Ain_XDirect: {
3004      /* NB: what goes on here has to be very closely coordinated with the
3005         chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
3006      /* We're generating chain-me requests here, so we need to be
3007         sure this is actually allowed -- no-redir translations can't
3008         use chain-me's.  Hence: */
3009      vassert(disp_cp_chain_me_to_slowEP != NULL);
3010      vassert(disp_cp_chain_me_to_fastEP != NULL);
3011
3012      HReg r11 = hregAMD64_R11();
3013
3014      /* Use ptmp for backpatching conditional jumps. */
3015      ptmp = NULL;
3016
3017      /* First off, if this is conditional, create a conditional
3018         jump over the rest of it. */
3019      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3020         /* jmp fwds if !condition */
3021         *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
3022         ptmp = p; /* fill in this bit later */
3023         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3024      }
3025
3026      /* Update the guest RIP. */
3027      if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
3028         /* use a shorter encoding */
3029         /* movl sign-extend(dstGA), %r11 */
3030         *p++ = 0x49;
3031         *p++ = 0xC7;
3032         *p++ = 0xC3;
3033         p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
3034      } else {
3035         /* movabsq $dstGA, %r11 */
3036         *p++ = 0x49;
3037         *p++ = 0xBB;
3038         p = emit64(p, i->Ain.XDirect.dstGA);
3039      }
3040
3041      /* movq %r11, amRIP */
3042      *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
3043      *p++ = 0x89;
3044      p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
3045
3046      /* --- FIRST PATCHABLE BYTE follows --- */
3047      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
3048         to) backs up the return address, so as to find the address of
3049         the first patchable byte.  So: don't change the length of the
3050         two instructions below. */
3051      /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
3052      *p++ = 0x49;
3053      *p++ = 0xBB;
3054      const void* disp_cp_chain_me
3055               = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
3056                                         : disp_cp_chain_me_to_slowEP;
3057      p = emit64(p, (Addr)disp_cp_chain_me);
3058      /* call *%r11 */
3059      *p++ = 0x41;
3060      *p++ = 0xFF;
3061      *p++ = 0xD3;
3062      /* --- END of PATCHABLE BYTES --- */
3063
3064      /* Fix up the conditional jump, if there was one. */
3065      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3066         Int delta = p - ptmp;
3067         vassert(delta > 0 && delta < 40);
3068         *ptmp = toUChar(delta-1);
3069      }
3070      goto done;
3071   }
3072
3073   case Ain_XIndir: {
3074      /* We're generating transfers that could lead indirectly to a
3075         chain-me, so we need to be sure this is actually allowed --
3076         no-redir translations are not allowed to reach normal
3077         translations without going through the scheduler.  That means
3078         no XDirects or XIndirs out from no-redir translations.
3079         Hence: */
3080      vassert(disp_cp_xindir != NULL);
3081
3082      /* Use ptmp for backpatching conditional jumps. */
3083      ptmp = NULL;
3084
3085      /* First off, if this is conditional, create a conditional
3086         jump over the rest of it. */
3087      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3088         /* jmp fwds if !condition */
3089         *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
3090         ptmp = p; /* fill in this bit later */
3091         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3092      }
3093
3094      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3095      *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3096      *p++ = 0x89;
3097      p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3098
3099      /* get $disp_cp_xindir into %r11 */
3100      if (fitsIn32Bits((Addr)disp_cp_xindir)) {
3101         /* use a shorter encoding */
3102         /* movl sign-extend(disp_cp_xindir), %r11 */
3103         *p++ = 0x49;
3104         *p++ = 0xC7;
3105         *p++ = 0xC3;
3106         p = emit32(p, (UInt)(Addr)disp_cp_xindir);
3107      } else {
3108         /* movabsq $disp_cp_xindir, %r11 */
3109         *p++ = 0x49;
3110         *p++ = 0xBB;
3111         p = emit64(p, (Addr)disp_cp_xindir);
3112      }
3113
3114      /* jmp *%r11 */
3115      *p++ = 0x41;
3116      *p++ = 0xFF;
3117      *p++ = 0xE3;
3118
3119      /* Fix up the conditional jump, if there was one. */
3120      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3121         Int delta = p - ptmp;
3122         vassert(delta > 0 && delta < 40);
3123         *ptmp = toUChar(delta-1);
3124      }
3125      goto done;
3126   }
3127
3128   case Ain_XAssisted: {
3129      /* Use ptmp for backpatching conditional jumps. */
3130      ptmp = NULL;
3131
3132      /* First off, if this is conditional, create a conditional
3133         jump over the rest of it. */
3134      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3135         /* jmp fwds if !condition */
3136         *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
3137         ptmp = p; /* fill in this bit later */
3138         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3139      }
3140
3141      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3142      *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3143      *p++ = 0x89;
3144      p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3145      /* movl $magic_number, %ebp.  Since these numbers are all small positive
3146         integers, we can get away with "movl $N, %ebp" rather than
3147         the longer "movq $N, %rbp". */
3148      UInt trcval = 0;
3149      switch (i->Ain.XAssisted.jk) {
3150         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3151         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3152         case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3153         case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
3154         case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3155         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3156         case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3157         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3158         case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3159         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3160         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3161         case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3162         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3163         /* We don't expect to see the following being assisted. */
3164         case Ijk_Ret:
3165         case Ijk_Call:
3166         /* fallthrough */
3167         default:
3168            ppIRJumpKind(i->Ain.XAssisted.jk);
3169            vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3170      }
3171      vassert(trcval != 0);
3172      *p++ = 0xBD;
3173      p = emit32(p, trcval);
3174      /* movabsq $disp_assisted, %r11 */
3175      *p++ = 0x49;
3176      *p++ = 0xBB;
3177      p = emit64(p, (Addr)disp_cp_xassisted);
3178      /* jmp *%r11 */
3179      *p++ = 0x41;
3180      *p++ = 0xFF;
3181      *p++ = 0xE3;
3182
3183      /* Fix up the conditional jump, if there was one. */
3184      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3185         Int delta = p - ptmp;
3186         vassert(delta > 0 && delta < 40);
3187         *ptmp = toUChar(delta-1);
3188      }
3189      goto done;
3190   }
3191
3192   case Ain_CMov64:
3193      vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3194      *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3195      *p++ = 0x0F;
3196      *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3197      p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3198      goto done;
3199
3200   case Ain_CLoad: {
3201      vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3202
3203      /* Only 32- or 64-bit variants are allowed. */
3204      vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3205
3206      /* Use ptmp for backpatching conditional jumps. */
3207      ptmp = NULL;
3208
3209      /* jmp fwds if !condition */
3210      *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3211      ptmp = p; /* fill in this bit later */
3212      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3213
3214      /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3215         load, which, by the default zero-extension rule, zeroes out
3216         the upper half of the destination, as required. */
3217      rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3218      *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3219      *p++ = 0x8B;
3220      p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3221
3222      /* Fix up the conditional branch */
3223      Int delta = p - ptmp;
3224      vassert(delta > 0 && delta < 40);
3225      *ptmp = toUChar(delta-1);
3226      goto done;
3227   }
3228
3229   case Ain_CStore: {
3230      /* AFAICS this is identical to Ain_CLoad except that the opcode
3231         is 0x89 instead of 0x8B. */
3232      vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3233
3234      /* Only 32- or 64-bit variants are allowed. */
3235      vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3236
3237      /* Use ptmp for backpatching conditional jumps. */
3238      ptmp = NULL;
3239
3240      /* jmp fwds if !condition */
3241      *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3242      ptmp = p; /* fill in this bit later */
3243      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3244
3245      /* Now the store. */
3246      rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3247      *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3248      *p++ = 0x89;
3249      p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3250
3251      /* Fix up the conditional branch */
3252      Int delta = p - ptmp;
3253      vassert(delta > 0 && delta < 40);
3254      *ptmp = toUChar(delta-1);
3255      goto done;
3256   }
3257
3258   case Ain_MovxLQ:
3259      /* No, _don't_ ask me why the sense of the args has to be
3260         different in the S vs Z case.  I don't know. */
3261      if (i->Ain.MovxLQ.syned) {
3262         /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3263         *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3264         *p++ = 0x63;
3265         p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3266      } else {
3267         /* Produce a 32-bit reg-reg move, since the implicit
3268            zero-extend does what we want. */
3269         *p++ = clearWBit (
3270                   rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3271         *p++ = 0x89;
3272         p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3273      }
3274      goto done;
3275
3276   case Ain_LoadEX:
3277      if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3278         /* movzbq */
3279         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3280         *p++ = 0x0F;
3281         *p++ = 0xB6;
3282         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3283         goto done;
3284      }
3285      if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3286         /* movzwq */
3287         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3288         *p++ = 0x0F;
3289         *p++ = 0xB7;
3290         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3291         goto done;
3292      }
3293      if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3294         /* movzlq */
3295         /* This isn't really an existing AMD64 instruction per se.
3296            Rather, we have to do a 32-bit load.  Because a 32-bit
3297            write implicitly clears the upper 32 bits of the target
3298            register, we get what we want. */
3299         *p++ = clearWBit(
3300                rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3301         *p++ = 0x8B;
3302         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3303         goto done;
3304      }
3305      break;
3306
3307   case Ain_Set64:
3308      /* Make the destination register be 1 or 0, depending on whether
3309         the relevant condition holds.  Complication: the top 56 bits
3310         of the destination should be forced to zero, but doing 'xorq
3311         %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3312         start off my moving $0 into the dest. */
3313      reg = iregEnc3210(i->Ain.Set64.dst);
3314      vassert(reg < 16);
3315
3316      /* movq $0, %dst */
3317      *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3318      *p++ = 0xC7;
3319      *p++ = toUChar(0xC0 + (reg & 7));
3320      p = emit32(p, 0);
3321
3322      /* setb lo8(%dst) */
3323      /* note, 8-bit register rex trickyness.  Be careful here. */
3324      *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3325      *p++ = 0x0F;
3326      *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3327      *p++ = toUChar(0xC0 + (reg & 7));
3328      goto done;
3329
3330   case Ain_Bsfr64:
3331      *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3332      *p++ = 0x0F;
3333      if (i->Ain.Bsfr64.isFwds) {
3334         *p++ = 0xBC;
3335      } else {
3336         *p++ = 0xBD;
3337      }
3338      p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3339      goto done;
3340
3341   case Ain_MFence:
3342      /* mfence */
3343      *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3344      goto done;
3345
3346   case Ain_ACAS:
3347      /* lock */
3348      *p++ = 0xF0;
3349      if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3350      /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3351         in %rbx.  The new-value register is hardwired to be %rbx
3352         since dealing with byte integer registers is too much hassle,
3353         so we force the register operand to %rbx (could equally be
3354         %rcx or %rdx). */
3355      rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3356      if (i->Ain.ACAS.sz != 8)
3357         rex = clearWBit(rex);
3358
3359      *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3360      *p++ = 0x0F;
3361      if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3362      p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3363      goto done;
3364
3365   case Ain_DACAS:
3366      /* lock */
3367      *p++ = 0xF0;
3368      /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3369         value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3370         aren't encoded in the insn. */
3371      rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3372      if (i->Ain.ACAS.sz != 8)
3373         rex = clearWBit(rex);
3374      *p++ = rex;
3375      *p++ = 0x0F;
3376      *p++ = 0xC7;
3377      p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3378      goto done;
3379
3380   case Ain_A87Free:
3381      vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3382      for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3383         p = do_ffree_st(p, 7-j);
3384      }
3385      goto done;
3386
3387   case Ain_A87PushPop:
3388      vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3389      if (i->Ain.A87PushPop.isPush) {
3390         /* Load from memory into %st(0): flds/fldl amode */
3391         *p++ = clearWBit(
3392                   rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3393         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3394	 p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3395      } else {
3396         /* Dump %st(0) to memory: fstps/fstpl amode */
3397         *p++ = clearWBit(
3398                   rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3399         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3400         p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3401         goto done;
3402      }
3403      goto done;
3404
3405   case Ain_A87FpOp:
3406      switch (i->Ain.A87FpOp.op) {
3407         case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3408         case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3409         case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3410         case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3411         case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3412         case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3413         case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3414         case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3415         case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3416         case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3417         case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3418         case Afp_TAN:
3419            /* fptan pushes 1.0 on the FP stack, except when the
3420               argument is out of range.  Hence we have to do the
3421               instruction, then inspect C2 to see if there is an out
3422               of range condition.  If there is, we skip the fincstp
3423               that is used by the in-range case to get rid of this
3424               extra 1.0 value. */
3425            *p++ = 0xD9; *p++ = 0xF2; // fptan
3426            *p++ = 0x50;              // pushq %rax
3427            *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3428            *p++ = 0x66; *p++ = 0xA9;
3429            *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3430            *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3431            *p++ = 0xD9; *p++ = 0xF7; // fincstp
3432            *p++ = 0x58;              // after_fincstp: popq %rax
3433            break;
3434         default:
3435            goto bad;
3436      }
3437      goto done;
3438
3439   case Ain_A87LdCW:
3440      *p++ = clearWBit(
3441                rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3442      *p++ = 0xD9;
3443      p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3444      goto done;
3445
3446   case Ain_A87StSW:
3447      *p++ = clearWBit(
3448                rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3449      *p++ = 0xDD;
3450      p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3451      goto done;
3452
3453   case Ain_Store:
3454      if (i->Ain.Store.sz == 2) {
3455         /* This just goes to show the crazyness of the instruction
3456            set encoding.  We have to insert two prefix bytes, but be
3457            careful to avoid a conflict in what the size should be, by
3458            ensuring that REX.W = 0. */
3459         *p++ = 0x66; /* override to 16-bits */
3460	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3461         *p++ = 0x89;
3462         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3463         goto done;
3464      }
3465      if (i->Ain.Store.sz == 4) {
3466	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3467         *p++ = 0x89;
3468         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3469         goto done;
3470      }
3471      if (i->Ain.Store.sz == 1) {
3472         /* This is one place where it would be wrong to skip emitting
3473            a rex byte of 0x40, since the mere presence of rex changes
3474            the meaning of the byte register access.  Be careful. */
3475	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3476         *p++ = 0x88;
3477         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3478         goto done;
3479      }
3480      break;
3481
3482   case Ain_LdMXCSR:
3483      *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3484      *p++ = 0x0F;
3485      *p++ = 0xAE;
3486      p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3487      goto done;
3488
3489   case Ain_SseUComIS:
3490      /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3491      /* ucomi[sd] %srcL, %srcR */
3492      if (i->Ain.SseUComIS.sz == 8) {
3493         *p++ = 0x66;
3494      } else {
3495         goto bad;
3496         vassert(i->Ain.SseUComIS.sz == 4);
3497      }
3498      *p++ = clearWBit (
3499             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3500                                 vregEnc3210(i->Ain.SseUComIS.srcR) ));
3501      *p++ = 0x0F;
3502      *p++ = 0x2E;
3503      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3504                               vregEnc3210(i->Ain.SseUComIS.srcR) );
3505      /* pushfq */
3506      *p++ = 0x9C;
3507      /* popq %dst */
3508      *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3509      *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3510      goto done;
3511
3512   case Ain_SseSI2SF:
3513      /* cvssi2s[sd] %src, %dst */
3514      rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3515                                i->Ain.SseSI2SF.src );
3516      *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3517      *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3518      *p++ = 0x0F;
3519      *p++ = 0x2A;
3520      p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3521                                i->Ain.SseSI2SF.src );
3522      goto done;
3523
3524   case Ain_SseSF2SI:
3525      /* cvss[sd]2si %src, %dst */
3526      rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3527                                vregEnc3210(i->Ain.SseSF2SI.src) );
3528      *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3529      *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3530      *p++ = 0x0F;
3531      *p++ = 0x2D;
3532      p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3533                                vregEnc3210(i->Ain.SseSF2SI.src) );
3534      goto done;
3535
3536   case Ain_SseSDSS:
3537      /* cvtsd2ss/cvtss2sd %src, %dst */
3538      *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3539      *p++ = clearWBit(
3540              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3541                                  vregEnc3210(i->Ain.SseSDSS.src) ));
3542      *p++ = 0x0F;
3543      *p++ = 0x5A;
3544      p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3545                                vregEnc3210(i->Ain.SseSDSS.src) );
3546      goto done;
3547
3548   case Ain_SseLdSt:
3549      if (i->Ain.SseLdSt.sz == 8) {
3550         *p++ = 0xF2;
3551      } else
3552      if (i->Ain.SseLdSt.sz == 4) {
3553         *p++ = 0xF3;
3554      } else
3555      if (i->Ain.SseLdSt.sz != 16) {
3556         vassert(0);
3557      }
3558      *p++ = clearWBit(
3559             rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3560                            i->Ain.SseLdSt.addr));
3561      *p++ = 0x0F;
3562      *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3563      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3564                           i->Ain.SseLdSt.addr);
3565      goto done;
3566
3567   case Ain_SseCStore: {
3568      vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
3569
3570      /* Use ptmp for backpatching conditional jumps. */
3571      ptmp = NULL;
3572
3573      /* jmp fwds if !condition */
3574      *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
3575      ptmp = p; /* fill in this bit later */
3576      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3577
3578      /* Now the store. */
3579      *p++ = clearWBit(
3580             rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
3581                            i->Ain.SseCStore.addr));
3582      *p++ = 0x0F;
3583      *p++ = toUChar(0x11);
3584      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
3585                           i->Ain.SseCStore.addr);
3586
3587      /* Fix up the conditional branch */
3588      Int delta = p - ptmp;
3589      vassert(delta > 0 && delta < 40);
3590      *ptmp = toUChar(delta-1);
3591      goto done;
3592   }
3593
3594   case Ain_SseCLoad: {
3595      vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
3596
3597      /* Use ptmp for backpatching conditional jumps. */
3598      ptmp = NULL;
3599
3600      /* jmp fwds if !condition */
3601      *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
3602      ptmp = p; /* fill in this bit later */
3603      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3604
3605      /* Now the load. */
3606      *p++ = clearWBit(
3607             rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
3608                            i->Ain.SseCLoad.addr));
3609      *p++ = 0x0F;
3610      *p++ = toUChar(0x10);
3611      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
3612                           i->Ain.SseCLoad.addr);
3613
3614      /* Fix up the conditional branch */
3615      Int delta = p - ptmp;
3616      vassert(delta > 0 && delta < 40);
3617      *ptmp = toUChar(delta-1);
3618      goto done;
3619   }
3620
3621   case Ain_SseLdzLO:
3622      vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3623      /* movs[sd] amode, %xmm-dst */
3624      *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3625      *p++ = clearWBit(
3626             rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3627                            i->Ain.SseLdzLO.addr));
3628      *p++ = 0x0F;
3629      *p++ = 0x10;
3630      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3631                           i->Ain.SseLdzLO.addr);
3632      goto done;
3633
3634   case Ain_Sse32Fx4:
3635      xtra = 0;
3636      *p++ = clearWBit(
3637             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
3638                                 vregEnc3210(i->Ain.Sse32Fx4.src) ));
3639      *p++ = 0x0F;
3640      switch (i->Ain.Sse32Fx4.op) {
3641         case Asse_ADDF:   *p++ = 0x58; break;
3642         case Asse_DIVF:   *p++ = 0x5E; break;
3643         case Asse_MAXF:   *p++ = 0x5F; break;
3644         case Asse_MINF:   *p++ = 0x5D; break;
3645         case Asse_MULF:   *p++ = 0x59; break;
3646         case Asse_RCPF:   *p++ = 0x53; break;
3647         case Asse_RSQRTF: *p++ = 0x52; break;
3648         case Asse_SQRTF:  *p++ = 0x51; break;
3649         case Asse_SUBF:   *p++ = 0x5C; break;
3650         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3651         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3652         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3653         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3654         default: goto bad;
3655      }
3656      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst),
3657                               vregEnc3210(i->Ain.Sse32Fx4.src) );
3658      if (xtra & 0x100)
3659         *p++ = toUChar(xtra & 0xFF);
3660      goto done;
3661
3662   case Ain_Sse64Fx2:
3663      xtra = 0;
3664      *p++ = 0x66;
3665      *p++ = clearWBit(
3666             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3667                                 vregEnc3210(i->Ain.Sse64Fx2.src) ));
3668      *p++ = 0x0F;
3669      switch (i->Ain.Sse64Fx2.op) {
3670         case Asse_ADDF:   *p++ = 0x58; break;
3671         case Asse_DIVF:   *p++ = 0x5E; break;
3672         case Asse_MAXF:   *p++ = 0x5F; break;
3673         case Asse_MINF:   *p++ = 0x5D; break;
3674         case Asse_MULF:   *p++ = 0x59; break;
3675         case Asse_SQRTF:  *p++ = 0x51; break;
3676         case Asse_SUBF:   *p++ = 0x5C; break;
3677         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3678         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3679         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3680         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3681         default: goto bad;
3682      }
3683      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3684                               vregEnc3210(i->Ain.Sse64Fx2.src) );
3685      if (xtra & 0x100)
3686         *p++ = toUChar(xtra & 0xFF);
3687      goto done;
3688
3689   case Ain_Sse32FLo:
3690      xtra = 0;
3691      *p++ = 0xF3;
3692      *p++ = clearWBit(
3693             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3694                                 vregEnc3210(i->Ain.Sse32FLo.src) ));
3695      *p++ = 0x0F;
3696      switch (i->Ain.Sse32FLo.op) {
3697         case Asse_ADDF:   *p++ = 0x58; break;
3698         case Asse_DIVF:   *p++ = 0x5E; break;
3699         case Asse_MAXF:   *p++ = 0x5F; break;
3700         case Asse_MINF:   *p++ = 0x5D; break;
3701         case Asse_MULF:   *p++ = 0x59; break;
3702         case Asse_RCPF:   *p++ = 0x53; break;
3703         case Asse_RSQRTF: *p++ = 0x52; break;
3704         case Asse_SQRTF:  *p++ = 0x51; break;
3705         case Asse_SUBF:   *p++ = 0x5C; break;
3706         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3707         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3708         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3709         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3710         default: goto bad;
3711      }
3712      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3713                               vregEnc3210(i->Ain.Sse32FLo.src) );
3714      if (xtra & 0x100)
3715         *p++ = toUChar(xtra & 0xFF);
3716      goto done;
3717
3718   case Ain_Sse64FLo:
3719      xtra = 0;
3720      *p++ = 0xF2;
3721      *p++ = clearWBit(
3722             rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3723                                 vregEnc3210(i->Ain.Sse64FLo.src) ));
3724      *p++ = 0x0F;
3725      switch (i->Ain.Sse64FLo.op) {
3726         case Asse_ADDF:   *p++ = 0x58; break;
3727         case Asse_DIVF:   *p++ = 0x5E; break;
3728         case Asse_MAXF:   *p++ = 0x5F; break;
3729         case Asse_MINF:   *p++ = 0x5D; break;
3730         case Asse_MULF:   *p++ = 0x59; break;
3731         case Asse_SQRTF:  *p++ = 0x51; break;
3732         case Asse_SUBF:   *p++ = 0x5C; break;
3733         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3734         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3735         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3736         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3737         default: goto bad;
3738      }
3739      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3740                               vregEnc3210(i->Ain.Sse64FLo.src) );
3741      if (xtra & 0x100)
3742         *p++ = toUChar(xtra & 0xFF);
3743      goto done;
3744
3745   case Ain_SseReRg:
3746#     define XX(_n) *p++ = (_n)
3747
3748      rex = clearWBit(
3749            rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3750                                vregEnc3210(i->Ain.SseReRg.src) ));
3751
3752      switch (i->Ain.SseReRg.op) {
3753         case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3754         case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3755         case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3756         case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3757         case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3758         case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3759         case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3760         case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3761         case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3762         case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3763         case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3764         case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3765         case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3766         case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3767         case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3768         case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3769         case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3770         case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3771         case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3772         case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3773         case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3774         case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3775         case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3776         case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3777         case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3778         case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3779         case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3780         case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3781         case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3782         case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3783         case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3784         case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3785         case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3786         case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3787         case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3788         case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3789         case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3790         case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3791         case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3792         case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3793         case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3794         case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3795         case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3796         case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3797         case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3798         case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3799         case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3800         case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3801         case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3802         case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3803         case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3804         case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3805         case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3806         case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3807         case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3808         default: goto bad;
3809      }
3810      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
3811                               vregEnc3210(i->Ain.SseReRg.src) );
3812#     undef XX
3813      goto done;
3814
3815   case Ain_SseCMov:
3816      /* jmp fwds if !condition */
3817      *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3818      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3819      ptmp = p;
3820
3821      /* movaps %src, %dst */
3822      *p++ = clearWBit(
3823             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
3824                                 vregEnc3210(i->Ain.SseCMov.src) ));
3825      *p++ = 0x0F;
3826      *p++ = 0x28;
3827      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
3828                               vregEnc3210(i->Ain.SseCMov.src) );
3829
3830      /* Fill in the jump offset. */
3831      *(ptmp-1) = toUChar(p - ptmp);
3832      goto done;
3833
3834   case Ain_SseShuf:
3835      *p++ = 0x66;
3836      *p++ = clearWBit(
3837             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
3838                                 vregEnc3210(i->Ain.SseShuf.src) ));
3839      *p++ = 0x0F;
3840      *p++ = 0x70;
3841      p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
3842                               vregEnc3210(i->Ain.SseShuf.src) );
3843      *p++ = (UChar)(i->Ain.SseShuf.order);
3844      goto done;
3845
3846   //uu case Ain_AvxLdSt: {
3847   //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
3848   //uu                           i->Ain.AvxLdSt.addr );
3849   //uu    p = emitVexPrefix(p, vex);
3850   //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
3851   //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
3852   //uu      goto done;
3853   //uu }
3854
3855   case Ain_EvCheck: {
3856      /* We generate:
3857            (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
3858            (2 bytes)  jns  nofail     expected taken
3859            (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
3860            nofail:
3861      */
3862      /* This is heavily asserted re instruction lengths.  It needs to
3863         be.  If we get given unexpected forms of .amCounter or
3864         .amFailAddr -- basically, anything that's not of the form
3865         uimm7(%rbp) -- they are likely to fail. */
3866      /* Note also that after the decl we must be very careful not to
3867         read the carry flag, else we get a partial flags stall.
3868         js/jns avoids that, though. */
3869      UChar* p0 = p;
3870      /* ---  decl 8(%rbp) --- */
3871      /* Need to compute the REX byte for the decl in order to prove
3872         that we don't need it, since this is a 32-bit inc and all
3873         registers involved in the amode are < r8.  "1" because
3874         there's no register in this encoding; instead the register
3875         field is used as a sub opcode.  The encoding for "decl r/m32"
3876         is FF /1, hence the "1". */
3877      rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
3878      if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
3879      *p++ = 0xFF;
3880      p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
3881      vassert(p - p0 == 3);
3882      /* --- jns nofail --- */
3883      *p++ = 0x79;
3884      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3885      vassert(p - p0 == 5);
3886      /* --- jmp* 0(%rbp) --- */
3887      /* Once again, verify we don't need REX.  The encoding is FF /4.
3888         We don't need REX.W since by default FF /4 in 64-bit mode
3889         implies a 64 bit load. */
3890      rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
3891      if (rex != 0x40) goto bad;
3892      *p++ = 0xFF;
3893      p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
3894      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3895      /* And crosscheck .. */
3896      vassert(evCheckSzB_AMD64() == 8);
3897      goto done;
3898   }
3899
3900   case Ain_ProfInc: {
3901      /* We generate   movabsq $0, %r11
3902                       incq (%r11)
3903         in the expectation that a later call to LibVEX_patchProfCtr
3904         will be used to fill in the immediate field once the right
3905         value is known.
3906         49 BB 00 00 00 00 00 00 00 00
3907         49 FF 03
3908      */
3909      *p++ = 0x49; *p++ = 0xBB;
3910      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3911      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3912      *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
3913      /* Tell the caller .. */
3914      vassert(!(*is_profInc));
3915      *is_profInc = True;
3916      goto done;
3917   }
3918
3919   default:
3920      goto bad;
3921   }
3922
3923  bad:
3924   ppAMD64Instr(i, mode64);
3925   vpanic("emit_AMD64Instr");
3926   /*NOTREACHED*/
3927
3928  done:
3929   vassert(p - &buf[0] <= 64);
3930   return p - &buf[0];
3931}
3932
3933
3934/* How big is an event check?  See case for Ain_EvCheck in
3935   emit_AMD64Instr just above.  That crosschecks what this returns, so
3936   we can tell if we're inconsistent. */
3937Int evCheckSzB_AMD64 (void)
3938{
3939   return 8;
3940}
3941
3942
3943/* NB: what goes on here has to be very closely coordinated with the
3944   emitInstr case for XDirect, above. */
3945VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
3946                                   void* place_to_chain,
3947                                   const void* disp_cp_chain_me_EXPECTED,
3948                                   const void* place_to_jump_to )
3949{
3950   vassert(endness_host == VexEndnessLE);
3951
3952   /* What we're expecting to see is:
3953        movabsq $disp_cp_chain_me_EXPECTED, %r11
3954        call *%r11
3955      viz
3956        49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
3957        41 FF D3
3958   */
3959   UChar* p = (UChar*)place_to_chain;
3960   vassert(p[0] == 0x49);
3961   vassert(p[1] == 0xBB);
3962   vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
3963   vassert(p[10] == 0x41);
3964   vassert(p[11] == 0xFF);
3965   vassert(p[12] == 0xD3);
3966   /* And what we want to change it to is either:
3967        (general case):
3968          movabsq $place_to_jump_to, %r11
3969          jmpq *%r11
3970        viz
3971          49 BB <8 bytes value == place_to_jump_to>
3972          41 FF E3
3973        So it's the same length (convenient, huh) and we don't
3974        need to change all the bits.
3975      ---OR---
3976        in the case where the displacement falls within 32 bits
3977          jmpq disp32   where disp32 is relative to the next insn
3978          ud2; ud2; ud2; ud2
3979        viz
3980          E9 <4 bytes == disp32>
3981          0F 0B 0F 0B 0F 0B 0F 0B
3982
3983      In both cases the replacement has the same length as the original.
3984      To remain sane & verifiable,
3985      (1) limit the displacement for the short form to
3986          (say) +/- one billion, so as to avoid wraparound
3987          off-by-ones
3988      (2) even if the short form is applicable, once every (say)
3989          1024 times use the long form anyway, so as to maintain
3990          verifiability
3991   */
3992   /* This is the delta we need to put into a JMP d32 insn.  It's
3993      relative to the start of the next insn, hence the -5.  */
3994   Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
3995   Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
3996
3997   static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
3998   if (shortOK) {
3999      shortCTR++; // thread safety bleh
4000      if (0 == (shortCTR & 0x3FF)) {
4001         shortOK = False;
4002         if (0)
4003            vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
4004                       "using long jmp\n", shortCTR);
4005      }
4006   }
4007
4008   /* And make the modifications. */
4009   if (shortOK) {
4010      p[0]  = 0xE9;
4011      write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
4012      p[5]  = 0x0F; p[6]  = 0x0B;
4013      p[7]  = 0x0F; p[8]  = 0x0B;
4014      p[9]  = 0x0F; p[10] = 0x0B;
4015      p[11] = 0x0F; p[12] = 0x0B;
4016      /* sanity check on the delta -- top 32 are all 0 or all 1 */
4017      delta >>= 32;
4018      vassert(delta == 0LL || delta == -1LL);
4019   } else {
4020      /* Minimal modifications from the starting sequence. */
4021      write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
4022      p[12] = 0xE3;
4023   }
4024   VexInvalRange vir = { (HWord)place_to_chain, 13 };
4025   return vir;
4026}
4027
4028
4029/* NB: what goes on here has to be very closely coordinated with the
4030   emitInstr case for XDirect, above. */
4031VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
4032                                     void* place_to_unchain,
4033                                     const void* place_to_jump_to_EXPECTED,
4034                                     const void* disp_cp_chain_me )
4035{
4036   vassert(endness_host == VexEndnessLE);
4037
4038   /* What we're expecting to see is either:
4039        (general case)
4040          movabsq $place_to_jump_to_EXPECTED, %r11
4041          jmpq *%r11
4042        viz
4043          49 BB <8 bytes value == place_to_jump_to_EXPECTED>
4044          41 FF E3
4045      ---OR---
4046        in the case where the displacement falls within 32 bits
4047          jmpq d32
4048          ud2; ud2; ud2; ud2
4049        viz
4050          E9 <4 bytes == disp32>
4051          0F 0B 0F 0B 0F 0B 0F 0B
4052   */
4053   UChar* p     = (UChar*)place_to_unchain;
4054   Bool   valid = False;
4055   if (p[0] == 0x49 && p[1] == 0xBB
4056       && read_misaligned_ULong_LE(&p[2])
4057          == (ULong)(Addr)place_to_jump_to_EXPECTED
4058       && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
4059      /* it's the long form */
4060      valid = True;
4061   }
4062   else
4063   if (p[0] == 0xE9
4064       && p[5]  == 0x0F && p[6]  == 0x0B
4065       && p[7]  == 0x0F && p[8]  == 0x0B
4066       && p[9]  == 0x0F && p[10] == 0x0B
4067       && p[11] == 0x0F && p[12] == 0x0B) {
4068      /* It's the short form.  Check the offset is right. */
4069      Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
4070      Long s64 = (Long)s32;
4071      if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
4072         valid = True;
4073         if (0)
4074            vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
4075      }
4076   }
4077   vassert(valid);
4078   /* And what we want to change it to is:
4079        movabsq $disp_cp_chain_me, %r11
4080        call *%r11
4081      viz
4082        49 BB <8 bytes value == disp_cp_chain_me>
4083        41 FF D3
4084      So it's the same length (convenient, huh).
4085   */
4086   p[0] = 0x49;
4087   p[1] = 0xBB;
4088   write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
4089   p[10] = 0x41;
4090   p[11] = 0xFF;
4091   p[12] = 0xD3;
4092   VexInvalRange vir = { (HWord)place_to_unchain, 13 };
4093   return vir;
4094}
4095
4096
4097/* Patch the counter address into a profile inc point, as previously
4098   created by the Ain_ProfInc case for emit_AMD64Instr. */
4099VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
4100                                   void*  place_to_patch,
4101                                   const ULong* location_of_counter )
4102{
4103   vassert(endness_host == VexEndnessLE);
4104   vassert(sizeof(ULong*) == 8);
4105   UChar* p = (UChar*)place_to_patch;
4106   vassert(p[0] == 0x49);
4107   vassert(p[1] == 0xBB);
4108   vassert(p[2] == 0x00);
4109   vassert(p[3] == 0x00);
4110   vassert(p[4] == 0x00);
4111   vassert(p[5] == 0x00);
4112   vassert(p[6] == 0x00);
4113   vassert(p[7] == 0x00);
4114   vassert(p[8] == 0x00);
4115   vassert(p[9] == 0x00);
4116   vassert(p[10] == 0x49);
4117   vassert(p[11] == 0xFF);
4118   vassert(p[12] == 0x03);
4119   ULong imm64 = (ULong)(Addr)location_of_counter;
4120   p[2] = imm64 & 0xFF; imm64 >>= 8;
4121   p[3] = imm64 & 0xFF; imm64 >>= 8;
4122   p[4] = imm64 & 0xFF; imm64 >>= 8;
4123   p[5] = imm64 & 0xFF; imm64 >>= 8;
4124   p[6] = imm64 & 0xFF; imm64 >>= 8;
4125   p[7] = imm64 & 0xFF; imm64 >>= 8;
4126   p[8] = imm64 & 0xFF; imm64 >>= 8;
4127   p[9] = imm64 & 0xFF; imm64 >>= 8;
4128   VexInvalRange vir = { (HWord)place_to_patch, 13 };
4129   return vir;
4130}
4131
4132
4133/*---------------------------------------------------------------*/
4134/*--- end                                   host_amd64_defs.c ---*/
4135/*---------------------------------------------------------------*/
4136