1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_x86_defs.h"
43
44
45/* --------- Registers. --------- */
46
47void ppHRegX86 ( HReg reg )
48{
49   Int r;
50   static const HChar* ireg32_names[8]
51     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
52   /* Be generic for all virtual regs. */
53   if (hregIsVirtual(reg)) {
54      ppHReg(reg);
55      return;
56   }
57   /* But specific for real regs. */
58   switch (hregClass(reg)) {
59      case HRcInt32:
60         r = hregNumber(reg);
61         vassert(r >= 0 && r < 8);
62         vex_printf("%s", ireg32_names[r]);
63         return;
64      case HRcFlt64:
65         r = hregNumber(reg);
66         vassert(r >= 0 && r < 6);
67         vex_printf("%%fake%d", r);
68         return;
69      case HRcVec128:
70         r = hregNumber(reg);
71         vassert(r >= 0 && r < 8);
72         vex_printf("%%xmm%d", r);
73         return;
74      default:
75         vpanic("ppHRegX86");
76   }
77}
78
79HReg hregX86_EAX ( void ) { return mkHReg(0, HRcInt32, False); }
80HReg hregX86_ECX ( void ) { return mkHReg(1, HRcInt32, False); }
81HReg hregX86_EDX ( void ) { return mkHReg(2, HRcInt32, False); }
82HReg hregX86_EBX ( void ) { return mkHReg(3, HRcInt32, False); }
83HReg hregX86_ESP ( void ) { return mkHReg(4, HRcInt32, False); }
84HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt32, False); }
85HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt32, False); }
86HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt32, False); }
87
88HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
89HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
90HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
91HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
92HReg hregX86_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
93HReg hregX86_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
94
95HReg hregX86_XMM0 ( void ) { return mkHReg(0, HRcVec128, False); }
96HReg hregX86_XMM1 ( void ) { return mkHReg(1, HRcVec128, False); }
97HReg hregX86_XMM2 ( void ) { return mkHReg(2, HRcVec128, False); }
98HReg hregX86_XMM3 ( void ) { return mkHReg(3, HRcVec128, False); }
99HReg hregX86_XMM4 ( void ) { return mkHReg(4, HRcVec128, False); }
100HReg hregX86_XMM5 ( void ) { return mkHReg(5, HRcVec128, False); }
101HReg hregX86_XMM6 ( void ) { return mkHReg(6, HRcVec128, False); }
102HReg hregX86_XMM7 ( void ) { return mkHReg(7, HRcVec128, False); }
103
104
105void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
106{
107   *nregs = 20;
108   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
109   (*arr)[0] = hregX86_EAX();
110   (*arr)[1] = hregX86_EBX();
111   (*arr)[2] = hregX86_ECX();
112   (*arr)[3] = hregX86_EDX();
113   (*arr)[4] = hregX86_ESI();
114   (*arr)[5] = hregX86_EDI();
115   (*arr)[6] = hregX86_FAKE0();
116   (*arr)[7] = hregX86_FAKE1();
117   (*arr)[8] = hregX86_FAKE2();
118   (*arr)[9] = hregX86_FAKE3();
119   (*arr)[10] = hregX86_FAKE4();
120   (*arr)[11] = hregX86_FAKE5();
121   (*arr)[12] = hregX86_XMM0();
122   (*arr)[13] = hregX86_XMM1();
123   (*arr)[14] = hregX86_XMM2();
124   (*arr)[15] = hregX86_XMM3();
125   (*arr)[16] = hregX86_XMM4();
126   (*arr)[17] = hregX86_XMM5();
127   (*arr)[18] = hregX86_XMM6();
128   (*arr)[19] = hregX86_XMM7();
129}
130
131
132/* --------- Condition codes, Intel encoding. --------- */
133
134const HChar* showX86CondCode ( X86CondCode cond )
135{
136   switch (cond) {
137      case Xcc_O:      return "o";
138      case Xcc_NO:     return "no";
139      case Xcc_B:      return "b";
140      case Xcc_NB:     return "nb";
141      case Xcc_Z:      return "z";
142      case Xcc_NZ:     return "nz";
143      case Xcc_BE:     return "be";
144      case Xcc_NBE:    return "nbe";
145      case Xcc_S:      return "s";
146      case Xcc_NS:     return "ns";
147      case Xcc_P:      return "p";
148      case Xcc_NP:     return "np";
149      case Xcc_L:      return "l";
150      case Xcc_NL:     return "nl";
151      case Xcc_LE:     return "le";
152      case Xcc_NLE:    return "nle";
153      case Xcc_ALWAYS: return "ALWAYS";
154      default: vpanic("ppX86CondCode");
155   }
156}
157
158
159/* --------- X86AMode: memory address expressions. --------- */
160
161X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
162   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
163   am->tag = Xam_IR;
164   am->Xam.IR.imm = imm32;
165   am->Xam.IR.reg = reg;
166   return am;
167}
168X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
169   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
170   am->tag = Xam_IRRS;
171   am->Xam.IRRS.imm = imm32;
172   am->Xam.IRRS.base = base;
173   am->Xam.IRRS.index = indEx;
174   am->Xam.IRRS.shift = shift;
175   vassert(shift >= 0 && shift <= 3);
176   return am;
177}
178
179X86AMode* dopyX86AMode ( X86AMode* am ) {
180   switch (am->tag) {
181      case Xam_IR:
182         return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
183      case Xam_IRRS:
184         return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
185                               am->Xam.IRRS.index, am->Xam.IRRS.shift );
186      default:
187         vpanic("dopyX86AMode");
188   }
189}
190
191void ppX86AMode ( X86AMode* am ) {
192   switch (am->tag) {
193      case Xam_IR:
194         if (am->Xam.IR.imm == 0)
195            vex_printf("(");
196         else
197            vex_printf("0x%x(", am->Xam.IR.imm);
198         ppHRegX86(am->Xam.IR.reg);
199         vex_printf(")");
200         return;
201      case Xam_IRRS:
202         vex_printf("0x%x(", am->Xam.IRRS.imm);
203         ppHRegX86(am->Xam.IRRS.base);
204         vex_printf(",");
205         ppHRegX86(am->Xam.IRRS.index);
206         vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
207         return;
208      default:
209         vpanic("ppX86AMode");
210   }
211}
212
213static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
214   switch (am->tag) {
215      case Xam_IR:
216         addHRegUse(u, HRmRead, am->Xam.IR.reg);
217         return;
218      case Xam_IRRS:
219         addHRegUse(u, HRmRead, am->Xam.IRRS.base);
220         addHRegUse(u, HRmRead, am->Xam.IRRS.index);
221         return;
222      default:
223         vpanic("addRegUsage_X86AMode");
224   }
225}
226
227static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
228   switch (am->tag) {
229      case Xam_IR:
230         am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
231         return;
232      case Xam_IRRS:
233         am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
234         am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
235         return;
236      default:
237         vpanic("mapRegs_X86AMode");
238   }
239}
240
241/* --------- Operand, which can be reg, immediate or memory. --------- */
242
243X86RMI* X86RMI_Imm ( UInt imm32 ) {
244   X86RMI* op         = LibVEX_Alloc(sizeof(X86RMI));
245   op->tag            = Xrmi_Imm;
246   op->Xrmi.Imm.imm32 = imm32;
247   return op;
248}
249X86RMI* X86RMI_Reg ( HReg reg ) {
250   X86RMI* op       = LibVEX_Alloc(sizeof(X86RMI));
251   op->tag          = Xrmi_Reg;
252   op->Xrmi.Reg.reg = reg;
253   return op;
254}
255X86RMI* X86RMI_Mem ( X86AMode* am ) {
256   X86RMI* op      = LibVEX_Alloc(sizeof(X86RMI));
257   op->tag         = Xrmi_Mem;
258   op->Xrmi.Mem.am = am;
259   return op;
260}
261
262void ppX86RMI ( X86RMI* op ) {
263   switch (op->tag) {
264      case Xrmi_Imm:
265         vex_printf("$0x%x", op->Xrmi.Imm.imm32);
266         return;
267      case Xrmi_Reg:
268         ppHRegX86(op->Xrmi.Reg.reg);
269         return;
270      case Xrmi_Mem:
271         ppX86AMode(op->Xrmi.Mem.am);
272         return;
273     default:
274         vpanic("ppX86RMI");
275   }
276}
277
278/* An X86RMI can only be used in a "read" context (what would it mean
279   to write or modify a literal?) and so we enumerate its registers
280   accordingly. */
281static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
282   switch (op->tag) {
283      case Xrmi_Imm:
284         return;
285      case Xrmi_Reg:
286         addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
287         return;
288      case Xrmi_Mem:
289         addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
290         return;
291      default:
292         vpanic("addRegUsage_X86RMI");
293   }
294}
295
296static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
297   switch (op->tag) {
298      case Xrmi_Imm:
299         return;
300      case Xrmi_Reg:
301         op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
302         return;
303      case Xrmi_Mem:
304         mapRegs_X86AMode(m, op->Xrmi.Mem.am);
305         return;
306      default:
307         vpanic("mapRegs_X86RMI");
308   }
309}
310
311
312/* --------- Operand, which can be reg or immediate only. --------- */
313
314X86RI* X86RI_Imm ( UInt imm32 ) {
315   X86RI* op         = LibVEX_Alloc(sizeof(X86RI));
316   op->tag           = Xri_Imm;
317   op->Xri.Imm.imm32 = imm32;
318   return op;
319}
320X86RI* X86RI_Reg ( HReg reg ) {
321   X86RI* op       = LibVEX_Alloc(sizeof(X86RI));
322   op->tag         = Xri_Reg;
323   op->Xri.Reg.reg = reg;
324   return op;
325}
326
327void ppX86RI ( X86RI* op ) {
328   switch (op->tag) {
329      case Xri_Imm:
330         vex_printf("$0x%x", op->Xri.Imm.imm32);
331         return;
332      case Xri_Reg:
333         ppHRegX86(op->Xri.Reg.reg);
334         return;
335     default:
336         vpanic("ppX86RI");
337   }
338}
339
340/* An X86RI can only be used in a "read" context (what would it mean
341   to write or modify a literal?) and so we enumerate its registers
342   accordingly. */
343static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
344   switch (op->tag) {
345      case Xri_Imm:
346         return;
347      case Xri_Reg:
348         addHRegUse(u, HRmRead, op->Xri.Reg.reg);
349         return;
350      default:
351         vpanic("addRegUsage_X86RI");
352   }
353}
354
355static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
356   switch (op->tag) {
357      case Xri_Imm:
358         return;
359      case Xri_Reg:
360         op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
361         return;
362      default:
363         vpanic("mapRegs_X86RI");
364   }
365}
366
367
368/* --------- Operand, which can be reg or memory only. --------- */
369
370X86RM* X86RM_Reg ( HReg reg ) {
371   X86RM* op       = LibVEX_Alloc(sizeof(X86RM));
372   op->tag         = Xrm_Reg;
373   op->Xrm.Reg.reg = reg;
374   return op;
375}
376X86RM* X86RM_Mem ( X86AMode* am ) {
377   X86RM* op      = LibVEX_Alloc(sizeof(X86RM));
378   op->tag        = Xrm_Mem;
379   op->Xrm.Mem.am = am;
380   return op;
381}
382
383void ppX86RM ( X86RM* op ) {
384   switch (op->tag) {
385      case Xrm_Mem:
386         ppX86AMode(op->Xrm.Mem.am);
387         return;
388      case Xrm_Reg:
389         ppHRegX86(op->Xrm.Reg.reg);
390         return;
391     default:
392         vpanic("ppX86RM");
393   }
394}
395
396/* Because an X86RM can be both a source or destination operand, we
397   have to supply a mode -- pertaining to the operand as a whole --
398   indicating how it's being used. */
399static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
400   switch (op->tag) {
401      case Xrm_Mem:
402         /* Memory is read, written or modified.  So we just want to
403            know the regs read by the amode. */
404         addRegUsage_X86AMode(u, op->Xrm.Mem.am);
405         return;
406      case Xrm_Reg:
407         /* reg is read, written or modified.  Add it in the
408            appropriate way. */
409         addHRegUse(u, mode, op->Xrm.Reg.reg);
410         return;
411     default:
412         vpanic("addRegUsage_X86RM");
413   }
414}
415
416static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
417{
418   switch (op->tag) {
419      case Xrm_Mem:
420         mapRegs_X86AMode(m, op->Xrm.Mem.am);
421         return;
422      case Xrm_Reg:
423         op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
424         return;
425     default:
426         vpanic("mapRegs_X86RM");
427   }
428}
429
430
431/* --------- Instructions. --------- */
432
433const HChar* showX86UnaryOp ( X86UnaryOp op ) {
434   switch (op) {
435      case Xun_NOT: return "not";
436      case Xun_NEG: return "neg";
437      default: vpanic("showX86UnaryOp");
438   }
439}
440
441const HChar* showX86AluOp ( X86AluOp op ) {
442   switch (op) {
443      case Xalu_MOV:  return "mov";
444      case Xalu_CMP:  return "cmp";
445      case Xalu_ADD:  return "add";
446      case Xalu_SUB:  return "sub";
447      case Xalu_ADC:  return "adc";
448      case Xalu_SBB:  return "sbb";
449      case Xalu_AND:  return "and";
450      case Xalu_OR:   return "or";
451      case Xalu_XOR:  return "xor";
452      case Xalu_MUL:  return "mul";
453      default: vpanic("showX86AluOp");
454   }
455}
456
457const HChar* showX86ShiftOp ( X86ShiftOp op ) {
458   switch (op) {
459      case Xsh_SHL: return "shl";
460      case Xsh_SHR: return "shr";
461      case Xsh_SAR: return "sar";
462      default: vpanic("showX86ShiftOp");
463   }
464}
465
466const HChar* showX86FpOp ( X86FpOp op ) {
467   switch (op) {
468      case Xfp_ADD:    return "add";
469      case Xfp_SUB:    return "sub";
470      case Xfp_MUL:    return "mul";
471      case Xfp_DIV:    return "div";
472      case Xfp_SCALE:  return "scale";
473      case Xfp_ATAN:   return "atan";
474      case Xfp_YL2X:   return "yl2x";
475      case Xfp_YL2XP1: return "yl2xp1";
476      case Xfp_PREM:   return "prem";
477      case Xfp_PREM1:  return "prem1";
478      case Xfp_SQRT:   return "sqrt";
479      case Xfp_ABS:    return "abs";
480      case Xfp_NEG:    return "chs";
481      case Xfp_MOV:    return "mov";
482      case Xfp_SIN:    return "sin";
483      case Xfp_COS:    return "cos";
484      case Xfp_TAN:    return "tan";
485      case Xfp_ROUND:  return "round";
486      case Xfp_2XM1:   return "2xm1";
487      default: vpanic("showX86FpOp");
488   }
489}
490
491const HChar* showX86SseOp ( X86SseOp op ) {
492   switch (op) {
493      case Xsse_MOV:      return "mov(?!)";
494      case Xsse_ADDF:     return "add";
495      case Xsse_SUBF:     return "sub";
496      case Xsse_MULF:     return "mul";
497      case Xsse_DIVF:     return "div";
498      case Xsse_MAXF:     return "max";
499      case Xsse_MINF:     return "min";
500      case Xsse_CMPEQF:   return "cmpFeq";
501      case Xsse_CMPLTF:   return "cmpFlt";
502      case Xsse_CMPLEF:   return "cmpFle";
503      case Xsse_CMPUNF:   return "cmpFun";
504      case Xsse_RCPF:     return "rcp";
505      case Xsse_RSQRTF:   return "rsqrt";
506      case Xsse_SQRTF:    return "sqrt";
507      case Xsse_AND:      return "and";
508      case Xsse_OR:       return "or";
509      case Xsse_XOR:      return "xor";
510      case Xsse_ANDN:     return "andn";
511      case Xsse_ADD8:     return "paddb";
512      case Xsse_ADD16:    return "paddw";
513      case Xsse_ADD32:    return "paddd";
514      case Xsse_ADD64:    return "paddq";
515      case Xsse_QADD8U:   return "paddusb";
516      case Xsse_QADD16U:  return "paddusw";
517      case Xsse_QADD8S:   return "paddsb";
518      case Xsse_QADD16S:  return "paddsw";
519      case Xsse_SUB8:     return "psubb";
520      case Xsse_SUB16:    return "psubw";
521      case Xsse_SUB32:    return "psubd";
522      case Xsse_SUB64:    return "psubq";
523      case Xsse_QSUB8U:   return "psubusb";
524      case Xsse_QSUB16U:  return "psubusw";
525      case Xsse_QSUB8S:   return "psubsb";
526      case Xsse_QSUB16S:  return "psubsw";
527      case Xsse_MUL16:    return "pmullw";
528      case Xsse_MULHI16U: return "pmulhuw";
529      case Xsse_MULHI16S: return "pmulhw";
530      case Xsse_AVG8U:    return "pavgb";
531      case Xsse_AVG16U:   return "pavgw";
532      case Xsse_MAX16S:   return "pmaxw";
533      case Xsse_MAX8U:    return "pmaxub";
534      case Xsse_MIN16S:   return "pminw";
535      case Xsse_MIN8U:    return "pminub";
536      case Xsse_CMPEQ8:   return "pcmpeqb";
537      case Xsse_CMPEQ16:  return "pcmpeqw";
538      case Xsse_CMPEQ32:  return "pcmpeqd";
539      case Xsse_CMPGT8S:  return "pcmpgtb";
540      case Xsse_CMPGT16S: return "pcmpgtw";
541      case Xsse_CMPGT32S: return "pcmpgtd";
542      case Xsse_SHL16:    return "psllw";
543      case Xsse_SHL32:    return "pslld";
544      case Xsse_SHL64:    return "psllq";
545      case Xsse_SHR16:    return "psrlw";
546      case Xsse_SHR32:    return "psrld";
547      case Xsse_SHR64:    return "psrlq";
548      case Xsse_SAR16:    return "psraw";
549      case Xsse_SAR32:    return "psrad";
550      case Xsse_PACKSSD:  return "packssdw";
551      case Xsse_PACKSSW:  return "packsswb";
552      case Xsse_PACKUSW:  return "packuswb";
553      case Xsse_UNPCKHB:  return "punpckhb";
554      case Xsse_UNPCKHW:  return "punpckhw";
555      case Xsse_UNPCKHD:  return "punpckhd";
556      case Xsse_UNPCKHQ:  return "punpckhq";
557      case Xsse_UNPCKLB:  return "punpcklb";
558      case Xsse_UNPCKLW:  return "punpcklw";
559      case Xsse_UNPCKLD:  return "punpckld";
560      case Xsse_UNPCKLQ:  return "punpcklq";
561      default: vpanic("showX86SseOp");
562   }
563}
564
565X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
566   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
567   i->tag            = Xin_Alu32R;
568   i->Xin.Alu32R.op  = op;
569   i->Xin.Alu32R.src = src;
570   i->Xin.Alu32R.dst = dst;
571   return i;
572}
573X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
574   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
575   i->tag            = Xin_Alu32M;
576   i->Xin.Alu32M.op  = op;
577   i->Xin.Alu32M.src = src;
578   i->Xin.Alu32M.dst = dst;
579   vassert(op != Xalu_MUL);
580   return i;
581}
582X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
583   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
584   i->tag          = Xin_Sh32;
585   i->Xin.Sh32.op  = op;
586   i->Xin.Sh32.src = src;
587   i->Xin.Sh32.dst = dst;
588   return i;
589}
590X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
591   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
592   i->tag              = Xin_Test32;
593   i->Xin.Test32.imm32 = imm32;
594   i->Xin.Test32.dst   = dst;
595   return i;
596}
597X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
598   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
599   i->tag             = Xin_Unary32;
600   i->Xin.Unary32.op  = op;
601   i->Xin.Unary32.dst = dst;
602   return i;
603}
604X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
605   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
606   i->tag             = Xin_Lea32;
607   i->Xin.Lea32.am    = am;
608   i->Xin.Lea32.dst   = dst;
609   return i;
610}
611X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
612   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
613   i->tag             = Xin_MulL;
614   i->Xin.MulL.syned  = syned;
615   i->Xin.MulL.src    = src;
616   return i;
617}
618X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
619   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
620   i->tag           = Xin_Div;
621   i->Xin.Div.syned = syned;
622   i->Xin.Div.src   = src;
623   return i;
624}
625X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
626   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
627   i->tag            = Xin_Sh3232;
628   i->Xin.Sh3232.op  = op;
629   i->Xin.Sh3232.amt = amt;
630   i->Xin.Sh3232.src = src;
631   i->Xin.Sh3232.dst = dst;
632   vassert(op == Xsh_SHL || op == Xsh_SHR);
633   return i;
634}
635X86Instr* X86Instr_Push( X86RMI* src ) {
636   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
637   i->tag          = Xin_Push;
638   i->Xin.Push.src = src;
639   return i;
640}
641X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms,
642                          RetLoc rloc ) {
643   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
644   i->tag               = Xin_Call;
645   i->Xin.Call.cond     = cond;
646   i->Xin.Call.target   = target;
647   i->Xin.Call.regparms = regparms;
648   i->Xin.Call.rloc     = rloc;
649   vassert(regparms >= 0 && regparms <= 3);
650   vassert(is_sane_RetLoc(rloc));
651   return i;
652}
653X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP,
654                             X86CondCode cond, Bool toFastEP ) {
655   X86Instr* i             = LibVEX_Alloc(sizeof(X86Instr));
656   i->tag                  = Xin_XDirect;
657   i->Xin.XDirect.dstGA    = dstGA;
658   i->Xin.XDirect.amEIP    = amEIP;
659   i->Xin.XDirect.cond     = cond;
660   i->Xin.XDirect.toFastEP = toFastEP;
661   return i;
662}
663X86Instr* X86Instr_XIndir ( HReg dstGA, X86AMode* amEIP,
664                            X86CondCode cond ) {
665   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
666   i->tag              = Xin_XIndir;
667   i->Xin.XIndir.dstGA = dstGA;
668   i->Xin.XIndir.amEIP = amEIP;
669   i->Xin.XIndir.cond  = cond;
670   return i;
671}
672X86Instr* X86Instr_XAssisted ( HReg dstGA, X86AMode* amEIP,
673                               X86CondCode cond, IRJumpKind jk ) {
674   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
675   i->tag                 = Xin_XAssisted;
676   i->Xin.XAssisted.dstGA = dstGA;
677   i->Xin.XAssisted.amEIP = amEIP;
678   i->Xin.XAssisted.cond  = cond;
679   i->Xin.XAssisted.jk    = jk;
680   return i;
681}
682X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
683   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
684   i->tag             = Xin_CMov32;
685   i->Xin.CMov32.cond = cond;
686   i->Xin.CMov32.src  = src;
687   i->Xin.CMov32.dst  = dst;
688   vassert(cond != Xcc_ALWAYS);
689   return i;
690}
691X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
692                            X86AMode* src, HReg dst ) {
693   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
694   i->tag                = Xin_LoadEX;
695   i->Xin.LoadEX.szSmall = szSmall;
696   i->Xin.LoadEX.syned   = syned;
697   i->Xin.LoadEX.src     = src;
698   i->Xin.LoadEX.dst     = dst;
699   vassert(szSmall == 1 || szSmall == 2);
700   return i;
701}
702X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
703   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
704   i->tag           = Xin_Store;
705   i->Xin.Store.sz  = sz;
706   i->Xin.Store.src = src;
707   i->Xin.Store.dst = dst;
708   vassert(sz == 1 || sz == 2);
709   return i;
710}
711X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
712   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
713   i->tag            = Xin_Set32;
714   i->Xin.Set32.cond = cond;
715   i->Xin.Set32.dst  = dst;
716   return i;
717}
718X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
719   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
720   i->tag               = Xin_Bsfr32;
721   i->Xin.Bsfr32.isFwds = isFwds;
722   i->Xin.Bsfr32.src    = src;
723   i->Xin.Bsfr32.dst    = dst;
724   return i;
725}
726X86Instr* X86Instr_MFence ( UInt hwcaps ) {
727   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
728   i->tag               = Xin_MFence;
729   i->Xin.MFence.hwcaps = hwcaps;
730   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_MMXEXT
731                            |VEX_HWCAPS_X86_SSE1
732                            |VEX_HWCAPS_X86_SSE2
733                            |VEX_HWCAPS_X86_SSE3
734                            |VEX_HWCAPS_X86_LZCNT)));
735   return i;
736}
737X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
738   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
739   i->tag           = Xin_ACAS;
740   i->Xin.ACAS.addr = addr;
741   i->Xin.ACAS.sz   = sz;
742   vassert(sz == 4 || sz == 2 || sz == 1);
743   return i;
744}
745X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
746   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
747   i->tag            = Xin_DACAS;
748   i->Xin.DACAS.addr = addr;
749   return i;
750}
751
752X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
753   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
754   i->tag             = Xin_FpUnary;
755   i->Xin.FpUnary.op  = op;
756   i->Xin.FpUnary.src = src;
757   i->Xin.FpUnary.dst = dst;
758   return i;
759}
760X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
761   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
762   i->tag               = Xin_FpBinary;
763   i->Xin.FpBinary.op   = op;
764   i->Xin.FpBinary.srcL = srcL;
765   i->Xin.FpBinary.srcR = srcR;
766   i->Xin.FpBinary.dst  = dst;
767   return i;
768}
769X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
770   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
771   i->tag               = Xin_FpLdSt;
772   i->Xin.FpLdSt.isLoad = isLoad;
773   i->Xin.FpLdSt.sz     = sz;
774   i->Xin.FpLdSt.reg    = reg;
775   i->Xin.FpLdSt.addr   = addr;
776   vassert(sz == 4 || sz == 8 || sz == 10);
777   return i;
778}
779X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
780                             HReg reg, X86AMode* addr ) {
781   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
782   i->tag                = Xin_FpLdStI;
783   i->Xin.FpLdStI.isLoad = isLoad;
784   i->Xin.FpLdStI.sz     = sz;
785   i->Xin.FpLdStI.reg    = reg;
786   i->Xin.FpLdStI.addr   = addr;
787   vassert(sz == 2 || sz == 4 || sz == 8);
788   return i;
789}
790X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
791   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
792   i->tag              = Xin_Fp64to32;
793   i->Xin.Fp64to32.src = src;
794   i->Xin.Fp64to32.dst = dst;
795   return i;
796}
797X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
798   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
799   i->tag             = Xin_FpCMov;
800   i->Xin.FpCMov.cond = cond;
801   i->Xin.FpCMov.src  = src;
802   i->Xin.FpCMov.dst  = dst;
803   vassert(cond != Xcc_ALWAYS);
804   return i;
805}
806X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
807   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
808   i->tag               = Xin_FpLdCW;
809   i->Xin.FpLdCW.addr   = addr;
810   return i;
811}
812X86Instr* X86Instr_FpStSW_AX ( void ) {
813   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
814   i->tag      = Xin_FpStSW_AX;
815   return i;
816}
817X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
818   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
819   i->tag            = Xin_FpCmp;
820   i->Xin.FpCmp.srcL = srcL;
821   i->Xin.FpCmp.srcR = srcR;
822   i->Xin.FpCmp.dst  = dst;
823   return i;
824}
825X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
826   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
827   i->tag                 = Xin_SseConst;
828   i->Xin.SseConst.con    = con;
829   i->Xin.SseConst.dst    = dst;
830   vassert(hregClass(dst) == HRcVec128);
831   return i;
832}
833X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
834   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
835   i->tag                = Xin_SseLdSt;
836   i->Xin.SseLdSt.isLoad = isLoad;
837   i->Xin.SseLdSt.reg    = reg;
838   i->Xin.SseLdSt.addr   = addr;
839   return i;
840}
841X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
842{
843   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
844   i->tag                = Xin_SseLdzLO;
845   i->Xin.SseLdzLO.sz    = toUChar(sz);
846   i->Xin.SseLdzLO.reg   = reg;
847   i->Xin.SseLdzLO.addr  = addr;
848   vassert(sz == 4 || sz == 8);
849   return i;
850}
851X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
852   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
853   i->tag              = Xin_Sse32Fx4;
854   i->Xin.Sse32Fx4.op  = op;
855   i->Xin.Sse32Fx4.src = src;
856   i->Xin.Sse32Fx4.dst = dst;
857   vassert(op != Xsse_MOV);
858   return i;
859}
860X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
861   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
862   i->tag              = Xin_Sse32FLo;
863   i->Xin.Sse32FLo.op  = op;
864   i->Xin.Sse32FLo.src = src;
865   i->Xin.Sse32FLo.dst = dst;
866   vassert(op != Xsse_MOV);
867   return i;
868}
869X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
870   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
871   i->tag              = Xin_Sse64Fx2;
872   i->Xin.Sse64Fx2.op  = op;
873   i->Xin.Sse64Fx2.src = src;
874   i->Xin.Sse64Fx2.dst = dst;
875   vassert(op != Xsse_MOV);
876   return i;
877}
878X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
879   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
880   i->tag              = Xin_Sse64FLo;
881   i->Xin.Sse64FLo.op  = op;
882   i->Xin.Sse64FLo.src = src;
883   i->Xin.Sse64FLo.dst = dst;
884   vassert(op != Xsse_MOV);
885   return i;
886}
887X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
888   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
889   i->tag             = Xin_SseReRg;
890   i->Xin.SseReRg.op  = op;
891   i->Xin.SseReRg.src = re;
892   i->Xin.SseReRg.dst = rg;
893   return i;
894}
895X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
896   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
897   i->tag              = Xin_SseCMov;
898   i->Xin.SseCMov.cond = cond;
899   i->Xin.SseCMov.src  = src;
900   i->Xin.SseCMov.dst  = dst;
901   vassert(cond != Xcc_ALWAYS);
902   return i;
903}
904X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
905   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
906   i->tag               = Xin_SseShuf;
907   i->Xin.SseShuf.order = order;
908   i->Xin.SseShuf.src   = src;
909   i->Xin.SseShuf.dst   = dst;
910   vassert(order >= 0 && order <= 0xFF);
911   return i;
912}
913X86Instr* X86Instr_EvCheck ( X86AMode* amCounter,
914                             X86AMode* amFailAddr ) {
915   X86Instr* i               = LibVEX_Alloc(sizeof(X86Instr));
916   i->tag                    = Xin_EvCheck;
917   i->Xin.EvCheck.amCounter  = amCounter;
918   i->Xin.EvCheck.amFailAddr = amFailAddr;
919   return i;
920}
921X86Instr* X86Instr_ProfInc ( void ) {
922   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
923   i->tag      = Xin_ProfInc;
924   return i;
925}
926
927void ppX86Instr ( X86Instr* i, Bool mode64 ) {
928   vassert(mode64 == False);
929   switch (i->tag) {
930      case Xin_Alu32R:
931         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
932         ppX86RMI(i->Xin.Alu32R.src);
933         vex_printf(",");
934         ppHRegX86(i->Xin.Alu32R.dst);
935         return;
936      case Xin_Alu32M:
937         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
938         ppX86RI(i->Xin.Alu32M.src);
939         vex_printf(",");
940         ppX86AMode(i->Xin.Alu32M.dst);
941         return;
942      case Xin_Sh32:
943         vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
944         if (i->Xin.Sh32.src == 0)
945           vex_printf("%%cl,");
946         else
947            vex_printf("$%d,", (Int)i->Xin.Sh32.src);
948         ppHRegX86(i->Xin.Sh32.dst);
949         return;
950      case Xin_Test32:
951         vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
952         ppX86RM(i->Xin.Test32.dst);
953         return;
954      case Xin_Unary32:
955         vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
956         ppHRegX86(i->Xin.Unary32.dst);
957         return;
958      case Xin_Lea32:
959         vex_printf("leal ");
960         ppX86AMode(i->Xin.Lea32.am);
961         vex_printf(",");
962         ppHRegX86(i->Xin.Lea32.dst);
963         return;
964      case Xin_MulL:
965         vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
966         ppX86RM(i->Xin.MulL.src);
967         return;
968      case Xin_Div:
969         vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
970         ppX86RM(i->Xin.Div.src);
971         return;
972      case Xin_Sh3232:
973         vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
974         if (i->Xin.Sh3232.amt == 0)
975           vex_printf(" %%cl,");
976         else
977            vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
978         ppHRegX86(i->Xin.Sh3232.src);
979         vex_printf(",");
980         ppHRegX86(i->Xin.Sh3232.dst);
981         return;
982      case Xin_Push:
983         vex_printf("pushl ");
984         ppX86RMI(i->Xin.Push.src);
985         return;
986      case Xin_Call:
987         vex_printf("call%s[%d,",
988                    i->Xin.Call.cond==Xcc_ALWAYS
989                       ? "" : showX86CondCode(i->Xin.Call.cond),
990                    i->Xin.Call.regparms);
991         ppRetLoc(i->Xin.Call.rloc);
992         vex_printf("] 0x%x", i->Xin.Call.target);
993         break;
994      case Xin_XDirect:
995         vex_printf("(xDirect) ");
996         vex_printf("if (%%eflags.%s) { ",
997                    showX86CondCode(i->Xin.XDirect.cond));
998         vex_printf("movl $0x%x,", i->Xin.XDirect.dstGA);
999         ppX86AMode(i->Xin.XDirect.amEIP);
1000         vex_printf("; ");
1001         vex_printf("movl $disp_cp_chain_me_to_%sEP,%%edx; call *%%edx }",
1002                    i->Xin.XDirect.toFastEP ? "fast" : "slow");
1003         return;
1004      case Xin_XIndir:
1005         vex_printf("(xIndir) ");
1006         vex_printf("if (%%eflags.%s) { movl ",
1007                    showX86CondCode(i->Xin.XIndir.cond));
1008         ppHRegX86(i->Xin.XIndir.dstGA);
1009         vex_printf(",");
1010         ppX86AMode(i->Xin.XIndir.amEIP);
1011         vex_printf("; movl $disp_indir,%%edx; jmp *%%edx }");
1012         return;
1013      case Xin_XAssisted:
1014         vex_printf("(xAssisted) ");
1015         vex_printf("if (%%eflags.%s) { ",
1016                    showX86CondCode(i->Xin.XAssisted.cond));
1017         vex_printf("movl ");
1018         ppHRegX86(i->Xin.XAssisted.dstGA);
1019         vex_printf(",");
1020         ppX86AMode(i->Xin.XAssisted.amEIP);
1021         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%ebp",
1022                    (Int)i->Xin.XAssisted.jk);
1023         vex_printf("; movl $disp_assisted,%%edx; jmp *%%edx }");
1024         return;
1025      case Xin_CMov32:
1026         vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
1027         ppX86RM(i->Xin.CMov32.src);
1028         vex_printf(",");
1029         ppHRegX86(i->Xin.CMov32.dst);
1030         return;
1031      case Xin_LoadEX:
1032         vex_printf("mov%c%cl ",
1033                    i->Xin.LoadEX.syned ? 's' : 'z',
1034                    i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
1035         ppX86AMode(i->Xin.LoadEX.src);
1036         vex_printf(",");
1037         ppHRegX86(i->Xin.LoadEX.dst);
1038         return;
1039      case Xin_Store:
1040         vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
1041         ppHRegX86(i->Xin.Store.src);
1042         vex_printf(",");
1043         ppX86AMode(i->Xin.Store.dst);
1044         return;
1045      case Xin_Set32:
1046         vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
1047         ppHRegX86(i->Xin.Set32.dst);
1048         return;
1049      case Xin_Bsfr32:
1050         vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
1051         ppHRegX86(i->Xin.Bsfr32.src);
1052         vex_printf(",");
1053         ppHRegX86(i->Xin.Bsfr32.dst);
1054         return;
1055      case Xin_MFence:
1056         vex_printf("mfence(%s)",
1057                    LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
1058         return;
1059      case Xin_ACAS:
1060         vex_printf("lock cmpxchg%c ",
1061                     i->Xin.ACAS.sz==1 ? 'b'
1062                                       : i->Xin.ACAS.sz==2 ? 'w' : 'l');
1063         vex_printf("{%%eax->%%ebx},");
1064         ppX86AMode(i->Xin.ACAS.addr);
1065         return;
1066      case Xin_DACAS:
1067         vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
1068         ppX86AMode(i->Xin.DACAS.addr);
1069         return;
1070      case Xin_FpUnary:
1071         vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
1072         ppHRegX86(i->Xin.FpUnary.src);
1073         vex_printf(",");
1074         ppHRegX86(i->Xin.FpUnary.dst);
1075         break;
1076      case Xin_FpBinary:
1077         vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
1078         ppHRegX86(i->Xin.FpBinary.srcL);
1079         vex_printf(",");
1080         ppHRegX86(i->Xin.FpBinary.srcR);
1081         vex_printf(",");
1082         ppHRegX86(i->Xin.FpBinary.dst);
1083         break;
1084      case Xin_FpLdSt:
1085         if (i->Xin.FpLdSt.isLoad) {
1086            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
1087                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1088            ppX86AMode(i->Xin.FpLdSt.addr);
1089            vex_printf(", ");
1090            ppHRegX86(i->Xin.FpLdSt.reg);
1091         } else {
1092            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
1093                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1094            ppHRegX86(i->Xin.FpLdSt.reg);
1095            vex_printf(", ");
1096            ppX86AMode(i->Xin.FpLdSt.addr);
1097         }
1098         return;
1099      case Xin_FpLdStI:
1100         if (i->Xin.FpLdStI.isLoad) {
1101            vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1102                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1103            ppX86AMode(i->Xin.FpLdStI.addr);
1104            vex_printf(", ");
1105            ppHRegX86(i->Xin.FpLdStI.reg);
1106         } else {
1107            vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1108                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1109            ppHRegX86(i->Xin.FpLdStI.reg);
1110            vex_printf(", ");
1111            ppX86AMode(i->Xin.FpLdStI.addr);
1112         }
1113         return;
1114      case Xin_Fp64to32:
1115         vex_printf("gdtof ");
1116         ppHRegX86(i->Xin.Fp64to32.src);
1117         vex_printf(",");
1118         ppHRegX86(i->Xin.Fp64to32.dst);
1119         return;
1120      case Xin_FpCMov:
1121         vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
1122         ppHRegX86(i->Xin.FpCMov.src);
1123         vex_printf(",");
1124         ppHRegX86(i->Xin.FpCMov.dst);
1125         return;
1126      case Xin_FpLdCW:
1127         vex_printf("fldcw ");
1128         ppX86AMode(i->Xin.FpLdCW.addr);
1129         return;
1130      case Xin_FpStSW_AX:
1131         vex_printf("fstsw %%ax");
1132         return;
1133      case Xin_FpCmp:
1134         vex_printf("gcmp ");
1135         ppHRegX86(i->Xin.FpCmp.srcL);
1136         vex_printf(",");
1137         ppHRegX86(i->Xin.FpCmp.srcR);
1138         vex_printf(",");
1139         ppHRegX86(i->Xin.FpCmp.dst);
1140         break;
1141      case Xin_SseConst:
1142         vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
1143         ppHRegX86(i->Xin.SseConst.dst);
1144         break;
1145      case Xin_SseLdSt:
1146         vex_printf("movups ");
1147         if (i->Xin.SseLdSt.isLoad) {
1148            ppX86AMode(i->Xin.SseLdSt.addr);
1149            vex_printf(",");
1150            ppHRegX86(i->Xin.SseLdSt.reg);
1151         } else {
1152            ppHRegX86(i->Xin.SseLdSt.reg);
1153            vex_printf(",");
1154            ppX86AMode(i->Xin.SseLdSt.addr);
1155         }
1156         return;
1157      case Xin_SseLdzLO:
1158         vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
1159         ppX86AMode(i->Xin.SseLdzLO.addr);
1160         vex_printf(",");
1161         ppHRegX86(i->Xin.SseLdzLO.reg);
1162         return;
1163      case Xin_Sse32Fx4:
1164         vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
1165         ppHRegX86(i->Xin.Sse32Fx4.src);
1166         vex_printf(",");
1167         ppHRegX86(i->Xin.Sse32Fx4.dst);
1168         return;
1169      case Xin_Sse32FLo:
1170         vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
1171         ppHRegX86(i->Xin.Sse32FLo.src);
1172         vex_printf(",");
1173         ppHRegX86(i->Xin.Sse32FLo.dst);
1174         return;
1175      case Xin_Sse64Fx2:
1176         vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
1177         ppHRegX86(i->Xin.Sse64Fx2.src);
1178         vex_printf(",");
1179         ppHRegX86(i->Xin.Sse64Fx2.dst);
1180         return;
1181      case Xin_Sse64FLo:
1182         vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
1183         ppHRegX86(i->Xin.Sse64FLo.src);
1184         vex_printf(",");
1185         ppHRegX86(i->Xin.Sse64FLo.dst);
1186         return;
1187      case Xin_SseReRg:
1188         vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
1189         ppHRegX86(i->Xin.SseReRg.src);
1190         vex_printf(",");
1191         ppHRegX86(i->Xin.SseReRg.dst);
1192         return;
1193      case Xin_SseCMov:
1194         vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
1195         ppHRegX86(i->Xin.SseCMov.src);
1196         vex_printf(",");
1197         ppHRegX86(i->Xin.SseCMov.dst);
1198         return;
1199      case Xin_SseShuf:
1200         vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
1201         ppHRegX86(i->Xin.SseShuf.src);
1202         vex_printf(",");
1203         ppHRegX86(i->Xin.SseShuf.dst);
1204         return;
1205      case Xin_EvCheck:
1206         vex_printf("(evCheck) decl ");
1207         ppX86AMode(i->Xin.EvCheck.amCounter);
1208         vex_printf("; jns nofail; jmp *");
1209         ppX86AMode(i->Xin.EvCheck.amFailAddr);
1210         vex_printf("; nofail:");
1211         return;
1212      case Xin_ProfInc:
1213         vex_printf("(profInc) addl $1,NotKnownYet; "
1214                    "adcl $0,NotKnownYet+4");
1215         return;
1216      default:
1217         vpanic("ppX86Instr");
1218   }
1219}
1220
1221/* --------- Helpers for register allocation. --------- */
1222
1223void getRegUsage_X86Instr (HRegUsage* u, X86Instr* i, Bool mode64)
1224{
1225   Bool unary;
1226   vassert(mode64 == False);
1227   initHRegUsage(u);
1228   switch (i->tag) {
1229      case Xin_Alu32R:
1230         addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
1231         if (i->Xin.Alu32R.op == Xalu_MOV) {
1232            addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
1233            return;
1234         }
1235         if (i->Xin.Alu32R.op == Xalu_CMP) {
1236            addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
1237            return;
1238         }
1239         addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
1240         return;
1241      case Xin_Alu32M:
1242         addRegUsage_X86RI(u, i->Xin.Alu32M.src);
1243         addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
1244         return;
1245      case Xin_Sh32:
1246         addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
1247         if (i->Xin.Sh32.src == 0)
1248            addHRegUse(u, HRmRead, hregX86_ECX());
1249         return;
1250      case Xin_Test32:
1251         addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
1252         return;
1253      case Xin_Unary32:
1254         addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
1255         return;
1256      case Xin_Lea32:
1257         addRegUsage_X86AMode(u, i->Xin.Lea32.am);
1258         addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
1259         return;
1260      case Xin_MulL:
1261         addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
1262         addHRegUse(u, HRmModify, hregX86_EAX());
1263         addHRegUse(u, HRmWrite, hregX86_EDX());
1264         return;
1265      case Xin_Div:
1266         addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
1267         addHRegUse(u, HRmModify, hregX86_EAX());
1268         addHRegUse(u, HRmModify, hregX86_EDX());
1269         return;
1270      case Xin_Sh3232:
1271         addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
1272         addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
1273         if (i->Xin.Sh3232.amt == 0)
1274            addHRegUse(u, HRmRead, hregX86_ECX());
1275         return;
1276      case Xin_Push:
1277         addRegUsage_X86RMI(u, i->Xin.Push.src);
1278         addHRegUse(u, HRmModify, hregX86_ESP());
1279         return;
1280      case Xin_Call:
1281         /* This is a bit subtle. */
1282         /* First off, claim it trashes all the caller-saved regs
1283            which fall within the register allocator's jurisdiction.
1284            These I believe to be %eax %ecx %edx and all the xmm
1285            registers. */
1286         addHRegUse(u, HRmWrite, hregX86_EAX());
1287         addHRegUse(u, HRmWrite, hregX86_ECX());
1288         addHRegUse(u, HRmWrite, hregX86_EDX());
1289         addHRegUse(u, HRmWrite, hregX86_XMM0());
1290         addHRegUse(u, HRmWrite, hregX86_XMM1());
1291         addHRegUse(u, HRmWrite, hregX86_XMM2());
1292         addHRegUse(u, HRmWrite, hregX86_XMM3());
1293         addHRegUse(u, HRmWrite, hregX86_XMM4());
1294         addHRegUse(u, HRmWrite, hregX86_XMM5());
1295         addHRegUse(u, HRmWrite, hregX86_XMM6());
1296         addHRegUse(u, HRmWrite, hregX86_XMM7());
1297         /* Now we have to state any parameter-carrying registers
1298            which might be read.  This depends on the regparmness. */
1299         switch (i->Xin.Call.regparms) {
1300            case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
1301            case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
1302            case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
1303            case 0: break;
1304            default: vpanic("getRegUsage_X86Instr:Call:regparms");
1305         }
1306         /* Finally, there is the issue that the insn trashes a
1307            register because the literal target address has to be
1308            loaded into a register.  Fortunately, for the 0/1/2
1309            regparm case, we can use EAX, EDX and ECX respectively, so
1310            this does not cause any further damage.  For the 3-regparm
1311            case, we'll have to choose another register arbitrarily --
1312            since A, D and C are used for parameters -- and so we might
1313            as well choose EDI. */
1314         if (i->Xin.Call.regparms == 3)
1315            addHRegUse(u, HRmWrite, hregX86_EDI());
1316         /* Upshot of this is that the assembler really must observe
1317            the here-stated convention of which register to use as an
1318            address temporary, depending on the regparmness: 0==EAX,
1319            1==EDX, 2==ECX, 3==EDI. */
1320         return;
1321      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1322         conditionally exit the block.  Hence we only need to list (1)
1323         the registers that they read, and (2) the registers that they
1324         write in the case where the block is not exited.  (2) is
1325         empty, hence only (1) is relevant here. */
1326      case Xin_XDirect:
1327         addRegUsage_X86AMode(u, i->Xin.XDirect.amEIP);
1328         return;
1329      case Xin_XIndir:
1330         addHRegUse(u, HRmRead, i->Xin.XIndir.dstGA);
1331         addRegUsage_X86AMode(u, i->Xin.XIndir.amEIP);
1332         return;
1333      case Xin_XAssisted:
1334         addHRegUse(u, HRmRead, i->Xin.XAssisted.dstGA);
1335         addRegUsage_X86AMode(u, i->Xin.XAssisted.amEIP);
1336         return;
1337      case Xin_CMov32:
1338         addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
1339         addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
1340         return;
1341      case Xin_LoadEX:
1342         addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
1343         addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
1344         return;
1345      case Xin_Store:
1346         addHRegUse(u, HRmRead, i->Xin.Store.src);
1347         addRegUsage_X86AMode(u, i->Xin.Store.dst);
1348         return;
1349      case Xin_Set32:
1350         addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
1351         return;
1352      case Xin_Bsfr32:
1353         addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
1354         addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
1355         return;
1356      case Xin_MFence:
1357         return;
1358      case Xin_ACAS:
1359         addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
1360         addHRegUse(u, HRmRead, hregX86_EBX());
1361         addHRegUse(u, HRmModify, hregX86_EAX());
1362         return;
1363      case Xin_DACAS:
1364         addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
1365         addHRegUse(u, HRmRead, hregX86_ECX());
1366         addHRegUse(u, HRmRead, hregX86_EBX());
1367         addHRegUse(u, HRmModify, hregX86_EDX());
1368         addHRegUse(u, HRmModify, hregX86_EAX());
1369         return;
1370      case Xin_FpUnary:
1371         addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
1372         addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
1373         return;
1374      case Xin_FpBinary:
1375         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
1376         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
1377         addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
1378         return;
1379      case Xin_FpLdSt:
1380         addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
1381         addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
1382                       i->Xin.FpLdSt.reg);
1383         return;
1384      case Xin_FpLdStI:
1385         addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
1386         addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
1387                       i->Xin.FpLdStI.reg);
1388         return;
1389      case Xin_Fp64to32:
1390         addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
1391         addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
1392         return;
1393      case Xin_FpCMov:
1394         addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
1395         addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
1396         return;
1397      case Xin_FpLdCW:
1398         addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
1399         return;
1400      case Xin_FpStSW_AX:
1401         addHRegUse(u, HRmWrite, hregX86_EAX());
1402         return;
1403      case Xin_FpCmp:
1404         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
1405         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
1406         addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
1407         addHRegUse(u, HRmWrite, hregX86_EAX());
1408         return;
1409      case Xin_SseLdSt:
1410         addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
1411         addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
1412                       i->Xin.SseLdSt.reg);
1413         return;
1414      case Xin_SseLdzLO:
1415         addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
1416         addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
1417         return;
1418      case Xin_SseConst:
1419         addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
1420         return;
1421      case Xin_Sse32Fx4:
1422         vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
1423         unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
1424                         || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
1425                         || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
1426         addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
1427         addHRegUse(u, unary ? HRmWrite : HRmModify,
1428                       i->Xin.Sse32Fx4.dst);
1429         return;
1430      case Xin_Sse32FLo:
1431         vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
1432         unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
1433                         || i->Xin.Sse32FLo.op == Xsse_RSQRTF
1434                         || i->Xin.Sse32FLo.op == Xsse_SQRTF );
1435         addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
1436         addHRegUse(u, unary ? HRmWrite : HRmModify,
1437                       i->Xin.Sse32FLo.dst);
1438         return;
1439      case Xin_Sse64Fx2:
1440         vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
1441         unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
1442                         || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
1443                         || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
1444         addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
1445         addHRegUse(u, unary ? HRmWrite : HRmModify,
1446                       i->Xin.Sse64Fx2.dst);
1447         return;
1448      case Xin_Sse64FLo:
1449         vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
1450         unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
1451                         || i->Xin.Sse64FLo.op == Xsse_RSQRTF
1452                         || i->Xin.Sse64FLo.op == Xsse_SQRTF );
1453         addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
1454         addHRegUse(u, unary ? HRmWrite : HRmModify,
1455                       i->Xin.Sse64FLo.dst);
1456         return;
1457      case Xin_SseReRg:
1458         if (i->Xin.SseReRg.op == Xsse_XOR
1459             && sameHReg(i->Xin.SseReRg.src, i->Xin.SseReRg.dst)) {
1460            /* reg-alloc needs to understand 'xor r,r' as a write of r */
1461            /* (as opposed to a rite of passage :-) */
1462            addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
1463         } else {
1464            addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
1465            addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV
1466                             ? HRmWrite : HRmModify,
1467                          i->Xin.SseReRg.dst);
1468         }
1469         return;
1470      case Xin_SseCMov:
1471         addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
1472         addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
1473         return;
1474      case Xin_SseShuf:
1475         addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
1476         addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
1477         return;
1478      case Xin_EvCheck:
1479         /* We expect both amodes only to mention %ebp, so this is in
1480            fact pointless, since %ebp isn't allocatable, but anyway.. */
1481         addRegUsage_X86AMode(u, i->Xin.EvCheck.amCounter);
1482         addRegUsage_X86AMode(u, i->Xin.EvCheck.amFailAddr);
1483         return;
1484      case Xin_ProfInc:
1485         /* does not use any registers. */
1486         return;
1487      default:
1488         ppX86Instr(i, False);
1489         vpanic("getRegUsage_X86Instr");
1490   }
1491}
1492
1493/* local helper */
1494static void mapReg( HRegRemap* m, HReg* r )
1495{
1496   *r = lookupHRegRemap(m, *r);
1497}
1498
1499void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
1500{
1501   vassert(mode64 == False);
1502   switch (i->tag) {
1503      case Xin_Alu32R:
1504         mapRegs_X86RMI(m, i->Xin.Alu32R.src);
1505         mapReg(m, &i->Xin.Alu32R.dst);
1506         return;
1507      case Xin_Alu32M:
1508         mapRegs_X86RI(m, i->Xin.Alu32M.src);
1509         mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
1510         return;
1511      case Xin_Sh32:
1512         mapReg(m, &i->Xin.Sh32.dst);
1513         return;
1514      case Xin_Test32:
1515         mapRegs_X86RM(m, i->Xin.Test32.dst);
1516         return;
1517      case Xin_Unary32:
1518         mapReg(m, &i->Xin.Unary32.dst);
1519         return;
1520      case Xin_Lea32:
1521         mapRegs_X86AMode(m, i->Xin.Lea32.am);
1522         mapReg(m, &i->Xin.Lea32.dst);
1523         return;
1524      case Xin_MulL:
1525         mapRegs_X86RM(m, i->Xin.MulL.src);
1526         return;
1527      case Xin_Div:
1528         mapRegs_X86RM(m, i->Xin.Div.src);
1529         return;
1530      case Xin_Sh3232:
1531         mapReg(m, &i->Xin.Sh3232.src);
1532         mapReg(m, &i->Xin.Sh3232.dst);
1533         return;
1534      case Xin_Push:
1535         mapRegs_X86RMI(m, i->Xin.Push.src);
1536         return;
1537      case Xin_Call:
1538         return;
1539      case Xin_XDirect:
1540         mapRegs_X86AMode(m, i->Xin.XDirect.amEIP);
1541         return;
1542      case Xin_XIndir:
1543         mapReg(m, &i->Xin.XIndir.dstGA);
1544         mapRegs_X86AMode(m, i->Xin.XIndir.amEIP);
1545         return;
1546      case Xin_XAssisted:
1547         mapReg(m, &i->Xin.XAssisted.dstGA);
1548         mapRegs_X86AMode(m, i->Xin.XAssisted.amEIP);
1549         return;
1550      case Xin_CMov32:
1551         mapRegs_X86RM(m, i->Xin.CMov32.src);
1552         mapReg(m, &i->Xin.CMov32.dst);
1553         return;
1554      case Xin_LoadEX:
1555         mapRegs_X86AMode(m, i->Xin.LoadEX.src);
1556         mapReg(m, &i->Xin.LoadEX.dst);
1557         return;
1558      case Xin_Store:
1559         mapReg(m, &i->Xin.Store.src);
1560         mapRegs_X86AMode(m, i->Xin.Store.dst);
1561         return;
1562      case Xin_Set32:
1563         mapReg(m, &i->Xin.Set32.dst);
1564         return;
1565      case Xin_Bsfr32:
1566         mapReg(m, &i->Xin.Bsfr32.src);
1567         mapReg(m, &i->Xin.Bsfr32.dst);
1568         return;
1569      case Xin_MFence:
1570         return;
1571      case Xin_ACAS:
1572         mapRegs_X86AMode(m, i->Xin.ACAS.addr);
1573         return;
1574      case Xin_DACAS:
1575         mapRegs_X86AMode(m, i->Xin.DACAS.addr);
1576         return;
1577      case Xin_FpUnary:
1578         mapReg(m, &i->Xin.FpUnary.src);
1579         mapReg(m, &i->Xin.FpUnary.dst);
1580         return;
1581      case Xin_FpBinary:
1582         mapReg(m, &i->Xin.FpBinary.srcL);
1583         mapReg(m, &i->Xin.FpBinary.srcR);
1584         mapReg(m, &i->Xin.FpBinary.dst);
1585         return;
1586      case Xin_FpLdSt:
1587         mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
1588         mapReg(m, &i->Xin.FpLdSt.reg);
1589         return;
1590      case Xin_FpLdStI:
1591         mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
1592         mapReg(m, &i->Xin.FpLdStI.reg);
1593         return;
1594      case Xin_Fp64to32:
1595         mapReg(m, &i->Xin.Fp64to32.src);
1596         mapReg(m, &i->Xin.Fp64to32.dst);
1597         return;
1598      case Xin_FpCMov:
1599         mapReg(m, &i->Xin.FpCMov.src);
1600         mapReg(m, &i->Xin.FpCMov.dst);
1601         return;
1602      case Xin_FpLdCW:
1603         mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
1604         return;
1605      case Xin_FpStSW_AX:
1606         return;
1607      case Xin_FpCmp:
1608         mapReg(m, &i->Xin.FpCmp.srcL);
1609         mapReg(m, &i->Xin.FpCmp.srcR);
1610         mapReg(m, &i->Xin.FpCmp.dst);
1611         return;
1612      case Xin_SseConst:
1613         mapReg(m, &i->Xin.SseConst.dst);
1614         return;
1615      case Xin_SseLdSt:
1616         mapReg(m, &i->Xin.SseLdSt.reg);
1617         mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
1618         break;
1619      case Xin_SseLdzLO:
1620         mapReg(m, &i->Xin.SseLdzLO.reg);
1621         mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
1622         break;
1623      case Xin_Sse32Fx4:
1624         mapReg(m, &i->Xin.Sse32Fx4.src);
1625         mapReg(m, &i->Xin.Sse32Fx4.dst);
1626         return;
1627      case Xin_Sse32FLo:
1628         mapReg(m, &i->Xin.Sse32FLo.src);
1629         mapReg(m, &i->Xin.Sse32FLo.dst);
1630         return;
1631      case Xin_Sse64Fx2:
1632         mapReg(m, &i->Xin.Sse64Fx2.src);
1633         mapReg(m, &i->Xin.Sse64Fx2.dst);
1634         return;
1635      case Xin_Sse64FLo:
1636         mapReg(m, &i->Xin.Sse64FLo.src);
1637         mapReg(m, &i->Xin.Sse64FLo.dst);
1638         return;
1639      case Xin_SseReRg:
1640         mapReg(m, &i->Xin.SseReRg.src);
1641         mapReg(m, &i->Xin.SseReRg.dst);
1642         return;
1643      case Xin_SseCMov:
1644         mapReg(m, &i->Xin.SseCMov.src);
1645         mapReg(m, &i->Xin.SseCMov.dst);
1646         return;
1647      case Xin_SseShuf:
1648         mapReg(m, &i->Xin.SseShuf.src);
1649         mapReg(m, &i->Xin.SseShuf.dst);
1650         return;
1651      case Xin_EvCheck:
1652         /* We expect both amodes only to mention %ebp, so this is in
1653            fact pointless, since %ebp isn't allocatable, but anyway.. */
1654         mapRegs_X86AMode(m, i->Xin.EvCheck.amCounter);
1655         mapRegs_X86AMode(m, i->Xin.EvCheck.amFailAddr);
1656         return;
1657      case Xin_ProfInc:
1658         /* does not use any registers. */
1659         return;
1660
1661      default:
1662         ppX86Instr(i, mode64);
1663         vpanic("mapRegs_X86Instr");
1664   }
1665}
1666
1667/* Figure out if i represents a reg-reg move, and if so assign the
1668   source and destination to *src and *dst.  If in doubt say No.  Used
1669   by the register allocator to do move coalescing.
1670*/
1671Bool isMove_X86Instr ( X86Instr* i, HReg* src, HReg* dst )
1672{
1673   /* Moves between integer regs */
1674   if (i->tag == Xin_Alu32R) {
1675      if (i->Xin.Alu32R.op != Xalu_MOV)
1676         return False;
1677      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
1678         return False;
1679      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
1680      *dst = i->Xin.Alu32R.dst;
1681      return True;
1682   }
1683   /* Moves between FP regs */
1684   if (i->tag == Xin_FpUnary) {
1685      if (i->Xin.FpUnary.op != Xfp_MOV)
1686         return False;
1687      *src = i->Xin.FpUnary.src;
1688      *dst = i->Xin.FpUnary.dst;
1689      return True;
1690   }
1691   if (i->tag == Xin_SseReRg) {
1692      if (i->Xin.SseReRg.op != Xsse_MOV)
1693         return False;
1694      *src = i->Xin.SseReRg.src;
1695      *dst = i->Xin.SseReRg.dst;
1696      return True;
1697   }
1698   return False;
1699}
1700
1701
1702/* Generate x86 spill/reload instructions under the direction of the
1703   register allocator.  Note it's critical these don't write the
1704   condition codes. */
1705
1706void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1707                    HReg rreg, Int offsetB, Bool mode64 )
1708{
1709   X86AMode* am;
1710   vassert(offsetB >= 0);
1711   vassert(!hregIsVirtual(rreg));
1712   vassert(mode64 == False);
1713   *i1 = *i2 = NULL;
1714   am = X86AMode_IR(offsetB, hregX86_EBP());
1715   switch (hregClass(rreg)) {
1716      case HRcInt32:
1717         *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
1718         return;
1719      case HRcFlt64:
1720         *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
1721         return;
1722      case HRcVec128:
1723         *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
1724         return;
1725      default:
1726         ppHRegClass(hregClass(rreg));
1727         vpanic("genSpill_X86: unimplemented regclass");
1728   }
1729}
1730
1731void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1732                     HReg rreg, Int offsetB, Bool mode64 )
1733{
1734   X86AMode* am;
1735   vassert(offsetB >= 0);
1736   vassert(!hregIsVirtual(rreg));
1737   vassert(mode64 == False);
1738   *i1 = *i2 = NULL;
1739   am = X86AMode_IR(offsetB, hregX86_EBP());
1740   switch (hregClass(rreg)) {
1741      case HRcInt32:
1742         *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
1743         return;
1744      case HRcFlt64:
1745         *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
1746         return;
1747      case HRcVec128:
1748         *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
1749         return;
1750      default:
1751         ppHRegClass(hregClass(rreg));
1752         vpanic("genReload_X86: unimplemented regclass");
1753   }
1754}
1755
1756/* The given instruction reads the specified vreg exactly once, and
1757   that vreg is currently located at the given spill offset.  If
1758   possible, return a variant of the instruction to one which instead
1759   references the spill slot directly. */
1760
1761X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
1762{
1763   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
1764
1765   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
1766      Convert to: src=RMI_Mem, dst=Reg
1767   */
1768   if (i->tag == Xin_Alu32R
1769       && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
1770           || i->Xin.Alu32R.op == Xalu_XOR)
1771       && i->Xin.Alu32R.src->tag == Xrmi_Reg
1772       && sameHReg(i->Xin.Alu32R.src->Xrmi.Reg.reg, vreg)) {
1773      vassert(! sameHReg(i->Xin.Alu32R.dst, vreg));
1774      return X86Instr_Alu32R(
1775                i->Xin.Alu32R.op,
1776                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
1777                i->Xin.Alu32R.dst
1778             );
1779   }
1780
1781   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
1782      Convert to: src=RI_Imm, dst=Mem
1783   */
1784   if (i->tag == Xin_Alu32R
1785       && (i->Xin.Alu32R.op == Xalu_CMP)
1786       && i->Xin.Alu32R.src->tag == Xrmi_Imm
1787       && sameHReg(i->Xin.Alu32R.dst, vreg)) {
1788      return X86Instr_Alu32M(
1789                i->Xin.Alu32R.op,
1790		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
1791                X86AMode_IR( spill_off, hregX86_EBP())
1792             );
1793   }
1794
1795   /* Deal with form: Push(RMI_Reg)
1796      Convert to: Push(RMI_Mem)
1797   */
1798   if (i->tag == Xin_Push
1799       && i->Xin.Push.src->tag == Xrmi_Reg
1800       && sameHReg(i->Xin.Push.src->Xrmi.Reg.reg, vreg)) {
1801      return X86Instr_Push(
1802                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
1803             );
1804   }
1805
1806   /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
1807      Convert to CMov32(RM_Mem, dst) */
1808   if (i->tag == Xin_CMov32
1809       && i->Xin.CMov32.src->tag == Xrm_Reg
1810       && sameHReg(i->Xin.CMov32.src->Xrm.Reg.reg, vreg)) {
1811      vassert(! sameHReg(i->Xin.CMov32.dst, vreg));
1812      return X86Instr_CMov32(
1813                i->Xin.CMov32.cond,
1814                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
1815                i->Xin.CMov32.dst
1816             );
1817   }
1818
1819   /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
1820   if (i->tag == Xin_Test32
1821       && i->Xin.Test32.dst->tag == Xrm_Reg
1822       && sameHReg(i->Xin.Test32.dst->Xrm.Reg.reg, vreg)) {
1823      return X86Instr_Test32(
1824                i->Xin.Test32.imm32,
1825                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
1826             );
1827   }
1828
1829   return NULL;
1830}
1831
1832
1833/* --------- The x86 assembler (bleh.) --------- */
1834
1835static UChar iregNo ( HReg r )
1836{
1837   UInt n;
1838   vassert(hregClass(r) == HRcInt32);
1839   vassert(!hregIsVirtual(r));
1840   n = hregNumber(r);
1841   vassert(n <= 7);
1842   return toUChar(n);
1843}
1844
1845static UInt fregNo ( HReg r )
1846{
1847   UInt n;
1848   vassert(hregClass(r) == HRcFlt64);
1849   vassert(!hregIsVirtual(r));
1850   n = hregNumber(r);
1851   vassert(n <= 5);
1852   return n;
1853}
1854
1855static UInt vregNo ( HReg r )
1856{
1857   UInt n;
1858   vassert(hregClass(r) == HRcVec128);
1859   vassert(!hregIsVirtual(r));
1860   n = hregNumber(r);
1861   vassert(n <= 7);
1862   return n;
1863}
1864
1865static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
1866{
1867   vassert(mod < 4);
1868   vassert((reg|regmem) < 8);
1869   return toUChar( ((mod & 3) << 6)
1870                   | ((reg & 7) << 3)
1871                   | (regmem & 7) );
1872}
1873
1874static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
1875{
1876   vassert(shift < 4);
1877   vassert((regindex|regbase) < 8);
1878   return toUChar( ((shift & 3) << 6)
1879                   | ((regindex & 7) << 3)
1880                   | (regbase & 7) );
1881}
1882
1883static UChar* emit32 ( UChar* p, UInt w32 )
1884{
1885   *p++ = toUChar( w32        & 0x000000FF);
1886   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1887   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1888   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1889   return p;
1890}
1891
1892/* Does a sign-extend of the lowest 8 bits give
1893   the original number? */
1894static Bool fits8bits ( UInt w32 )
1895{
1896   Int i32 = (Int)w32;
1897   return toBool(i32 == ((i32 << 24) >> 24));
1898}
1899
1900
1901/* Forming mod-reg-rm bytes and scale-index-base bytes.
1902
1903     greg,  0(ereg)    |  ereg != ESP && ereg != EBP
1904                       =  00 greg ereg
1905
1906     greg,  d8(ereg)   |  ereg != ESP
1907                       =  01 greg ereg, d8
1908
1909     greg,  d32(ereg)  |  ereg != ESP
1910                       =  10 greg ereg, d32
1911
1912     greg,  d8(%esp)   =  01 greg 100, 0x24, d8
1913
1914     -----------------------------------------------
1915
1916     greg,  d8(base,index,scale)
1917               |  index != ESP
1918               =  01 greg 100, scale index base, d8
1919
1920     greg,  d32(base,index,scale)
1921               |  index != ESP
1922               =  10 greg 100, scale index base, d32
1923*/
1924static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am )
1925{
1926   if (am->tag == Xam_IR) {
1927      if (am->Xam.IR.imm == 0
1928          && ! sameHReg(am->Xam.IR.reg, hregX86_ESP())
1929          && ! sameHReg(am->Xam.IR.reg, hregX86_EBP()) ) {
1930         *p++ = mkModRegRM(0, iregNo(greg), iregNo(am->Xam.IR.reg));
1931         return p;
1932      }
1933      if (fits8bits(am->Xam.IR.imm)
1934          && ! sameHReg(am->Xam.IR.reg, hregX86_ESP())) {
1935         *p++ = mkModRegRM(1, iregNo(greg), iregNo(am->Xam.IR.reg));
1936         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1937         return p;
1938      }
1939      if (! sameHReg(am->Xam.IR.reg, hregX86_ESP())) {
1940         *p++ = mkModRegRM(2, iregNo(greg), iregNo(am->Xam.IR.reg));
1941         p = emit32(p, am->Xam.IR.imm);
1942         return p;
1943      }
1944      if (sameHReg(am->Xam.IR.reg, hregX86_ESP())
1945          && fits8bits(am->Xam.IR.imm)) {
1946 	 *p++ = mkModRegRM(1, iregNo(greg), 4);
1947         *p++ = 0x24;
1948         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1949         return p;
1950      }
1951      ppX86AMode(am);
1952      vpanic("doAMode_M: can't emit amode IR");
1953      /*NOTREACHED*/
1954   }
1955   if (am->tag == Xam_IRRS) {
1956      if (fits8bits(am->Xam.IRRS.imm)
1957          && ! sameHReg(am->Xam.IRRS.index, hregX86_ESP())) {
1958         *p++ = mkModRegRM(1, iregNo(greg), 4);
1959         *p++ = mkSIB(am->Xam.IRRS.shift, iregNo(am->Xam.IRRS.index),
1960                                          iregNo(am->Xam.IRRS.base));
1961         *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
1962         return p;
1963      }
1964      if (! sameHReg(am->Xam.IRRS.index, hregX86_ESP())) {
1965         *p++ = mkModRegRM(2, iregNo(greg), 4);
1966         *p++ = mkSIB(am->Xam.IRRS.shift, iregNo(am->Xam.IRRS.index),
1967                                          iregNo(am->Xam.IRRS.base));
1968         p = emit32(p, am->Xam.IRRS.imm);
1969         return p;
1970      }
1971      ppX86AMode(am);
1972      vpanic("doAMode_M: can't emit amode IRRS");
1973      /*NOTREACHED*/
1974   }
1975   vpanic("doAMode_M: unknown amode");
1976   /*NOTREACHED*/
1977}
1978
1979
1980/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
1981static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
1982{
1983   *p++ = mkModRegRM(3, iregNo(greg), iregNo(ereg));
1984   return p;
1985}
1986
1987
1988/* Emit ffree %st(7) */
1989static UChar* do_ffree_st7 ( UChar* p )
1990{
1991   *p++ = 0xDD;
1992   *p++ = 0xC7;
1993   return p;
1994}
1995
1996/* Emit fstp %st(i), 1 <= i <= 7 */
1997static UChar* do_fstp_st ( UChar* p, Int i )
1998{
1999   vassert(1 <= i && i <= 7);
2000   *p++ = 0xDD;
2001   *p++ = toUChar(0xD8+i);
2002   return p;
2003}
2004
2005/* Emit fld %st(i), 0 <= i <= 6 */
2006static UChar* do_fld_st ( UChar* p, Int i )
2007{
2008   vassert(0 <= i && i <= 6);
2009   *p++ = 0xD9;
2010   *p++ = toUChar(0xC0+i);
2011   return p;
2012}
2013
2014/* Emit f<op> %st(0) */
2015static UChar* do_fop1_st ( UChar* p, X86FpOp op )
2016{
2017   switch (op) {
2018      case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
2019      case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
2020      case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
2021      case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
2022      case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
2023      case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
2024      case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
2025      case Xfp_MOV:    break;
2026      case Xfp_TAN:
2027         /* fptan pushes 1.0 on the FP stack, except when the argument
2028            is out of range.  Hence we have to do the instruction,
2029            then inspect C2 to see if there is an out of range
2030            condition.  If there is, we skip the fincstp that is used
2031            by the in-range case to get rid of this extra 1.0
2032            value. */
2033         p = do_ffree_st7(p); /* since fptan sometimes pushes 1.0 */
2034         *p++ = 0xD9; *p++ = 0xF2; // fptan
2035         *p++ = 0x50;              // pushl %eax
2036         *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
2037         *p++ = 0x66; *p++ = 0xA9;
2038         *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
2039         *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
2040         *p++ = 0xD9; *p++ = 0xF7; // fincstp
2041         *p++ = 0x58;              // after_fincstp: popl %eax
2042         break;
2043      default:
2044         vpanic("do_fop1_st: unknown op");
2045   }
2046   return p;
2047}
2048
2049/* Emit f<op> %st(i), 1 <= i <= 5 */
2050static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
2051{
2052#  define fake(_n) mkHReg((_n), HRcInt32, False)
2053   Int subopc;
2054   switch (op) {
2055      case Xfp_ADD: subopc = 0; break;
2056      case Xfp_SUB: subopc = 4; break;
2057      case Xfp_MUL: subopc = 1; break;
2058      case Xfp_DIV: subopc = 6; break;
2059      default: vpanic("do_fop2_st: unknown op");
2060   }
2061   *p++ = 0xD8;
2062   p    = doAMode_R(p, fake(subopc), fake(i));
2063   return p;
2064#  undef fake
2065}
2066
2067/* Push a 32-bit word on the stack.  The word depends on tags[3:0];
2068each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
2069*/
2070static UChar* push_word_from_tags ( UChar* p, UShort tags )
2071{
2072   UInt w;
2073   vassert(0 == (tags & ~0xF));
2074   if (tags == 0) {
2075      /* pushl $0x00000000 */
2076      *p++ = 0x6A;
2077      *p++ = 0x00;
2078   }
2079   else
2080   /* pushl $0xFFFFFFFF */
2081   if (tags == 0xF) {
2082      *p++ = 0x6A;
2083      *p++ = 0xFF;
2084   } else {
2085      vassert(0); /* awaiting test case */
2086      w = 0;
2087      if (tags & 1) w |= 0x000000FF;
2088      if (tags & 2) w |= 0x0000FF00;
2089      if (tags & 4) w |= 0x00FF0000;
2090      if (tags & 8) w |= 0xFF000000;
2091      *p++ = 0x68;
2092      p = emit32(p, w);
2093   }
2094   return p;
2095}
2096
2097/* Emit an instruction into buf and return the number of bytes used.
2098   Note that buf is not the insn's final place, and therefore it is
2099   imperative to emit position-independent code.  If the emitted
2100   instruction was a profiler inc, set *is_profInc to True, else
2101   leave it unchanged. */
2102
2103Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
2104                    UChar* buf, Int nbuf, X86Instr* i,
2105                    Bool mode64,
2106                    void* disp_cp_chain_me_to_slowEP,
2107                    void* disp_cp_chain_me_to_fastEP,
2108                    void* disp_cp_xindir,
2109                    void* disp_cp_xassisted )
2110{
2111   UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2112
2113   UInt   xtra;
2114   UChar* p = &buf[0];
2115   UChar* ptmp;
2116   vassert(nbuf >= 32);
2117   vassert(mode64 == False);
2118
2119   /* Wrap an integer as a int register, for use assembling
2120      GrpN insns, in which the greg field is used as a sub-opcode
2121      and does not really contain a register. */
2122#  define fake(_n) mkHReg((_n), HRcInt32, False)
2123
2124   /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
2125
2126   switch (i->tag) {
2127
2128   case Xin_Alu32R:
2129      /* Deal specially with MOV */
2130      if (i->Xin.Alu32R.op == Xalu_MOV) {
2131         switch (i->Xin.Alu32R.src->tag) {
2132            case Xrmi_Imm:
2133               *p++ = toUChar(0xB8 + iregNo(i->Xin.Alu32R.dst));
2134               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2135               goto done;
2136            case Xrmi_Reg:
2137               *p++ = 0x89;
2138               p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2139                                i->Xin.Alu32R.dst);
2140               goto done;
2141            case Xrmi_Mem:
2142               *p++ = 0x8B;
2143               p = doAMode_M(p, i->Xin.Alu32R.dst,
2144                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2145               goto done;
2146            default:
2147               goto bad;
2148         }
2149      }
2150      /* MUL */
2151      if (i->Xin.Alu32R.op == Xalu_MUL) {
2152         switch (i->Xin.Alu32R.src->tag) {
2153            case Xrmi_Reg:
2154               *p++ = 0x0F;
2155               *p++ = 0xAF;
2156               p = doAMode_R(p, i->Xin.Alu32R.dst,
2157                                i->Xin.Alu32R.src->Xrmi.Reg.reg);
2158               goto done;
2159            case Xrmi_Mem:
2160               *p++ = 0x0F;
2161               *p++ = 0xAF;
2162               p = doAMode_M(p, i->Xin.Alu32R.dst,
2163                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2164               goto done;
2165            case Xrmi_Imm:
2166               if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2167                  *p++ = 0x6B;
2168                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2169                  *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2170               } else {
2171                  *p++ = 0x69;
2172                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2173                  p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2174               }
2175               goto done;
2176            default:
2177               goto bad;
2178         }
2179      }
2180      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2181      opc = opc_rr = subopc_imm = opc_imma = 0;
2182      switch (i->Xin.Alu32R.op) {
2183         case Xalu_ADC: opc = 0x13; opc_rr = 0x11;
2184                        subopc_imm = 2; opc_imma = 0x15; break;
2185         case Xalu_ADD: opc = 0x03; opc_rr = 0x01;
2186                        subopc_imm = 0; opc_imma = 0x05; break;
2187         case Xalu_SUB: opc = 0x2B; opc_rr = 0x29;
2188                        subopc_imm = 5; opc_imma = 0x2D; break;
2189         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19;
2190                        subopc_imm = 3; opc_imma = 0x1D; break;
2191         case Xalu_AND: opc = 0x23; opc_rr = 0x21;
2192                        subopc_imm = 4; opc_imma = 0x25; break;
2193         case Xalu_XOR: opc = 0x33; opc_rr = 0x31;
2194                        subopc_imm = 6; opc_imma = 0x35; break;
2195         case Xalu_OR:  opc = 0x0B; opc_rr = 0x09;
2196                        subopc_imm = 1; opc_imma = 0x0D; break;
2197         case Xalu_CMP: opc = 0x3B; opc_rr = 0x39;
2198                        subopc_imm = 7; opc_imma = 0x3D; break;
2199         default: goto bad;
2200      }
2201      switch (i->Xin.Alu32R.src->tag) {
2202         case Xrmi_Imm:
2203            if (sameHReg(i->Xin.Alu32R.dst, hregX86_EAX())
2204                && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2205               *p++ = toUChar(opc_imma);
2206               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2207            } else
2208            if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2209               *p++ = 0x83;
2210               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2211               *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2212            } else {
2213               *p++ = 0x81;
2214               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2215               p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2216            }
2217            goto done;
2218         case Xrmi_Reg:
2219            *p++ = toUChar(opc_rr);
2220            p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2221                             i->Xin.Alu32R.dst);
2222            goto done;
2223         case Xrmi_Mem:
2224            *p++ = toUChar(opc);
2225            p = doAMode_M(p, i->Xin.Alu32R.dst,
2226                             i->Xin.Alu32R.src->Xrmi.Mem.am);
2227            goto done;
2228         default:
2229            goto bad;
2230      }
2231      break;
2232
2233   case Xin_Alu32M:
2234      /* Deal specially with MOV */
2235      if (i->Xin.Alu32M.op == Xalu_MOV) {
2236         switch (i->Xin.Alu32M.src->tag) {
2237            case Xri_Reg:
2238               *p++ = 0x89;
2239               p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2240                                i->Xin.Alu32M.dst);
2241               goto done;
2242            case Xri_Imm:
2243               *p++ = 0xC7;
2244               p = doAMode_M(p, fake(0), i->Xin.Alu32M.dst);
2245               p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2246               goto done;
2247            default:
2248               goto bad;
2249         }
2250      }
2251      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2252         allowed here. */
2253      opc = subopc_imm = opc_imma = 0;
2254      switch (i->Xin.Alu32M.op) {
2255         case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
2256         case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
2257         case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
2258         default: goto bad;
2259      }
2260      switch (i->Xin.Alu32M.src->tag) {
2261         case Xri_Reg:
2262            *p++ = toUChar(opc);
2263            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2264                             i->Xin.Alu32M.dst);
2265            goto done;
2266         case Xri_Imm:
2267            if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
2268               *p++ = 0x83;
2269               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2270               *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
2271               goto done;
2272            } else {
2273               *p++ = 0x81;
2274               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2275               p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2276               goto done;
2277            }
2278         default:
2279            goto bad;
2280      }
2281      break;
2282
2283   case Xin_Sh32:
2284      opc_cl = opc_imm = subopc = 0;
2285      switch (i->Xin.Sh32.op) {
2286         case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2287         case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2288         case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2289         default: goto bad;
2290      }
2291      if (i->Xin.Sh32.src == 0) {
2292         *p++ = toUChar(opc_cl);
2293         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2294      } else {
2295         *p++ = toUChar(opc_imm);
2296         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2297         *p++ = (UChar)(i->Xin.Sh32.src);
2298      }
2299      goto done;
2300
2301   case Xin_Test32:
2302      if (i->Xin.Test32.dst->tag == Xrm_Reg) {
2303         /* testl $imm32, %reg */
2304         *p++ = 0xF7;
2305         p = doAMode_R(p, fake(0), i->Xin.Test32.dst->Xrm.Reg.reg);
2306         p = emit32(p, i->Xin.Test32.imm32);
2307         goto done;
2308      } else {
2309         /* testl $imm32, amode */
2310         *p++ = 0xF7;
2311         p = doAMode_M(p, fake(0), i->Xin.Test32.dst->Xrm.Mem.am);
2312         p = emit32(p, i->Xin.Test32.imm32);
2313         goto done;
2314      }
2315
2316   case Xin_Unary32:
2317      if (i->Xin.Unary32.op == Xun_NOT) {
2318         *p++ = 0xF7;
2319         p = doAMode_R(p, fake(2), i->Xin.Unary32.dst);
2320         goto done;
2321      }
2322      if (i->Xin.Unary32.op == Xun_NEG) {
2323         *p++ = 0xF7;
2324         p = doAMode_R(p, fake(3), i->Xin.Unary32.dst);
2325         goto done;
2326      }
2327      break;
2328
2329   case Xin_Lea32:
2330      *p++ = 0x8D;
2331      p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
2332      goto done;
2333
2334   case Xin_MulL:
2335      subopc = i->Xin.MulL.syned ? 5 : 4;
2336      *p++ = 0xF7;
2337      switch (i->Xin.MulL.src->tag)  {
2338         case Xrm_Mem:
2339            p = doAMode_M(p, fake(subopc),
2340                             i->Xin.MulL.src->Xrm.Mem.am);
2341            goto done;
2342         case Xrm_Reg:
2343            p = doAMode_R(p, fake(subopc),
2344                             i->Xin.MulL.src->Xrm.Reg.reg);
2345            goto done;
2346         default:
2347            goto bad;
2348      }
2349      break;
2350
2351   case Xin_Div:
2352      subopc = i->Xin.Div.syned ? 7 : 6;
2353      *p++ = 0xF7;
2354      switch (i->Xin.Div.src->tag)  {
2355         case Xrm_Mem:
2356            p = doAMode_M(p, fake(subopc),
2357                             i->Xin.Div.src->Xrm.Mem.am);
2358            goto done;
2359         case Xrm_Reg:
2360            p = doAMode_R(p, fake(subopc),
2361                             i->Xin.Div.src->Xrm.Reg.reg);
2362            goto done;
2363         default:
2364            goto bad;
2365      }
2366      break;
2367
2368   case Xin_Sh3232:
2369      vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
2370      if (i->Xin.Sh3232.amt == 0) {
2371         /* shldl/shrdl by %cl */
2372         *p++ = 0x0F;
2373         if (i->Xin.Sh3232.op == Xsh_SHL) {
2374            *p++ = 0xA5;
2375         } else {
2376            *p++ = 0xAD;
2377         }
2378         p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
2379         goto done;
2380      }
2381      break;
2382
2383   case Xin_Push:
2384      switch (i->Xin.Push.src->tag) {
2385         case Xrmi_Mem:
2386            *p++ = 0xFF;
2387            p = doAMode_M(p, fake(6), i->Xin.Push.src->Xrmi.Mem.am);
2388            goto done;
2389         case Xrmi_Imm:
2390            *p++ = 0x68;
2391            p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
2392            goto done;
2393         case Xrmi_Reg:
2394            *p++ = toUChar(0x50 + iregNo(i->Xin.Push.src->Xrmi.Reg.reg));
2395            goto done;
2396        default:
2397            goto bad;
2398      }
2399
2400   case Xin_Call:
2401      if (i->Xin.Call.cond != Xcc_ALWAYS
2402          && i->Xin.Call.rloc.pri != RLPri_None) {
2403         /* The call might not happen (it isn't unconditional) and it
2404            returns a result.  In this case we will need to generate a
2405            control flow diamond to put 0x555..555 in the return
2406            register(s) in the case where the call doesn't happen.  If
2407            this ever becomes necessary, maybe copy code from the ARM
2408            equivalent.  Until that day, just give up. */
2409         goto bad;
2410      }
2411      /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
2412         for explanation of this. */
2413      switch (i->Xin.Call.regparms) {
2414         case 0: irno = iregNo(hregX86_EAX()); break;
2415         case 1: irno = iregNo(hregX86_EDX()); break;
2416         case 2: irno = iregNo(hregX86_ECX()); break;
2417         case 3: irno = iregNo(hregX86_EDI()); break;
2418         default: vpanic(" emit_X86Instr:call:regparms");
2419      }
2420      /* jump over the following two insns if the condition does not
2421         hold */
2422      if (i->Xin.Call.cond != Xcc_ALWAYS) {
2423         *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
2424         *p++ = 0x07; /* 7 bytes in the next two insns */
2425      }
2426      /* movl $target, %tmp */
2427      *p++ = toUChar(0xB8 + irno);
2428      p = emit32(p, i->Xin.Call.target);
2429      /* call *%tmp */
2430      *p++ = 0xFF;
2431      *p++ = toUChar(0xD0 + irno);
2432      goto done;
2433
2434   case Xin_XDirect: {
2435      /* NB: what goes on here has to be very closely coordinated with the
2436         chainXDirect_X86 and unchainXDirect_X86 below. */
2437      /* We're generating chain-me requests here, so we need to be
2438         sure this is actually allowed -- no-redir translations can't
2439         use chain-me's.  Hence: */
2440      vassert(disp_cp_chain_me_to_slowEP != NULL);
2441      vassert(disp_cp_chain_me_to_fastEP != NULL);
2442
2443      /* Use ptmp for backpatching conditional jumps. */
2444      ptmp = NULL;
2445
2446      /* First off, if this is conditional, create a conditional
2447         jump over the rest of it. */
2448      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
2449         /* jmp fwds if !condition */
2450         *p++ = toUChar(0x70 + (0xF & (i->Xin.XDirect.cond ^ 1)));
2451         ptmp = p; /* fill in this bit later */
2452         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2453      }
2454
2455      /* Update the guest EIP. */
2456      /* movl $dstGA, amEIP */
2457      *p++ = 0xC7;
2458      p    = doAMode_M(p, fake(0), i->Xin.XDirect.amEIP);
2459      p    = emit32(p, i->Xin.XDirect.dstGA);
2460
2461      /* --- FIRST PATCHABLE BYTE follows --- */
2462      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2463         to) backs up the return address, so as to find the address of
2464         the first patchable byte.  So: don't change the length of the
2465         two instructions below. */
2466      /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */
2467      *p++ = 0xBA;
2468      void* disp_cp_chain_me
2469               = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2470                                         : disp_cp_chain_me_to_slowEP;
2471      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_chain_me));
2472      /* call *%edx */
2473      *p++ = 0xFF;
2474      *p++ = 0xD2;
2475      /* --- END of PATCHABLE BYTES --- */
2476
2477      /* Fix up the conditional jump, if there was one. */
2478      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
2479         Int delta = p - ptmp;
2480         vassert(delta > 0 && delta < 40);
2481         *ptmp = toUChar(delta-1);
2482      }
2483      goto done;
2484   }
2485
2486   case Xin_XIndir: {
2487      /* We're generating transfers that could lead indirectly to a
2488         chain-me, so we need to be sure this is actually allowed --
2489         no-redir translations are not allowed to reach normal
2490         translations without going through the scheduler.  That means
2491         no XDirects or XIndirs out from no-redir translations.
2492         Hence: */
2493      vassert(disp_cp_xindir != NULL);
2494
2495      /* Use ptmp for backpatching conditional jumps. */
2496      ptmp = NULL;
2497
2498      /* First off, if this is conditional, create a conditional
2499         jump over the rest of it. */
2500      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
2501         /* jmp fwds if !condition */
2502         *p++ = toUChar(0x70 + (0xF & (i->Xin.XIndir.cond ^ 1)));
2503         ptmp = p; /* fill in this bit later */
2504         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2505      }
2506
2507      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
2508      *p++ = 0x89;
2509      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
2510
2511      /* movl $disp_indir, %edx */
2512      *p++ = 0xBA;
2513      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
2514      /* jmp *%edx */
2515      *p++ = 0xFF;
2516      *p++ = 0xE2;
2517
2518      /* Fix up the conditional jump, if there was one. */
2519      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
2520         Int delta = p - ptmp;
2521         vassert(delta > 0 && delta < 40);
2522         *ptmp = toUChar(delta-1);
2523      }
2524      goto done;
2525   }
2526
2527   case Xin_XAssisted: {
2528      /* Use ptmp for backpatching conditional jumps. */
2529      ptmp = NULL;
2530
2531      /* First off, if this is conditional, create a conditional
2532         jump over the rest of it. */
2533      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
2534         /* jmp fwds if !condition */
2535         *p++ = toUChar(0x70 + (0xF & (i->Xin.XAssisted.cond ^ 1)));
2536         ptmp = p; /* fill in this bit later */
2537         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2538      }
2539
2540      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
2541      *p++ = 0x89;
2542      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
2543      /* movl $magic_number, %ebp. */
2544      UInt trcval = 0;
2545      switch (i->Xin.XAssisted.jk) {
2546         case Ijk_ClientReq:    trcval = VEX_TRC_JMP_CLIENTREQ;    break;
2547         case Ijk_Sys_syscall:  trcval = VEX_TRC_JMP_SYS_SYSCALL;  break;
2548         case Ijk_Sys_int128:   trcval = VEX_TRC_JMP_SYS_INT128;   break;
2549         case Ijk_Sys_int129:   trcval = VEX_TRC_JMP_SYS_INT129;   break;
2550         case Ijk_Sys_int130:   trcval = VEX_TRC_JMP_SYS_INT130;   break;
2551         case Ijk_Sys_sysenter: trcval = VEX_TRC_JMP_SYS_SYSENTER; break;
2552         case Ijk_Yield:        trcval = VEX_TRC_JMP_YIELD;        break;
2553         case Ijk_EmWarn:       trcval = VEX_TRC_JMP_EMWARN;       break;
2554         case Ijk_MapFail:      trcval = VEX_TRC_JMP_MAPFAIL;      break;
2555         case Ijk_NoDecode:     trcval = VEX_TRC_JMP_NODECODE;     break;
2556         case Ijk_InvalICache:  trcval = VEX_TRC_JMP_INVALICACHE;  break;
2557         case Ijk_NoRedir:      trcval = VEX_TRC_JMP_NOREDIR;      break;
2558         case Ijk_SigTRAP:      trcval = VEX_TRC_JMP_SIGTRAP;      break;
2559         case Ijk_SigSEGV:      trcval = VEX_TRC_JMP_SIGSEGV;      break;
2560         case Ijk_Boring:       trcval = VEX_TRC_JMP_BORING;       break;
2561         /* We don't expect to see the following being assisted. */
2562         case Ijk_Ret:
2563         case Ijk_Call:
2564         /* fallthrough */
2565         default:
2566            ppIRJumpKind(i->Xin.XAssisted.jk);
2567            vpanic("emit_X86Instr.Xin_XAssisted: unexpected jump kind");
2568      }
2569      vassert(trcval != 0);
2570      *p++ = 0xBD;
2571      p = emit32(p, trcval);
2572
2573      /* movl $disp_indir, %edx */
2574      *p++ = 0xBA;
2575      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xassisted));
2576      /* jmp *%edx */
2577      *p++ = 0xFF;
2578      *p++ = 0xE2;
2579
2580      /* Fix up the conditional jump, if there was one. */
2581      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
2582         Int delta = p - ptmp;
2583         vassert(delta > 0 && delta < 40);
2584         *ptmp = toUChar(delta-1);
2585      }
2586      goto done;
2587   }
2588
2589   case Xin_CMov32:
2590      vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
2591
2592      /* This generates cmov, which is illegal on P54/P55. */
2593      /*
2594      *p++ = 0x0F;
2595      *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
2596      if (i->Xin.CMov32.src->tag == Xrm_Reg) {
2597         p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
2598         goto done;
2599      }
2600      if (i->Xin.CMov32.src->tag == Xrm_Mem) {
2601         p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
2602         goto done;
2603      }
2604      */
2605
2606      /* Alternative version which works on any x86 variant. */
2607      /* jmp fwds if !condition */
2608      *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
2609      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2610      ptmp = p;
2611
2612      switch (i->Xin.CMov32.src->tag) {
2613         case Xrm_Reg:
2614            /* Big sigh.  This is movl E -> G ... */
2615            *p++ = 0x89;
2616            p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
2617                             i->Xin.CMov32.dst);
2618
2619            break;
2620         case Xrm_Mem:
2621            /* ... whereas this is movl G -> E.  That's why the args
2622               to doAMode_R appear to be the wrong way round in the
2623               Xrm_Reg case. */
2624            *p++ = 0x8B;
2625            p = doAMode_M(p, i->Xin.CMov32.dst,
2626                             i->Xin.CMov32.src->Xrm.Mem.am);
2627            break;
2628         default:
2629            goto bad;
2630      }
2631      /* Fill in the jump offset. */
2632      *(ptmp-1) = toUChar(p - ptmp);
2633      goto done;
2634
2635      break;
2636
2637   case Xin_LoadEX:
2638      if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
2639         /* movzbl */
2640         *p++ = 0x0F;
2641         *p++ = 0xB6;
2642         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2643         goto done;
2644      }
2645      if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
2646         /* movzwl */
2647         *p++ = 0x0F;
2648         *p++ = 0xB7;
2649         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2650         goto done;
2651      }
2652      if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
2653         /* movsbl */
2654         *p++ = 0x0F;
2655         *p++ = 0xBE;
2656         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2657         goto done;
2658      }
2659      break;
2660
2661   case Xin_Set32:
2662      /* Make the destination register be 1 or 0, depending on whether
2663         the relevant condition holds.  We have to dodge and weave
2664         when the destination is %esi or %edi as we cannot directly
2665         emit the native 'setb %reg' for those.  Further complication:
2666         the top 24 bits of the destination should be forced to zero,
2667         but doing 'xor %r,%r' kills the flag(s) we are about to read.
2668         Sigh.  So start off my moving $0 into the dest. */
2669
2670      /* Do we need to swap in %eax? */
2671      if (iregNo(i->Xin.Set32.dst) >= 4) {
2672         /* xchg %eax, %dst */
2673         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2674         /* movl $0, %eax */
2675         *p++ =toUChar(0xB8 + iregNo(hregX86_EAX()));
2676         p = emit32(p, 0);
2677         /* setb lo8(%eax) */
2678         *p++ = 0x0F;
2679         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2680         p = doAMode_R(p, fake(0), hregX86_EAX());
2681         /* xchg %eax, %dst */
2682         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2683      } else {
2684         /* movl $0, %dst */
2685         *p++ = toUChar(0xB8 + iregNo(i->Xin.Set32.dst));
2686         p = emit32(p, 0);
2687         /* setb lo8(%dst) */
2688         *p++ = 0x0F;
2689         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2690         p = doAMode_R(p, fake(0), i->Xin.Set32.dst);
2691      }
2692      goto done;
2693
2694   case Xin_Bsfr32:
2695      *p++ = 0x0F;
2696      if (i->Xin.Bsfr32.isFwds) {
2697         *p++ = 0xBC;
2698      } else {
2699         *p++ = 0xBD;
2700      }
2701      p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
2702      goto done;
2703
2704   case Xin_MFence:
2705      /* see comment in hdefs.h re this insn */
2706      if (0) vex_printf("EMIT FENCE\n");
2707      if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
2708                                  |VEX_HWCAPS_X86_SSE2)) {
2709         /* mfence */
2710         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
2711         goto done;
2712      }
2713      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_MMXEXT) {
2714         /* sfence */
2715         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
2716         /* lock addl $0,0(%esp) */
2717         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2718         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2719         goto done;
2720      }
2721      if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
2722         /* lock addl $0,0(%esp) */
2723         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2724         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2725         goto done;
2726      }
2727      vpanic("emit_X86Instr:mfence:hwcaps");
2728      /*NOTREACHED*/
2729      break;
2730
2731   case Xin_ACAS:
2732      /* lock */
2733      *p++ = 0xF0;
2734      /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
2735         in %ebx.  The new-value register is hardwired to be %ebx
2736         since letting it be any integer register gives the problem
2737         that %sil and %dil are unaddressible on x86 and hence we
2738         would have to resort to the same kind of trickery as with
2739         byte-sized Xin.Store, just below.  Given that this isn't
2740         performance critical, it is simpler just to force the
2741         register operand to %ebx (could equally be %ecx or %edx).
2742         (Although %ebx is more consistent with cmpxchg8b.) */
2743      if (i->Xin.ACAS.sz == 2) *p++ = 0x66;
2744      *p++ = 0x0F;
2745      if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
2746      p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
2747      goto done;
2748
2749   case Xin_DACAS:
2750      /* lock */
2751      *p++ = 0xF0;
2752      /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
2753         in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
2754         aren't encoded in the insn. */
2755      *p++ = 0x0F;
2756      *p++ = 0xC7;
2757      p = doAMode_M(p, fake(1), i->Xin.DACAS.addr);
2758      goto done;
2759
2760   case Xin_Store:
2761      if (i->Xin.Store.sz == 2) {
2762         /* This case, at least, is simple, given that we can
2763            reference the low 16 bits of any integer register. */
2764         *p++ = 0x66;
2765         *p++ = 0x89;
2766         p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2767         goto done;
2768      }
2769
2770      if (i->Xin.Store.sz == 1) {
2771         /* We have to do complex dodging and weaving if src is not
2772            the low 8 bits of %eax/%ebx/%ecx/%edx. */
2773         if (iregNo(i->Xin.Store.src) < 4) {
2774            /* we're OK, can do it directly */
2775            *p++ = 0x88;
2776            p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2777           goto done;
2778         } else {
2779            /* Bleh.  This means the source is %edi or %esi.  Since
2780               the address mode can only mention three registers, at
2781               least one of %eax/%ebx/%ecx/%edx must be available to
2782               temporarily swap the source into, so the store can
2783               happen.  So we have to look at the regs mentioned
2784               in the amode. */
2785            HReg swap = INVALID_HREG;
2786            HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(),
2787                  ecx = hregX86_ECX(), edx = hregX86_EDX();
2788            Bool a_ok = True, b_ok = True, c_ok = True, d_ok = True;
2789            HRegUsage u;
2790            Int j;
2791            initHRegUsage(&u);
2792            addRegUsage_X86AMode(&u,  i->Xin.Store.dst);
2793            for (j = 0; j < u.n_used; j++) {
2794               HReg r = u.hreg[j];
2795               if (sameHReg(r, eax)) a_ok = False;
2796               if (sameHReg(r, ebx)) b_ok = False;
2797               if (sameHReg(r, ecx)) c_ok = False;
2798               if (sameHReg(r, edx)) d_ok = False;
2799            }
2800            if (a_ok) swap = eax;
2801            if (b_ok) swap = ebx;
2802            if (c_ok) swap = ecx;
2803            if (d_ok) swap = edx;
2804            vassert(! hregIsInvalid(swap));
2805            /* xchgl %source, %swap. Could do better if swap is %eax. */
2806            *p++ = 0x87;
2807            p = doAMode_R(p, i->Xin.Store.src, swap);
2808            /* movb lo8{%swap}, (dst) */
2809            *p++ = 0x88;
2810            p = doAMode_M(p, swap, i->Xin.Store.dst);
2811            /* xchgl %source, %swap. Could do better if swap is %eax. */
2812            *p++ = 0x87;
2813            p = doAMode_R(p, i->Xin.Store.src, swap);
2814            goto done;
2815         }
2816      } /* if (i->Xin.Store.sz == 1) */
2817      break;
2818
2819   case Xin_FpUnary:
2820      /* gop %src, %dst
2821         --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
2822      */
2823      p = do_ffree_st7(p);
2824      p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
2825      p = do_fop1_st(p, i->Xin.FpUnary.op);
2826      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
2827      goto done;
2828
2829   case Xin_FpBinary:
2830      if (i->Xin.FpBinary.op == Xfp_YL2X
2831          || i->Xin.FpBinary.op == Xfp_YL2XP1) {
2832         /* Have to do this specially. */
2833         /* ffree %st7 ; fld %st(srcL) ;
2834            ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
2835         p = do_ffree_st7(p);
2836         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2837         p = do_ffree_st7(p);
2838         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2839         *p++ = 0xD9;
2840         *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
2841         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2842         goto done;
2843      }
2844      if (i->Xin.FpBinary.op == Xfp_ATAN) {
2845         /* Have to do this specially. */
2846         /* ffree %st7 ; fld %st(srcL) ;
2847            ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
2848         p = do_ffree_st7(p);
2849         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2850         p = do_ffree_st7(p);
2851         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2852         *p++ = 0xD9; *p++ = 0xF3;
2853         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2854         goto done;
2855      }
2856      if (i->Xin.FpBinary.op == Xfp_PREM
2857          || i->Xin.FpBinary.op == Xfp_PREM1
2858          || i->Xin.FpBinary.op == Xfp_SCALE) {
2859         /* Have to do this specially. */
2860         /* ffree %st7 ; fld %st(srcR) ;
2861            ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
2862            fincstp ; ffree %st7 */
2863         p = do_ffree_st7(p);
2864         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
2865         p = do_ffree_st7(p);
2866         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
2867         *p++ = 0xD9;
2868         switch (i->Xin.FpBinary.op) {
2869            case Xfp_PREM: *p++ = 0xF8; break;
2870            case Xfp_PREM1: *p++ = 0xF5; break;
2871            case Xfp_SCALE: *p++ =  0xFD; break;
2872            default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
2873         }
2874         p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
2875         *p++ = 0xD9; *p++ = 0xF7;
2876         p = do_ffree_st7(p);
2877         goto done;
2878      }
2879      /* General case */
2880      /* gop %srcL, %srcR, %dst
2881         --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
2882      */
2883      p = do_ffree_st7(p);
2884      p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2885      p = do_fop2_st(p, i->Xin.FpBinary.op,
2886                        1+hregNumber(i->Xin.FpBinary.srcR));
2887      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2888      goto done;
2889
2890   case Xin_FpLdSt:
2891      if (i->Xin.FpLdSt.isLoad) {
2892         /* Load from memory into %fakeN.
2893            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
2894         */
2895         p = do_ffree_st7(p);
2896         switch (i->Xin.FpLdSt.sz) {
2897            case 4:
2898               *p++ = 0xD9;
2899               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2900               break;
2901            case 8:
2902               *p++ = 0xDD;
2903               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2904               break;
2905            case 10:
2906               *p++ = 0xDB;
2907               p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
2908               break;
2909            default:
2910               vpanic("emitX86Instr(FpLdSt,load)");
2911         }
2912         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
2913         goto done;
2914      } else {
2915         /* Store from %fakeN into memory.
2916            --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
2917	 */
2918         p = do_ffree_st7(p);
2919         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
2920         switch (i->Xin.FpLdSt.sz) {
2921            case 4:
2922               *p++ = 0xD9;
2923               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2924               break;
2925            case 8:
2926               *p++ = 0xDD;
2927               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2928               break;
2929            case 10:
2930               *p++ = 0xDB;
2931               p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
2932               break;
2933            default:
2934               vpanic("emitX86Instr(FpLdSt,store)");
2935         }
2936         goto done;
2937      }
2938      break;
2939
2940   case Xin_FpLdStI:
2941      if (i->Xin.FpLdStI.isLoad) {
2942         /* Load from memory into %fakeN, converting from an int.
2943            --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
2944         */
2945         switch (i->Xin.FpLdStI.sz) {
2946            case 8:  opc = 0xDF; subopc_imm = 5; break;
2947            case 4:  opc = 0xDB; subopc_imm = 0; break;
2948            case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
2949            default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
2950         }
2951         p = do_ffree_st7(p);
2952         *p++ = toUChar(opc);
2953         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2954         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
2955         goto done;
2956      } else {
2957         /* Store from %fakeN into memory, converting to an int.
2958            --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
2959	 */
2960         switch (i->Xin.FpLdStI.sz) {
2961            case 8:  opc = 0xDF; subopc_imm = 7; break;
2962            case 4:  opc = 0xDB; subopc_imm = 3; break;
2963            case 2:  opc = 0xDF; subopc_imm = 3; break;
2964            default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
2965         }
2966         p = do_ffree_st7(p);
2967         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
2968         *p++ = toUChar(opc);
2969         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2970         goto done;
2971      }
2972      break;
2973
2974   case Xin_Fp64to32:
2975      /* ffree %st7 ; fld %st(src) */
2976      p = do_ffree_st7(p);
2977      p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
2978      /* subl $4, %esp */
2979      *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
2980      /* fstps (%esp) */
2981      *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
2982      /* flds (%esp) */
2983      *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
2984      /* addl $4, %esp */
2985      *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
2986      /* fstp %st(1+dst) */
2987      p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
2988      goto done;
2989
2990   case Xin_FpCMov:
2991      /* jmp fwds if !condition */
2992      *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
2993      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2994      ptmp = p;
2995
2996      /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
2997      p = do_ffree_st7(p);
2998      p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
2999      p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
3000
3001      /* Fill in the jump offset. */
3002      *(ptmp-1) = toUChar(p - ptmp);
3003      goto done;
3004
3005   case Xin_FpLdCW:
3006      *p++ = 0xD9;
3007      p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdCW.addr);
3008      goto done;
3009
3010   case Xin_FpStSW_AX:
3011      /* note, this emits fnstsw %ax, not fstsw %ax */
3012      *p++ = 0xDF;
3013      *p++ = 0xE0;
3014      goto done;
3015
3016   case Xin_FpCmp:
3017      /* gcmp %fL, %fR, %dst
3018         -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
3019            fnstsw %ax ; movl %eax, %dst
3020      */
3021      /* ffree %st7 */
3022      p = do_ffree_st7(p);
3023      /* fpush %fL */
3024      p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
3025      /* fucomp %(fR+1) */
3026      *p++ = 0xDD;
3027      *p++ = toUChar(0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR))));
3028      /* fnstsw %ax */
3029      *p++ = 0xDF;
3030      *p++ = 0xE0;
3031      /*  movl %eax, %dst */
3032      *p++ = 0x89;
3033      p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
3034      goto done;
3035
3036   case Xin_SseConst: {
3037      UShort con = i->Xin.SseConst.con;
3038      p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
3039      p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
3040      p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
3041      p = push_word_from_tags(p, toUShort(con & 0xF));
3042      /* movl (%esp), %xmm-dst */
3043      *p++ = 0x0F;
3044      *p++ = 0x10;
3045      *p++ = toUChar(0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst)));
3046      *p++ = 0x24;
3047      /* addl $16, %esp */
3048      *p++ = 0x83;
3049      *p++ = 0xC4;
3050      *p++ = 0x10;
3051      goto done;
3052   }
3053
3054   case Xin_SseLdSt:
3055      *p++ = 0x0F;
3056      *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
3057      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdSt.reg)), i->Xin.SseLdSt.addr);
3058      goto done;
3059
3060   case Xin_SseLdzLO:
3061      vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
3062      /* movs[sd] amode, %xmm-dst */
3063      *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3064      *p++ = 0x0F;
3065      *p++ = 0x10;
3066      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdzLO.reg)),
3067                       i->Xin.SseLdzLO.addr);
3068      goto done;
3069
3070   case Xin_Sse32Fx4:
3071      xtra = 0;
3072      *p++ = 0x0F;
3073      switch (i->Xin.Sse32Fx4.op) {
3074         case Xsse_ADDF:   *p++ = 0x58; break;
3075         case Xsse_DIVF:   *p++ = 0x5E; break;
3076         case Xsse_MAXF:   *p++ = 0x5F; break;
3077         case Xsse_MINF:   *p++ = 0x5D; break;
3078         case Xsse_MULF:   *p++ = 0x59; break;
3079         case Xsse_RCPF:   *p++ = 0x53; break;
3080         case Xsse_RSQRTF: *p++ = 0x52; break;
3081         case Xsse_SQRTF:  *p++ = 0x51; break;
3082         case Xsse_SUBF:   *p++ = 0x5C; break;
3083         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3084         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3085         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3086         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3087         default: goto bad;
3088      }
3089      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32Fx4.dst)),
3090                       fake(vregNo(i->Xin.Sse32Fx4.src)) );
3091      if (xtra & 0x100)
3092         *p++ = toUChar(xtra & 0xFF);
3093      goto done;
3094
3095   case Xin_Sse64Fx2:
3096      xtra = 0;
3097      *p++ = 0x66;
3098      *p++ = 0x0F;
3099      switch (i->Xin.Sse64Fx2.op) {
3100         case Xsse_ADDF:   *p++ = 0x58; break;
3101         case Xsse_DIVF:   *p++ = 0x5E; break;
3102         case Xsse_MAXF:   *p++ = 0x5F; break;
3103         case Xsse_MINF:   *p++ = 0x5D; break;
3104         case Xsse_MULF:   *p++ = 0x59; break;
3105         case Xsse_RCPF:   *p++ = 0x53; break;
3106         case Xsse_RSQRTF: *p++ = 0x52; break;
3107         case Xsse_SQRTF:  *p++ = 0x51; break;
3108         case Xsse_SUBF:   *p++ = 0x5C; break;
3109         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3110         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3111         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3112         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3113         default: goto bad;
3114      }
3115      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64Fx2.dst)),
3116                       fake(vregNo(i->Xin.Sse64Fx2.src)) );
3117      if (xtra & 0x100)
3118         *p++ = toUChar(xtra & 0xFF);
3119      goto done;
3120
3121   case Xin_Sse32FLo:
3122      xtra = 0;
3123      *p++ = 0xF3;
3124      *p++ = 0x0F;
3125      switch (i->Xin.Sse32FLo.op) {
3126         case Xsse_ADDF:   *p++ = 0x58; break;
3127         case Xsse_DIVF:   *p++ = 0x5E; break;
3128         case Xsse_MAXF:   *p++ = 0x5F; break;
3129         case Xsse_MINF:   *p++ = 0x5D; break;
3130         case Xsse_MULF:   *p++ = 0x59; break;
3131         case Xsse_RCPF:   *p++ = 0x53; break;
3132         case Xsse_RSQRTF: *p++ = 0x52; break;
3133         case Xsse_SQRTF:  *p++ = 0x51; break;
3134         case Xsse_SUBF:   *p++ = 0x5C; break;
3135         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3136         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3137         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3138         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3139         default: goto bad;
3140      }
3141      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32FLo.dst)),
3142                       fake(vregNo(i->Xin.Sse32FLo.src)) );
3143      if (xtra & 0x100)
3144         *p++ = toUChar(xtra & 0xFF);
3145      goto done;
3146
3147   case Xin_Sse64FLo:
3148      xtra = 0;
3149      *p++ = 0xF2;
3150      *p++ = 0x0F;
3151      switch (i->Xin.Sse64FLo.op) {
3152         case Xsse_ADDF:   *p++ = 0x58; break;
3153         case Xsse_DIVF:   *p++ = 0x5E; break;
3154         case Xsse_MAXF:   *p++ = 0x5F; break;
3155         case Xsse_MINF:   *p++ = 0x5D; break;
3156         case Xsse_MULF:   *p++ = 0x59; break;
3157         case Xsse_RCPF:   *p++ = 0x53; break;
3158         case Xsse_RSQRTF: *p++ = 0x52; break;
3159         case Xsse_SQRTF:  *p++ = 0x51; break;
3160         case Xsse_SUBF:   *p++ = 0x5C; break;
3161         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3162         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3163         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3164         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3165         default: goto bad;
3166      }
3167      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64FLo.dst)),
3168                       fake(vregNo(i->Xin.Sse64FLo.src)) );
3169      if (xtra & 0x100)
3170         *p++ = toUChar(xtra & 0xFF);
3171      goto done;
3172
3173   case Xin_SseReRg:
3174#     define XX(_n) *p++ = (_n)
3175      switch (i->Xin.SseReRg.op) {
3176         case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
3177         case Xsse_OR:                 XX(0x0F); XX(0x56); break;
3178         case Xsse_XOR:                XX(0x0F); XX(0x57); break;
3179         case Xsse_AND:                XX(0x0F); XX(0x54); break;
3180         case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
3181         case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
3182         case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
3183         case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
3184         case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
3185         case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
3186         case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
3187         case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
3188         case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
3189         case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
3190         case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
3191         case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
3192         case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
3193         case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
3194         case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
3195         case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
3196         case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
3197         case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
3198         case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
3199         case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
3200         case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
3201         case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
3202         case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
3203         case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
3204         case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
3205         case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
3206         case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
3207         case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
3208         case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
3209         case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
3210         case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
3211         case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
3212         case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
3213         case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
3214         case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
3215         case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
3216         case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
3217         case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
3218         case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
3219         case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
3220         case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
3221         case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
3222         case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
3223         case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
3224         case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
3225         case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
3226         case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
3227         case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
3228         case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
3229         case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
3230         default: goto bad;
3231      }
3232      p = doAMode_R(p, fake(vregNo(i->Xin.SseReRg.dst)),
3233                       fake(vregNo(i->Xin.SseReRg.src)) );
3234#     undef XX
3235      goto done;
3236
3237   case Xin_SseCMov:
3238      /* jmp fwds if !condition */
3239      *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
3240      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3241      ptmp = p;
3242
3243      /* movaps %src, %dst */
3244      *p++ = 0x0F;
3245      *p++ = 0x28;
3246      p = doAMode_R(p, fake(vregNo(i->Xin.SseCMov.dst)),
3247                       fake(vregNo(i->Xin.SseCMov.src)) );
3248
3249      /* Fill in the jump offset. */
3250      *(ptmp-1) = toUChar(p - ptmp);
3251      goto done;
3252
3253   case Xin_SseShuf:
3254      *p++ = 0x66;
3255      *p++ = 0x0F;
3256      *p++ = 0x70;
3257      p = doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
3258                       fake(vregNo(i->Xin.SseShuf.src)) );
3259      *p++ = (UChar)(i->Xin.SseShuf.order);
3260      goto done;
3261
3262   case Xin_EvCheck: {
3263      /* We generate:
3264            (3 bytes)  decl 4(%ebp)    4 == offsetof(host_EvC_COUNTER)
3265            (2 bytes)  jns  nofail     expected taken
3266            (3 bytes)  jmp* 0(%ebp)    0 == offsetof(host_EvC_FAILADDR)
3267            nofail:
3268      */
3269      /* This is heavily asserted re instruction lengths.  It needs to
3270         be.  If we get given unexpected forms of .amCounter or
3271         .amFailAddr -- basically, anything that's not of the form
3272         uimm7(%ebp) -- they are likely to fail. */
3273      /* Note also that after the decl we must be very careful not to
3274         read the carry flag, else we get a partial flags stall.
3275         js/jns avoids that, though. */
3276      UChar* p0 = p;
3277      /* ---  decl 8(%ebp) --- */
3278      /* "fake(1)" because + there's no register in this encoding;
3279         instead the register + field is used as a sub opcode.  The
3280         encoding for "decl r/m32" + is FF /1, hence the fake(1). */
3281      *p++ = 0xFF;
3282      p = doAMode_M(p, fake(1), i->Xin.EvCheck.amCounter);
3283      vassert(p - p0 == 3);
3284      /* --- jns nofail --- */
3285      *p++ = 0x79;
3286      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3287      vassert(p - p0 == 5);
3288      /* --- jmp* 0(%ebp) --- */
3289      /* The encoding is FF /4. */
3290      *p++ = 0xFF;
3291      p = doAMode_M(p, fake(4), i->Xin.EvCheck.amFailAddr);
3292      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3293      /* And crosscheck .. */
3294      vassert(evCheckSzB_X86() == 8);
3295      goto done;
3296   }
3297
3298   case Xin_ProfInc: {
3299      /* We generate   addl $1,NotKnownYet
3300                       adcl $0,NotKnownYet+4
3301         in the expectation that a later call to LibVEX_patchProfCtr
3302         will be used to fill in the immediate fields once the right
3303         value is known.
3304           83 05  00 00 00 00  01
3305           83 15  00 00 00 00  00
3306      */
3307      *p++ = 0x83; *p++ = 0x05;
3308      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3309      *p++ = 0x01;
3310      *p++ = 0x83; *p++ = 0x15;
3311      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3312      *p++ = 0x00;
3313      /* Tell the caller .. */
3314      vassert(!(*is_profInc));
3315      *is_profInc = True;
3316      goto done;
3317   }
3318
3319   default:
3320      goto bad;
3321   }
3322
3323  bad:
3324   ppX86Instr(i, mode64);
3325   vpanic("emit_X86Instr");
3326   /*NOTREACHED*/
3327
3328  done:
3329   vassert(p - &buf[0] <= 32);
3330   return p - &buf[0];
3331
3332#  undef fake
3333}
3334
3335
3336/* How big is an event check?  See case for Xin_EvCheck in
3337   emit_X86Instr just above.  That crosschecks what this returns, so
3338   we can tell if we're inconsistent. */
3339Int evCheckSzB_X86 ( void )
3340{
3341   return 8;
3342}
3343
3344
3345/* NB: what goes on here has to be very closely coordinated with the
3346   emitInstr case for XDirect, above. */
3347VexInvalRange chainXDirect_X86 ( void* place_to_chain,
3348                                 void* disp_cp_chain_me_EXPECTED,
3349                                 void* place_to_jump_to )
3350{
3351   /* What we're expecting to see is:
3352        movl $disp_cp_chain_me_EXPECTED, %edx
3353        call *%edx
3354      viz
3355        BA <4 bytes value == disp_cp_chain_me_EXPECTED>
3356        FF D2
3357   */
3358   UChar* p = (UChar*)place_to_chain;
3359   vassert(p[0] == 0xBA);
3360   vassert(*(UInt*)(&p[1]) == (UInt)Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
3361   vassert(p[5] == 0xFF);
3362   vassert(p[6] == 0xD2);
3363   /* And what we want to change it to is:
3364          jmp disp32   where disp32 is relative to the next insn
3365          ud2;
3366        viz
3367          E9 <4 bytes == disp32>
3368          0F 0B
3369      The replacement has the same length as the original.
3370   */
3371   /* This is the delta we need to put into a JMP d32 insn.  It's
3372      relative to the start of the next insn, hence the -5.  */
3373   Long delta = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
3374
3375   /* And make the modifications. */
3376   p[0] = 0xE9;
3377   p[1] = (delta >> 0) & 0xFF;
3378   p[2] = (delta >> 8) & 0xFF;
3379   p[3] = (delta >> 16) & 0xFF;
3380   p[4] = (delta >> 24) & 0xFF;
3381   p[5] = 0x0F; p[6]  = 0x0B;
3382   /* sanity check on the delta -- top 32 are all 0 or all 1 */
3383   delta >>= 32;
3384   vassert(delta == 0LL || delta == -1LL);
3385   VexInvalRange vir = { (HWord)place_to_chain, 7 };
3386   return vir;
3387}
3388
3389
3390/* NB: what goes on here has to be very closely coordinated with the
3391   emitInstr case for XDirect, above. */
3392VexInvalRange unchainXDirect_X86 ( void* place_to_unchain,
3393                                   void* place_to_jump_to_EXPECTED,
3394                                   void* disp_cp_chain_me )
3395{
3396   /* What we're expecting to see is:
3397          jmp d32
3398          ud2;
3399       viz
3400          E9 <4 bytes == disp32>
3401          0F 0B
3402   */
3403   UChar* p     = (UChar*)place_to_unchain;
3404   Bool   valid = False;
3405   if (p[0] == 0xE9
3406       && p[5]  == 0x0F && p[6]  == 0x0B) {
3407      /* Check the offset is right. */
3408      Int s32 = *(Int*)(&p[1]);
3409      if ((UChar*)p + 5 + s32 == (UChar*)place_to_jump_to_EXPECTED) {
3410         valid = True;
3411         if (0)
3412            vex_printf("QQQ unchainXDirect_X86: found valid\n");
3413      }
3414   }
3415   vassert(valid);
3416   /* And what we want to change it to is:
3417         movl $disp_cp_chain_me, %edx
3418         call *%edx
3419      viz
3420         BA <4 bytes value == disp_cp_chain_me_EXPECTED>
3421         FF D2
3422      So it's the same length (convenient, huh).
3423   */
3424   p[0] = 0xBA;
3425   *(UInt*)(&p[1]) = (UInt)Ptr_to_ULong(disp_cp_chain_me);
3426   p[5] = 0xFF;
3427   p[6] = 0xD2;
3428   VexInvalRange vir = { (HWord)place_to_unchain, 7 };
3429   return vir;
3430}
3431
3432
3433/* Patch the counter address into a profile inc point, as previously
3434   created by the Xin_ProfInc case for emit_X86Instr. */
3435VexInvalRange patchProfInc_X86 ( void*  place_to_patch,
3436                                 ULong* location_of_counter )
3437{
3438   vassert(sizeof(ULong*) == 4);
3439   UChar* p = (UChar*)place_to_patch;
3440   vassert(p[0] == 0x83);
3441   vassert(p[1] == 0x05);
3442   vassert(p[2] == 0x00);
3443   vassert(p[3] == 0x00);
3444   vassert(p[4] == 0x00);
3445   vassert(p[5] == 0x00);
3446   vassert(p[6] == 0x01);
3447   vassert(p[7] == 0x83);
3448   vassert(p[8] == 0x15);
3449   vassert(p[9] == 0x00);
3450   vassert(p[10] == 0x00);
3451   vassert(p[11] == 0x00);
3452   vassert(p[12] == 0x00);
3453   vassert(p[13] == 0x00);
3454   UInt imm32 = (UInt)Ptr_to_ULong(location_of_counter);
3455   p[2] = imm32 & 0xFF; imm32 >>= 8;
3456   p[3] = imm32 & 0xFF; imm32 >>= 8;
3457   p[4] = imm32 & 0xFF; imm32 >>= 8;
3458   p[5] = imm32 & 0xFF; imm32 >>= 8;
3459   imm32 = 4 + (UInt)Ptr_to_ULong(location_of_counter);
3460   p[9]  = imm32 & 0xFF; imm32 >>= 8;
3461   p[10] = imm32 & 0xFF; imm32 >>= 8;
3462   p[11] = imm32 & 0xFF; imm32 >>= 8;
3463   p[12] = imm32 & 0xFF; imm32 >>= 8;
3464   VexInvalRange vir = { (HWord)place_to_patch, 14 };
3465   return vir;
3466}
3467
3468
3469/*---------------------------------------------------------------*/
3470/*--- end                                     host_x86_defs.c ---*/
3471/*---------------------------------------------------------------*/
3472