1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_x86_defs.h"
43
44
45/* --------- Registers. --------- */
46
47void ppHRegX86 ( HReg reg )
48{
49   Int r;
50   static HChar* ireg32_names[8]
51     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
52   /* Be generic for all virtual regs. */
53   if (hregIsVirtual(reg)) {
54      ppHReg(reg);
55      return;
56   }
57   /* But specific for real regs. */
58   switch (hregClass(reg)) {
59      case HRcInt32:
60         r = hregNumber(reg);
61         vassert(r >= 0 && r < 8);
62         vex_printf("%s", ireg32_names[r]);
63         return;
64      case HRcFlt64:
65         r = hregNumber(reg);
66         vassert(r >= 0 && r < 6);
67         vex_printf("%%fake%d", r);
68         return;
69      case HRcVec128:
70         r = hregNumber(reg);
71         vassert(r >= 0 && r < 8);
72         vex_printf("%%xmm%d", r);
73         return;
74      default:
75         vpanic("ppHRegX86");
76   }
77}
78
79HReg hregX86_EAX ( void ) { return mkHReg(0, HRcInt32, False); }
80HReg hregX86_ECX ( void ) { return mkHReg(1, HRcInt32, False); }
81HReg hregX86_EDX ( void ) { return mkHReg(2, HRcInt32, False); }
82HReg hregX86_EBX ( void ) { return mkHReg(3, HRcInt32, False); }
83HReg hregX86_ESP ( void ) { return mkHReg(4, HRcInt32, False); }
84HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt32, False); }
85HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt32, False); }
86HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt32, False); }
87
88HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
89HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
90HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
91HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
92HReg hregX86_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
93HReg hregX86_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
94
95HReg hregX86_XMM0 ( void ) { return mkHReg(0, HRcVec128, False); }
96HReg hregX86_XMM1 ( void ) { return mkHReg(1, HRcVec128, False); }
97HReg hregX86_XMM2 ( void ) { return mkHReg(2, HRcVec128, False); }
98HReg hregX86_XMM3 ( void ) { return mkHReg(3, HRcVec128, False); }
99HReg hregX86_XMM4 ( void ) { return mkHReg(4, HRcVec128, False); }
100HReg hregX86_XMM5 ( void ) { return mkHReg(5, HRcVec128, False); }
101HReg hregX86_XMM6 ( void ) { return mkHReg(6, HRcVec128, False); }
102HReg hregX86_XMM7 ( void ) { return mkHReg(7, HRcVec128, False); }
103
104
105void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
106{
107   *nregs = 20;
108   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
109   (*arr)[0] = hregX86_EAX();
110   (*arr)[1] = hregX86_EBX();
111   (*arr)[2] = hregX86_ECX();
112   (*arr)[3] = hregX86_EDX();
113   (*arr)[4] = hregX86_ESI();
114   (*arr)[5] = hregX86_EDI();
115   (*arr)[6] = hregX86_FAKE0();
116   (*arr)[7] = hregX86_FAKE1();
117   (*arr)[8] = hregX86_FAKE2();
118   (*arr)[9] = hregX86_FAKE3();
119   (*arr)[10] = hregX86_FAKE4();
120   (*arr)[11] = hregX86_FAKE5();
121   (*arr)[12] = hregX86_XMM0();
122   (*arr)[13] = hregX86_XMM1();
123   (*arr)[14] = hregX86_XMM2();
124   (*arr)[15] = hregX86_XMM3();
125   (*arr)[16] = hregX86_XMM4();
126   (*arr)[17] = hregX86_XMM5();
127   (*arr)[18] = hregX86_XMM6();
128   (*arr)[19] = hregX86_XMM7();
129}
130
131
132/* --------- Condition codes, Intel encoding. --------- */
133
134HChar* showX86CondCode ( X86CondCode cond )
135{
136   switch (cond) {
137      case Xcc_O:      return "o";
138      case Xcc_NO:     return "no";
139      case Xcc_B:      return "b";
140      case Xcc_NB:     return "nb";
141      case Xcc_Z:      return "z";
142      case Xcc_NZ:     return "nz";
143      case Xcc_BE:     return "be";
144      case Xcc_NBE:    return "nbe";
145      case Xcc_S:      return "s";
146      case Xcc_NS:     return "ns";
147      case Xcc_P:      return "p";
148      case Xcc_NP:     return "np";
149      case Xcc_L:      return "l";
150      case Xcc_NL:     return "nl";
151      case Xcc_LE:     return "le";
152      case Xcc_NLE:    return "nle";
153      case Xcc_ALWAYS: return "ALWAYS";
154      default: vpanic("ppX86CondCode");
155   }
156}
157
158
159/* --------- X86AMode: memory address expressions. --------- */
160
161X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
162   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
163   am->tag = Xam_IR;
164   am->Xam.IR.imm = imm32;
165   am->Xam.IR.reg = reg;
166   return am;
167}
168X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
169   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
170   am->tag = Xam_IRRS;
171   am->Xam.IRRS.imm = imm32;
172   am->Xam.IRRS.base = base;
173   am->Xam.IRRS.index = indEx;
174   am->Xam.IRRS.shift = shift;
175   vassert(shift >= 0 && shift <= 3);
176   return am;
177}
178
179X86AMode* dopyX86AMode ( X86AMode* am ) {
180   switch (am->tag) {
181      case Xam_IR:
182         return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
183      case Xam_IRRS:
184         return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
185                               am->Xam.IRRS.index, am->Xam.IRRS.shift );
186      default:
187         vpanic("dopyX86AMode");
188   }
189}
190
191void ppX86AMode ( X86AMode* am ) {
192   switch (am->tag) {
193      case Xam_IR:
194         if (am->Xam.IR.imm == 0)
195            vex_printf("(");
196         else
197            vex_printf("0x%x(", am->Xam.IR.imm);
198         ppHRegX86(am->Xam.IR.reg);
199         vex_printf(")");
200         return;
201      case Xam_IRRS:
202         vex_printf("0x%x(", am->Xam.IRRS.imm);
203         ppHRegX86(am->Xam.IRRS.base);
204         vex_printf(",");
205         ppHRegX86(am->Xam.IRRS.index);
206         vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
207         return;
208      default:
209         vpanic("ppX86AMode");
210   }
211}
212
213static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
214   switch (am->tag) {
215      case Xam_IR:
216         addHRegUse(u, HRmRead, am->Xam.IR.reg);
217         return;
218      case Xam_IRRS:
219         addHRegUse(u, HRmRead, am->Xam.IRRS.base);
220         addHRegUse(u, HRmRead, am->Xam.IRRS.index);
221         return;
222      default:
223         vpanic("addRegUsage_X86AMode");
224   }
225}
226
227static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
228   switch (am->tag) {
229      case Xam_IR:
230         am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
231         return;
232      case Xam_IRRS:
233         am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
234         am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
235         return;
236      default:
237         vpanic("mapRegs_X86AMode");
238   }
239}
240
241/* --------- Operand, which can be reg, immediate or memory. --------- */
242
243X86RMI* X86RMI_Imm ( UInt imm32 ) {
244   X86RMI* op         = LibVEX_Alloc(sizeof(X86RMI));
245   op->tag            = Xrmi_Imm;
246   op->Xrmi.Imm.imm32 = imm32;
247   return op;
248}
249X86RMI* X86RMI_Reg ( HReg reg ) {
250   X86RMI* op       = LibVEX_Alloc(sizeof(X86RMI));
251   op->tag          = Xrmi_Reg;
252   op->Xrmi.Reg.reg = reg;
253   return op;
254}
255X86RMI* X86RMI_Mem ( X86AMode* am ) {
256   X86RMI* op      = LibVEX_Alloc(sizeof(X86RMI));
257   op->tag         = Xrmi_Mem;
258   op->Xrmi.Mem.am = am;
259   return op;
260}
261
262void ppX86RMI ( X86RMI* op ) {
263   switch (op->tag) {
264      case Xrmi_Imm:
265         vex_printf("$0x%x", op->Xrmi.Imm.imm32);
266         return;
267      case Xrmi_Reg:
268         ppHRegX86(op->Xrmi.Reg.reg);
269         return;
270      case Xrmi_Mem:
271         ppX86AMode(op->Xrmi.Mem.am);
272         return;
273     default:
274         vpanic("ppX86RMI");
275   }
276}
277
278/* An X86RMI can only be used in a "read" context (what would it mean
279   to write or modify a literal?) and so we enumerate its registers
280   accordingly. */
281static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
282   switch (op->tag) {
283      case Xrmi_Imm:
284         return;
285      case Xrmi_Reg:
286         addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
287         return;
288      case Xrmi_Mem:
289         addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
290         return;
291      default:
292         vpanic("addRegUsage_X86RMI");
293   }
294}
295
296static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
297   switch (op->tag) {
298      case Xrmi_Imm:
299         return;
300      case Xrmi_Reg:
301         op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
302         return;
303      case Xrmi_Mem:
304         mapRegs_X86AMode(m, op->Xrmi.Mem.am);
305         return;
306      default:
307         vpanic("mapRegs_X86RMI");
308   }
309}
310
311
312/* --------- Operand, which can be reg or immediate only. --------- */
313
314X86RI* X86RI_Imm ( UInt imm32 ) {
315   X86RI* op         = LibVEX_Alloc(sizeof(X86RI));
316   op->tag           = Xri_Imm;
317   op->Xri.Imm.imm32 = imm32;
318   return op;
319}
320X86RI* X86RI_Reg ( HReg reg ) {
321   X86RI* op       = LibVEX_Alloc(sizeof(X86RI));
322   op->tag         = Xri_Reg;
323   op->Xri.Reg.reg = reg;
324   return op;
325}
326
327void ppX86RI ( X86RI* op ) {
328   switch (op->tag) {
329      case Xri_Imm:
330         vex_printf("$0x%x", op->Xri.Imm.imm32);
331         return;
332      case Xri_Reg:
333         ppHRegX86(op->Xri.Reg.reg);
334         return;
335     default:
336         vpanic("ppX86RI");
337   }
338}
339
340/* An X86RI can only be used in a "read" context (what would it mean
341   to write or modify a literal?) and so we enumerate its registers
342   accordingly. */
343static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
344   switch (op->tag) {
345      case Xri_Imm:
346         return;
347      case Xri_Reg:
348         addHRegUse(u, HRmRead, op->Xri.Reg.reg);
349         return;
350      default:
351         vpanic("addRegUsage_X86RI");
352   }
353}
354
355static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
356   switch (op->tag) {
357      case Xri_Imm:
358         return;
359      case Xri_Reg:
360         op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
361         return;
362      default:
363         vpanic("mapRegs_X86RI");
364   }
365}
366
367
368/* --------- Operand, which can be reg or memory only. --------- */
369
370X86RM* X86RM_Reg ( HReg reg ) {
371   X86RM* op       = LibVEX_Alloc(sizeof(X86RM));
372   op->tag         = Xrm_Reg;
373   op->Xrm.Reg.reg = reg;
374   return op;
375}
376X86RM* X86RM_Mem ( X86AMode* am ) {
377   X86RM* op      = LibVEX_Alloc(sizeof(X86RM));
378   op->tag        = Xrm_Mem;
379   op->Xrm.Mem.am = am;
380   return op;
381}
382
383void ppX86RM ( X86RM* op ) {
384   switch (op->tag) {
385      case Xrm_Mem:
386         ppX86AMode(op->Xrm.Mem.am);
387         return;
388      case Xrm_Reg:
389         ppHRegX86(op->Xrm.Reg.reg);
390         return;
391     default:
392         vpanic("ppX86RM");
393   }
394}
395
396/* Because an X86RM can be both a source or destination operand, we
397   have to supply a mode -- pertaining to the operand as a whole --
398   indicating how it's being used. */
399static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
400   switch (op->tag) {
401      case Xrm_Mem:
402         /* Memory is read, written or modified.  So we just want to
403            know the regs read by the amode. */
404         addRegUsage_X86AMode(u, op->Xrm.Mem.am);
405         return;
406      case Xrm_Reg:
407         /* reg is read, written or modified.  Add it in the
408            appropriate way. */
409         addHRegUse(u, mode, op->Xrm.Reg.reg);
410         return;
411     default:
412         vpanic("addRegUsage_X86RM");
413   }
414}
415
416static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
417{
418   switch (op->tag) {
419      case Xrm_Mem:
420         mapRegs_X86AMode(m, op->Xrm.Mem.am);
421         return;
422      case Xrm_Reg:
423         op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
424         return;
425     default:
426         vpanic("mapRegs_X86RM");
427   }
428}
429
430
431/* --------- Instructions. --------- */
432
433HChar* showX86UnaryOp ( X86UnaryOp op ) {
434   switch (op) {
435      case Xun_NOT: return "not";
436      case Xun_NEG: return "neg";
437      default: vpanic("showX86UnaryOp");
438   }
439}
440
441HChar* showX86AluOp ( X86AluOp op ) {
442   switch (op) {
443      case Xalu_MOV:  return "mov";
444      case Xalu_CMP:  return "cmp";
445      case Xalu_ADD:  return "add";
446      case Xalu_SUB:  return "sub";
447      case Xalu_ADC:  return "adc";
448      case Xalu_SBB:  return "sbb";
449      case Xalu_AND:  return "and";
450      case Xalu_OR:   return "or";
451      case Xalu_XOR:  return "xor";
452      case Xalu_MUL:  return "mul";
453      default: vpanic("showX86AluOp");
454   }
455}
456
457HChar* showX86ShiftOp ( X86ShiftOp op ) {
458   switch (op) {
459      case Xsh_SHL: return "shl";
460      case Xsh_SHR: return "shr";
461      case Xsh_SAR: return "sar";
462      default: vpanic("showX86ShiftOp");
463   }
464}
465
466HChar* showX86FpOp ( X86FpOp op ) {
467   switch (op) {
468      case Xfp_ADD:    return "add";
469      case Xfp_SUB:    return "sub";
470      case Xfp_MUL:    return "mul";
471      case Xfp_DIV:    return "div";
472      case Xfp_SCALE:  return "scale";
473      case Xfp_ATAN:   return "atan";
474      case Xfp_YL2X:   return "yl2x";
475      case Xfp_YL2XP1: return "yl2xp1";
476      case Xfp_PREM:   return "prem";
477      case Xfp_PREM1:  return "prem1";
478      case Xfp_SQRT:   return "sqrt";
479      case Xfp_ABS:    return "abs";
480      case Xfp_NEG:    return "chs";
481      case Xfp_MOV:    return "mov";
482      case Xfp_SIN:    return "sin";
483      case Xfp_COS:    return "cos";
484      case Xfp_TAN:    return "tan";
485      case Xfp_ROUND:  return "round";
486      case Xfp_2XM1:   return "2xm1";
487      default: vpanic("showX86FpOp");
488   }
489}
490
491HChar* showX86SseOp ( X86SseOp op ) {
492   switch (op) {
493      case Xsse_MOV:      return "mov(?!)";
494      case Xsse_ADDF:     return "add";
495      case Xsse_SUBF:     return "sub";
496      case Xsse_MULF:     return "mul";
497      case Xsse_DIVF:     return "div";
498      case Xsse_MAXF:     return "max";
499      case Xsse_MINF:     return "min";
500      case Xsse_CMPEQF:   return "cmpFeq";
501      case Xsse_CMPLTF:   return "cmpFlt";
502      case Xsse_CMPLEF:   return "cmpFle";
503      case Xsse_CMPUNF:   return "cmpFun";
504      case Xsse_RCPF:     return "rcp";
505      case Xsse_RSQRTF:   return "rsqrt";
506      case Xsse_SQRTF:    return "sqrt";
507      case Xsse_AND:      return "and";
508      case Xsse_OR:       return "or";
509      case Xsse_XOR:      return "xor";
510      case Xsse_ANDN:     return "andn";
511      case Xsse_ADD8:     return "paddb";
512      case Xsse_ADD16:    return "paddw";
513      case Xsse_ADD32:    return "paddd";
514      case Xsse_ADD64:    return "paddq";
515      case Xsse_QADD8U:   return "paddusb";
516      case Xsse_QADD16U:  return "paddusw";
517      case Xsse_QADD8S:   return "paddsb";
518      case Xsse_QADD16S:  return "paddsw";
519      case Xsse_SUB8:     return "psubb";
520      case Xsse_SUB16:    return "psubw";
521      case Xsse_SUB32:    return "psubd";
522      case Xsse_SUB64:    return "psubq";
523      case Xsse_QSUB8U:   return "psubusb";
524      case Xsse_QSUB16U:  return "psubusw";
525      case Xsse_QSUB8S:   return "psubsb";
526      case Xsse_QSUB16S:  return "psubsw";
527      case Xsse_MUL16:    return "pmullw";
528      case Xsse_MULHI16U: return "pmulhuw";
529      case Xsse_MULHI16S: return "pmulhw";
530      case Xsse_AVG8U:    return "pavgb";
531      case Xsse_AVG16U:   return "pavgw";
532      case Xsse_MAX16S:   return "pmaxw";
533      case Xsse_MAX8U:    return "pmaxub";
534      case Xsse_MIN16S:   return "pminw";
535      case Xsse_MIN8U:    return "pminub";
536      case Xsse_CMPEQ8:   return "pcmpeqb";
537      case Xsse_CMPEQ16:  return "pcmpeqw";
538      case Xsse_CMPEQ32:  return "pcmpeqd";
539      case Xsse_CMPGT8S:  return "pcmpgtb";
540      case Xsse_CMPGT16S: return "pcmpgtw";
541      case Xsse_CMPGT32S: return "pcmpgtd";
542      case Xsse_SHL16:    return "psllw";
543      case Xsse_SHL32:    return "pslld";
544      case Xsse_SHL64:    return "psllq";
545      case Xsse_SHR16:    return "psrlw";
546      case Xsse_SHR32:    return "psrld";
547      case Xsse_SHR64:    return "psrlq";
548      case Xsse_SAR16:    return "psraw";
549      case Xsse_SAR32:    return "psrad";
550      case Xsse_PACKSSD:  return "packssdw";
551      case Xsse_PACKSSW:  return "packsswb";
552      case Xsse_PACKUSW:  return "packuswb";
553      case Xsse_UNPCKHB:  return "punpckhb";
554      case Xsse_UNPCKHW:  return "punpckhw";
555      case Xsse_UNPCKHD:  return "punpckhd";
556      case Xsse_UNPCKHQ:  return "punpckhq";
557      case Xsse_UNPCKLB:  return "punpcklb";
558      case Xsse_UNPCKLW:  return "punpcklw";
559      case Xsse_UNPCKLD:  return "punpckld";
560      case Xsse_UNPCKLQ:  return "punpcklq";
561      default: vpanic("showX86SseOp");
562   }
563}
564
565X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
566   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
567   i->tag            = Xin_Alu32R;
568   i->Xin.Alu32R.op  = op;
569   i->Xin.Alu32R.src = src;
570   i->Xin.Alu32R.dst = dst;
571   return i;
572}
573X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
574   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
575   i->tag            = Xin_Alu32M;
576   i->Xin.Alu32M.op  = op;
577   i->Xin.Alu32M.src = src;
578   i->Xin.Alu32M.dst = dst;
579   vassert(op != Xalu_MUL);
580   return i;
581}
582X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
583   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
584   i->tag          = Xin_Sh32;
585   i->Xin.Sh32.op  = op;
586   i->Xin.Sh32.src = src;
587   i->Xin.Sh32.dst = dst;
588   return i;
589}
590X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
591   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
592   i->tag              = Xin_Test32;
593   i->Xin.Test32.imm32 = imm32;
594   i->Xin.Test32.dst   = dst;
595   return i;
596}
597X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
598   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
599   i->tag             = Xin_Unary32;
600   i->Xin.Unary32.op  = op;
601   i->Xin.Unary32.dst = dst;
602   return i;
603}
604X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
605   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
606   i->tag             = Xin_Lea32;
607   i->Xin.Lea32.am    = am;
608   i->Xin.Lea32.dst   = dst;
609   return i;
610}
611X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
612   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
613   i->tag             = Xin_MulL;
614   i->Xin.MulL.syned  = syned;
615   i->Xin.MulL.src    = src;
616   return i;
617}
618X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
619   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
620   i->tag           = Xin_Div;
621   i->Xin.Div.syned = syned;
622   i->Xin.Div.src   = src;
623   return i;
624}
625X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
626   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
627   i->tag            = Xin_Sh3232;
628   i->Xin.Sh3232.op  = op;
629   i->Xin.Sh3232.amt = amt;
630   i->Xin.Sh3232.src = src;
631   i->Xin.Sh3232.dst = dst;
632   vassert(op == Xsh_SHL || op == Xsh_SHR);
633   return i;
634}
635X86Instr* X86Instr_Push( X86RMI* src ) {
636   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
637   i->tag          = Xin_Push;
638   i->Xin.Push.src = src;
639   return i;
640}
641X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms ) {
642   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
643   i->tag               = Xin_Call;
644   i->Xin.Call.cond     = cond;
645   i->Xin.Call.target   = target;
646   i->Xin.Call.regparms = regparms;
647   vassert(regparms >= 0 && regparms <= 3);
648   return i;
649}
650X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP,
651                             X86CondCode cond, Bool toFastEP ) {
652   X86Instr* i             = LibVEX_Alloc(sizeof(X86Instr));
653   i->tag                  = Xin_XDirect;
654   i->Xin.XDirect.dstGA    = dstGA;
655   i->Xin.XDirect.amEIP    = amEIP;
656   i->Xin.XDirect.cond     = cond;
657   i->Xin.XDirect.toFastEP = toFastEP;
658   return i;
659}
660X86Instr* X86Instr_XIndir ( HReg dstGA, X86AMode* amEIP,
661                            X86CondCode cond ) {
662   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
663   i->tag              = Xin_XIndir;
664   i->Xin.XIndir.dstGA = dstGA;
665   i->Xin.XIndir.amEIP = amEIP;
666   i->Xin.XIndir.cond  = cond;
667   return i;
668}
669X86Instr* X86Instr_XAssisted ( HReg dstGA, X86AMode* amEIP,
670                               X86CondCode cond, IRJumpKind jk ) {
671   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
672   i->tag                 = Xin_XAssisted;
673   i->Xin.XAssisted.dstGA = dstGA;
674   i->Xin.XAssisted.amEIP = amEIP;
675   i->Xin.XAssisted.cond  = cond;
676   i->Xin.XAssisted.jk    = jk;
677   return i;
678}
679X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
680   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
681   i->tag             = Xin_CMov32;
682   i->Xin.CMov32.cond = cond;
683   i->Xin.CMov32.src  = src;
684   i->Xin.CMov32.dst  = dst;
685   vassert(cond != Xcc_ALWAYS);
686   return i;
687}
688X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
689                            X86AMode* src, HReg dst ) {
690   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
691   i->tag                = Xin_LoadEX;
692   i->Xin.LoadEX.szSmall = szSmall;
693   i->Xin.LoadEX.syned   = syned;
694   i->Xin.LoadEX.src     = src;
695   i->Xin.LoadEX.dst     = dst;
696   vassert(szSmall == 1 || szSmall == 2);
697   return i;
698}
699X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
700   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
701   i->tag           = Xin_Store;
702   i->Xin.Store.sz  = sz;
703   i->Xin.Store.src = src;
704   i->Xin.Store.dst = dst;
705   vassert(sz == 1 || sz == 2);
706   return i;
707}
708X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
709   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
710   i->tag            = Xin_Set32;
711   i->Xin.Set32.cond = cond;
712   i->Xin.Set32.dst  = dst;
713   return i;
714}
715X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
716   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
717   i->tag               = Xin_Bsfr32;
718   i->Xin.Bsfr32.isFwds = isFwds;
719   i->Xin.Bsfr32.src    = src;
720   i->Xin.Bsfr32.dst    = dst;
721   return i;
722}
723X86Instr* X86Instr_MFence ( UInt hwcaps ) {
724   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
725   i->tag               = Xin_MFence;
726   i->Xin.MFence.hwcaps = hwcaps;
727   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_SSE1
728                            |VEX_HWCAPS_X86_SSE2
729                            |VEX_HWCAPS_X86_SSE3
730                            |VEX_HWCAPS_X86_LZCNT)));
731   return i;
732}
733X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
734   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
735   i->tag           = Xin_ACAS;
736   i->Xin.ACAS.addr = addr;
737   i->Xin.ACAS.sz   = sz;
738   vassert(sz == 4 || sz == 2 || sz == 1);
739   return i;
740}
741X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
742   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
743   i->tag            = Xin_DACAS;
744   i->Xin.DACAS.addr = addr;
745   return i;
746}
747
748X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
749   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
750   i->tag             = Xin_FpUnary;
751   i->Xin.FpUnary.op  = op;
752   i->Xin.FpUnary.src = src;
753   i->Xin.FpUnary.dst = dst;
754   return i;
755}
756X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
757   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
758   i->tag               = Xin_FpBinary;
759   i->Xin.FpBinary.op   = op;
760   i->Xin.FpBinary.srcL = srcL;
761   i->Xin.FpBinary.srcR = srcR;
762   i->Xin.FpBinary.dst  = dst;
763   return i;
764}
765X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
766   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
767   i->tag               = Xin_FpLdSt;
768   i->Xin.FpLdSt.isLoad = isLoad;
769   i->Xin.FpLdSt.sz     = sz;
770   i->Xin.FpLdSt.reg    = reg;
771   i->Xin.FpLdSt.addr   = addr;
772   vassert(sz == 4 || sz == 8 || sz == 10);
773   return i;
774}
775X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
776                             HReg reg, X86AMode* addr ) {
777   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
778   i->tag                = Xin_FpLdStI;
779   i->Xin.FpLdStI.isLoad = isLoad;
780   i->Xin.FpLdStI.sz     = sz;
781   i->Xin.FpLdStI.reg    = reg;
782   i->Xin.FpLdStI.addr   = addr;
783   vassert(sz == 2 || sz == 4 || sz == 8);
784   return i;
785}
786X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
787   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
788   i->tag              = Xin_Fp64to32;
789   i->Xin.Fp64to32.src = src;
790   i->Xin.Fp64to32.dst = dst;
791   return i;
792}
793X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
794   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
795   i->tag             = Xin_FpCMov;
796   i->Xin.FpCMov.cond = cond;
797   i->Xin.FpCMov.src  = src;
798   i->Xin.FpCMov.dst  = dst;
799   vassert(cond != Xcc_ALWAYS);
800   return i;
801}
802X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
803   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
804   i->tag               = Xin_FpLdCW;
805   i->Xin.FpLdCW.addr   = addr;
806   return i;
807}
808X86Instr* X86Instr_FpStSW_AX ( void ) {
809   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
810   i->tag      = Xin_FpStSW_AX;
811   return i;
812}
813X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
814   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
815   i->tag            = Xin_FpCmp;
816   i->Xin.FpCmp.srcL = srcL;
817   i->Xin.FpCmp.srcR = srcR;
818   i->Xin.FpCmp.dst  = dst;
819   return i;
820}
821X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
822   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
823   i->tag                 = Xin_SseConst;
824   i->Xin.SseConst.con    = con;
825   i->Xin.SseConst.dst    = dst;
826   vassert(hregClass(dst) == HRcVec128);
827   return i;
828}
829X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
830   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
831   i->tag                = Xin_SseLdSt;
832   i->Xin.SseLdSt.isLoad = isLoad;
833   i->Xin.SseLdSt.reg    = reg;
834   i->Xin.SseLdSt.addr   = addr;
835   return i;
836}
837X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
838{
839   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
840   i->tag                = Xin_SseLdzLO;
841   i->Xin.SseLdzLO.sz    = toUChar(sz);
842   i->Xin.SseLdzLO.reg   = reg;
843   i->Xin.SseLdzLO.addr  = addr;
844   vassert(sz == 4 || sz == 8);
845   return i;
846}
847X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
848   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
849   i->tag              = Xin_Sse32Fx4;
850   i->Xin.Sse32Fx4.op  = op;
851   i->Xin.Sse32Fx4.src = src;
852   i->Xin.Sse32Fx4.dst = dst;
853   vassert(op != Xsse_MOV);
854   return i;
855}
856X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
857   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
858   i->tag              = Xin_Sse32FLo;
859   i->Xin.Sse32FLo.op  = op;
860   i->Xin.Sse32FLo.src = src;
861   i->Xin.Sse32FLo.dst = dst;
862   vassert(op != Xsse_MOV);
863   return i;
864}
865X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
866   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
867   i->tag              = Xin_Sse64Fx2;
868   i->Xin.Sse64Fx2.op  = op;
869   i->Xin.Sse64Fx2.src = src;
870   i->Xin.Sse64Fx2.dst = dst;
871   vassert(op != Xsse_MOV);
872   return i;
873}
874X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
875   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
876   i->tag              = Xin_Sse64FLo;
877   i->Xin.Sse64FLo.op  = op;
878   i->Xin.Sse64FLo.src = src;
879   i->Xin.Sse64FLo.dst = dst;
880   vassert(op != Xsse_MOV);
881   return i;
882}
883X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
884   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
885   i->tag             = Xin_SseReRg;
886   i->Xin.SseReRg.op  = op;
887   i->Xin.SseReRg.src = re;
888   i->Xin.SseReRg.dst = rg;
889   return i;
890}
891X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
892   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
893   i->tag              = Xin_SseCMov;
894   i->Xin.SseCMov.cond = cond;
895   i->Xin.SseCMov.src  = src;
896   i->Xin.SseCMov.dst  = dst;
897   vassert(cond != Xcc_ALWAYS);
898   return i;
899}
900X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
901   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
902   i->tag               = Xin_SseShuf;
903   i->Xin.SseShuf.order = order;
904   i->Xin.SseShuf.src   = src;
905   i->Xin.SseShuf.dst   = dst;
906   vassert(order >= 0 && order <= 0xFF);
907   return i;
908}
909X86Instr* X86Instr_EvCheck ( X86AMode* amCounter,
910                             X86AMode* amFailAddr ) {
911   X86Instr* i               = LibVEX_Alloc(sizeof(X86Instr));
912   i->tag                    = Xin_EvCheck;
913   i->Xin.EvCheck.amCounter  = amCounter;
914   i->Xin.EvCheck.amFailAddr = amFailAddr;
915   return i;
916}
917X86Instr* X86Instr_ProfInc ( void ) {
918   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
919   i->tag      = Xin_ProfInc;
920   return i;
921}
922
923void ppX86Instr ( X86Instr* i, Bool mode64 ) {
924   vassert(mode64 == False);
925   switch (i->tag) {
926      case Xin_Alu32R:
927         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
928         ppX86RMI(i->Xin.Alu32R.src);
929         vex_printf(",");
930         ppHRegX86(i->Xin.Alu32R.dst);
931         return;
932      case Xin_Alu32M:
933         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
934         ppX86RI(i->Xin.Alu32M.src);
935         vex_printf(",");
936         ppX86AMode(i->Xin.Alu32M.dst);
937         return;
938      case Xin_Sh32:
939         vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
940         if (i->Xin.Sh32.src == 0)
941           vex_printf("%%cl,");
942         else
943            vex_printf("$%d,", (Int)i->Xin.Sh32.src);
944         ppHRegX86(i->Xin.Sh32.dst);
945         return;
946      case Xin_Test32:
947         vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
948         ppX86RM(i->Xin.Test32.dst);
949         return;
950      case Xin_Unary32:
951         vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
952         ppHRegX86(i->Xin.Unary32.dst);
953         return;
954      case Xin_Lea32:
955         vex_printf("leal ");
956         ppX86AMode(i->Xin.Lea32.am);
957         vex_printf(",");
958         ppHRegX86(i->Xin.Lea32.dst);
959         return;
960      case Xin_MulL:
961         vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
962         ppX86RM(i->Xin.MulL.src);
963         return;
964      case Xin_Div:
965         vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
966         ppX86RM(i->Xin.Div.src);
967         return;
968      case Xin_Sh3232:
969         vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
970         if (i->Xin.Sh3232.amt == 0)
971           vex_printf(" %%cl,");
972         else
973            vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
974         ppHRegX86(i->Xin.Sh3232.src);
975         vex_printf(",");
976         ppHRegX86(i->Xin.Sh3232.dst);
977         return;
978      case Xin_Push:
979         vex_printf("pushl ");
980         ppX86RMI(i->Xin.Push.src);
981         return;
982      case Xin_Call:
983         vex_printf("call%s[%d] ",
984                    i->Xin.Call.cond==Xcc_ALWAYS
985                       ? "" : showX86CondCode(i->Xin.Call.cond),
986                    i->Xin.Call.regparms);
987         vex_printf("0x%x", i->Xin.Call.target);
988         break;
989      case Xin_XDirect:
990         vex_printf("(xDirect) ");
991         vex_printf("if (%%eflags.%s) { ",
992                    showX86CondCode(i->Xin.XDirect.cond));
993         vex_printf("movl $0x%x,", i->Xin.XDirect.dstGA);
994         ppX86AMode(i->Xin.XDirect.amEIP);
995         vex_printf("; ");
996         vex_printf("movl $disp_cp_chain_me_to_%sEP,%%edx; call *%%edx }",
997                    i->Xin.XDirect.toFastEP ? "fast" : "slow");
998         return;
999      case Xin_XIndir:
1000         vex_printf("(xIndir) ");
1001         vex_printf("if (%%eflags.%s) { movl ",
1002                    showX86CondCode(i->Xin.XIndir.cond));
1003         ppHRegX86(i->Xin.XIndir.dstGA);
1004         vex_printf(",");
1005         ppX86AMode(i->Xin.XIndir.amEIP);
1006         vex_printf("; movl $disp_indir,%%edx; jmp *%%edx }");
1007         return;
1008      case Xin_XAssisted:
1009         vex_printf("(xAssisted) ");
1010         vex_printf("if (%%eflags.%s) { ",
1011                    showX86CondCode(i->Xin.XAssisted.cond));
1012         vex_printf("movl ");
1013         ppHRegX86(i->Xin.XAssisted.dstGA);
1014         vex_printf(",");
1015         ppX86AMode(i->Xin.XAssisted.amEIP);
1016         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%ebp",
1017                    (Int)i->Xin.XAssisted.jk);
1018         vex_printf("; movl $disp_assisted,%%edx; jmp *%%edx }");
1019         return;
1020      case Xin_CMov32:
1021         vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
1022         ppX86RM(i->Xin.CMov32.src);
1023         vex_printf(",");
1024         ppHRegX86(i->Xin.CMov32.dst);
1025         return;
1026      case Xin_LoadEX:
1027         vex_printf("mov%c%cl ",
1028                    i->Xin.LoadEX.syned ? 's' : 'z',
1029                    i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
1030         ppX86AMode(i->Xin.LoadEX.src);
1031         vex_printf(",");
1032         ppHRegX86(i->Xin.LoadEX.dst);
1033         return;
1034      case Xin_Store:
1035         vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
1036         ppHRegX86(i->Xin.Store.src);
1037         vex_printf(",");
1038         ppX86AMode(i->Xin.Store.dst);
1039         return;
1040      case Xin_Set32:
1041         vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
1042         ppHRegX86(i->Xin.Set32.dst);
1043         return;
1044      case Xin_Bsfr32:
1045         vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
1046         ppHRegX86(i->Xin.Bsfr32.src);
1047         vex_printf(",");
1048         ppHRegX86(i->Xin.Bsfr32.dst);
1049         return;
1050      case Xin_MFence:
1051         vex_printf("mfence(%s)",
1052                    LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
1053         return;
1054      case Xin_ACAS:
1055         vex_printf("lock cmpxchg%c ",
1056                     i->Xin.ACAS.sz==1 ? 'b'
1057                                       : i->Xin.ACAS.sz==2 ? 'w' : 'l');
1058         vex_printf("{%%eax->%%ebx},");
1059         ppX86AMode(i->Xin.ACAS.addr);
1060         return;
1061      case Xin_DACAS:
1062         vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
1063         ppX86AMode(i->Xin.DACAS.addr);
1064         return;
1065      case Xin_FpUnary:
1066         vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
1067         ppHRegX86(i->Xin.FpUnary.src);
1068         vex_printf(",");
1069         ppHRegX86(i->Xin.FpUnary.dst);
1070         break;
1071      case Xin_FpBinary:
1072         vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
1073         ppHRegX86(i->Xin.FpBinary.srcL);
1074         vex_printf(",");
1075         ppHRegX86(i->Xin.FpBinary.srcR);
1076         vex_printf(",");
1077         ppHRegX86(i->Xin.FpBinary.dst);
1078         break;
1079      case Xin_FpLdSt:
1080         if (i->Xin.FpLdSt.isLoad) {
1081            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
1082                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1083            ppX86AMode(i->Xin.FpLdSt.addr);
1084            vex_printf(", ");
1085            ppHRegX86(i->Xin.FpLdSt.reg);
1086         } else {
1087            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
1088                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1089            ppHRegX86(i->Xin.FpLdSt.reg);
1090            vex_printf(", ");
1091            ppX86AMode(i->Xin.FpLdSt.addr);
1092         }
1093         return;
1094      case Xin_FpLdStI:
1095         if (i->Xin.FpLdStI.isLoad) {
1096            vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1097                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1098            ppX86AMode(i->Xin.FpLdStI.addr);
1099            vex_printf(", ");
1100            ppHRegX86(i->Xin.FpLdStI.reg);
1101         } else {
1102            vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1103                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1104            ppHRegX86(i->Xin.FpLdStI.reg);
1105            vex_printf(", ");
1106            ppX86AMode(i->Xin.FpLdStI.addr);
1107         }
1108         return;
1109      case Xin_Fp64to32:
1110         vex_printf("gdtof ");
1111         ppHRegX86(i->Xin.Fp64to32.src);
1112         vex_printf(",");
1113         ppHRegX86(i->Xin.Fp64to32.dst);
1114         return;
1115      case Xin_FpCMov:
1116         vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
1117         ppHRegX86(i->Xin.FpCMov.src);
1118         vex_printf(",");
1119         ppHRegX86(i->Xin.FpCMov.dst);
1120         return;
1121      case Xin_FpLdCW:
1122         vex_printf("fldcw ");
1123         ppX86AMode(i->Xin.FpLdCW.addr);
1124         return;
1125      case Xin_FpStSW_AX:
1126         vex_printf("fstsw %%ax");
1127         return;
1128      case Xin_FpCmp:
1129         vex_printf("gcmp ");
1130         ppHRegX86(i->Xin.FpCmp.srcL);
1131         vex_printf(",");
1132         ppHRegX86(i->Xin.FpCmp.srcR);
1133         vex_printf(",");
1134         ppHRegX86(i->Xin.FpCmp.dst);
1135         break;
1136      case Xin_SseConst:
1137         vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
1138         ppHRegX86(i->Xin.SseConst.dst);
1139         break;
1140      case Xin_SseLdSt:
1141         vex_printf("movups ");
1142         if (i->Xin.SseLdSt.isLoad) {
1143            ppX86AMode(i->Xin.SseLdSt.addr);
1144            vex_printf(",");
1145            ppHRegX86(i->Xin.SseLdSt.reg);
1146         } else {
1147            ppHRegX86(i->Xin.SseLdSt.reg);
1148            vex_printf(",");
1149            ppX86AMode(i->Xin.SseLdSt.addr);
1150         }
1151         return;
1152      case Xin_SseLdzLO:
1153         vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
1154         ppX86AMode(i->Xin.SseLdzLO.addr);
1155         vex_printf(",");
1156         ppHRegX86(i->Xin.SseLdzLO.reg);
1157         return;
1158      case Xin_Sse32Fx4:
1159         vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
1160         ppHRegX86(i->Xin.Sse32Fx4.src);
1161         vex_printf(",");
1162         ppHRegX86(i->Xin.Sse32Fx4.dst);
1163         return;
1164      case Xin_Sse32FLo:
1165         vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
1166         ppHRegX86(i->Xin.Sse32FLo.src);
1167         vex_printf(",");
1168         ppHRegX86(i->Xin.Sse32FLo.dst);
1169         return;
1170      case Xin_Sse64Fx2:
1171         vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
1172         ppHRegX86(i->Xin.Sse64Fx2.src);
1173         vex_printf(",");
1174         ppHRegX86(i->Xin.Sse64Fx2.dst);
1175         return;
1176      case Xin_Sse64FLo:
1177         vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
1178         ppHRegX86(i->Xin.Sse64FLo.src);
1179         vex_printf(",");
1180         ppHRegX86(i->Xin.Sse64FLo.dst);
1181         return;
1182      case Xin_SseReRg:
1183         vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
1184         ppHRegX86(i->Xin.SseReRg.src);
1185         vex_printf(",");
1186         ppHRegX86(i->Xin.SseReRg.dst);
1187         return;
1188      case Xin_SseCMov:
1189         vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
1190         ppHRegX86(i->Xin.SseCMov.src);
1191         vex_printf(",");
1192         ppHRegX86(i->Xin.SseCMov.dst);
1193         return;
1194      case Xin_SseShuf:
1195         vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
1196         ppHRegX86(i->Xin.SseShuf.src);
1197         vex_printf(",");
1198         ppHRegX86(i->Xin.SseShuf.dst);
1199         return;
1200      case Xin_EvCheck:
1201         vex_printf("(evCheck) decl ");
1202         ppX86AMode(i->Xin.EvCheck.amCounter);
1203         vex_printf("; jns nofail; jmp *");
1204         ppX86AMode(i->Xin.EvCheck.amFailAddr);
1205         vex_printf("; nofail:");
1206         return;
1207      case Xin_ProfInc:
1208         vex_printf("(profInc) addl $1,NotKnownYet; "
1209                    "adcl $0,NotKnownYet+4");
1210         return;
1211      default:
1212         vpanic("ppX86Instr");
1213   }
1214}
1215
1216/* --------- Helpers for register allocation. --------- */
1217
1218void getRegUsage_X86Instr (HRegUsage* u, X86Instr* i, Bool mode64)
1219{
1220   Bool unary;
1221   vassert(mode64 == False);
1222   initHRegUsage(u);
1223   switch (i->tag) {
1224      case Xin_Alu32R:
1225         addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
1226         if (i->Xin.Alu32R.op == Xalu_MOV) {
1227            addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
1228            return;
1229         }
1230         if (i->Xin.Alu32R.op == Xalu_CMP) {
1231            addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
1232            return;
1233         }
1234         addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
1235         return;
1236      case Xin_Alu32M:
1237         addRegUsage_X86RI(u, i->Xin.Alu32M.src);
1238         addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
1239         return;
1240      case Xin_Sh32:
1241         addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
1242         if (i->Xin.Sh32.src == 0)
1243            addHRegUse(u, HRmRead, hregX86_ECX());
1244         return;
1245      case Xin_Test32:
1246         addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
1247         return;
1248      case Xin_Unary32:
1249         addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
1250         return;
1251      case Xin_Lea32:
1252         addRegUsage_X86AMode(u, i->Xin.Lea32.am);
1253         addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
1254         return;
1255      case Xin_MulL:
1256         addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
1257         addHRegUse(u, HRmModify, hregX86_EAX());
1258         addHRegUse(u, HRmWrite, hregX86_EDX());
1259         return;
1260      case Xin_Div:
1261         addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
1262         addHRegUse(u, HRmModify, hregX86_EAX());
1263         addHRegUse(u, HRmModify, hregX86_EDX());
1264         return;
1265      case Xin_Sh3232:
1266         addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
1267         addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
1268         if (i->Xin.Sh3232.amt == 0)
1269            addHRegUse(u, HRmRead, hregX86_ECX());
1270         return;
1271      case Xin_Push:
1272         addRegUsage_X86RMI(u, i->Xin.Push.src);
1273         addHRegUse(u, HRmModify, hregX86_ESP());
1274         return;
1275      case Xin_Call:
1276         /* This is a bit subtle. */
1277         /* First off, claim it trashes all the caller-saved regs
1278            which fall within the register allocator's jurisdiction.
1279            These I believe to be %eax %ecx %edx and all the xmm
1280            registers. */
1281         addHRegUse(u, HRmWrite, hregX86_EAX());
1282         addHRegUse(u, HRmWrite, hregX86_ECX());
1283         addHRegUse(u, HRmWrite, hregX86_EDX());
1284         addHRegUse(u, HRmWrite, hregX86_XMM0());
1285         addHRegUse(u, HRmWrite, hregX86_XMM1());
1286         addHRegUse(u, HRmWrite, hregX86_XMM2());
1287         addHRegUse(u, HRmWrite, hregX86_XMM3());
1288         addHRegUse(u, HRmWrite, hregX86_XMM4());
1289         addHRegUse(u, HRmWrite, hregX86_XMM5());
1290         addHRegUse(u, HRmWrite, hregX86_XMM6());
1291         addHRegUse(u, HRmWrite, hregX86_XMM7());
1292         /* Now we have to state any parameter-carrying registers
1293            which might be read.  This depends on the regparmness. */
1294         switch (i->Xin.Call.regparms) {
1295            case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
1296            case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
1297            case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
1298            case 0: break;
1299            default: vpanic("getRegUsage_X86Instr:Call:regparms");
1300         }
1301         /* Finally, there is the issue that the insn trashes a
1302            register because the literal target address has to be
1303            loaded into a register.  Fortunately, for the 0/1/2
1304            regparm case, we can use EAX, EDX and ECX respectively, so
1305            this does not cause any further damage.  For the 3-regparm
1306            case, we'll have to choose another register arbitrarily --
1307            since A, D and C are used for parameters -- and so we might
1308            as well choose EDI. */
1309         if (i->Xin.Call.regparms == 3)
1310            addHRegUse(u, HRmWrite, hregX86_EDI());
1311         /* Upshot of this is that the assembler really must observe
1312            the here-stated convention of which register to use as an
1313            address temporary, depending on the regparmness: 0==EAX,
1314            1==EDX, 2==ECX, 3==EDI. */
1315         return;
1316      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1317         conditionally exit the block.  Hence we only need to list (1)
1318         the registers that they read, and (2) the registers that they
1319         write in the case where the block is not exited.  (2) is
1320         empty, hence only (1) is relevant here. */
1321      case Xin_XDirect:
1322         addRegUsage_X86AMode(u, i->Xin.XDirect.amEIP);
1323         return;
1324      case Xin_XIndir:
1325         addHRegUse(u, HRmRead, i->Xin.XIndir.dstGA);
1326         addRegUsage_X86AMode(u, i->Xin.XIndir.amEIP);
1327         return;
1328      case Xin_XAssisted:
1329         addHRegUse(u, HRmRead, i->Xin.XAssisted.dstGA);
1330         addRegUsage_X86AMode(u, i->Xin.XAssisted.amEIP);
1331         return;
1332      case Xin_CMov32:
1333         addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
1334         addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
1335         return;
1336      case Xin_LoadEX:
1337         addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
1338         addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
1339         return;
1340      case Xin_Store:
1341         addHRegUse(u, HRmRead, i->Xin.Store.src);
1342         addRegUsage_X86AMode(u, i->Xin.Store.dst);
1343         return;
1344      case Xin_Set32:
1345         addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
1346         return;
1347      case Xin_Bsfr32:
1348         addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
1349         addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
1350         return;
1351      case Xin_MFence:
1352         return;
1353      case Xin_ACAS:
1354         addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
1355         addHRegUse(u, HRmRead, hregX86_EBX());
1356         addHRegUse(u, HRmModify, hregX86_EAX());
1357         return;
1358      case Xin_DACAS:
1359         addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
1360         addHRegUse(u, HRmRead, hregX86_ECX());
1361         addHRegUse(u, HRmRead, hregX86_EBX());
1362         addHRegUse(u, HRmModify, hregX86_EDX());
1363         addHRegUse(u, HRmModify, hregX86_EAX());
1364         return;
1365      case Xin_FpUnary:
1366         addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
1367         addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
1368         return;
1369      case Xin_FpBinary:
1370         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
1371         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
1372         addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
1373         return;
1374      case Xin_FpLdSt:
1375         addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
1376         addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
1377                       i->Xin.FpLdSt.reg);
1378         return;
1379      case Xin_FpLdStI:
1380         addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
1381         addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
1382                       i->Xin.FpLdStI.reg);
1383         return;
1384      case Xin_Fp64to32:
1385         addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
1386         addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
1387         return;
1388      case Xin_FpCMov:
1389         addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
1390         addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
1391         return;
1392      case Xin_FpLdCW:
1393         addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
1394         return;
1395      case Xin_FpStSW_AX:
1396         addHRegUse(u, HRmWrite, hregX86_EAX());
1397         return;
1398      case Xin_FpCmp:
1399         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
1400         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
1401         addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
1402         addHRegUse(u, HRmWrite, hregX86_EAX());
1403         return;
1404      case Xin_SseLdSt:
1405         addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
1406         addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
1407                       i->Xin.SseLdSt.reg);
1408         return;
1409      case Xin_SseLdzLO:
1410         addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
1411         addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
1412         return;
1413      case Xin_SseConst:
1414         addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
1415         return;
1416      case Xin_Sse32Fx4:
1417         vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
1418         unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
1419                         || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
1420                         || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
1421         addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
1422         addHRegUse(u, unary ? HRmWrite : HRmModify,
1423                       i->Xin.Sse32Fx4.dst);
1424         return;
1425      case Xin_Sse32FLo:
1426         vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
1427         unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
1428                         || i->Xin.Sse32FLo.op == Xsse_RSQRTF
1429                         || i->Xin.Sse32FLo.op == Xsse_SQRTF );
1430         addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
1431         addHRegUse(u, unary ? HRmWrite : HRmModify,
1432                       i->Xin.Sse32FLo.dst);
1433         return;
1434      case Xin_Sse64Fx2:
1435         vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
1436         unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
1437                         || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
1438                         || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
1439         addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
1440         addHRegUse(u, unary ? HRmWrite : HRmModify,
1441                       i->Xin.Sse64Fx2.dst);
1442         return;
1443      case Xin_Sse64FLo:
1444         vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
1445         unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
1446                         || i->Xin.Sse64FLo.op == Xsse_RSQRTF
1447                         || i->Xin.Sse64FLo.op == Xsse_SQRTF );
1448         addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
1449         addHRegUse(u, unary ? HRmWrite : HRmModify,
1450                       i->Xin.Sse64FLo.dst);
1451         return;
1452      case Xin_SseReRg:
1453         if (i->Xin.SseReRg.op == Xsse_XOR
1454             && i->Xin.SseReRg.src == i->Xin.SseReRg.dst) {
1455            /* reg-alloc needs to understand 'xor r,r' as a write of r */
1456            /* (as opposed to a rite of passage :-) */
1457            addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
1458         } else {
1459            addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
1460            addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV
1461                             ? HRmWrite : HRmModify,
1462                          i->Xin.SseReRg.dst);
1463         }
1464         return;
1465      case Xin_SseCMov:
1466         addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
1467         addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
1468         return;
1469      case Xin_SseShuf:
1470         addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
1471         addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
1472         return;
1473      case Xin_EvCheck:
1474         /* We expect both amodes only to mention %ebp, so this is in
1475            fact pointless, since %ebp isn't allocatable, but anyway.. */
1476         addRegUsage_X86AMode(u, i->Xin.EvCheck.amCounter);
1477         addRegUsage_X86AMode(u, i->Xin.EvCheck.amFailAddr);
1478         return;
1479      case Xin_ProfInc:
1480         /* does not use any registers. */
1481         return;
1482      default:
1483         ppX86Instr(i, False);
1484         vpanic("getRegUsage_X86Instr");
1485   }
1486}
1487
1488/* local helper */
1489static void mapReg( HRegRemap* m, HReg* r )
1490{
1491   *r = lookupHRegRemap(m, *r);
1492}
1493
1494void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
1495{
1496   vassert(mode64 == False);
1497   switch (i->tag) {
1498      case Xin_Alu32R:
1499         mapRegs_X86RMI(m, i->Xin.Alu32R.src);
1500         mapReg(m, &i->Xin.Alu32R.dst);
1501         return;
1502      case Xin_Alu32M:
1503         mapRegs_X86RI(m, i->Xin.Alu32M.src);
1504         mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
1505         return;
1506      case Xin_Sh32:
1507         mapReg(m, &i->Xin.Sh32.dst);
1508         return;
1509      case Xin_Test32:
1510         mapRegs_X86RM(m, i->Xin.Test32.dst);
1511         return;
1512      case Xin_Unary32:
1513         mapReg(m, &i->Xin.Unary32.dst);
1514         return;
1515      case Xin_Lea32:
1516         mapRegs_X86AMode(m, i->Xin.Lea32.am);
1517         mapReg(m, &i->Xin.Lea32.dst);
1518         return;
1519      case Xin_MulL:
1520         mapRegs_X86RM(m, i->Xin.MulL.src);
1521         return;
1522      case Xin_Div:
1523         mapRegs_X86RM(m, i->Xin.Div.src);
1524         return;
1525      case Xin_Sh3232:
1526         mapReg(m, &i->Xin.Sh3232.src);
1527         mapReg(m, &i->Xin.Sh3232.dst);
1528         return;
1529      case Xin_Push:
1530         mapRegs_X86RMI(m, i->Xin.Push.src);
1531         return;
1532      case Xin_Call:
1533         return;
1534      case Xin_XDirect:
1535         mapRegs_X86AMode(m, i->Xin.XDirect.amEIP);
1536         return;
1537      case Xin_XIndir:
1538         mapReg(m, &i->Xin.XIndir.dstGA);
1539         mapRegs_X86AMode(m, i->Xin.XIndir.amEIP);
1540         return;
1541      case Xin_XAssisted:
1542         mapReg(m, &i->Xin.XAssisted.dstGA);
1543         mapRegs_X86AMode(m, i->Xin.XAssisted.amEIP);
1544         return;
1545      case Xin_CMov32:
1546         mapRegs_X86RM(m, i->Xin.CMov32.src);
1547         mapReg(m, &i->Xin.CMov32.dst);
1548         return;
1549      case Xin_LoadEX:
1550         mapRegs_X86AMode(m, i->Xin.LoadEX.src);
1551         mapReg(m, &i->Xin.LoadEX.dst);
1552         return;
1553      case Xin_Store:
1554         mapReg(m, &i->Xin.Store.src);
1555         mapRegs_X86AMode(m, i->Xin.Store.dst);
1556         return;
1557      case Xin_Set32:
1558         mapReg(m, &i->Xin.Set32.dst);
1559         return;
1560      case Xin_Bsfr32:
1561         mapReg(m, &i->Xin.Bsfr32.src);
1562         mapReg(m, &i->Xin.Bsfr32.dst);
1563         return;
1564      case Xin_MFence:
1565         return;
1566      case Xin_ACAS:
1567         mapRegs_X86AMode(m, i->Xin.ACAS.addr);
1568         return;
1569      case Xin_DACAS:
1570         mapRegs_X86AMode(m, i->Xin.DACAS.addr);
1571         return;
1572      case Xin_FpUnary:
1573         mapReg(m, &i->Xin.FpUnary.src);
1574         mapReg(m, &i->Xin.FpUnary.dst);
1575         return;
1576      case Xin_FpBinary:
1577         mapReg(m, &i->Xin.FpBinary.srcL);
1578         mapReg(m, &i->Xin.FpBinary.srcR);
1579         mapReg(m, &i->Xin.FpBinary.dst);
1580         return;
1581      case Xin_FpLdSt:
1582         mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
1583         mapReg(m, &i->Xin.FpLdSt.reg);
1584         return;
1585      case Xin_FpLdStI:
1586         mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
1587         mapReg(m, &i->Xin.FpLdStI.reg);
1588         return;
1589      case Xin_Fp64to32:
1590         mapReg(m, &i->Xin.Fp64to32.src);
1591         mapReg(m, &i->Xin.Fp64to32.dst);
1592         return;
1593      case Xin_FpCMov:
1594         mapReg(m, &i->Xin.FpCMov.src);
1595         mapReg(m, &i->Xin.FpCMov.dst);
1596         return;
1597      case Xin_FpLdCW:
1598         mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
1599         return;
1600      case Xin_FpStSW_AX:
1601         return;
1602      case Xin_FpCmp:
1603         mapReg(m, &i->Xin.FpCmp.srcL);
1604         mapReg(m, &i->Xin.FpCmp.srcR);
1605         mapReg(m, &i->Xin.FpCmp.dst);
1606         return;
1607      case Xin_SseConst:
1608         mapReg(m, &i->Xin.SseConst.dst);
1609         return;
1610      case Xin_SseLdSt:
1611         mapReg(m, &i->Xin.SseLdSt.reg);
1612         mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
1613         break;
1614      case Xin_SseLdzLO:
1615         mapReg(m, &i->Xin.SseLdzLO.reg);
1616         mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
1617         break;
1618      case Xin_Sse32Fx4:
1619         mapReg(m, &i->Xin.Sse32Fx4.src);
1620         mapReg(m, &i->Xin.Sse32Fx4.dst);
1621         return;
1622      case Xin_Sse32FLo:
1623         mapReg(m, &i->Xin.Sse32FLo.src);
1624         mapReg(m, &i->Xin.Sse32FLo.dst);
1625         return;
1626      case Xin_Sse64Fx2:
1627         mapReg(m, &i->Xin.Sse64Fx2.src);
1628         mapReg(m, &i->Xin.Sse64Fx2.dst);
1629         return;
1630      case Xin_Sse64FLo:
1631         mapReg(m, &i->Xin.Sse64FLo.src);
1632         mapReg(m, &i->Xin.Sse64FLo.dst);
1633         return;
1634      case Xin_SseReRg:
1635         mapReg(m, &i->Xin.SseReRg.src);
1636         mapReg(m, &i->Xin.SseReRg.dst);
1637         return;
1638      case Xin_SseCMov:
1639         mapReg(m, &i->Xin.SseCMov.src);
1640         mapReg(m, &i->Xin.SseCMov.dst);
1641         return;
1642      case Xin_SseShuf:
1643         mapReg(m, &i->Xin.SseShuf.src);
1644         mapReg(m, &i->Xin.SseShuf.dst);
1645         return;
1646      case Xin_EvCheck:
1647         /* We expect both amodes only to mention %ebp, so this is in
1648            fact pointless, since %ebp isn't allocatable, but anyway.. */
1649         mapRegs_X86AMode(m, i->Xin.EvCheck.amCounter);
1650         mapRegs_X86AMode(m, i->Xin.EvCheck.amFailAddr);
1651         return;
1652      case Xin_ProfInc:
1653         /* does not use any registers. */
1654         return;
1655
1656      default:
1657         ppX86Instr(i, mode64);
1658         vpanic("mapRegs_X86Instr");
1659   }
1660}
1661
1662/* Figure out if i represents a reg-reg move, and if so assign the
1663   source and destination to *src and *dst.  If in doubt say No.  Used
1664   by the register allocator to do move coalescing.
1665*/
1666Bool isMove_X86Instr ( X86Instr* i, HReg* src, HReg* dst )
1667{
1668   /* Moves between integer regs */
1669   if (i->tag == Xin_Alu32R) {
1670      if (i->Xin.Alu32R.op != Xalu_MOV)
1671         return False;
1672      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
1673         return False;
1674      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
1675      *dst = i->Xin.Alu32R.dst;
1676      return True;
1677   }
1678   /* Moves between FP regs */
1679   if (i->tag == Xin_FpUnary) {
1680      if (i->Xin.FpUnary.op != Xfp_MOV)
1681         return False;
1682      *src = i->Xin.FpUnary.src;
1683      *dst = i->Xin.FpUnary.dst;
1684      return True;
1685   }
1686   if (i->tag == Xin_SseReRg) {
1687      if (i->Xin.SseReRg.op != Xsse_MOV)
1688         return False;
1689      *src = i->Xin.SseReRg.src;
1690      *dst = i->Xin.SseReRg.dst;
1691      return True;
1692   }
1693   return False;
1694}
1695
1696
1697/* Generate x86 spill/reload instructions under the direction of the
1698   register allocator.  Note it's critical these don't write the
1699   condition codes. */
1700
1701void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1702                    HReg rreg, Int offsetB, Bool mode64 )
1703{
1704   X86AMode* am;
1705   vassert(offsetB >= 0);
1706   vassert(!hregIsVirtual(rreg));
1707   vassert(mode64 == False);
1708   *i1 = *i2 = NULL;
1709   am = X86AMode_IR(offsetB, hregX86_EBP());
1710   switch (hregClass(rreg)) {
1711      case HRcInt32:
1712         *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
1713         return;
1714      case HRcFlt64:
1715         *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
1716         return;
1717      case HRcVec128:
1718         *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
1719         return;
1720      default:
1721         ppHRegClass(hregClass(rreg));
1722         vpanic("genSpill_X86: unimplemented regclass");
1723   }
1724}
1725
1726void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1727                     HReg rreg, Int offsetB, Bool mode64 )
1728{
1729   X86AMode* am;
1730   vassert(offsetB >= 0);
1731   vassert(!hregIsVirtual(rreg));
1732   vassert(mode64 == False);
1733   *i1 = *i2 = NULL;
1734   am = X86AMode_IR(offsetB, hregX86_EBP());
1735   switch (hregClass(rreg)) {
1736      case HRcInt32:
1737         *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
1738         return;
1739      case HRcFlt64:
1740         *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
1741         return;
1742      case HRcVec128:
1743         *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
1744         return;
1745      default:
1746         ppHRegClass(hregClass(rreg));
1747         vpanic("genReload_X86: unimplemented regclass");
1748   }
1749}
1750
1751/* The given instruction reads the specified vreg exactly once, and
1752   that vreg is currently located at the given spill offset.  If
1753   possible, return a variant of the instruction to one which instead
1754   references the spill slot directly. */
1755
1756X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
1757{
1758   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
1759
1760   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
1761      Convert to: src=RMI_Mem, dst=Reg
1762   */
1763   if (i->tag == Xin_Alu32R
1764       && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
1765           || i->Xin.Alu32R.op == Xalu_XOR)
1766       && i->Xin.Alu32R.src->tag == Xrmi_Reg
1767       && i->Xin.Alu32R.src->Xrmi.Reg.reg == vreg) {
1768      vassert(i->Xin.Alu32R.dst != vreg);
1769      return X86Instr_Alu32R(
1770                i->Xin.Alu32R.op,
1771                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
1772                i->Xin.Alu32R.dst
1773             );
1774   }
1775
1776   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
1777      Convert to: src=RI_Imm, dst=Mem
1778   */
1779   if (i->tag == Xin_Alu32R
1780       && (i->Xin.Alu32R.op == Xalu_CMP)
1781       && i->Xin.Alu32R.src->tag == Xrmi_Imm
1782       && i->Xin.Alu32R.dst == vreg) {
1783      return X86Instr_Alu32M(
1784                i->Xin.Alu32R.op,
1785		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
1786                X86AMode_IR( spill_off, hregX86_EBP())
1787             );
1788   }
1789
1790   /* Deal with form: Push(RMI_Reg)
1791      Convert to: Push(RMI_Mem)
1792   */
1793   if (i->tag == Xin_Push
1794       && i->Xin.Push.src->tag == Xrmi_Reg
1795       && i->Xin.Push.src->Xrmi.Reg.reg == vreg) {
1796      return X86Instr_Push(
1797                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
1798             );
1799   }
1800
1801   /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
1802      Convert to CMov32(RM_Mem, dst) */
1803   if (i->tag == Xin_CMov32
1804       && i->Xin.CMov32.src->tag == Xrm_Reg
1805       && i->Xin.CMov32.src->Xrm.Reg.reg == vreg) {
1806      vassert(i->Xin.CMov32.dst != vreg);
1807      return X86Instr_CMov32(
1808                i->Xin.CMov32.cond,
1809                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
1810                i->Xin.CMov32.dst
1811             );
1812   }
1813
1814   /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
1815   if (i->tag == Xin_Test32
1816       && i->Xin.Test32.dst->tag == Xrm_Reg
1817       && i->Xin.Test32.dst->Xrm.Reg.reg == vreg) {
1818      return X86Instr_Test32(
1819                i->Xin.Test32.imm32,
1820                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
1821             );
1822   }
1823
1824   return NULL;
1825}
1826
1827
1828/* --------- The x86 assembler (bleh.) --------- */
1829
1830static UChar iregNo ( HReg r )
1831{
1832   UInt n;
1833   vassert(hregClass(r) == HRcInt32);
1834   vassert(!hregIsVirtual(r));
1835   n = hregNumber(r);
1836   vassert(n <= 7);
1837   return toUChar(n);
1838}
1839
1840static UInt fregNo ( HReg r )
1841{
1842   UInt n;
1843   vassert(hregClass(r) == HRcFlt64);
1844   vassert(!hregIsVirtual(r));
1845   n = hregNumber(r);
1846   vassert(n <= 5);
1847   return n;
1848}
1849
1850static UInt vregNo ( HReg r )
1851{
1852   UInt n;
1853   vassert(hregClass(r) == HRcVec128);
1854   vassert(!hregIsVirtual(r));
1855   n = hregNumber(r);
1856   vassert(n <= 7);
1857   return n;
1858}
1859
1860static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
1861{
1862   return toUChar( ((mod & 3) << 6)
1863                   | ((reg & 7) << 3)
1864                   | (regmem & 7) );
1865}
1866
1867static UChar mkSIB ( Int shift, Int regindex, Int regbase )
1868{
1869   return toUChar( ((shift & 3) << 6)
1870                   | ((regindex & 7) << 3)
1871                   | (regbase & 7) );
1872}
1873
1874static UChar* emit32 ( UChar* p, UInt w32 )
1875{
1876   *p++ = toUChar( w32        & 0x000000FF);
1877   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1878   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1879   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1880   return p;
1881}
1882
1883/* Does a sign-extend of the lowest 8 bits give
1884   the original number? */
1885static Bool fits8bits ( UInt w32 )
1886{
1887   Int i32 = (Int)w32;
1888   return toBool(i32 == ((i32 << 24) >> 24));
1889}
1890
1891
1892/* Forming mod-reg-rm bytes and scale-index-base bytes.
1893
1894     greg,  0(ereg)    |  ereg != ESP && ereg != EBP
1895                       =  00 greg ereg
1896
1897     greg,  d8(ereg)   |  ereg != ESP
1898                       =  01 greg ereg, d8
1899
1900     greg,  d32(ereg)  |  ereg != ESP
1901                       =  10 greg ereg, d32
1902
1903     greg,  d8(%esp)   =  01 greg 100, 0x24, d8
1904
1905     -----------------------------------------------
1906
1907     greg,  d8(base,index,scale)
1908               |  index != ESP
1909               =  01 greg 100, scale index base, d8
1910
1911     greg,  d32(base,index,scale)
1912               |  index != ESP
1913               =  10 greg 100, scale index base, d32
1914*/
1915static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am )
1916{
1917   if (am->tag == Xam_IR) {
1918      if (am->Xam.IR.imm == 0
1919          && am->Xam.IR.reg != hregX86_ESP()
1920          && am->Xam.IR.reg != hregX86_EBP() ) {
1921         *p++ = mkModRegRM(0, iregNo(greg), iregNo(am->Xam.IR.reg));
1922         return p;
1923      }
1924      if (fits8bits(am->Xam.IR.imm)
1925          && am->Xam.IR.reg != hregX86_ESP()) {
1926         *p++ = mkModRegRM(1, iregNo(greg), iregNo(am->Xam.IR.reg));
1927         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1928         return p;
1929      }
1930      if (am->Xam.IR.reg != hregX86_ESP()) {
1931         *p++ = mkModRegRM(2, iregNo(greg), iregNo(am->Xam.IR.reg));
1932         p = emit32(p, am->Xam.IR.imm);
1933         return p;
1934      }
1935      if (am->Xam.IR.reg == hregX86_ESP()
1936          && fits8bits(am->Xam.IR.imm)) {
1937 	 *p++ = mkModRegRM(1, iregNo(greg), 4);
1938         *p++ = 0x24;
1939         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1940         return p;
1941      }
1942      ppX86AMode(am);
1943      vpanic("doAMode_M: can't emit amode IR");
1944      /*NOTREACHED*/
1945   }
1946   if (am->tag == Xam_IRRS) {
1947      if (fits8bits(am->Xam.IRRS.imm)
1948          && am->Xam.IRRS.index != hregX86_ESP()) {
1949         *p++ = mkModRegRM(1, iregNo(greg), 4);
1950         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
1951                                          am->Xam.IRRS.base);
1952         *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
1953         return p;
1954      }
1955      if (am->Xam.IRRS.index != hregX86_ESP()) {
1956         *p++ = mkModRegRM(2, iregNo(greg), 4);
1957         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
1958                                          am->Xam.IRRS.base);
1959         p = emit32(p, am->Xam.IRRS.imm);
1960         return p;
1961      }
1962      ppX86AMode(am);
1963      vpanic("doAMode_M: can't emit amode IRRS");
1964      /*NOTREACHED*/
1965   }
1966   vpanic("doAMode_M: unknown amode");
1967   /*NOTREACHED*/
1968}
1969
1970
1971/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
1972static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
1973{
1974   *p++ = mkModRegRM(3, iregNo(greg), iregNo(ereg));
1975   return p;
1976}
1977
1978
1979/* Emit ffree %st(7) */
1980static UChar* do_ffree_st7 ( UChar* p )
1981{
1982   *p++ = 0xDD;
1983   *p++ = 0xC7;
1984   return p;
1985}
1986
1987/* Emit fstp %st(i), 1 <= i <= 7 */
1988static UChar* do_fstp_st ( UChar* p, Int i )
1989{
1990   vassert(1 <= i && i <= 7);
1991   *p++ = 0xDD;
1992   *p++ = toUChar(0xD8+i);
1993   return p;
1994}
1995
1996/* Emit fld %st(i), 0 <= i <= 6 */
1997static UChar* do_fld_st ( UChar* p, Int i )
1998{
1999   vassert(0 <= i && i <= 6);
2000   *p++ = 0xD9;
2001   *p++ = toUChar(0xC0+i);
2002   return p;
2003}
2004
2005/* Emit f<op> %st(0) */
2006static UChar* do_fop1_st ( UChar* p, X86FpOp op )
2007{
2008   switch (op) {
2009      case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
2010      case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
2011      case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
2012      case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
2013      case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
2014      case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
2015      case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
2016      case Xfp_MOV:    break;
2017      case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
2018                       *p++ = 0xD9; *p++ = 0xF2; /* fptan */
2019                       *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
2020                       break;
2021      default: vpanic("do_fop1_st: unknown op");
2022   }
2023   return p;
2024}
2025
2026/* Emit f<op> %st(i), 1 <= i <= 5 */
2027static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
2028{
2029#  define fake(_n) mkHReg((_n), HRcInt32, False)
2030   Int subopc;
2031   switch (op) {
2032      case Xfp_ADD: subopc = 0; break;
2033      case Xfp_SUB: subopc = 4; break;
2034      case Xfp_MUL: subopc = 1; break;
2035      case Xfp_DIV: subopc = 6; break;
2036      default: vpanic("do_fop2_st: unknown op");
2037   }
2038   *p++ = 0xD8;
2039   p    = doAMode_R(p, fake(subopc), fake(i));
2040   return p;
2041#  undef fake
2042}
2043
2044/* Push a 32-bit word on the stack.  The word depends on tags[3:0];
2045each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
2046*/
2047static UChar* push_word_from_tags ( UChar* p, UShort tags )
2048{
2049   UInt w;
2050   vassert(0 == (tags & ~0xF));
2051   if (tags == 0) {
2052      /* pushl $0x00000000 */
2053      *p++ = 0x6A;
2054      *p++ = 0x00;
2055   }
2056   else
2057   /* pushl $0xFFFFFFFF */
2058   if (tags == 0xF) {
2059      *p++ = 0x6A;
2060      *p++ = 0xFF;
2061   } else {
2062      vassert(0); /* awaiting test case */
2063      w = 0;
2064      if (tags & 1) w |= 0x000000FF;
2065      if (tags & 2) w |= 0x0000FF00;
2066      if (tags & 4) w |= 0x00FF0000;
2067      if (tags & 8) w |= 0xFF000000;
2068      *p++ = 0x68;
2069      p = emit32(p, w);
2070   }
2071   return p;
2072}
2073
2074/* Emit an instruction into buf and return the number of bytes used.
2075   Note that buf is not the insn's final place, and therefore it is
2076   imperative to emit position-independent code.  If the emitted
2077   instruction was a profiler inc, set *is_profInc to True, else
2078   leave it unchanged. */
2079
2080Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
2081                    UChar* buf, Int nbuf, X86Instr* i,
2082                    Bool mode64,
2083                    void* disp_cp_chain_me_to_slowEP,
2084                    void* disp_cp_chain_me_to_fastEP,
2085                    void* disp_cp_xindir,
2086                    void* disp_cp_xassisted )
2087{
2088   UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2089
2090   UInt   xtra;
2091   UChar* p = &buf[0];
2092   UChar* ptmp;
2093   vassert(nbuf >= 32);
2094   vassert(mode64 == False);
2095
2096   /* Wrap an integer as a int register, for use assembling
2097      GrpN insns, in which the greg field is used as a sub-opcode
2098      and does not really contain a register. */
2099#  define fake(_n) mkHReg((_n), HRcInt32, False)
2100
2101   /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
2102
2103   switch (i->tag) {
2104
2105   case Xin_Alu32R:
2106      /* Deal specially with MOV */
2107      if (i->Xin.Alu32R.op == Xalu_MOV) {
2108         switch (i->Xin.Alu32R.src->tag) {
2109            case Xrmi_Imm:
2110               *p++ = toUChar(0xB8 + iregNo(i->Xin.Alu32R.dst));
2111               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2112               goto done;
2113            case Xrmi_Reg:
2114               *p++ = 0x89;
2115               p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2116                                i->Xin.Alu32R.dst);
2117               goto done;
2118            case Xrmi_Mem:
2119               *p++ = 0x8B;
2120               p = doAMode_M(p, i->Xin.Alu32R.dst,
2121                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2122               goto done;
2123            default:
2124               goto bad;
2125         }
2126      }
2127      /* MUL */
2128      if (i->Xin.Alu32R.op == Xalu_MUL) {
2129         switch (i->Xin.Alu32R.src->tag) {
2130            case Xrmi_Reg:
2131               *p++ = 0x0F;
2132               *p++ = 0xAF;
2133               p = doAMode_R(p, i->Xin.Alu32R.dst,
2134                                i->Xin.Alu32R.src->Xrmi.Reg.reg);
2135               goto done;
2136            case Xrmi_Mem:
2137               *p++ = 0x0F;
2138               *p++ = 0xAF;
2139               p = doAMode_M(p, i->Xin.Alu32R.dst,
2140                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2141               goto done;
2142            case Xrmi_Imm:
2143               if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2144                  *p++ = 0x6B;
2145                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2146                  *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2147               } else {
2148                  *p++ = 0x69;
2149                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2150                  p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2151               }
2152               goto done;
2153            default:
2154               goto bad;
2155         }
2156      }
2157      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2158      opc = opc_rr = subopc_imm = opc_imma = 0;
2159      switch (i->Xin.Alu32R.op) {
2160         case Xalu_ADC: opc = 0x13; opc_rr = 0x11;
2161                        subopc_imm = 2; opc_imma = 0x15; break;
2162         case Xalu_ADD: opc = 0x03; opc_rr = 0x01;
2163                        subopc_imm = 0; opc_imma = 0x05; break;
2164         case Xalu_SUB: opc = 0x2B; opc_rr = 0x29;
2165                        subopc_imm = 5; opc_imma = 0x2D; break;
2166         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19;
2167                        subopc_imm = 3; opc_imma = 0x1D; break;
2168         case Xalu_AND: opc = 0x23; opc_rr = 0x21;
2169                        subopc_imm = 4; opc_imma = 0x25; break;
2170         case Xalu_XOR: opc = 0x33; opc_rr = 0x31;
2171                        subopc_imm = 6; opc_imma = 0x35; break;
2172         case Xalu_OR:  opc = 0x0B; opc_rr = 0x09;
2173                        subopc_imm = 1; opc_imma = 0x0D; break;
2174         case Xalu_CMP: opc = 0x3B; opc_rr = 0x39;
2175                        subopc_imm = 7; opc_imma = 0x3D; break;
2176         default: goto bad;
2177      }
2178      switch (i->Xin.Alu32R.src->tag) {
2179         case Xrmi_Imm:
2180            if (i->Xin.Alu32R.dst == hregX86_EAX()
2181                && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2182               *p++ = toUChar(opc_imma);
2183               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2184            } else
2185            if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2186               *p++ = 0x83;
2187               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2188               *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2189            } else {
2190               *p++ = 0x81;
2191               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2192               p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2193            }
2194            goto done;
2195         case Xrmi_Reg:
2196            *p++ = toUChar(opc_rr);
2197            p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2198                             i->Xin.Alu32R.dst);
2199            goto done;
2200         case Xrmi_Mem:
2201            *p++ = toUChar(opc);
2202            p = doAMode_M(p, i->Xin.Alu32R.dst,
2203                             i->Xin.Alu32R.src->Xrmi.Mem.am);
2204            goto done;
2205         default:
2206            goto bad;
2207      }
2208      break;
2209
2210   case Xin_Alu32M:
2211      /* Deal specially with MOV */
2212      if (i->Xin.Alu32M.op == Xalu_MOV) {
2213         switch (i->Xin.Alu32M.src->tag) {
2214            case Xri_Reg:
2215               *p++ = 0x89;
2216               p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2217                                i->Xin.Alu32M.dst);
2218               goto done;
2219            case Xri_Imm:
2220               *p++ = 0xC7;
2221               p = doAMode_M(p, fake(0), i->Xin.Alu32M.dst);
2222               p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2223               goto done;
2224            default:
2225               goto bad;
2226         }
2227      }
2228      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2229         allowed here. */
2230      opc = subopc_imm = opc_imma = 0;
2231      switch (i->Xin.Alu32M.op) {
2232         case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
2233         case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
2234         case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
2235         default: goto bad;
2236      }
2237      switch (i->Xin.Alu32M.src->tag) {
2238         case Xri_Reg:
2239            *p++ = toUChar(opc);
2240            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2241                             i->Xin.Alu32M.dst);
2242            goto done;
2243         case Xri_Imm:
2244            if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
2245               *p++ = 0x83;
2246               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2247               *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
2248               goto done;
2249            } else {
2250               *p++ = 0x81;
2251               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2252               p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2253               goto done;
2254            }
2255         default:
2256            goto bad;
2257      }
2258      break;
2259
2260   case Xin_Sh32:
2261      opc_cl = opc_imm = subopc = 0;
2262      switch (i->Xin.Sh32.op) {
2263         case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2264         case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2265         case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2266         default: goto bad;
2267      }
2268      if (i->Xin.Sh32.src == 0) {
2269         *p++ = toUChar(opc_cl);
2270         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2271      } else {
2272         *p++ = toUChar(opc_imm);
2273         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2274         *p++ = (UChar)(i->Xin.Sh32.src);
2275      }
2276      goto done;
2277
2278   case Xin_Test32:
2279      if (i->Xin.Test32.dst->tag == Xrm_Reg) {
2280         /* testl $imm32, %reg */
2281         *p++ = 0xF7;
2282         p = doAMode_R(p, fake(0), i->Xin.Test32.dst->Xrm.Reg.reg);
2283         p = emit32(p, i->Xin.Test32.imm32);
2284         goto done;
2285      } else {
2286         /* testl $imm32, amode */
2287         *p++ = 0xF7;
2288         p = doAMode_M(p, fake(0), i->Xin.Test32.dst->Xrm.Mem.am);
2289         p = emit32(p, i->Xin.Test32.imm32);
2290         goto done;
2291      }
2292
2293   case Xin_Unary32:
2294      if (i->Xin.Unary32.op == Xun_NOT) {
2295         *p++ = 0xF7;
2296         p = doAMode_R(p, fake(2), i->Xin.Unary32.dst);
2297         goto done;
2298      }
2299      if (i->Xin.Unary32.op == Xun_NEG) {
2300         *p++ = 0xF7;
2301         p = doAMode_R(p, fake(3), i->Xin.Unary32.dst);
2302         goto done;
2303      }
2304      break;
2305
2306   case Xin_Lea32:
2307      *p++ = 0x8D;
2308      p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
2309      goto done;
2310
2311   case Xin_MulL:
2312      subopc = i->Xin.MulL.syned ? 5 : 4;
2313      *p++ = 0xF7;
2314      switch (i->Xin.MulL.src->tag)  {
2315         case Xrm_Mem:
2316            p = doAMode_M(p, fake(subopc),
2317                             i->Xin.MulL.src->Xrm.Mem.am);
2318            goto done;
2319         case Xrm_Reg:
2320            p = doAMode_R(p, fake(subopc),
2321                             i->Xin.MulL.src->Xrm.Reg.reg);
2322            goto done;
2323         default:
2324            goto bad;
2325      }
2326      break;
2327
2328   case Xin_Div:
2329      subopc = i->Xin.Div.syned ? 7 : 6;
2330      *p++ = 0xF7;
2331      switch (i->Xin.Div.src->tag)  {
2332         case Xrm_Mem:
2333            p = doAMode_M(p, fake(subopc),
2334                             i->Xin.Div.src->Xrm.Mem.am);
2335            goto done;
2336         case Xrm_Reg:
2337            p = doAMode_R(p, fake(subopc),
2338                             i->Xin.Div.src->Xrm.Reg.reg);
2339            goto done;
2340         default:
2341            goto bad;
2342      }
2343      break;
2344
2345   case Xin_Sh3232:
2346      vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
2347      if (i->Xin.Sh3232.amt == 0) {
2348         /* shldl/shrdl by %cl */
2349         *p++ = 0x0F;
2350         if (i->Xin.Sh3232.op == Xsh_SHL) {
2351            *p++ = 0xA5;
2352         } else {
2353            *p++ = 0xAD;
2354         }
2355         p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
2356         goto done;
2357      }
2358      break;
2359
2360   case Xin_Push:
2361      switch (i->Xin.Push.src->tag) {
2362         case Xrmi_Mem:
2363            *p++ = 0xFF;
2364            p = doAMode_M(p, fake(6), i->Xin.Push.src->Xrmi.Mem.am);
2365            goto done;
2366         case Xrmi_Imm:
2367            *p++ = 0x68;
2368            p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
2369            goto done;
2370         case Xrmi_Reg:
2371            *p++ = toUChar(0x50 + iregNo(i->Xin.Push.src->Xrmi.Reg.reg));
2372            goto done;
2373        default:
2374            goto bad;
2375      }
2376
2377   case Xin_Call:
2378      /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
2379         for explanation of this. */
2380      switch (i->Xin.Call.regparms) {
2381         case 0: irno = iregNo(hregX86_EAX()); break;
2382         case 1: irno = iregNo(hregX86_EDX()); break;
2383         case 2: irno = iregNo(hregX86_ECX()); break;
2384         case 3: irno = iregNo(hregX86_EDI()); break;
2385         default: vpanic(" emit_X86Instr:call:regparms");
2386      }
2387      /* jump over the following two insns if the condition does not
2388         hold */
2389      if (i->Xin.Call.cond != Xcc_ALWAYS) {
2390         *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
2391         *p++ = 0x07; /* 7 bytes in the next two insns */
2392      }
2393      /* movl $target, %tmp */
2394      *p++ = toUChar(0xB8 + irno);
2395      p = emit32(p, i->Xin.Call.target);
2396      /* call *%tmp */
2397      *p++ = 0xFF;
2398      *p++ = toUChar(0xD0 + irno);
2399      goto done;
2400
2401   case Xin_XDirect: {
2402      /* NB: what goes on here has to be very closely coordinated with the
2403         chainXDirect_X86 and unchainXDirect_X86 below. */
2404      /* We're generating chain-me requests here, so we need to be
2405         sure this is actually allowed -- no-redir translations can't
2406         use chain-me's.  Hence: */
2407      vassert(disp_cp_chain_me_to_slowEP != NULL);
2408      vassert(disp_cp_chain_me_to_fastEP != NULL);
2409
2410      /* Use ptmp for backpatching conditional jumps. */
2411      ptmp = NULL;
2412
2413      /* First off, if this is conditional, create a conditional
2414         jump over the rest of it. */
2415      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
2416         /* jmp fwds if !condition */
2417         *p++ = toUChar(0x70 + (0xF & (i->Xin.XDirect.cond ^ 1)));
2418         ptmp = p; /* fill in this bit later */
2419         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2420      }
2421
2422      /* Update the guest EIP. */
2423      /* movl $dstGA, amEIP */
2424      *p++ = 0xC7;
2425      p    = doAMode_M(p, fake(0), i->Xin.XDirect.amEIP);
2426      p    = emit32(p, i->Xin.XDirect.dstGA);
2427
2428      /* --- FIRST PATCHABLE BYTE follows --- */
2429      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2430         to) backs up the return address, so as to find the address of
2431         the first patchable byte.  So: don't change the length of the
2432         two instructions below. */
2433      /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */
2434      *p++ = 0xBA;
2435      void* disp_cp_chain_me
2436               = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2437                                         : disp_cp_chain_me_to_slowEP;
2438      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_chain_me));
2439      /* call *%edx */
2440      *p++ = 0xFF;
2441      *p++ = 0xD2;
2442      /* --- END of PATCHABLE BYTES --- */
2443
2444      /* Fix up the conditional jump, if there was one. */
2445      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
2446         Int delta = p - ptmp;
2447         vassert(delta > 0 && delta < 40);
2448         *ptmp = toUChar(delta-1);
2449      }
2450      goto done;
2451   }
2452
2453   case Xin_XIndir: {
2454      /* We're generating transfers that could lead indirectly to a
2455         chain-me, so we need to be sure this is actually allowed --
2456         no-redir translations are not allowed to reach normal
2457         translations without going through the scheduler.  That means
2458         no XDirects or XIndirs out from no-redir translations.
2459         Hence: */
2460      vassert(disp_cp_xindir != NULL);
2461
2462      /* Use ptmp for backpatching conditional jumps. */
2463      ptmp = NULL;
2464
2465      /* First off, if this is conditional, create a conditional
2466         jump over the rest of it. */
2467      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
2468         /* jmp fwds if !condition */
2469         *p++ = toUChar(0x70 + (0xF & (i->Xin.XIndir.cond ^ 1)));
2470         ptmp = p; /* fill in this bit later */
2471         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2472      }
2473
2474      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
2475      *p++ = 0x89;
2476      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
2477
2478      /* movl $disp_indir, %edx */
2479      *p++ = 0xBA;
2480      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
2481      /* jmp *%edx */
2482      *p++ = 0xFF;
2483      *p++ = 0xE2;
2484
2485      /* Fix up the conditional jump, if there was one. */
2486      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
2487         Int delta = p - ptmp;
2488         vassert(delta > 0 && delta < 40);
2489         *ptmp = toUChar(delta-1);
2490      }
2491      goto done;
2492   }
2493
2494   case Xin_XAssisted: {
2495      /* Use ptmp for backpatching conditional jumps. */
2496      ptmp = NULL;
2497
2498      /* First off, if this is conditional, create a conditional
2499         jump over the rest of it. */
2500      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
2501         /* jmp fwds if !condition */
2502         *p++ = toUChar(0x70 + (0xF & (i->Xin.XAssisted.cond ^ 1)));
2503         ptmp = p; /* fill in this bit later */
2504         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2505      }
2506
2507      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
2508      *p++ = 0x89;
2509      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
2510      /* movl $magic_number, %ebp. */
2511      UInt trcval = 0;
2512      switch (i->Xin.XAssisted.jk) {
2513         case Ijk_ClientReq:    trcval = VEX_TRC_JMP_CLIENTREQ;    break;
2514         case Ijk_Sys_syscall:  trcval = VEX_TRC_JMP_SYS_SYSCALL;  break;
2515         case Ijk_Sys_int128:   trcval = VEX_TRC_JMP_SYS_INT128;   break;
2516         case Ijk_Sys_int129:   trcval = VEX_TRC_JMP_SYS_INT129;   break;
2517         case Ijk_Sys_int130:   trcval = VEX_TRC_JMP_SYS_INT130;   break;
2518         case Ijk_Sys_sysenter: trcval = VEX_TRC_JMP_SYS_SYSENTER; break;
2519         case Ijk_Yield:        trcval = VEX_TRC_JMP_YIELD;        break;
2520         case Ijk_EmWarn:       trcval = VEX_TRC_JMP_EMWARN;       break;
2521         case Ijk_MapFail:      trcval = VEX_TRC_JMP_MAPFAIL;      break;
2522         case Ijk_NoDecode:     trcval = VEX_TRC_JMP_NODECODE;     break;
2523         case Ijk_TInval:       trcval = VEX_TRC_JMP_TINVAL;       break;
2524         case Ijk_NoRedir:      trcval = VEX_TRC_JMP_NOREDIR;      break;
2525         case Ijk_SigTRAP:      trcval = VEX_TRC_JMP_SIGTRAP;      break;
2526         case Ijk_SigSEGV:      trcval = VEX_TRC_JMP_SIGSEGV;      break;
2527         case Ijk_Boring:       trcval = VEX_TRC_JMP_BORING;       break;
2528         /* We don't expect to see the following being assisted. */
2529         case Ijk_Ret:
2530         case Ijk_Call:
2531         /* fallthrough */
2532         default:
2533            ppIRJumpKind(i->Xin.XAssisted.jk);
2534            vpanic("emit_X86Instr.Xin_XAssisted: unexpected jump kind");
2535      }
2536      vassert(trcval != 0);
2537      *p++ = 0xBD;
2538      p = emit32(p, trcval);
2539
2540      /* movl $disp_indir, %edx */
2541      *p++ = 0xBA;
2542      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xassisted));
2543      /* jmp *%edx */
2544      *p++ = 0xFF;
2545      *p++ = 0xE2;
2546
2547      /* Fix up the conditional jump, if there was one. */
2548      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
2549         Int delta = p - ptmp;
2550         vassert(delta > 0 && delta < 40);
2551         *ptmp = toUChar(delta-1);
2552      }
2553      goto done;
2554   }
2555
2556   case Xin_CMov32:
2557      vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
2558
2559      /* This generates cmov, which is illegal on P54/P55. */
2560      /*
2561      *p++ = 0x0F;
2562      *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
2563      if (i->Xin.CMov32.src->tag == Xrm_Reg) {
2564         p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
2565         goto done;
2566      }
2567      if (i->Xin.CMov32.src->tag == Xrm_Mem) {
2568         p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
2569         goto done;
2570      }
2571      */
2572
2573      /* Alternative version which works on any x86 variant. */
2574      /* jmp fwds if !condition */
2575      *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
2576      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2577      ptmp = p;
2578
2579      switch (i->Xin.CMov32.src->tag) {
2580         case Xrm_Reg:
2581            /* Big sigh.  This is movl E -> G ... */
2582            *p++ = 0x89;
2583            p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
2584                             i->Xin.CMov32.dst);
2585
2586            break;
2587         case Xrm_Mem:
2588            /* ... whereas this is movl G -> E.  That's why the args
2589               to doAMode_R appear to be the wrong way round in the
2590               Xrm_Reg case. */
2591            *p++ = 0x8B;
2592            p = doAMode_M(p, i->Xin.CMov32.dst,
2593                             i->Xin.CMov32.src->Xrm.Mem.am);
2594            break;
2595         default:
2596            goto bad;
2597      }
2598      /* Fill in the jump offset. */
2599      *(ptmp-1) = toUChar(p - ptmp);
2600      goto done;
2601
2602      break;
2603
2604   case Xin_LoadEX:
2605      if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
2606         /* movzbl */
2607         *p++ = 0x0F;
2608         *p++ = 0xB6;
2609         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2610         goto done;
2611      }
2612      if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
2613         /* movzwl */
2614         *p++ = 0x0F;
2615         *p++ = 0xB7;
2616         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2617         goto done;
2618      }
2619      if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
2620         /* movsbl */
2621         *p++ = 0x0F;
2622         *p++ = 0xBE;
2623         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2624         goto done;
2625      }
2626      break;
2627
2628   case Xin_Set32:
2629      /* Make the destination register be 1 or 0, depending on whether
2630         the relevant condition holds.  We have to dodge and weave
2631         when the destination is %esi or %edi as we cannot directly
2632         emit the native 'setb %reg' for those.  Further complication:
2633         the top 24 bits of the destination should be forced to zero,
2634         but doing 'xor %r,%r' kills the flag(s) we are about to read.
2635         Sigh.  So start off my moving $0 into the dest. */
2636
2637      /* Do we need to swap in %eax? */
2638      if (iregNo(i->Xin.Set32.dst) >= 4) {
2639         /* xchg %eax, %dst */
2640         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2641         /* movl $0, %eax */
2642         *p++ =toUChar(0xB8 + iregNo(hregX86_EAX()));
2643         p = emit32(p, 0);
2644         /* setb lo8(%eax) */
2645         *p++ = 0x0F;
2646         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2647         p = doAMode_R(p, fake(0), hregX86_EAX());
2648         /* xchg %eax, %dst */
2649         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2650      } else {
2651         /* movl $0, %dst */
2652         *p++ = toUChar(0xB8 + iregNo(i->Xin.Set32.dst));
2653         p = emit32(p, 0);
2654         /* setb lo8(%dst) */
2655         *p++ = 0x0F;
2656         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2657         p = doAMode_R(p, fake(0), i->Xin.Set32.dst);
2658      }
2659      goto done;
2660
2661   case Xin_Bsfr32:
2662      *p++ = 0x0F;
2663      if (i->Xin.Bsfr32.isFwds) {
2664         *p++ = 0xBC;
2665      } else {
2666         *p++ = 0xBD;
2667      }
2668      p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
2669      goto done;
2670
2671   case Xin_MFence:
2672      /* see comment in hdefs.h re this insn */
2673      if (0) vex_printf("EMIT FENCE\n");
2674      if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
2675                                  |VEX_HWCAPS_X86_SSE2)) {
2676         /* mfence */
2677         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
2678         goto done;
2679      }
2680      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_SSE1) {
2681         /* sfence */
2682         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
2683         /* lock addl $0,0(%esp) */
2684         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2685         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2686         goto done;
2687      }
2688      if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
2689         /* lock addl $0,0(%esp) */
2690         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2691         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2692         goto done;
2693      }
2694      vpanic("emit_X86Instr:mfence:hwcaps");
2695      /*NOTREACHED*/
2696      break;
2697
2698   case Xin_ACAS:
2699      /* lock */
2700      *p++ = 0xF0;
2701      /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
2702         in %ebx.  The new-value register is hardwired to be %ebx
2703         since letting it be any integer register gives the problem
2704         that %sil and %dil are unaddressible on x86 and hence we
2705         would have to resort to the same kind of trickery as with
2706         byte-sized Xin.Store, just below.  Given that this isn't
2707         performance critical, it is simpler just to force the
2708         register operand to %ebx (could equally be %ecx or %edx).
2709         (Although %ebx is more consistent with cmpxchg8b.) */
2710      if (i->Xin.ACAS.sz == 2) *p++ = 0x66;
2711      *p++ = 0x0F;
2712      if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
2713      p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
2714      goto done;
2715
2716   case Xin_DACAS:
2717      /* lock */
2718      *p++ = 0xF0;
2719      /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
2720         in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
2721         aren't encoded in the insn. */
2722      *p++ = 0x0F;
2723      *p++ = 0xC7;
2724      p = doAMode_M(p, fake(1), i->Xin.DACAS.addr);
2725      goto done;
2726
2727   case Xin_Store:
2728      if (i->Xin.Store.sz == 2) {
2729         /* This case, at least, is simple, given that we can
2730            reference the low 16 bits of any integer register. */
2731         *p++ = 0x66;
2732         *p++ = 0x89;
2733         p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2734         goto done;
2735      }
2736
2737      if (i->Xin.Store.sz == 1) {
2738         /* We have to do complex dodging and weaving if src is not
2739            the low 8 bits of %eax/%ebx/%ecx/%edx. */
2740         if (iregNo(i->Xin.Store.src) < 4) {
2741            /* we're OK, can do it directly */
2742            *p++ = 0x88;
2743            p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2744           goto done;
2745         } else {
2746            /* Bleh.  This means the source is %edi or %esi.  Since
2747               the address mode can only mention three registers, at
2748               least one of %eax/%ebx/%ecx/%edx must be available to
2749               temporarily swap the source into, so the store can
2750               happen.  So we have to look at the regs mentioned
2751               in the amode. */
2752            HReg swap = INVALID_HREG;
2753            HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(),
2754                  ecx = hregX86_ECX(), edx = hregX86_EDX();
2755            Bool a_ok = True, b_ok = True, c_ok = True, d_ok = True;
2756            HRegUsage u;
2757            Int j;
2758            initHRegUsage(&u);
2759            addRegUsage_X86AMode(&u,  i->Xin.Store.dst);
2760            for (j = 0; j < u.n_used; j++) {
2761               HReg r = u.hreg[j];
2762               if (r == eax) a_ok = False;
2763               if (r == ebx) b_ok = False;
2764               if (r == ecx) c_ok = False;
2765               if (r == edx) d_ok = False;
2766            }
2767            if (a_ok) swap = eax;
2768            if (b_ok) swap = ebx;
2769            if (c_ok) swap = ecx;
2770            if (d_ok) swap = edx;
2771            vassert(swap != INVALID_HREG);
2772            /* xchgl %source, %swap. Could do better if swap is %eax. */
2773            *p++ = 0x87;
2774            p = doAMode_R(p, i->Xin.Store.src, swap);
2775            /* movb lo8{%swap}, (dst) */
2776            *p++ = 0x88;
2777            p = doAMode_M(p, swap, i->Xin.Store.dst);
2778            /* xchgl %source, %swap. Could do better if swap is %eax. */
2779            *p++ = 0x87;
2780            p = doAMode_R(p, i->Xin.Store.src, swap);
2781            goto done;
2782         }
2783      } /* if (i->Xin.Store.sz == 1) */
2784      break;
2785
2786   case Xin_FpUnary:
2787      /* gop %src, %dst
2788         --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
2789      */
2790      p = do_ffree_st7(p);
2791      p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
2792      p = do_fop1_st(p, i->Xin.FpUnary.op);
2793      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
2794      goto done;
2795
2796   case Xin_FpBinary:
2797      if (i->Xin.FpBinary.op == Xfp_YL2X
2798          || i->Xin.FpBinary.op == Xfp_YL2XP1) {
2799         /* Have to do this specially. */
2800         /* ffree %st7 ; fld %st(srcL) ;
2801            ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
2802         p = do_ffree_st7(p);
2803         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2804         p = do_ffree_st7(p);
2805         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2806         *p++ = 0xD9;
2807         *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
2808         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2809         goto done;
2810      }
2811      if (i->Xin.FpBinary.op == Xfp_ATAN) {
2812         /* Have to do this specially. */
2813         /* ffree %st7 ; fld %st(srcL) ;
2814            ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
2815         p = do_ffree_st7(p);
2816         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2817         p = do_ffree_st7(p);
2818         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2819         *p++ = 0xD9; *p++ = 0xF3;
2820         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2821         goto done;
2822      }
2823      if (i->Xin.FpBinary.op == Xfp_PREM
2824          || i->Xin.FpBinary.op == Xfp_PREM1
2825          || i->Xin.FpBinary.op == Xfp_SCALE) {
2826         /* Have to do this specially. */
2827         /* ffree %st7 ; fld %st(srcR) ;
2828            ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
2829            fincstp ; ffree %st7 */
2830         p = do_ffree_st7(p);
2831         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
2832         p = do_ffree_st7(p);
2833         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
2834         *p++ = 0xD9;
2835         switch (i->Xin.FpBinary.op) {
2836            case Xfp_PREM: *p++ = 0xF8; break;
2837            case Xfp_PREM1: *p++ = 0xF5; break;
2838            case Xfp_SCALE: *p++ =  0xFD; break;
2839            default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
2840         }
2841         p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
2842         *p++ = 0xD9; *p++ = 0xF7;
2843         p = do_ffree_st7(p);
2844         goto done;
2845      }
2846      /* General case */
2847      /* gop %srcL, %srcR, %dst
2848         --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
2849      */
2850      p = do_ffree_st7(p);
2851      p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2852      p = do_fop2_st(p, i->Xin.FpBinary.op,
2853                        1+hregNumber(i->Xin.FpBinary.srcR));
2854      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2855      goto done;
2856
2857   case Xin_FpLdSt:
2858      if (i->Xin.FpLdSt.isLoad) {
2859         /* Load from memory into %fakeN.
2860            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
2861         */
2862         p = do_ffree_st7(p);
2863         switch (i->Xin.FpLdSt.sz) {
2864            case 4:
2865               *p++ = 0xD9;
2866               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2867               break;
2868            case 8:
2869               *p++ = 0xDD;
2870               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2871               break;
2872            case 10:
2873               *p++ = 0xDB;
2874               p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
2875               break;
2876            default:
2877               vpanic("emitX86Instr(FpLdSt,load)");
2878         }
2879         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
2880         goto done;
2881      } else {
2882         /* Store from %fakeN into memory.
2883            --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
2884	 */
2885         p = do_ffree_st7(p);
2886         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
2887         switch (i->Xin.FpLdSt.sz) {
2888            case 4:
2889               *p++ = 0xD9;
2890               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2891               break;
2892            case 8:
2893               *p++ = 0xDD;
2894               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2895               break;
2896            case 10:
2897               *p++ = 0xDB;
2898               p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
2899               break;
2900            default:
2901               vpanic("emitX86Instr(FpLdSt,store)");
2902         }
2903         goto done;
2904      }
2905      break;
2906
2907   case Xin_FpLdStI:
2908      if (i->Xin.FpLdStI.isLoad) {
2909         /* Load from memory into %fakeN, converting from an int.
2910            --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
2911         */
2912         switch (i->Xin.FpLdStI.sz) {
2913            case 8:  opc = 0xDF; subopc_imm = 5; break;
2914            case 4:  opc = 0xDB; subopc_imm = 0; break;
2915            case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
2916            default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
2917         }
2918         p = do_ffree_st7(p);
2919         *p++ = toUChar(opc);
2920         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2921         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
2922         goto done;
2923      } else {
2924         /* Store from %fakeN into memory, converting to an int.
2925            --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
2926	 */
2927         switch (i->Xin.FpLdStI.sz) {
2928            case 8:  opc = 0xDF; subopc_imm = 7; break;
2929            case 4:  opc = 0xDB; subopc_imm = 3; break;
2930            case 2:  opc = 0xDF; subopc_imm = 3; break;
2931            default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
2932         }
2933         p = do_ffree_st7(p);
2934         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
2935         *p++ = toUChar(opc);
2936         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2937         goto done;
2938      }
2939      break;
2940
2941   case Xin_Fp64to32:
2942      /* ffree %st7 ; fld %st(src) */
2943      p = do_ffree_st7(p);
2944      p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
2945      /* subl $4, %esp */
2946      *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
2947      /* fstps (%esp) */
2948      *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
2949      /* flds (%esp) */
2950      *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
2951      /* addl $4, %esp */
2952      *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
2953      /* fstp %st(1+dst) */
2954      p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
2955      goto done;
2956
2957   case Xin_FpCMov:
2958      /* jmp fwds if !condition */
2959      *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
2960      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2961      ptmp = p;
2962
2963      /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
2964      p = do_ffree_st7(p);
2965      p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
2966      p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
2967
2968      /* Fill in the jump offset. */
2969      *(ptmp-1) = toUChar(p - ptmp);
2970      goto done;
2971
2972   case Xin_FpLdCW:
2973      *p++ = 0xD9;
2974      p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdCW.addr);
2975      goto done;
2976
2977   case Xin_FpStSW_AX:
2978      /* note, this emits fnstsw %ax, not fstsw %ax */
2979      *p++ = 0xDF;
2980      *p++ = 0xE0;
2981      goto done;
2982
2983   case Xin_FpCmp:
2984      /* gcmp %fL, %fR, %dst
2985         -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
2986            fnstsw %ax ; movl %eax, %dst
2987      */
2988      /* ffree %st7 */
2989      p = do_ffree_st7(p);
2990      /* fpush %fL */
2991      p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
2992      /* fucomp %(fR+1) */
2993      *p++ = 0xDD;
2994      *p++ = toUChar(0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR))));
2995      /* fnstsw %ax */
2996      *p++ = 0xDF;
2997      *p++ = 0xE0;
2998      /*  movl %eax, %dst */
2999      *p++ = 0x89;
3000      p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
3001      goto done;
3002
3003   case Xin_SseConst: {
3004      UShort con = i->Xin.SseConst.con;
3005      p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
3006      p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
3007      p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
3008      p = push_word_from_tags(p, toUShort(con & 0xF));
3009      /* movl (%esp), %xmm-dst */
3010      *p++ = 0x0F;
3011      *p++ = 0x10;
3012      *p++ = toUChar(0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst)));
3013      *p++ = 0x24;
3014      /* addl $16, %esp */
3015      *p++ = 0x83;
3016      *p++ = 0xC4;
3017      *p++ = 0x10;
3018      goto done;
3019   }
3020
3021   case Xin_SseLdSt:
3022      *p++ = 0x0F;
3023      *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
3024      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdSt.reg)), i->Xin.SseLdSt.addr);
3025      goto done;
3026
3027   case Xin_SseLdzLO:
3028      vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
3029      /* movs[sd] amode, %xmm-dst */
3030      *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3031      *p++ = 0x0F;
3032      *p++ = 0x10;
3033      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdzLO.reg)),
3034                       i->Xin.SseLdzLO.addr);
3035      goto done;
3036
3037   case Xin_Sse32Fx4:
3038      xtra = 0;
3039      *p++ = 0x0F;
3040      switch (i->Xin.Sse32Fx4.op) {
3041         case Xsse_ADDF:   *p++ = 0x58; break;
3042         case Xsse_DIVF:   *p++ = 0x5E; break;
3043         case Xsse_MAXF:   *p++ = 0x5F; break;
3044         case Xsse_MINF:   *p++ = 0x5D; break;
3045         case Xsse_MULF:   *p++ = 0x59; break;
3046         case Xsse_RCPF:   *p++ = 0x53; break;
3047         case Xsse_RSQRTF: *p++ = 0x52; break;
3048         case Xsse_SQRTF:  *p++ = 0x51; break;
3049         case Xsse_SUBF:   *p++ = 0x5C; break;
3050         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3051         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3052         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3053         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3054         default: goto bad;
3055      }
3056      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32Fx4.dst)),
3057                       fake(vregNo(i->Xin.Sse32Fx4.src)) );
3058      if (xtra & 0x100)
3059         *p++ = toUChar(xtra & 0xFF);
3060      goto done;
3061
3062   case Xin_Sse64Fx2:
3063      xtra = 0;
3064      *p++ = 0x66;
3065      *p++ = 0x0F;
3066      switch (i->Xin.Sse64Fx2.op) {
3067         case Xsse_ADDF:   *p++ = 0x58; break;
3068         case Xsse_DIVF:   *p++ = 0x5E; break;
3069         case Xsse_MAXF:   *p++ = 0x5F; break;
3070         case Xsse_MINF:   *p++ = 0x5D; break;
3071         case Xsse_MULF:   *p++ = 0x59; break;
3072         case Xsse_RCPF:   *p++ = 0x53; break;
3073         case Xsse_RSQRTF: *p++ = 0x52; break;
3074         case Xsse_SQRTF:  *p++ = 0x51; break;
3075         case Xsse_SUBF:   *p++ = 0x5C; break;
3076         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3077         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3078         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3079         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3080         default: goto bad;
3081      }
3082      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64Fx2.dst)),
3083                       fake(vregNo(i->Xin.Sse64Fx2.src)) );
3084      if (xtra & 0x100)
3085         *p++ = toUChar(xtra & 0xFF);
3086      goto done;
3087
3088   case Xin_Sse32FLo:
3089      xtra = 0;
3090      *p++ = 0xF3;
3091      *p++ = 0x0F;
3092      switch (i->Xin.Sse32FLo.op) {
3093         case Xsse_ADDF:   *p++ = 0x58; break;
3094         case Xsse_DIVF:   *p++ = 0x5E; break;
3095         case Xsse_MAXF:   *p++ = 0x5F; break;
3096         case Xsse_MINF:   *p++ = 0x5D; break;
3097         case Xsse_MULF:   *p++ = 0x59; break;
3098         case Xsse_RCPF:   *p++ = 0x53; break;
3099         case Xsse_RSQRTF: *p++ = 0x52; break;
3100         case Xsse_SQRTF:  *p++ = 0x51; break;
3101         case Xsse_SUBF:   *p++ = 0x5C; break;
3102         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3103         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3104         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3105         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3106         default: goto bad;
3107      }
3108      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32FLo.dst)),
3109                       fake(vregNo(i->Xin.Sse32FLo.src)) );
3110      if (xtra & 0x100)
3111         *p++ = toUChar(xtra & 0xFF);
3112      goto done;
3113
3114   case Xin_Sse64FLo:
3115      xtra = 0;
3116      *p++ = 0xF2;
3117      *p++ = 0x0F;
3118      switch (i->Xin.Sse64FLo.op) {
3119         case Xsse_ADDF:   *p++ = 0x58; break;
3120         case Xsse_DIVF:   *p++ = 0x5E; break;
3121         case Xsse_MAXF:   *p++ = 0x5F; break;
3122         case Xsse_MINF:   *p++ = 0x5D; break;
3123         case Xsse_MULF:   *p++ = 0x59; break;
3124         case Xsse_RCPF:   *p++ = 0x53; break;
3125         case Xsse_RSQRTF: *p++ = 0x52; break;
3126         case Xsse_SQRTF:  *p++ = 0x51; break;
3127         case Xsse_SUBF:   *p++ = 0x5C; break;
3128         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3129         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3130         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3131         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3132         default: goto bad;
3133      }
3134      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64FLo.dst)),
3135                       fake(vregNo(i->Xin.Sse64FLo.src)) );
3136      if (xtra & 0x100)
3137         *p++ = toUChar(xtra & 0xFF);
3138      goto done;
3139
3140   case Xin_SseReRg:
3141#     define XX(_n) *p++ = (_n)
3142      switch (i->Xin.SseReRg.op) {
3143         case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
3144         case Xsse_OR:                 XX(0x0F); XX(0x56); break;
3145         case Xsse_XOR:                XX(0x0F); XX(0x57); break;
3146         case Xsse_AND:                XX(0x0F); XX(0x54); break;
3147         case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
3148         case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
3149         case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
3150         case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
3151         case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
3152         case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
3153         case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
3154         case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
3155         case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
3156         case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
3157         case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
3158         case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
3159         case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
3160         case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
3161         case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
3162         case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
3163         case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
3164         case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
3165         case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
3166         case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
3167         case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
3168         case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
3169         case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
3170         case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
3171         case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
3172         case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
3173         case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
3174         case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
3175         case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
3176         case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
3177         case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
3178         case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
3179         case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
3180         case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
3181         case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
3182         case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
3183         case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
3184         case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
3185         case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
3186         case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
3187         case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
3188         case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
3189         case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
3190         case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
3191         case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
3192         case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
3193         case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
3194         case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
3195         case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
3196         case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
3197         default: goto bad;
3198      }
3199      p = doAMode_R(p, fake(vregNo(i->Xin.SseReRg.dst)),
3200                       fake(vregNo(i->Xin.SseReRg.src)) );
3201#     undef XX
3202      goto done;
3203
3204   case Xin_SseCMov:
3205      /* jmp fwds if !condition */
3206      *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
3207      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3208      ptmp = p;
3209
3210      /* movaps %src, %dst */
3211      *p++ = 0x0F;
3212      *p++ = 0x28;
3213      p = doAMode_R(p, fake(vregNo(i->Xin.SseCMov.dst)),
3214                       fake(vregNo(i->Xin.SseCMov.src)) );
3215
3216      /* Fill in the jump offset. */
3217      *(ptmp-1) = toUChar(p - ptmp);
3218      goto done;
3219
3220   case Xin_SseShuf:
3221      *p++ = 0x66;
3222      *p++ = 0x0F;
3223      *p++ = 0x70;
3224      p = doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
3225                       fake(vregNo(i->Xin.SseShuf.src)) );
3226      *p++ = (UChar)(i->Xin.SseShuf.order);
3227      goto done;
3228
3229   case Xin_EvCheck: {
3230      /* We generate:
3231            (3 bytes)  decl 4(%ebp)    4 == offsetof(host_EvC_COUNTER)
3232            (2 bytes)  jns  nofail     expected taken
3233            (3 bytes)  jmp* 0(%ebp)    0 == offsetof(host_EvC_FAILADDR)
3234            nofail:
3235      */
3236      /* This is heavily asserted re instruction lengths.  It needs to
3237         be.  If we get given unexpected forms of .amCounter or
3238         .amFailAddr -- basically, anything that's not of the form
3239         uimm7(%ebp) -- they are likely to fail. */
3240      /* Note also that after the decl we must be very careful not to
3241         read the carry flag, else we get a partial flags stall.
3242         js/jns avoids that, though. */
3243      UChar* p0 = p;
3244      /* ---  decl 8(%ebp) --- */
3245      /* "fake(1)" because + there's no register in this encoding;
3246         instead the register + field is used as a sub opcode.  The
3247         encoding for "decl r/m32" + is FF /1, hence the fake(1). */
3248      *p++ = 0xFF;
3249      p = doAMode_M(p, fake(1), i->Xin.EvCheck.amCounter);
3250      vassert(p - p0 == 3);
3251      /* --- jns nofail --- */
3252      *p++ = 0x79;
3253      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3254      vassert(p - p0 == 5);
3255      /* --- jmp* 0(%ebp) --- */
3256      /* The encoding is FF /4. */
3257      *p++ = 0xFF;
3258      p = doAMode_M(p, fake(4), i->Xin.EvCheck.amFailAddr);
3259      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3260      /* And crosscheck .. */
3261      vassert(evCheckSzB_X86() == 8);
3262      goto done;
3263   }
3264
3265   case Xin_ProfInc: {
3266      /* We generate   addl $1,NotKnownYet
3267                       adcl $0,NotKnownYet+4
3268         in the expectation that a later call to LibVEX_patchProfCtr
3269         will be used to fill in the immediate fields once the right
3270         value is known.
3271           83 05  00 00 00 00  01
3272           83 15  00 00 00 00  00
3273      */
3274      *p++ = 0x83; *p++ = 0x05;
3275      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3276      *p++ = 0x01;
3277      *p++ = 0x83; *p++ = 0x15;
3278      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3279      *p++ = 0x00;
3280      /* Tell the caller .. */
3281      vassert(!(*is_profInc));
3282      *is_profInc = True;
3283      goto done;
3284   }
3285
3286   default:
3287      goto bad;
3288   }
3289
3290  bad:
3291   ppX86Instr(i, mode64);
3292   vpanic("emit_X86Instr");
3293   /*NOTREACHED*/
3294
3295  done:
3296   vassert(p - &buf[0] <= 32);
3297   return p - &buf[0];
3298
3299#  undef fake
3300}
3301
3302
3303/* How big is an event check?  See case for Xin_EvCheck in
3304   emit_X86Instr just above.  That crosschecks what this returns, so
3305   we can tell if we're inconsistent. */
3306Int evCheckSzB_X86 ( void )
3307{
3308   return 8;
3309}
3310
3311
3312/* NB: what goes on here has to be very closely coordinated with the
3313   emitInstr case for XDirect, above. */
3314VexInvalRange chainXDirect_X86 ( void* place_to_chain,
3315                                 void* disp_cp_chain_me_EXPECTED,
3316                                 void* place_to_jump_to )
3317{
3318   /* What we're expecting to see is:
3319        movl $disp_cp_chain_me_EXPECTED, %edx
3320        call *%edx
3321      viz
3322        BA <4 bytes value == disp_cp_chain_me_EXPECTED>
3323        FF D2
3324   */
3325   UChar* p = (UChar*)place_to_chain;
3326   vassert(p[0] == 0xBA);
3327   vassert(*(UInt*)(&p[1]) == (UInt)Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
3328   vassert(p[5] == 0xFF);
3329   vassert(p[6] == 0xD2);
3330   /* And what we want to change it to is:
3331          jmp disp32   where disp32 is relative to the next insn
3332          ud2;
3333        viz
3334          E9 <4 bytes == disp32>
3335          0F 0B
3336      The replacement has the same length as the original.
3337   */
3338   /* This is the delta we need to put into a JMP d32 insn.  It's
3339      relative to the start of the next insn, hence the -5.  */
3340   Long delta = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
3341
3342   /* And make the modifications. */
3343   p[0] = 0xE9;
3344   p[1] = (delta >> 0) & 0xFF;
3345   p[2] = (delta >> 8) & 0xFF;
3346   p[3] = (delta >> 16) & 0xFF;
3347   p[4] = (delta >> 24) & 0xFF;
3348   p[5] = 0x0F; p[6]  = 0x0B;
3349   /* sanity check on the delta -- top 32 are all 0 or all 1 */
3350   delta >>= 32;
3351   vassert(delta == 0LL || delta == -1LL);
3352   VexInvalRange vir = {0, 0};
3353   return vir;
3354}
3355
3356
3357/* NB: what goes on here has to be very closely coordinated with the
3358   emitInstr case for XDirect, above. */
3359VexInvalRange unchainXDirect_X86 ( void* place_to_unchain,
3360                                   void* place_to_jump_to_EXPECTED,
3361                                   void* disp_cp_chain_me )
3362{
3363   /* What we're expecting to see is:
3364          jmp d32
3365          ud2;
3366       viz
3367          E9 <4 bytes == disp32>
3368          0F 0B
3369   */
3370   UChar* p     = (UChar*)place_to_unchain;
3371   Bool   valid = False;
3372   if (p[0] == 0xE9
3373       && p[5]  == 0x0F && p[6]  == 0x0B) {
3374      /* Check the offset is right. */
3375      Int s32 = *(Int*)(&p[1]);
3376      if ((UChar*)p + 5 + s32 == (UChar*)place_to_jump_to_EXPECTED) {
3377         valid = True;
3378         if (0)
3379            vex_printf("QQQ unchainXDirect_X86: found valid\n");
3380      }
3381   }
3382   vassert(valid);
3383   /* And what we want to change it to is:
3384         movl $disp_cp_chain_me, %edx
3385         call *%edx
3386      viz
3387         BA <4 bytes value == disp_cp_chain_me_EXPECTED>
3388         FF D2
3389      So it's the same length (convenient, huh).
3390   */
3391   p[0] = 0xBA;
3392   *(UInt*)(&p[1]) = (UInt)Ptr_to_ULong(disp_cp_chain_me);
3393   p[5] = 0xFF;
3394   p[6] = 0xD2;
3395   VexInvalRange vir = {0, 0};
3396   return vir;
3397}
3398
3399
3400/* Patch the counter address into a profile inc point, as previously
3401   created by the Xin_ProfInc case for emit_X86Instr. */
3402VexInvalRange patchProfInc_X86 ( void*  place_to_patch,
3403                                 ULong* location_of_counter )
3404{
3405   vassert(sizeof(ULong*) == 4);
3406   UChar* p = (UChar*)place_to_patch;
3407   vassert(p[0] == 0x83);
3408   vassert(p[1] == 0x05);
3409   vassert(p[2] == 0x00);
3410   vassert(p[3] == 0x00);
3411   vassert(p[4] == 0x00);
3412   vassert(p[5] == 0x00);
3413   vassert(p[6] == 0x01);
3414   vassert(p[7] == 0x83);
3415   vassert(p[8] == 0x15);
3416   vassert(p[9] == 0x00);
3417   vassert(p[10] == 0x00);
3418   vassert(p[11] == 0x00);
3419   vassert(p[12] == 0x00);
3420   vassert(p[13] == 0x00);
3421   UInt imm32 = (UInt)Ptr_to_ULong(location_of_counter);
3422   p[2] = imm32 & 0xFF; imm32 >>= 8;
3423   p[3] = imm32 & 0xFF; imm32 >>= 8;
3424   p[4] = imm32 & 0xFF; imm32 >>= 8;
3425   p[5] = imm32 & 0xFF; imm32 >>= 8;
3426   imm32 = 4 + (UInt)Ptr_to_ULong(location_of_counter);
3427   p[9]  = imm32 & 0xFF; imm32 >>= 8;
3428   p[10] = imm32 & 0xFF; imm32 >>= 8;
3429   p[11] = imm32 & 0xFF; imm32 >>= 8;
3430   p[12] = imm32 & 0xFF; imm32 >>= 8;
3431   VexInvalRange vir = {0, 0};
3432   return vir;
3433}
3434
3435
3436/*---------------------------------------------------------------*/
3437/*--- end                                     host_x86_defs.c ---*/
3438/*---------------------------------------------------------------*/
3439