1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2011 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_x86_defs.h"
43
44
45/* --------- Registers. --------- */
46
47void ppHRegX86 ( HReg reg )
48{
49   Int r;
50   static HChar* ireg32_names[8]
51     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
52   /* Be generic for all virtual regs. */
53   if (hregIsVirtual(reg)) {
54      ppHReg(reg);
55      return;
56   }
57   /* But specific for real regs. */
58   switch (hregClass(reg)) {
59      case HRcInt32:
60         r = hregNumber(reg);
61         vassert(r >= 0 && r < 8);
62         vex_printf("%s", ireg32_names[r]);
63         return;
64      case HRcFlt64:
65         r = hregNumber(reg);
66         vassert(r >= 0 && r < 6);
67         vex_printf("%%fake%d", r);
68         return;
69      case HRcVec128:
70         r = hregNumber(reg);
71         vassert(r >= 0 && r < 8);
72         vex_printf("%%xmm%d", r);
73         return;
74      default:
75         vpanic("ppHRegX86");
76   }
77}
78
79HReg hregX86_EAX ( void ) { return mkHReg(0, HRcInt32, False); }
80HReg hregX86_ECX ( void ) { return mkHReg(1, HRcInt32, False); }
81HReg hregX86_EDX ( void ) { return mkHReg(2, HRcInt32, False); }
82HReg hregX86_EBX ( void ) { return mkHReg(3, HRcInt32, False); }
83HReg hregX86_ESP ( void ) { return mkHReg(4, HRcInt32, False); }
84HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt32, False); }
85HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt32, False); }
86HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt32, False); }
87
88HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
89HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
90HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
91HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
92HReg hregX86_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
93HReg hregX86_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
94
95HReg hregX86_XMM0 ( void ) { return mkHReg(0, HRcVec128, False); }
96HReg hregX86_XMM1 ( void ) { return mkHReg(1, HRcVec128, False); }
97HReg hregX86_XMM2 ( void ) { return mkHReg(2, HRcVec128, False); }
98HReg hregX86_XMM3 ( void ) { return mkHReg(3, HRcVec128, False); }
99HReg hregX86_XMM4 ( void ) { return mkHReg(4, HRcVec128, False); }
100HReg hregX86_XMM5 ( void ) { return mkHReg(5, HRcVec128, False); }
101HReg hregX86_XMM6 ( void ) { return mkHReg(6, HRcVec128, False); }
102HReg hregX86_XMM7 ( void ) { return mkHReg(7, HRcVec128, False); }
103
104
105void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
106{
107   *nregs = 20;
108   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
109   (*arr)[0] = hregX86_EAX();
110   (*arr)[1] = hregX86_EBX();
111   (*arr)[2] = hregX86_ECX();
112   (*arr)[3] = hregX86_EDX();
113   (*arr)[4] = hregX86_ESI();
114   (*arr)[5] = hregX86_EDI();
115   (*arr)[6] = hregX86_FAKE0();
116   (*arr)[7] = hregX86_FAKE1();
117   (*arr)[8] = hregX86_FAKE2();
118   (*arr)[9] = hregX86_FAKE3();
119   (*arr)[10] = hregX86_FAKE4();
120   (*arr)[11] = hregX86_FAKE5();
121   (*arr)[12] = hregX86_XMM0();
122   (*arr)[13] = hregX86_XMM1();
123   (*arr)[14] = hregX86_XMM2();
124   (*arr)[15] = hregX86_XMM3();
125   (*arr)[16] = hregX86_XMM4();
126   (*arr)[17] = hregX86_XMM5();
127   (*arr)[18] = hregX86_XMM6();
128   (*arr)[19] = hregX86_XMM7();
129}
130
131
132/* --------- Condition codes, Intel encoding. --------- */
133
134HChar* showX86CondCode ( X86CondCode cond )
135{
136   switch (cond) {
137      case Xcc_O:      return "o";
138      case Xcc_NO:     return "no";
139      case Xcc_B:      return "b";
140      case Xcc_NB:     return "nb";
141      case Xcc_Z:      return "z";
142      case Xcc_NZ:     return "nz";
143      case Xcc_BE:     return "be";
144      case Xcc_NBE:    return "nbe";
145      case Xcc_S:      return "s";
146      case Xcc_NS:     return "ns";
147      case Xcc_P:      return "p";
148      case Xcc_NP:     return "np";
149      case Xcc_L:      return "l";
150      case Xcc_NL:     return "nl";
151      case Xcc_LE:     return "le";
152      case Xcc_NLE:    return "nle";
153      case Xcc_ALWAYS: return "ALWAYS";
154      default: vpanic("ppX86CondCode");
155   }
156}
157
158
159/* --------- X86AMode: memory address expressions. --------- */
160
161X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
162   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
163   am->tag = Xam_IR;
164   am->Xam.IR.imm = imm32;
165   am->Xam.IR.reg = reg;
166   return am;
167}
168X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
169   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
170   am->tag = Xam_IRRS;
171   am->Xam.IRRS.imm = imm32;
172   am->Xam.IRRS.base = base;
173   am->Xam.IRRS.index = indEx;
174   am->Xam.IRRS.shift = shift;
175   vassert(shift >= 0 && shift <= 3);
176   return am;
177}
178
179X86AMode* dopyX86AMode ( X86AMode* am ) {
180   switch (am->tag) {
181      case Xam_IR:
182         return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
183      case Xam_IRRS:
184         return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
185                               am->Xam.IRRS.index, am->Xam.IRRS.shift );
186      default:
187         vpanic("dopyX86AMode");
188   }
189}
190
191void ppX86AMode ( X86AMode* am ) {
192   switch (am->tag) {
193      case Xam_IR:
194         if (am->Xam.IR.imm == 0)
195            vex_printf("(");
196         else
197            vex_printf("0x%x(", am->Xam.IR.imm);
198         ppHRegX86(am->Xam.IR.reg);
199         vex_printf(")");
200         return;
201      case Xam_IRRS:
202         vex_printf("0x%x(", am->Xam.IRRS.imm);
203         ppHRegX86(am->Xam.IRRS.base);
204         vex_printf(",");
205         ppHRegX86(am->Xam.IRRS.index);
206         vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
207         return;
208      default:
209         vpanic("ppX86AMode");
210   }
211}
212
213static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
214   switch (am->tag) {
215      case Xam_IR:
216         addHRegUse(u, HRmRead, am->Xam.IR.reg);
217         return;
218      case Xam_IRRS:
219         addHRegUse(u, HRmRead, am->Xam.IRRS.base);
220         addHRegUse(u, HRmRead, am->Xam.IRRS.index);
221         return;
222      default:
223         vpanic("addRegUsage_X86AMode");
224   }
225}
226
227static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
228   switch (am->tag) {
229      case Xam_IR:
230         am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
231         return;
232      case Xam_IRRS:
233         am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
234         am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
235         return;
236      default:
237         vpanic("mapRegs_X86AMode");
238   }
239}
240
241/* --------- Operand, which can be reg, immediate or memory. --------- */
242
243X86RMI* X86RMI_Imm ( UInt imm32 ) {
244   X86RMI* op         = LibVEX_Alloc(sizeof(X86RMI));
245   op->tag            = Xrmi_Imm;
246   op->Xrmi.Imm.imm32 = imm32;
247   return op;
248}
249X86RMI* X86RMI_Reg ( HReg reg ) {
250   X86RMI* op       = LibVEX_Alloc(sizeof(X86RMI));
251   op->tag          = Xrmi_Reg;
252   op->Xrmi.Reg.reg = reg;
253   return op;
254}
255X86RMI* X86RMI_Mem ( X86AMode* am ) {
256   X86RMI* op      = LibVEX_Alloc(sizeof(X86RMI));
257   op->tag         = Xrmi_Mem;
258   op->Xrmi.Mem.am = am;
259   return op;
260}
261
262void ppX86RMI ( X86RMI* op ) {
263   switch (op->tag) {
264      case Xrmi_Imm:
265         vex_printf("$0x%x", op->Xrmi.Imm.imm32);
266         return;
267      case Xrmi_Reg:
268         ppHRegX86(op->Xrmi.Reg.reg);
269         return;
270      case Xrmi_Mem:
271         ppX86AMode(op->Xrmi.Mem.am);
272         return;
273     default:
274         vpanic("ppX86RMI");
275   }
276}
277
278/* An X86RMI can only be used in a "read" context (what would it mean
279   to write or modify a literal?) and so we enumerate its registers
280   accordingly. */
281static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
282   switch (op->tag) {
283      case Xrmi_Imm:
284         return;
285      case Xrmi_Reg:
286         addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
287         return;
288      case Xrmi_Mem:
289         addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
290         return;
291      default:
292         vpanic("addRegUsage_X86RMI");
293   }
294}
295
296static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
297   switch (op->tag) {
298      case Xrmi_Imm:
299         return;
300      case Xrmi_Reg:
301         op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
302         return;
303      case Xrmi_Mem:
304         mapRegs_X86AMode(m, op->Xrmi.Mem.am);
305         return;
306      default:
307         vpanic("mapRegs_X86RMI");
308   }
309}
310
311
312/* --------- Operand, which can be reg or immediate only. --------- */
313
314X86RI* X86RI_Imm ( UInt imm32 ) {
315   X86RI* op         = LibVEX_Alloc(sizeof(X86RI));
316   op->tag           = Xri_Imm;
317   op->Xri.Imm.imm32 = imm32;
318   return op;
319}
320X86RI* X86RI_Reg ( HReg reg ) {
321   X86RI* op       = LibVEX_Alloc(sizeof(X86RI));
322   op->tag         = Xri_Reg;
323   op->Xri.Reg.reg = reg;
324   return op;
325}
326
327void ppX86RI ( X86RI* op ) {
328   switch (op->tag) {
329      case Xri_Imm:
330         vex_printf("$0x%x", op->Xri.Imm.imm32);
331         return;
332      case Xri_Reg:
333         ppHRegX86(op->Xri.Reg.reg);
334         return;
335     default:
336         vpanic("ppX86RI");
337   }
338}
339
340/* An X86RI can only be used in a "read" context (what would it mean
341   to write or modify a literal?) and so we enumerate its registers
342   accordingly. */
343static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
344   switch (op->tag) {
345      case Xri_Imm:
346         return;
347      case Xri_Reg:
348         addHRegUse(u, HRmRead, op->Xri.Reg.reg);
349         return;
350      default:
351         vpanic("addRegUsage_X86RI");
352   }
353}
354
355static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
356   switch (op->tag) {
357      case Xri_Imm:
358         return;
359      case Xri_Reg:
360         op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
361         return;
362      default:
363         vpanic("mapRegs_X86RI");
364   }
365}
366
367
368/* --------- Operand, which can be reg or memory only. --------- */
369
370X86RM* X86RM_Reg ( HReg reg ) {
371   X86RM* op       = LibVEX_Alloc(sizeof(X86RM));
372   op->tag         = Xrm_Reg;
373   op->Xrm.Reg.reg = reg;
374   return op;
375}
376X86RM* X86RM_Mem ( X86AMode* am ) {
377   X86RM* op      = LibVEX_Alloc(sizeof(X86RM));
378   op->tag        = Xrm_Mem;
379   op->Xrm.Mem.am = am;
380   return op;
381}
382
383void ppX86RM ( X86RM* op ) {
384   switch (op->tag) {
385      case Xrm_Mem:
386         ppX86AMode(op->Xrm.Mem.am);
387         return;
388      case Xrm_Reg:
389         ppHRegX86(op->Xrm.Reg.reg);
390         return;
391     default:
392         vpanic("ppX86RM");
393   }
394}
395
396/* Because an X86RM can be both a source or destination operand, we
397   have to supply a mode -- pertaining to the operand as a whole --
398   indicating how it's being used. */
399static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
400   switch (op->tag) {
401      case Xrm_Mem:
402         /* Memory is read, written or modified.  So we just want to
403            know the regs read by the amode. */
404         addRegUsage_X86AMode(u, op->Xrm.Mem.am);
405         return;
406      case Xrm_Reg:
407         /* reg is read, written or modified.  Add it in the
408            appropriate way. */
409         addHRegUse(u, mode, op->Xrm.Reg.reg);
410         return;
411     default:
412         vpanic("addRegUsage_X86RM");
413   }
414}
415
416static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
417{
418   switch (op->tag) {
419      case Xrm_Mem:
420         mapRegs_X86AMode(m, op->Xrm.Mem.am);
421         return;
422      case Xrm_Reg:
423         op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
424         return;
425     default:
426         vpanic("mapRegs_X86RM");
427   }
428}
429
430
431/* --------- Instructions. --------- */
432
433HChar* showX86UnaryOp ( X86UnaryOp op ) {
434   switch (op) {
435      case Xun_NOT: return "not";
436      case Xun_NEG: return "neg";
437      default: vpanic("showX86UnaryOp");
438   }
439}
440
441HChar* showX86AluOp ( X86AluOp op ) {
442   switch (op) {
443      case Xalu_MOV:  return "mov";
444      case Xalu_CMP:  return "cmp";
445      case Xalu_ADD:  return "add";
446      case Xalu_SUB:  return "sub";
447      case Xalu_ADC:  return "adc";
448      case Xalu_SBB:  return "sbb";
449      case Xalu_AND:  return "and";
450      case Xalu_OR:   return "or";
451      case Xalu_XOR:  return "xor";
452      case Xalu_MUL:  return "mul";
453      default: vpanic("showX86AluOp");
454   }
455}
456
457HChar* showX86ShiftOp ( X86ShiftOp op ) {
458   switch (op) {
459      case Xsh_SHL: return "shl";
460      case Xsh_SHR: return "shr";
461      case Xsh_SAR: return "sar";
462      default: vpanic("showX86ShiftOp");
463   }
464}
465
466HChar* showX86FpOp ( X86FpOp op ) {
467   switch (op) {
468      case Xfp_ADD:    return "add";
469      case Xfp_SUB:    return "sub";
470      case Xfp_MUL:    return "mul";
471      case Xfp_DIV:    return "div";
472      case Xfp_SCALE:  return "scale";
473      case Xfp_ATAN:   return "atan";
474      case Xfp_YL2X:   return "yl2x";
475      case Xfp_YL2XP1: return "yl2xp1";
476      case Xfp_PREM:   return "prem";
477      case Xfp_PREM1:  return "prem1";
478      case Xfp_SQRT:   return "sqrt";
479      case Xfp_ABS:    return "abs";
480      case Xfp_NEG:    return "chs";
481      case Xfp_MOV:    return "mov";
482      case Xfp_SIN:    return "sin";
483      case Xfp_COS:    return "cos";
484      case Xfp_TAN:    return "tan";
485      case Xfp_ROUND:  return "round";
486      case Xfp_2XM1:   return "2xm1";
487      default: vpanic("showX86FpOp");
488   }
489}
490
491HChar* showX86SseOp ( X86SseOp op ) {
492   switch (op) {
493      case Xsse_MOV:      return "mov(?!)";
494      case Xsse_ADDF:     return "add";
495      case Xsse_SUBF:     return "sub";
496      case Xsse_MULF:     return "mul";
497      case Xsse_DIVF:     return "div";
498      case Xsse_MAXF:     return "max";
499      case Xsse_MINF:     return "min";
500      case Xsse_CMPEQF:   return "cmpFeq";
501      case Xsse_CMPLTF:   return "cmpFlt";
502      case Xsse_CMPLEF:   return "cmpFle";
503      case Xsse_CMPUNF:   return "cmpFun";
504      case Xsse_RCPF:     return "rcp";
505      case Xsse_RSQRTF:   return "rsqrt";
506      case Xsse_SQRTF:    return "sqrt";
507      case Xsse_AND:      return "and";
508      case Xsse_OR:       return "or";
509      case Xsse_XOR:      return "xor";
510      case Xsse_ANDN:     return "andn";
511      case Xsse_ADD8:     return "paddb";
512      case Xsse_ADD16:    return "paddw";
513      case Xsse_ADD32:    return "paddd";
514      case Xsse_ADD64:    return "paddq";
515      case Xsse_QADD8U:   return "paddusb";
516      case Xsse_QADD16U:  return "paddusw";
517      case Xsse_QADD8S:   return "paddsb";
518      case Xsse_QADD16S:  return "paddsw";
519      case Xsse_SUB8:     return "psubb";
520      case Xsse_SUB16:    return "psubw";
521      case Xsse_SUB32:    return "psubd";
522      case Xsse_SUB64:    return "psubq";
523      case Xsse_QSUB8U:   return "psubusb";
524      case Xsse_QSUB16U:  return "psubusw";
525      case Xsse_QSUB8S:   return "psubsb";
526      case Xsse_QSUB16S:  return "psubsw";
527      case Xsse_MUL16:    return "pmullw";
528      case Xsse_MULHI16U: return "pmulhuw";
529      case Xsse_MULHI16S: return "pmulhw";
530      case Xsse_AVG8U:    return "pavgb";
531      case Xsse_AVG16U:   return "pavgw";
532      case Xsse_MAX16S:   return "pmaxw";
533      case Xsse_MAX8U:    return "pmaxub";
534      case Xsse_MIN16S:   return "pminw";
535      case Xsse_MIN8U:    return "pminub";
536      case Xsse_CMPEQ8:   return "pcmpeqb";
537      case Xsse_CMPEQ16:  return "pcmpeqw";
538      case Xsse_CMPEQ32:  return "pcmpeqd";
539      case Xsse_CMPGT8S:  return "pcmpgtb";
540      case Xsse_CMPGT16S: return "pcmpgtw";
541      case Xsse_CMPGT32S: return "pcmpgtd";
542      case Xsse_SHL16:    return "psllw";
543      case Xsse_SHL32:    return "pslld";
544      case Xsse_SHL64:    return "psllq";
545      case Xsse_SHR16:    return "psrlw";
546      case Xsse_SHR32:    return "psrld";
547      case Xsse_SHR64:    return "psrlq";
548      case Xsse_SAR16:    return "psraw";
549      case Xsse_SAR32:    return "psrad";
550      case Xsse_PACKSSD:  return "packssdw";
551      case Xsse_PACKSSW:  return "packsswb";
552      case Xsse_PACKUSW:  return "packuswb";
553      case Xsse_UNPCKHB:  return "punpckhb";
554      case Xsse_UNPCKHW:  return "punpckhw";
555      case Xsse_UNPCKHD:  return "punpckhd";
556      case Xsse_UNPCKHQ:  return "punpckhq";
557      case Xsse_UNPCKLB:  return "punpcklb";
558      case Xsse_UNPCKLW:  return "punpcklw";
559      case Xsse_UNPCKLD:  return "punpckld";
560      case Xsse_UNPCKLQ:  return "punpcklq";
561      default: vpanic("showX86SseOp");
562   }
563}
564
565X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
566   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
567   i->tag            = Xin_Alu32R;
568   i->Xin.Alu32R.op  = op;
569   i->Xin.Alu32R.src = src;
570   i->Xin.Alu32R.dst = dst;
571   return i;
572}
573X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
574   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
575   i->tag            = Xin_Alu32M;
576   i->Xin.Alu32M.op  = op;
577   i->Xin.Alu32M.src = src;
578   i->Xin.Alu32M.dst = dst;
579   vassert(op != Xalu_MUL);
580   return i;
581}
582X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
583   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
584   i->tag          = Xin_Sh32;
585   i->Xin.Sh32.op  = op;
586   i->Xin.Sh32.src = src;
587   i->Xin.Sh32.dst = dst;
588   return i;
589}
590X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
591   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
592   i->tag              = Xin_Test32;
593   i->Xin.Test32.imm32 = imm32;
594   i->Xin.Test32.dst   = dst;
595   return i;
596}
597X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
598   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
599   i->tag             = Xin_Unary32;
600   i->Xin.Unary32.op  = op;
601   i->Xin.Unary32.dst = dst;
602   return i;
603}
604X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
605   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
606   i->tag             = Xin_Lea32;
607   i->Xin.Lea32.am    = am;
608   i->Xin.Lea32.dst   = dst;
609   return i;
610}
611X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
612   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
613   i->tag             = Xin_MulL;
614   i->Xin.MulL.syned  = syned;
615   i->Xin.MulL.src    = src;
616   return i;
617}
618X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
619   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
620   i->tag           = Xin_Div;
621   i->Xin.Div.syned = syned;
622   i->Xin.Div.src   = src;
623   return i;
624}
625X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
626   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
627   i->tag            = Xin_Sh3232;
628   i->Xin.Sh3232.op  = op;
629   i->Xin.Sh3232.amt = amt;
630   i->Xin.Sh3232.src = src;
631   i->Xin.Sh3232.dst = dst;
632   vassert(op == Xsh_SHL || op == Xsh_SHR);
633   return i;
634}
635X86Instr* X86Instr_Push( X86RMI* src ) {
636   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
637   i->tag          = Xin_Push;
638   i->Xin.Push.src = src;
639   return i;
640}
641X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms ) {
642   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
643   i->tag               = Xin_Call;
644   i->Xin.Call.cond     = cond;
645   i->Xin.Call.target   = target;
646   i->Xin.Call.regparms = regparms;
647   vassert(regparms >= 0 && regparms <= 3);
648   return i;
649}
650X86Instr* X86Instr_Goto ( IRJumpKind jk, X86CondCode cond, X86RI* dst ) {
651   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
652   i->tag           = Xin_Goto;
653   i->Xin.Goto.cond = cond;
654   i->Xin.Goto.dst  = dst;
655   i->Xin.Goto.jk   = jk;
656   return i;
657}
658X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
659   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
660   i->tag             = Xin_CMov32;
661   i->Xin.CMov32.cond = cond;
662   i->Xin.CMov32.src  = src;
663   i->Xin.CMov32.dst  = dst;
664   vassert(cond != Xcc_ALWAYS);
665   return i;
666}
667X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
668                            X86AMode* src, HReg dst ) {
669   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
670   i->tag                = Xin_LoadEX;
671   i->Xin.LoadEX.szSmall = szSmall;
672   i->Xin.LoadEX.syned   = syned;
673   i->Xin.LoadEX.src     = src;
674   i->Xin.LoadEX.dst     = dst;
675   vassert(szSmall == 1 || szSmall == 2);
676   return i;
677}
678X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
679   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
680   i->tag           = Xin_Store;
681   i->Xin.Store.sz  = sz;
682   i->Xin.Store.src = src;
683   i->Xin.Store.dst = dst;
684   vassert(sz == 1 || sz == 2);
685   return i;
686}
687X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
688   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
689   i->tag            = Xin_Set32;
690   i->Xin.Set32.cond = cond;
691   i->Xin.Set32.dst  = dst;
692   return i;
693}
694X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
695   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
696   i->tag               = Xin_Bsfr32;
697   i->Xin.Bsfr32.isFwds = isFwds;
698   i->Xin.Bsfr32.src    = src;
699   i->Xin.Bsfr32.dst    = dst;
700   return i;
701}
702X86Instr* X86Instr_MFence ( UInt hwcaps ) {
703   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
704   i->tag               = Xin_MFence;
705   i->Xin.MFence.hwcaps = hwcaps;
706   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_SSE1
707                            |VEX_HWCAPS_X86_SSE2
708                            |VEX_HWCAPS_X86_SSE3
709                            |VEX_HWCAPS_X86_LZCNT)));
710   return i;
711}
712X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
713   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
714   i->tag           = Xin_ACAS;
715   i->Xin.ACAS.addr = addr;
716   i->Xin.ACAS.sz   = sz;
717   vassert(sz == 4 || sz == 2 || sz == 1);
718   return i;
719}
720X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
721   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
722   i->tag            = Xin_DACAS;
723   i->Xin.DACAS.addr = addr;
724   return i;
725}
726
727X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
728   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
729   i->tag             = Xin_FpUnary;
730   i->Xin.FpUnary.op  = op;
731   i->Xin.FpUnary.src = src;
732   i->Xin.FpUnary.dst = dst;
733   return i;
734}
735X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
736   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
737   i->tag               = Xin_FpBinary;
738   i->Xin.FpBinary.op   = op;
739   i->Xin.FpBinary.srcL = srcL;
740   i->Xin.FpBinary.srcR = srcR;
741   i->Xin.FpBinary.dst  = dst;
742   return i;
743}
744X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
745   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
746   i->tag               = Xin_FpLdSt;
747   i->Xin.FpLdSt.isLoad = isLoad;
748   i->Xin.FpLdSt.sz     = sz;
749   i->Xin.FpLdSt.reg    = reg;
750   i->Xin.FpLdSt.addr   = addr;
751   vassert(sz == 4 || sz == 8 || sz == 10);
752   return i;
753}
754X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
755                             HReg reg, X86AMode* addr ) {
756   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
757   i->tag                = Xin_FpLdStI;
758   i->Xin.FpLdStI.isLoad = isLoad;
759   i->Xin.FpLdStI.sz     = sz;
760   i->Xin.FpLdStI.reg    = reg;
761   i->Xin.FpLdStI.addr   = addr;
762   vassert(sz == 2 || sz == 4 || sz == 8);
763   return i;
764}
765X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
766   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
767   i->tag              = Xin_Fp64to32;
768   i->Xin.Fp64to32.src = src;
769   i->Xin.Fp64to32.dst = dst;
770   return i;
771}
772X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
773   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
774   i->tag             = Xin_FpCMov;
775   i->Xin.FpCMov.cond = cond;
776   i->Xin.FpCMov.src  = src;
777   i->Xin.FpCMov.dst  = dst;
778   vassert(cond != Xcc_ALWAYS);
779   return i;
780}
781X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
782   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
783   i->tag               = Xin_FpLdCW;
784   i->Xin.FpLdCW.addr   = addr;
785   return i;
786}
787X86Instr* X86Instr_FpStSW_AX ( void ) {
788   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
789   i->tag      = Xin_FpStSW_AX;
790   return i;
791}
792X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
793   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
794   i->tag            = Xin_FpCmp;
795   i->Xin.FpCmp.srcL = srcL;
796   i->Xin.FpCmp.srcR = srcR;
797   i->Xin.FpCmp.dst  = dst;
798   return i;
799}
800
801X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
802   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
803   i->tag                 = Xin_SseConst;
804   i->Xin.SseConst.con    = con;
805   i->Xin.SseConst.dst    = dst;
806   vassert(hregClass(dst) == HRcVec128);
807   return i;
808}
809X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
810   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
811   i->tag                = Xin_SseLdSt;
812   i->Xin.SseLdSt.isLoad = isLoad;
813   i->Xin.SseLdSt.reg    = reg;
814   i->Xin.SseLdSt.addr   = addr;
815   return i;
816}
817X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
818{
819   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
820   i->tag                = Xin_SseLdzLO;
821   i->Xin.SseLdzLO.sz    = toUChar(sz);
822   i->Xin.SseLdzLO.reg   = reg;
823   i->Xin.SseLdzLO.addr  = addr;
824   vassert(sz == 4 || sz == 8);
825   return i;
826}
827X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
828   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
829   i->tag              = Xin_Sse32Fx4;
830   i->Xin.Sse32Fx4.op  = op;
831   i->Xin.Sse32Fx4.src = src;
832   i->Xin.Sse32Fx4.dst = dst;
833   vassert(op != Xsse_MOV);
834   return i;
835}
836X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
837   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
838   i->tag              = Xin_Sse32FLo;
839   i->Xin.Sse32FLo.op  = op;
840   i->Xin.Sse32FLo.src = src;
841   i->Xin.Sse32FLo.dst = dst;
842   vassert(op != Xsse_MOV);
843   return i;
844}
845X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
846   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
847   i->tag              = Xin_Sse64Fx2;
848   i->Xin.Sse64Fx2.op  = op;
849   i->Xin.Sse64Fx2.src = src;
850   i->Xin.Sse64Fx2.dst = dst;
851   vassert(op != Xsse_MOV);
852   return i;
853}
854X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
855   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
856   i->tag              = Xin_Sse64FLo;
857   i->Xin.Sse64FLo.op  = op;
858   i->Xin.Sse64FLo.src = src;
859   i->Xin.Sse64FLo.dst = dst;
860   vassert(op != Xsse_MOV);
861   return i;
862}
863X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
864   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
865   i->tag             = Xin_SseReRg;
866   i->Xin.SseReRg.op  = op;
867   i->Xin.SseReRg.src = re;
868   i->Xin.SseReRg.dst = rg;
869   return i;
870}
871X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
872   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
873   i->tag              = Xin_SseCMov;
874   i->Xin.SseCMov.cond = cond;
875   i->Xin.SseCMov.src  = src;
876   i->Xin.SseCMov.dst  = dst;
877   vassert(cond != Xcc_ALWAYS);
878   return i;
879}
880X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
881   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
882   i->tag               = Xin_SseShuf;
883   i->Xin.SseShuf.order = order;
884   i->Xin.SseShuf.src   = src;
885   i->Xin.SseShuf.dst   = dst;
886   vassert(order >= 0 && order <= 0xFF);
887   return i;
888}
889
890void ppX86Instr ( X86Instr* i, Bool mode64 ) {
891   vassert(mode64 == False);
892   switch (i->tag) {
893      case Xin_Alu32R:
894         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
895         ppX86RMI(i->Xin.Alu32R.src);
896         vex_printf(",");
897         ppHRegX86(i->Xin.Alu32R.dst);
898         return;
899      case Xin_Alu32M:
900         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
901         ppX86RI(i->Xin.Alu32M.src);
902         vex_printf(",");
903         ppX86AMode(i->Xin.Alu32M.dst);
904         return;
905      case Xin_Sh32:
906         vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
907         if (i->Xin.Sh32.src == 0)
908           vex_printf("%%cl,");
909         else
910            vex_printf("$%d,", (Int)i->Xin.Sh32.src);
911         ppHRegX86(i->Xin.Sh32.dst);
912         return;
913      case Xin_Test32:
914         vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
915         ppX86RM(i->Xin.Test32.dst);
916         return;
917      case Xin_Unary32:
918         vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
919         ppHRegX86(i->Xin.Unary32.dst);
920         return;
921      case Xin_Lea32:
922         vex_printf("leal ");
923         ppX86AMode(i->Xin.Lea32.am);
924         vex_printf(",");
925         ppHRegX86(i->Xin.Lea32.dst);
926         return;
927      case Xin_MulL:
928         vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
929         ppX86RM(i->Xin.MulL.src);
930         return;
931      case Xin_Div:
932         vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
933         ppX86RM(i->Xin.Div.src);
934         return;
935      case Xin_Sh3232:
936         vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
937         if (i->Xin.Sh3232.amt == 0)
938           vex_printf(" %%cl,");
939         else
940            vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
941         ppHRegX86(i->Xin.Sh3232.src);
942         vex_printf(",");
943         ppHRegX86(i->Xin.Sh3232.dst);
944         return;
945      case Xin_Push:
946         vex_printf("pushl ");
947         ppX86RMI(i->Xin.Push.src);
948         return;
949      case Xin_Call:
950         vex_printf("call%s[%d] ",
951                    i->Xin.Call.cond==Xcc_ALWAYS
952                       ? "" : showX86CondCode(i->Xin.Call.cond),
953                    i->Xin.Call.regparms);
954         vex_printf("0x%x", i->Xin.Call.target);
955         break;
956      case Xin_Goto:
957         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
958            vex_printf("if (%%eflags.%s) { ",
959                       showX86CondCode(i->Xin.Goto.cond));
960	 }
961         if (i->Xin.Goto.jk != Ijk_Boring
962             && i->Xin.Goto.jk != Ijk_Call
963             && i->Xin.Goto.jk != Ijk_Ret) {
964            vex_printf("movl $");
965            ppIRJumpKind(i->Xin.Goto.jk);
966            vex_printf(",%%ebp ; ");
967         }
968         vex_printf("movl ");
969         ppX86RI(i->Xin.Goto.dst);
970         vex_printf(",%%eax ; movl $dispatcher_addr,%%edx ; jmp *%%edx");
971         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
972            vex_printf(" }");
973	 }
974         return;
975      case Xin_CMov32:
976         vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
977         ppX86RM(i->Xin.CMov32.src);
978         vex_printf(",");
979         ppHRegX86(i->Xin.CMov32.dst);
980         return;
981      case Xin_LoadEX:
982         vex_printf("mov%c%cl ",
983                    i->Xin.LoadEX.syned ? 's' : 'z',
984                    i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
985         ppX86AMode(i->Xin.LoadEX.src);
986         vex_printf(",");
987         ppHRegX86(i->Xin.LoadEX.dst);
988         return;
989      case Xin_Store:
990         vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
991         ppHRegX86(i->Xin.Store.src);
992         vex_printf(",");
993         ppX86AMode(i->Xin.Store.dst);
994         return;
995      case Xin_Set32:
996         vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
997         ppHRegX86(i->Xin.Set32.dst);
998         return;
999      case Xin_Bsfr32:
1000         vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
1001         ppHRegX86(i->Xin.Bsfr32.src);
1002         vex_printf(",");
1003         ppHRegX86(i->Xin.Bsfr32.dst);
1004         return;
1005      case Xin_MFence:
1006         vex_printf("mfence(%s)",
1007                    LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
1008         return;
1009      case Xin_ACAS:
1010         vex_printf("lock cmpxchg%c ",
1011                     i->Xin.ACAS.sz==1 ? 'b'
1012                                       : i->Xin.ACAS.sz==2 ? 'w' : 'l');
1013         vex_printf("{%%eax->%%ebx},");
1014         ppX86AMode(i->Xin.ACAS.addr);
1015         return;
1016      case Xin_DACAS:
1017         vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
1018         ppX86AMode(i->Xin.DACAS.addr);
1019         return;
1020      case Xin_FpUnary:
1021         vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
1022         ppHRegX86(i->Xin.FpUnary.src);
1023         vex_printf(",");
1024         ppHRegX86(i->Xin.FpUnary.dst);
1025         break;
1026      case Xin_FpBinary:
1027         vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
1028         ppHRegX86(i->Xin.FpBinary.srcL);
1029         vex_printf(",");
1030         ppHRegX86(i->Xin.FpBinary.srcR);
1031         vex_printf(",");
1032         ppHRegX86(i->Xin.FpBinary.dst);
1033         break;
1034      case Xin_FpLdSt:
1035         if (i->Xin.FpLdSt.isLoad) {
1036            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
1037                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1038            ppX86AMode(i->Xin.FpLdSt.addr);
1039            vex_printf(", ");
1040            ppHRegX86(i->Xin.FpLdSt.reg);
1041         } else {
1042            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
1043                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1044            ppHRegX86(i->Xin.FpLdSt.reg);
1045            vex_printf(", ");
1046            ppX86AMode(i->Xin.FpLdSt.addr);
1047         }
1048         return;
1049      case Xin_FpLdStI:
1050         if (i->Xin.FpLdStI.isLoad) {
1051            vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1052                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1053            ppX86AMode(i->Xin.FpLdStI.addr);
1054            vex_printf(", ");
1055            ppHRegX86(i->Xin.FpLdStI.reg);
1056         } else {
1057            vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1058                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1059            ppHRegX86(i->Xin.FpLdStI.reg);
1060            vex_printf(", ");
1061            ppX86AMode(i->Xin.FpLdStI.addr);
1062         }
1063         return;
1064      case Xin_Fp64to32:
1065         vex_printf("gdtof ");
1066         ppHRegX86(i->Xin.Fp64to32.src);
1067         vex_printf(",");
1068         ppHRegX86(i->Xin.Fp64to32.dst);
1069         return;
1070      case Xin_FpCMov:
1071         vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
1072         ppHRegX86(i->Xin.FpCMov.src);
1073         vex_printf(",");
1074         ppHRegX86(i->Xin.FpCMov.dst);
1075         return;
1076      case Xin_FpLdCW:
1077         vex_printf("fldcw ");
1078         ppX86AMode(i->Xin.FpLdCW.addr);
1079         return;
1080      case Xin_FpStSW_AX:
1081         vex_printf("fstsw %%ax");
1082         return;
1083      case Xin_FpCmp:
1084         vex_printf("gcmp ");
1085         ppHRegX86(i->Xin.FpCmp.srcL);
1086         vex_printf(",");
1087         ppHRegX86(i->Xin.FpCmp.srcR);
1088         vex_printf(",");
1089         ppHRegX86(i->Xin.FpCmp.dst);
1090         break;
1091      case Xin_SseConst:
1092         vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
1093         ppHRegX86(i->Xin.SseConst.dst);
1094         break;
1095      case Xin_SseLdSt:
1096         vex_printf("movups ");
1097         if (i->Xin.SseLdSt.isLoad) {
1098            ppX86AMode(i->Xin.SseLdSt.addr);
1099            vex_printf(",");
1100            ppHRegX86(i->Xin.SseLdSt.reg);
1101         } else {
1102            ppHRegX86(i->Xin.SseLdSt.reg);
1103            vex_printf(",");
1104            ppX86AMode(i->Xin.SseLdSt.addr);
1105         }
1106         return;
1107      case Xin_SseLdzLO:
1108         vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
1109         ppX86AMode(i->Xin.SseLdzLO.addr);
1110         vex_printf(",");
1111         ppHRegX86(i->Xin.SseLdzLO.reg);
1112         return;
1113      case Xin_Sse32Fx4:
1114         vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
1115         ppHRegX86(i->Xin.Sse32Fx4.src);
1116         vex_printf(",");
1117         ppHRegX86(i->Xin.Sse32Fx4.dst);
1118         return;
1119      case Xin_Sse32FLo:
1120         vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
1121         ppHRegX86(i->Xin.Sse32FLo.src);
1122         vex_printf(",");
1123         ppHRegX86(i->Xin.Sse32FLo.dst);
1124         return;
1125      case Xin_Sse64Fx2:
1126         vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
1127         ppHRegX86(i->Xin.Sse64Fx2.src);
1128         vex_printf(",");
1129         ppHRegX86(i->Xin.Sse64Fx2.dst);
1130         return;
1131      case Xin_Sse64FLo:
1132         vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
1133         ppHRegX86(i->Xin.Sse64FLo.src);
1134         vex_printf(",");
1135         ppHRegX86(i->Xin.Sse64FLo.dst);
1136         return;
1137      case Xin_SseReRg:
1138         vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
1139         ppHRegX86(i->Xin.SseReRg.src);
1140         vex_printf(",");
1141         ppHRegX86(i->Xin.SseReRg.dst);
1142         return;
1143      case Xin_SseCMov:
1144         vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
1145         ppHRegX86(i->Xin.SseCMov.src);
1146         vex_printf(",");
1147         ppHRegX86(i->Xin.SseCMov.dst);
1148         return;
1149      case Xin_SseShuf:
1150         vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
1151         ppHRegX86(i->Xin.SseShuf.src);
1152         vex_printf(",");
1153         ppHRegX86(i->Xin.SseShuf.dst);
1154         return;
1155
1156      default:
1157         vpanic("ppX86Instr");
1158   }
1159}
1160
1161/* --------- Helpers for register allocation. --------- */
1162
1163void getRegUsage_X86Instr (HRegUsage* u, X86Instr* i, Bool mode64)
1164{
1165   Bool unary;
1166   vassert(mode64 == False);
1167   initHRegUsage(u);
1168   switch (i->tag) {
1169      case Xin_Alu32R:
1170         addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
1171         if (i->Xin.Alu32R.op == Xalu_MOV) {
1172            addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
1173            return;
1174         }
1175         if (i->Xin.Alu32R.op == Xalu_CMP) {
1176            addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
1177            return;
1178         }
1179         addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
1180         return;
1181      case Xin_Alu32M:
1182         addRegUsage_X86RI(u, i->Xin.Alu32M.src);
1183         addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
1184         return;
1185      case Xin_Sh32:
1186         addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
1187         if (i->Xin.Sh32.src == 0)
1188            addHRegUse(u, HRmRead, hregX86_ECX());
1189         return;
1190      case Xin_Test32:
1191         addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
1192         return;
1193      case Xin_Unary32:
1194         addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
1195         return;
1196      case Xin_Lea32:
1197         addRegUsage_X86AMode(u, i->Xin.Lea32.am);
1198         addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
1199         return;
1200      case Xin_MulL:
1201         addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
1202         addHRegUse(u, HRmModify, hregX86_EAX());
1203         addHRegUse(u, HRmWrite, hregX86_EDX());
1204         return;
1205      case Xin_Div:
1206         addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
1207         addHRegUse(u, HRmModify, hregX86_EAX());
1208         addHRegUse(u, HRmModify, hregX86_EDX());
1209         return;
1210      case Xin_Sh3232:
1211         addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
1212         addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
1213         if (i->Xin.Sh3232.amt == 0)
1214            addHRegUse(u, HRmRead, hregX86_ECX());
1215         return;
1216      case Xin_Push:
1217         addRegUsage_X86RMI(u, i->Xin.Push.src);
1218         addHRegUse(u, HRmModify, hregX86_ESP());
1219         return;
1220      case Xin_Call:
1221         /* This is a bit subtle. */
1222         /* First off, claim it trashes all the caller-saved regs
1223            which fall within the register allocator's jurisdiction.
1224            These I believe to be %eax %ecx %edx and all the xmm
1225            registers. */
1226         addHRegUse(u, HRmWrite, hregX86_EAX());
1227         addHRegUse(u, HRmWrite, hregX86_ECX());
1228         addHRegUse(u, HRmWrite, hregX86_EDX());
1229         addHRegUse(u, HRmWrite, hregX86_XMM0());
1230         addHRegUse(u, HRmWrite, hregX86_XMM1());
1231         addHRegUse(u, HRmWrite, hregX86_XMM2());
1232         addHRegUse(u, HRmWrite, hregX86_XMM3());
1233         addHRegUse(u, HRmWrite, hregX86_XMM4());
1234         addHRegUse(u, HRmWrite, hregX86_XMM5());
1235         addHRegUse(u, HRmWrite, hregX86_XMM6());
1236         addHRegUse(u, HRmWrite, hregX86_XMM7());
1237         /* Now we have to state any parameter-carrying registers
1238            which might be read.  This depends on the regparmness. */
1239         switch (i->Xin.Call.regparms) {
1240            case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
1241            case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
1242            case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
1243            case 0: break;
1244            default: vpanic("getRegUsage_X86Instr:Call:regparms");
1245         }
1246         /* Finally, there is the issue that the insn trashes a
1247            register because the literal target address has to be
1248            loaded into a register.  Fortunately, for the 0/1/2
1249            regparm case, we can use EAX, EDX and ECX respectively, so
1250            this does not cause any further damage.  For the 3-regparm
1251            case, we'll have to choose another register arbitrarily --
1252            since A, D and C are used for parameters -- and so we might
1253            as well choose EDI. */
1254         if (i->Xin.Call.regparms == 3)
1255            addHRegUse(u, HRmWrite, hregX86_EDI());
1256         /* Upshot of this is that the assembler really must observe
1257            the here-stated convention of which register to use as an
1258            address temporary, depending on the regparmness: 0==EAX,
1259            1==EDX, 2==ECX, 3==EDI. */
1260         return;
1261      case Xin_Goto:
1262         addRegUsage_X86RI(u, i->Xin.Goto.dst);
1263         addHRegUse(u, HRmWrite, hregX86_EAX()); /* used for next guest addr */
1264         addHRegUse(u, HRmWrite, hregX86_EDX()); /* used for dispatcher addr */
1265         if (i->Xin.Goto.jk != Ijk_Boring
1266             && i->Xin.Goto.jk != Ijk_Call
1267             && i->Xin.Goto.jk != Ijk_Ret)
1268            /* note, this is irrelevant since ebp is not actually
1269               available to the allocator.  But still .. */
1270            addHRegUse(u, HRmWrite, hregX86_EBP());
1271         return;
1272      case Xin_CMov32:
1273         addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
1274         addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
1275         return;
1276      case Xin_LoadEX:
1277         addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
1278         addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
1279         return;
1280      case Xin_Store:
1281         addHRegUse(u, HRmRead, i->Xin.Store.src);
1282         addRegUsage_X86AMode(u, i->Xin.Store.dst);
1283         return;
1284      case Xin_Set32:
1285         addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
1286         return;
1287      case Xin_Bsfr32:
1288         addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
1289         addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
1290         return;
1291      case Xin_MFence:
1292         return;
1293      case Xin_ACAS:
1294         addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
1295         addHRegUse(u, HRmRead, hregX86_EBX());
1296         addHRegUse(u, HRmModify, hregX86_EAX());
1297         return;
1298      case Xin_DACAS:
1299         addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
1300         addHRegUse(u, HRmRead, hregX86_ECX());
1301         addHRegUse(u, HRmRead, hregX86_EBX());
1302         addHRegUse(u, HRmModify, hregX86_EDX());
1303         addHRegUse(u, HRmModify, hregX86_EAX());
1304         return;
1305      case Xin_FpUnary:
1306         addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
1307         addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
1308         return;
1309      case Xin_FpBinary:
1310         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
1311         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
1312         addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
1313         return;
1314      case Xin_FpLdSt:
1315         addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
1316         addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
1317                       i->Xin.FpLdSt.reg);
1318         return;
1319      case Xin_FpLdStI:
1320         addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
1321         addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
1322                       i->Xin.FpLdStI.reg);
1323         return;
1324      case Xin_Fp64to32:
1325         addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
1326         addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
1327         return;
1328      case Xin_FpCMov:
1329         addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
1330         addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
1331         return;
1332      case Xin_FpLdCW:
1333         addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
1334         return;
1335      case Xin_FpStSW_AX:
1336         addHRegUse(u, HRmWrite, hregX86_EAX());
1337         return;
1338      case Xin_FpCmp:
1339         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
1340         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
1341         addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
1342         addHRegUse(u, HRmWrite, hregX86_EAX());
1343         return;
1344      case Xin_SseLdSt:
1345         addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
1346         addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
1347                       i->Xin.SseLdSt.reg);
1348         return;
1349      case Xin_SseLdzLO:
1350         addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
1351         addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
1352         return;
1353      case Xin_SseConst:
1354         addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
1355         return;
1356      case Xin_Sse32Fx4:
1357         vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
1358         unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
1359                         || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
1360                         || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
1361         addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
1362         addHRegUse(u, unary ? HRmWrite : HRmModify,
1363                       i->Xin.Sse32Fx4.dst);
1364         return;
1365      case Xin_Sse32FLo:
1366         vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
1367         unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
1368                         || i->Xin.Sse32FLo.op == Xsse_RSQRTF
1369                         || i->Xin.Sse32FLo.op == Xsse_SQRTF );
1370         addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
1371         addHRegUse(u, unary ? HRmWrite : HRmModify,
1372                       i->Xin.Sse32FLo.dst);
1373         return;
1374      case Xin_Sse64Fx2:
1375         vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
1376         unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
1377                         || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
1378                         || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
1379         addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
1380         addHRegUse(u, unary ? HRmWrite : HRmModify,
1381                       i->Xin.Sse64Fx2.dst);
1382         return;
1383      case Xin_Sse64FLo:
1384         vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
1385         unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
1386                         || i->Xin.Sse64FLo.op == Xsse_RSQRTF
1387                         || i->Xin.Sse64FLo.op == Xsse_SQRTF );
1388         addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
1389         addHRegUse(u, unary ? HRmWrite : HRmModify,
1390                       i->Xin.Sse64FLo.dst);
1391         return;
1392      case Xin_SseReRg:
1393         if (i->Xin.SseReRg.op == Xsse_XOR
1394             && i->Xin.SseReRg.src == i->Xin.SseReRg.dst) {
1395            /* reg-alloc needs to understand 'xor r,r' as a write of r */
1396            /* (as opposed to a rite of passage :-) */
1397            addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
1398         } else {
1399            addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
1400            addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV
1401                             ? HRmWrite : HRmModify,
1402                          i->Xin.SseReRg.dst);
1403         }
1404         return;
1405      case Xin_SseCMov:
1406         addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
1407         addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
1408         return;
1409      case Xin_SseShuf:
1410         addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
1411         addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
1412         return;
1413      default:
1414         ppX86Instr(i, False);
1415         vpanic("getRegUsage_X86Instr");
1416   }
1417}
1418
1419/* local helper */
1420static void mapReg( HRegRemap* m, HReg* r )
1421{
1422   *r = lookupHRegRemap(m, *r);
1423}
1424
1425void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
1426{
1427   vassert(mode64 == False);
1428   switch (i->tag) {
1429      case Xin_Alu32R:
1430         mapRegs_X86RMI(m, i->Xin.Alu32R.src);
1431         mapReg(m, &i->Xin.Alu32R.dst);
1432         return;
1433      case Xin_Alu32M:
1434         mapRegs_X86RI(m, i->Xin.Alu32M.src);
1435         mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
1436         return;
1437      case Xin_Sh32:
1438         mapReg(m, &i->Xin.Sh32.dst);
1439         return;
1440      case Xin_Test32:
1441         mapRegs_X86RM(m, i->Xin.Test32.dst);
1442         return;
1443      case Xin_Unary32:
1444         mapReg(m, &i->Xin.Unary32.dst);
1445         return;
1446      case Xin_Lea32:
1447         mapRegs_X86AMode(m, i->Xin.Lea32.am);
1448         mapReg(m, &i->Xin.Lea32.dst);
1449         return;
1450      case Xin_MulL:
1451         mapRegs_X86RM(m, i->Xin.MulL.src);
1452         return;
1453      case Xin_Div:
1454         mapRegs_X86RM(m, i->Xin.Div.src);
1455         return;
1456      case Xin_Sh3232:
1457         mapReg(m, &i->Xin.Sh3232.src);
1458         mapReg(m, &i->Xin.Sh3232.dst);
1459         return;
1460      case Xin_Push:
1461         mapRegs_X86RMI(m, i->Xin.Push.src);
1462         return;
1463      case Xin_Call:
1464         return;
1465      case Xin_Goto:
1466         mapRegs_X86RI(m, i->Xin.Goto.dst);
1467         return;
1468      case Xin_CMov32:
1469         mapRegs_X86RM(m, i->Xin.CMov32.src);
1470         mapReg(m, &i->Xin.CMov32.dst);
1471         return;
1472      case Xin_LoadEX:
1473         mapRegs_X86AMode(m, i->Xin.LoadEX.src);
1474         mapReg(m, &i->Xin.LoadEX.dst);
1475         return;
1476      case Xin_Store:
1477         mapReg(m, &i->Xin.Store.src);
1478         mapRegs_X86AMode(m, i->Xin.Store.dst);
1479         return;
1480      case Xin_Set32:
1481         mapReg(m, &i->Xin.Set32.dst);
1482         return;
1483      case Xin_Bsfr32:
1484         mapReg(m, &i->Xin.Bsfr32.src);
1485         mapReg(m, &i->Xin.Bsfr32.dst);
1486         return;
1487      case Xin_MFence:
1488         return;
1489      case Xin_ACAS:
1490         mapRegs_X86AMode(m, i->Xin.ACAS.addr);
1491         return;
1492      case Xin_DACAS:
1493         mapRegs_X86AMode(m, i->Xin.DACAS.addr);
1494         return;
1495      case Xin_FpUnary:
1496         mapReg(m, &i->Xin.FpUnary.src);
1497         mapReg(m, &i->Xin.FpUnary.dst);
1498         return;
1499      case Xin_FpBinary:
1500         mapReg(m, &i->Xin.FpBinary.srcL);
1501         mapReg(m, &i->Xin.FpBinary.srcR);
1502         mapReg(m, &i->Xin.FpBinary.dst);
1503         return;
1504      case Xin_FpLdSt:
1505         mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
1506         mapReg(m, &i->Xin.FpLdSt.reg);
1507         return;
1508      case Xin_FpLdStI:
1509         mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
1510         mapReg(m, &i->Xin.FpLdStI.reg);
1511         return;
1512      case Xin_Fp64to32:
1513         mapReg(m, &i->Xin.Fp64to32.src);
1514         mapReg(m, &i->Xin.Fp64to32.dst);
1515         return;
1516      case Xin_FpCMov:
1517         mapReg(m, &i->Xin.FpCMov.src);
1518         mapReg(m, &i->Xin.FpCMov.dst);
1519         return;
1520      case Xin_FpLdCW:
1521         mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
1522         return;
1523      case Xin_FpStSW_AX:
1524         return;
1525      case Xin_FpCmp:
1526         mapReg(m, &i->Xin.FpCmp.srcL);
1527         mapReg(m, &i->Xin.FpCmp.srcR);
1528         mapReg(m, &i->Xin.FpCmp.dst);
1529         return;
1530      case Xin_SseConst:
1531         mapReg(m, &i->Xin.SseConst.dst);
1532         return;
1533      case Xin_SseLdSt:
1534         mapReg(m, &i->Xin.SseLdSt.reg);
1535         mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
1536         break;
1537      case Xin_SseLdzLO:
1538         mapReg(m, &i->Xin.SseLdzLO.reg);
1539         mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
1540         break;
1541      case Xin_Sse32Fx4:
1542         mapReg(m, &i->Xin.Sse32Fx4.src);
1543         mapReg(m, &i->Xin.Sse32Fx4.dst);
1544         return;
1545      case Xin_Sse32FLo:
1546         mapReg(m, &i->Xin.Sse32FLo.src);
1547         mapReg(m, &i->Xin.Sse32FLo.dst);
1548         return;
1549      case Xin_Sse64Fx2:
1550         mapReg(m, &i->Xin.Sse64Fx2.src);
1551         mapReg(m, &i->Xin.Sse64Fx2.dst);
1552         return;
1553      case Xin_Sse64FLo:
1554         mapReg(m, &i->Xin.Sse64FLo.src);
1555         mapReg(m, &i->Xin.Sse64FLo.dst);
1556         return;
1557      case Xin_SseReRg:
1558         mapReg(m, &i->Xin.SseReRg.src);
1559         mapReg(m, &i->Xin.SseReRg.dst);
1560         return;
1561      case Xin_SseCMov:
1562         mapReg(m, &i->Xin.SseCMov.src);
1563         mapReg(m, &i->Xin.SseCMov.dst);
1564         return;
1565      case Xin_SseShuf:
1566         mapReg(m, &i->Xin.SseShuf.src);
1567         mapReg(m, &i->Xin.SseShuf.dst);
1568         return;
1569      default:
1570         ppX86Instr(i, mode64);
1571         vpanic("mapRegs_X86Instr");
1572   }
1573}
1574
1575/* Figure out if i represents a reg-reg move, and if so assign the
1576   source and destination to *src and *dst.  If in doubt say No.  Used
1577   by the register allocator to do move coalescing.
1578*/
1579Bool isMove_X86Instr ( X86Instr* i, HReg* src, HReg* dst )
1580{
1581   /* Moves between integer regs */
1582   if (i->tag == Xin_Alu32R) {
1583      if (i->Xin.Alu32R.op != Xalu_MOV)
1584         return False;
1585      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
1586         return False;
1587      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
1588      *dst = i->Xin.Alu32R.dst;
1589      return True;
1590   }
1591   /* Moves between FP regs */
1592   if (i->tag == Xin_FpUnary) {
1593      if (i->Xin.FpUnary.op != Xfp_MOV)
1594         return False;
1595      *src = i->Xin.FpUnary.src;
1596      *dst = i->Xin.FpUnary.dst;
1597      return True;
1598   }
1599   if (i->tag == Xin_SseReRg) {
1600      if (i->Xin.SseReRg.op != Xsse_MOV)
1601         return False;
1602      *src = i->Xin.SseReRg.src;
1603      *dst = i->Xin.SseReRg.dst;
1604      return True;
1605   }
1606   return False;
1607}
1608
1609
1610/* Generate x86 spill/reload instructions under the direction of the
1611   register allocator.  Note it's critical these don't write the
1612   condition codes. */
1613
1614void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1615                    HReg rreg, Int offsetB, Bool mode64 )
1616{
1617   X86AMode* am;
1618   vassert(offsetB >= 0);
1619   vassert(!hregIsVirtual(rreg));
1620   vassert(mode64 == False);
1621   *i1 = *i2 = NULL;
1622   am = X86AMode_IR(offsetB, hregX86_EBP());
1623   switch (hregClass(rreg)) {
1624      case HRcInt32:
1625         *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
1626         return;
1627      case HRcFlt64:
1628         *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
1629         return;
1630      case HRcVec128:
1631         *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
1632         return;
1633      default:
1634         ppHRegClass(hregClass(rreg));
1635         vpanic("genSpill_X86: unimplemented regclass");
1636   }
1637}
1638
1639void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1640                     HReg rreg, Int offsetB, Bool mode64 )
1641{
1642   X86AMode* am;
1643   vassert(offsetB >= 0);
1644   vassert(!hregIsVirtual(rreg));
1645   vassert(mode64 == False);
1646   *i1 = *i2 = NULL;
1647   am = X86AMode_IR(offsetB, hregX86_EBP());
1648   switch (hregClass(rreg)) {
1649      case HRcInt32:
1650         *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
1651         return;
1652      case HRcFlt64:
1653         *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
1654         return;
1655      case HRcVec128:
1656         *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
1657         return;
1658      default:
1659         ppHRegClass(hregClass(rreg));
1660         vpanic("genReload_X86: unimplemented regclass");
1661   }
1662}
1663
1664/* The given instruction reads the specified vreg exactly once, and
1665   that vreg is currently located at the given spill offset.  If
1666   possible, return a variant of the instruction to one which instead
1667   references the spill slot directly. */
1668
1669X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
1670{
1671   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
1672
1673   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
1674      Convert to: src=RMI_Mem, dst=Reg
1675   */
1676   if (i->tag == Xin_Alu32R
1677       && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
1678           || i->Xin.Alu32R.op == Xalu_XOR)
1679       && i->Xin.Alu32R.src->tag == Xrmi_Reg
1680       && i->Xin.Alu32R.src->Xrmi.Reg.reg == vreg) {
1681      vassert(i->Xin.Alu32R.dst != vreg);
1682      return X86Instr_Alu32R(
1683                i->Xin.Alu32R.op,
1684                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
1685                i->Xin.Alu32R.dst
1686             );
1687   }
1688
1689   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
1690      Convert to: src=RI_Imm, dst=Mem
1691   */
1692   if (i->tag == Xin_Alu32R
1693       && (i->Xin.Alu32R.op == Xalu_CMP)
1694       && i->Xin.Alu32R.src->tag == Xrmi_Imm
1695       && i->Xin.Alu32R.dst == vreg) {
1696      return X86Instr_Alu32M(
1697                i->Xin.Alu32R.op,
1698		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
1699                X86AMode_IR( spill_off, hregX86_EBP())
1700             );
1701   }
1702
1703   /* Deal with form: Push(RMI_Reg)
1704      Convert to: Push(RMI_Mem)
1705   */
1706   if (i->tag == Xin_Push
1707       && i->Xin.Push.src->tag == Xrmi_Reg
1708       && i->Xin.Push.src->Xrmi.Reg.reg == vreg) {
1709      return X86Instr_Push(
1710                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
1711             );
1712   }
1713
1714   /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
1715      Convert to CMov32(RM_Mem, dst) */
1716   if (i->tag == Xin_CMov32
1717       && i->Xin.CMov32.src->tag == Xrm_Reg
1718       && i->Xin.CMov32.src->Xrm.Reg.reg == vreg) {
1719      vassert(i->Xin.CMov32.dst != vreg);
1720      return X86Instr_CMov32(
1721                i->Xin.CMov32.cond,
1722                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
1723                i->Xin.CMov32.dst
1724             );
1725   }
1726
1727   /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
1728   if (i->tag == Xin_Test32
1729       && i->Xin.Test32.dst->tag == Xrm_Reg
1730       && i->Xin.Test32.dst->Xrm.Reg.reg == vreg) {
1731      return X86Instr_Test32(
1732                i->Xin.Test32.imm32,
1733                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
1734             );
1735   }
1736
1737   return NULL;
1738}
1739
1740
1741/* --------- The x86 assembler (bleh.) --------- */
1742
1743static UChar iregNo ( HReg r )
1744{
1745   UInt n;
1746   vassert(hregClass(r) == HRcInt32);
1747   vassert(!hregIsVirtual(r));
1748   n = hregNumber(r);
1749   vassert(n <= 7);
1750   return toUChar(n);
1751}
1752
1753static UInt fregNo ( HReg r )
1754{
1755   UInt n;
1756   vassert(hregClass(r) == HRcFlt64);
1757   vassert(!hregIsVirtual(r));
1758   n = hregNumber(r);
1759   vassert(n <= 5);
1760   return n;
1761}
1762
1763static UInt vregNo ( HReg r )
1764{
1765   UInt n;
1766   vassert(hregClass(r) == HRcVec128);
1767   vassert(!hregIsVirtual(r));
1768   n = hregNumber(r);
1769   vassert(n <= 7);
1770   return n;
1771}
1772
1773static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
1774{
1775   return toUChar( ((mod & 3) << 6)
1776                   | ((reg & 7) << 3)
1777                   | (regmem & 7) );
1778}
1779
1780static UChar mkSIB ( Int shift, Int regindex, Int regbase )
1781{
1782   return toUChar( ((shift & 3) << 6)
1783                   | ((regindex & 7) << 3)
1784                   | (regbase & 7) );
1785}
1786
1787static UChar* emit32 ( UChar* p, UInt w32 )
1788{
1789   *p++ = toUChar( w32        & 0x000000FF);
1790   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1791   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1792   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1793   return p;
1794}
1795
1796/* Does a sign-extend of the lowest 8 bits give
1797   the original number? */
1798static Bool fits8bits ( UInt w32 )
1799{
1800   Int i32 = (Int)w32;
1801   return toBool(i32 == ((i32 << 24) >> 24));
1802}
1803
1804
1805/* Forming mod-reg-rm bytes and scale-index-base bytes.
1806
1807     greg,  0(ereg)    |  ereg != ESP && ereg != EBP
1808                       =  00 greg ereg
1809
1810     greg,  d8(ereg)   |  ereg != ESP
1811                       =  01 greg ereg, d8
1812
1813     greg,  d32(ereg)  |  ereg != ESP
1814                       =  10 greg ereg, d32
1815
1816     greg,  d8(%esp)   =  01 greg 100, 0x24, d8
1817
1818     -----------------------------------------------
1819
1820     greg,  d8(base,index,scale)
1821               |  index != ESP
1822               =  01 greg 100, scale index base, d8
1823
1824     greg,  d32(base,index,scale)
1825               |  index != ESP
1826               =  10 greg 100, scale index base, d32
1827*/
1828static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am )
1829{
1830   if (am->tag == Xam_IR) {
1831      if (am->Xam.IR.imm == 0
1832          && am->Xam.IR.reg != hregX86_ESP()
1833          && am->Xam.IR.reg != hregX86_EBP() ) {
1834         *p++ = mkModRegRM(0, iregNo(greg), iregNo(am->Xam.IR.reg));
1835         return p;
1836      }
1837      if (fits8bits(am->Xam.IR.imm)
1838          && am->Xam.IR.reg != hregX86_ESP()) {
1839         *p++ = mkModRegRM(1, iregNo(greg), iregNo(am->Xam.IR.reg));
1840         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1841         return p;
1842      }
1843      if (am->Xam.IR.reg != hregX86_ESP()) {
1844         *p++ = mkModRegRM(2, iregNo(greg), iregNo(am->Xam.IR.reg));
1845         p = emit32(p, am->Xam.IR.imm);
1846         return p;
1847      }
1848      if (am->Xam.IR.reg == hregX86_ESP()
1849          && fits8bits(am->Xam.IR.imm)) {
1850 	 *p++ = mkModRegRM(1, iregNo(greg), 4);
1851         *p++ = 0x24;
1852         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1853         return p;
1854      }
1855      ppX86AMode(am);
1856      vpanic("doAMode_M: can't emit amode IR");
1857      /*NOTREACHED*/
1858   }
1859   if (am->tag == Xam_IRRS) {
1860      if (fits8bits(am->Xam.IRRS.imm)
1861          && am->Xam.IRRS.index != hregX86_ESP()) {
1862         *p++ = mkModRegRM(1, iregNo(greg), 4);
1863         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
1864                                          am->Xam.IRRS.base);
1865         *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
1866         return p;
1867      }
1868      if (am->Xam.IRRS.index != hregX86_ESP()) {
1869         *p++ = mkModRegRM(2, iregNo(greg), 4);
1870         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
1871                                          am->Xam.IRRS.base);
1872         p = emit32(p, am->Xam.IRRS.imm);
1873         return p;
1874      }
1875      ppX86AMode(am);
1876      vpanic("doAMode_M: can't emit amode IRRS");
1877      /*NOTREACHED*/
1878   }
1879   vpanic("doAMode_M: unknown amode");
1880   /*NOTREACHED*/
1881}
1882
1883
1884/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
1885static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
1886{
1887   *p++ = mkModRegRM(3, iregNo(greg), iregNo(ereg));
1888   return p;
1889}
1890
1891
1892/* Emit ffree %st(7) */
1893static UChar* do_ffree_st7 ( UChar* p )
1894{
1895   *p++ = 0xDD;
1896   *p++ = 0xC7;
1897   return p;
1898}
1899
1900/* Emit fstp %st(i), 1 <= i <= 7 */
1901static UChar* do_fstp_st ( UChar* p, Int i )
1902{
1903   vassert(1 <= i && i <= 7);
1904   *p++ = 0xDD;
1905   *p++ = toUChar(0xD8+i);
1906   return p;
1907}
1908
1909/* Emit fld %st(i), 0 <= i <= 6 */
1910static UChar* do_fld_st ( UChar* p, Int i )
1911{
1912   vassert(0 <= i && i <= 6);
1913   *p++ = 0xD9;
1914   *p++ = toUChar(0xC0+i);
1915   return p;
1916}
1917
1918/* Emit f<op> %st(0) */
1919static UChar* do_fop1_st ( UChar* p, X86FpOp op )
1920{
1921   switch (op) {
1922      case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
1923      case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
1924      case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
1925      case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
1926      case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
1927      case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
1928      case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
1929      case Xfp_MOV:    break;
1930      case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
1931                       *p++ = 0xD9; *p++ = 0xF2; /* fptan */
1932                       *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
1933                       break;
1934      default: vpanic("do_fop1_st: unknown op");
1935   }
1936   return p;
1937}
1938
1939/* Emit f<op> %st(i), 1 <= i <= 5 */
1940static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
1941{
1942#  define fake(_n) mkHReg((_n), HRcInt32, False)
1943   Int subopc;
1944   switch (op) {
1945      case Xfp_ADD: subopc = 0; break;
1946      case Xfp_SUB: subopc = 4; break;
1947      case Xfp_MUL: subopc = 1; break;
1948      case Xfp_DIV: subopc = 6; break;
1949      default: vpanic("do_fop2_st: unknown op");
1950   }
1951   *p++ = 0xD8;
1952   p    = doAMode_R(p, fake(subopc), fake(i));
1953   return p;
1954#  undef fake
1955}
1956
1957/* Push a 32-bit word on the stack.  The word depends on tags[3:0];
1958each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
1959*/
1960static UChar* push_word_from_tags ( UChar* p, UShort tags )
1961{
1962   UInt w;
1963   vassert(0 == (tags & ~0xF));
1964   if (tags == 0) {
1965      /* pushl $0x00000000 */
1966      *p++ = 0x6A;
1967      *p++ = 0x00;
1968   }
1969   else
1970   /* pushl $0xFFFFFFFF */
1971   if (tags == 0xF) {
1972      *p++ = 0x6A;
1973      *p++ = 0xFF;
1974   } else {
1975      vassert(0); /* awaiting test case */
1976      w = 0;
1977      if (tags & 1) w |= 0x000000FF;
1978      if (tags & 2) w |= 0x0000FF00;
1979      if (tags & 4) w |= 0x00FF0000;
1980      if (tags & 8) w |= 0xFF000000;
1981      *p++ = 0x68;
1982      p = emit32(p, w);
1983   }
1984   return p;
1985}
1986
1987/* Emit an instruction into buf and return the number of bytes used.
1988   Note that buf is not the insn's final place, and therefore it is
1989   imperative to emit position-independent code. */
1990
1991Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i,
1992                    Bool mode64,
1993                    void* dispatch_unassisted,
1994                    void* dispatch_assisted )
1995{
1996   UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
1997
1998   UInt   xtra;
1999   UChar* p = &buf[0];
2000   UChar* ptmp;
2001   vassert(nbuf >= 32);
2002   vassert(mode64 == False);
2003
2004   /* Wrap an integer as a int register, for use assembling
2005      GrpN insns, in which the greg field is used as a sub-opcode
2006      and does not really contain a register. */
2007#  define fake(_n) mkHReg((_n), HRcInt32, False)
2008
2009   /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
2010
2011   switch (i->tag) {
2012
2013   case Xin_Alu32R:
2014      /* Deal specially with MOV */
2015      if (i->Xin.Alu32R.op == Xalu_MOV) {
2016         switch (i->Xin.Alu32R.src->tag) {
2017            case Xrmi_Imm:
2018               *p++ = toUChar(0xB8 + iregNo(i->Xin.Alu32R.dst));
2019               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2020               goto done;
2021            case Xrmi_Reg:
2022               *p++ = 0x89;
2023               p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2024                                i->Xin.Alu32R.dst);
2025               goto done;
2026            case Xrmi_Mem:
2027               *p++ = 0x8B;
2028               p = doAMode_M(p, i->Xin.Alu32R.dst,
2029                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2030               goto done;
2031            default:
2032               goto bad;
2033         }
2034      }
2035      /* MUL */
2036      if (i->Xin.Alu32R.op == Xalu_MUL) {
2037         switch (i->Xin.Alu32R.src->tag) {
2038            case Xrmi_Reg:
2039               *p++ = 0x0F;
2040               *p++ = 0xAF;
2041               p = doAMode_R(p, i->Xin.Alu32R.dst,
2042                                i->Xin.Alu32R.src->Xrmi.Reg.reg);
2043               goto done;
2044            case Xrmi_Mem:
2045               *p++ = 0x0F;
2046               *p++ = 0xAF;
2047               p = doAMode_M(p, i->Xin.Alu32R.dst,
2048                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2049               goto done;
2050            case Xrmi_Imm:
2051               if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2052                  *p++ = 0x6B;
2053                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2054                  *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2055               } else {
2056                  *p++ = 0x69;
2057                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2058                  p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2059               }
2060               goto done;
2061            default:
2062               goto bad;
2063         }
2064      }
2065      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2066      opc = opc_rr = subopc_imm = opc_imma = 0;
2067      switch (i->Xin.Alu32R.op) {
2068         case Xalu_ADC: opc = 0x13; opc_rr = 0x11;
2069                        subopc_imm = 2; opc_imma = 0x15; break;
2070         case Xalu_ADD: opc = 0x03; opc_rr = 0x01;
2071                        subopc_imm = 0; opc_imma = 0x05; break;
2072         case Xalu_SUB: opc = 0x2B; opc_rr = 0x29;
2073                        subopc_imm = 5; opc_imma = 0x2D; break;
2074         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19;
2075                        subopc_imm = 3; opc_imma = 0x1D; break;
2076         case Xalu_AND: opc = 0x23; opc_rr = 0x21;
2077                        subopc_imm = 4; opc_imma = 0x25; break;
2078         case Xalu_XOR: opc = 0x33; opc_rr = 0x31;
2079                        subopc_imm = 6; opc_imma = 0x35; break;
2080         case Xalu_OR:  opc = 0x0B; opc_rr = 0x09;
2081                        subopc_imm = 1; opc_imma = 0x0D; break;
2082         case Xalu_CMP: opc = 0x3B; opc_rr = 0x39;
2083                        subopc_imm = 7; opc_imma = 0x3D; break;
2084         default: goto bad;
2085      }
2086      switch (i->Xin.Alu32R.src->tag) {
2087         case Xrmi_Imm:
2088            if (i->Xin.Alu32R.dst == hregX86_EAX()
2089                && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2090               *p++ = toUChar(opc_imma);
2091               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2092            } else
2093            if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2094               *p++ = 0x83;
2095               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2096               *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2097            } else {
2098               *p++ = 0x81;
2099               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2100               p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2101            }
2102            goto done;
2103         case Xrmi_Reg:
2104            *p++ = toUChar(opc_rr);
2105            p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2106                             i->Xin.Alu32R.dst);
2107            goto done;
2108         case Xrmi_Mem:
2109            *p++ = toUChar(opc);
2110            p = doAMode_M(p, i->Xin.Alu32R.dst,
2111                             i->Xin.Alu32R.src->Xrmi.Mem.am);
2112            goto done;
2113         default:
2114            goto bad;
2115      }
2116      break;
2117
2118   case Xin_Alu32M:
2119      /* Deal specially with MOV */
2120      if (i->Xin.Alu32M.op == Xalu_MOV) {
2121         switch (i->Xin.Alu32M.src->tag) {
2122            case Xri_Reg:
2123               *p++ = 0x89;
2124               p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2125                                i->Xin.Alu32M.dst);
2126               goto done;
2127            case Xri_Imm:
2128               *p++ = 0xC7;
2129               p = doAMode_M(p, fake(0), i->Xin.Alu32M.dst);
2130               p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2131               goto done;
2132            default:
2133               goto bad;
2134         }
2135      }
2136      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2137         allowed here. */
2138      opc = subopc_imm = opc_imma = 0;
2139      switch (i->Xin.Alu32M.op) {
2140         case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
2141         case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
2142         case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
2143         default: goto bad;
2144      }
2145      switch (i->Xin.Alu32M.src->tag) {
2146         case Xri_Reg:
2147            *p++ = toUChar(opc);
2148            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2149                             i->Xin.Alu32M.dst);
2150            goto done;
2151         case Xri_Imm:
2152            if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
2153               *p++ = 0x83;
2154               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2155               *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
2156               goto done;
2157            } else {
2158               *p++ = 0x81;
2159               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2160               p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2161               goto done;
2162            }
2163         default:
2164            goto bad;
2165      }
2166      break;
2167
2168   case Xin_Sh32:
2169      opc_cl = opc_imm = subopc = 0;
2170      switch (i->Xin.Sh32.op) {
2171         case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2172         case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2173         case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2174         default: goto bad;
2175      }
2176      if (i->Xin.Sh32.src == 0) {
2177         *p++ = toUChar(opc_cl);
2178         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2179      } else {
2180         *p++ = toUChar(opc_imm);
2181         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2182         *p++ = (UChar)(i->Xin.Sh32.src);
2183      }
2184      goto done;
2185
2186   case Xin_Test32:
2187      if (i->Xin.Test32.dst->tag == Xrm_Reg) {
2188         /* testl $imm32, %reg */
2189         *p++ = 0xF7;
2190         p = doAMode_R(p, fake(0), i->Xin.Test32.dst->Xrm.Reg.reg);
2191         p = emit32(p, i->Xin.Test32.imm32);
2192         goto done;
2193      } else {
2194         /* testl $imm32, amode */
2195         *p++ = 0xF7;
2196         p = doAMode_M(p, fake(0), i->Xin.Test32.dst->Xrm.Mem.am);
2197         p = emit32(p, i->Xin.Test32.imm32);
2198         goto done;
2199      }
2200
2201   case Xin_Unary32:
2202      if (i->Xin.Unary32.op == Xun_NOT) {
2203         *p++ = 0xF7;
2204         p = doAMode_R(p, fake(2), i->Xin.Unary32.dst);
2205         goto done;
2206      }
2207      if (i->Xin.Unary32.op == Xun_NEG) {
2208         *p++ = 0xF7;
2209         p = doAMode_R(p, fake(3), i->Xin.Unary32.dst);
2210         goto done;
2211      }
2212      break;
2213
2214   case Xin_Lea32:
2215      *p++ = 0x8D;
2216      p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
2217      goto done;
2218
2219   case Xin_MulL:
2220      subopc = i->Xin.MulL.syned ? 5 : 4;
2221      *p++ = 0xF7;
2222      switch (i->Xin.MulL.src->tag)  {
2223         case Xrm_Mem:
2224            p = doAMode_M(p, fake(subopc),
2225                             i->Xin.MulL.src->Xrm.Mem.am);
2226            goto done;
2227         case Xrm_Reg:
2228            p = doAMode_R(p, fake(subopc),
2229                             i->Xin.MulL.src->Xrm.Reg.reg);
2230            goto done;
2231         default:
2232            goto bad;
2233      }
2234      break;
2235
2236   case Xin_Div:
2237      subopc = i->Xin.Div.syned ? 7 : 6;
2238      *p++ = 0xF7;
2239      switch (i->Xin.Div.src->tag)  {
2240         case Xrm_Mem:
2241            p = doAMode_M(p, fake(subopc),
2242                             i->Xin.Div.src->Xrm.Mem.am);
2243            goto done;
2244         case Xrm_Reg:
2245            p = doAMode_R(p, fake(subopc),
2246                             i->Xin.Div.src->Xrm.Reg.reg);
2247            goto done;
2248         default:
2249            goto bad;
2250      }
2251      break;
2252
2253   case Xin_Sh3232:
2254      vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
2255      if (i->Xin.Sh3232.amt == 0) {
2256         /* shldl/shrdl by %cl */
2257         *p++ = 0x0F;
2258         if (i->Xin.Sh3232.op == Xsh_SHL) {
2259            *p++ = 0xA5;
2260         } else {
2261            *p++ = 0xAD;
2262         }
2263         p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
2264         goto done;
2265      }
2266      break;
2267
2268   case Xin_Push:
2269      switch (i->Xin.Push.src->tag) {
2270         case Xrmi_Mem:
2271            *p++ = 0xFF;
2272            p = doAMode_M(p, fake(6), i->Xin.Push.src->Xrmi.Mem.am);
2273            goto done;
2274         case Xrmi_Imm:
2275            *p++ = 0x68;
2276            p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
2277            goto done;
2278         case Xrmi_Reg:
2279            *p++ = toUChar(0x50 + iregNo(i->Xin.Push.src->Xrmi.Reg.reg));
2280            goto done;
2281        default:
2282            goto bad;
2283      }
2284
2285   case Xin_Call:
2286      /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
2287         for explanation of this. */
2288      switch (i->Xin.Call.regparms) {
2289         case 0: irno = iregNo(hregX86_EAX()); break;
2290         case 1: irno = iregNo(hregX86_EDX()); break;
2291         case 2: irno = iregNo(hregX86_ECX()); break;
2292         case 3: irno = iregNo(hregX86_EDI()); break;
2293         default: vpanic(" emit_X86Instr:call:regparms");
2294      }
2295      /* jump over the following two insns if the condition does not
2296         hold */
2297      if (i->Xin.Call.cond != Xcc_ALWAYS) {
2298         *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
2299         *p++ = 0x07; /* 7 bytes in the next two insns */
2300      }
2301      /* movl $target, %tmp */
2302      *p++ = toUChar(0xB8 + irno);
2303      p = emit32(p, i->Xin.Call.target);
2304      /* call *%tmp */
2305      *p++ = 0xFF;
2306      *p++ = toUChar(0xD0 + irno);
2307      goto done;
2308
2309   case Xin_Goto: {
2310      void* dispatch_to_use = NULL;
2311      vassert(dispatch_unassisted != NULL);
2312      vassert(dispatch_assisted != NULL);
2313
2314      /* Use ptmp for backpatching conditional jumps. */
2315      ptmp = NULL;
2316
2317      /* First off, if this is conditional, create a conditional
2318	 jump over the rest of it. */
2319      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
2320         /* jmp fwds if !condition */
2321         *p++ = toUChar(0x70 + (0xF & (i->Xin.Goto.cond ^ 1)));
2322         ptmp = p; /* fill in this bit later */
2323         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2324      }
2325
2326      /* If a non-boring, set %ebp (the guest state pointer)
2327         appropriately.  Also, decide which dispatcher we need to
2328         use. */
2329      dispatch_to_use = dispatch_assisted;
2330
2331      /* movl $magic_number, %ebp */
2332      switch (i->Xin.Goto.jk) {
2333         case Ijk_ClientReq:
2334            *p++ = 0xBD;
2335            p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
2336         case Ijk_Sys_int128:
2337            *p++ = 0xBD;
2338            p = emit32(p, VEX_TRC_JMP_SYS_INT128); break;
2339         case Ijk_Sys_int129:
2340            *p++ = 0xBD;
2341            p = emit32(p, VEX_TRC_JMP_SYS_INT129); break;
2342         case Ijk_Sys_int130:
2343            *p++ = 0xBD;
2344            p = emit32(p, VEX_TRC_JMP_SYS_INT130); break;
2345         case Ijk_Yield:
2346            *p++ = 0xBD;
2347            p = emit32(p, VEX_TRC_JMP_YIELD); break;
2348         case Ijk_YieldNoRedir:
2349            *p++ = 0xBD;
2350            p = emit32(p, VEX_TRC_JMP_YIELD_NOREDIR); break;
2351         case Ijk_EmWarn:
2352            *p++ = 0xBD;
2353            p = emit32(p, VEX_TRC_JMP_EMWARN); break;
2354         case Ijk_MapFail:
2355            *p++ = 0xBD;
2356            p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
2357         case Ijk_NoDecode:
2358            *p++ = 0xBD;
2359            p = emit32(p, VEX_TRC_JMP_NODECODE); break;
2360         case Ijk_TInval:
2361            *p++ = 0xBD;
2362            p = emit32(p, VEX_TRC_JMP_TINVAL); break;
2363         case Ijk_NoRedir:
2364            *p++ = 0xBD;
2365            p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
2366         case Ijk_Sys_sysenter:
2367            *p++ = 0xBD;
2368            p = emit32(p, VEX_TRC_JMP_SYS_SYSENTER); break;
2369         case Ijk_SigTRAP:
2370            *p++ = 0xBD;
2371            p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
2372         case Ijk_SigSEGV:
2373            *p++ = 0xBD;
2374            p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
2375         case Ijk_Ret:
2376	 case Ijk_Call:
2377         case Ijk_Boring:
2378            dispatch_to_use = dispatch_unassisted;
2379            break;
2380         default:
2381            ppIRJumpKind(i->Xin.Goto.jk);
2382            vpanic("emit_X86Instr.Xin_Goto: unknown jump kind");
2383      }
2384
2385      /* Get the destination address into %eax */
2386      if (i->Xin.Goto.dst->tag == Xri_Imm) {
2387         /* movl $immediate, %eax */
2388         *p++ = 0xB8;
2389         p = emit32(p, i->Xin.Goto.dst->Xri.Imm.imm32);
2390      } else {
2391         vassert(i->Xin.Goto.dst->tag == Xri_Reg);
2392         /* movl %reg, %eax */
2393         if (i->Xin.Goto.dst->Xri.Reg.reg != hregX86_EAX()) {
2394            *p++ = 0x89;
2395            p = doAMode_R(p, i->Xin.Goto.dst->Xri.Reg.reg, hregX86_EAX());
2396         }
2397      }
2398
2399      /* Get the dispatcher address into %edx.  This has to happen
2400         after the load of %eax since %edx might be carrying the value
2401         destined for %eax immediately prior to this Xin_Goto. */
2402      vassert(sizeof(UInt) == sizeof(void*));
2403      vassert(dispatch_to_use != NULL);
2404      /* movl $imm32, %edx */
2405      *p++ = 0xBA;
2406      p = emit32(p, (UInt)Ptr_to_ULong(dispatch_to_use));
2407
2408      /* jmp *%edx */
2409      *p++ = 0xFF;
2410      *p++ = 0xE2;
2411
2412      /* Fix up the conditional jump, if there was one. */
2413      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
2414         Int delta = p - ptmp;
2415	 vassert(delta > 0 && delta < 20);
2416         *ptmp = toUChar(delta-1);
2417      }
2418      goto done;
2419   }
2420
2421   case Xin_CMov32:
2422      vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
2423
2424      /* This generates cmov, which is illegal on P54/P55. */
2425      /*
2426      *p++ = 0x0F;
2427      *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
2428      if (i->Xin.CMov32.src->tag == Xrm_Reg) {
2429         p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
2430         goto done;
2431      }
2432      if (i->Xin.CMov32.src->tag == Xrm_Mem) {
2433         p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
2434         goto done;
2435      }
2436      */
2437
2438      /* Alternative version which works on any x86 variant. */
2439      /* jmp fwds if !condition */
2440      *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
2441      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2442      ptmp = p;
2443
2444      switch (i->Xin.CMov32.src->tag) {
2445         case Xrm_Reg:
2446            /* Big sigh.  This is movl E -> G ... */
2447            *p++ = 0x89;
2448            p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
2449                             i->Xin.CMov32.dst);
2450
2451            break;
2452         case Xrm_Mem:
2453            /* ... whereas this is movl G -> E.  That's why the args
2454               to doAMode_R appear to be the wrong way round in the
2455               Xrm_Reg case. */
2456            *p++ = 0x8B;
2457            p = doAMode_M(p, i->Xin.CMov32.dst,
2458                             i->Xin.CMov32.src->Xrm.Mem.am);
2459            break;
2460         default:
2461            goto bad;
2462      }
2463      /* Fill in the jump offset. */
2464      *(ptmp-1) = toUChar(p - ptmp);
2465      goto done;
2466
2467      break;
2468
2469   case Xin_LoadEX:
2470      if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
2471         /* movzbl */
2472         *p++ = 0x0F;
2473         *p++ = 0xB6;
2474         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2475         goto done;
2476      }
2477      if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
2478         /* movzwl */
2479         *p++ = 0x0F;
2480         *p++ = 0xB7;
2481         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2482         goto done;
2483      }
2484      if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
2485         /* movsbl */
2486         *p++ = 0x0F;
2487         *p++ = 0xBE;
2488         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2489         goto done;
2490      }
2491      break;
2492
2493   case Xin_Set32:
2494      /* Make the destination register be 1 or 0, depending on whether
2495         the relevant condition holds.  We have to dodge and weave
2496         when the destination is %esi or %edi as we cannot directly
2497         emit the native 'setb %reg' for those.  Further complication:
2498         the top 24 bits of the destination should be forced to zero,
2499         but doing 'xor %r,%r' kills the flag(s) we are about to read.
2500         Sigh.  So start off my moving $0 into the dest. */
2501
2502      /* Do we need to swap in %eax? */
2503      if (iregNo(i->Xin.Set32.dst) >= 4) {
2504         /* xchg %eax, %dst */
2505         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2506         /* movl $0, %eax */
2507         *p++ =toUChar(0xB8 + iregNo(hregX86_EAX()));
2508         p = emit32(p, 0);
2509         /* setb lo8(%eax) */
2510         *p++ = 0x0F;
2511         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2512         p = doAMode_R(p, fake(0), hregX86_EAX());
2513         /* xchg %eax, %dst */
2514         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2515      } else {
2516         /* movl $0, %dst */
2517         *p++ = toUChar(0xB8 + iregNo(i->Xin.Set32.dst));
2518         p = emit32(p, 0);
2519         /* setb lo8(%dst) */
2520         *p++ = 0x0F;
2521         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2522         p = doAMode_R(p, fake(0), i->Xin.Set32.dst);
2523      }
2524      goto done;
2525
2526   case Xin_Bsfr32:
2527      *p++ = 0x0F;
2528      if (i->Xin.Bsfr32.isFwds) {
2529         *p++ = 0xBC;
2530      } else {
2531         *p++ = 0xBD;
2532      }
2533      p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
2534      goto done;
2535
2536   case Xin_MFence:
2537      /* see comment in hdefs.h re this insn */
2538      if (0) vex_printf("EMIT FENCE\n");
2539      if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
2540                                  |VEX_HWCAPS_X86_SSE2)) {
2541         /* mfence */
2542         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
2543         goto done;
2544      }
2545      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_SSE1) {
2546         /* sfence */
2547         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
2548         /* lock addl $0,0(%esp) */
2549         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2550         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2551         goto done;
2552      }
2553      if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
2554         /* lock addl $0,0(%esp) */
2555         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2556         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2557         goto done;
2558      }
2559      vpanic("emit_X86Instr:mfence:hwcaps");
2560      /*NOTREACHED*/
2561      break;
2562
2563   case Xin_ACAS:
2564      /* lock */
2565      *p++ = 0xF0;
2566      /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
2567         in %ebx.  The new-value register is hardwired to be %ebx
2568         since letting it be any integer register gives the problem
2569         that %sil and %dil are unaddressible on x86 and hence we
2570         would have to resort to the same kind of trickery as with
2571         byte-sized Xin.Store, just below.  Given that this isn't
2572         performance critical, it is simpler just to force the
2573         register operand to %ebx (could equally be %ecx or %edx).
2574         (Although %ebx is more consistent with cmpxchg8b.) */
2575      if (i->Xin.ACAS.sz == 2) *p++ = 0x66;
2576      *p++ = 0x0F;
2577      if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
2578      p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
2579      goto done;
2580
2581   case Xin_DACAS:
2582      /* lock */
2583      *p++ = 0xF0;
2584      /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
2585         in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
2586         aren't encoded in the insn. */
2587      *p++ = 0x0F;
2588      *p++ = 0xC7;
2589      p = doAMode_M(p, fake(1), i->Xin.DACAS.addr);
2590      goto done;
2591
2592   case Xin_Store:
2593      if (i->Xin.Store.sz == 2) {
2594         /* This case, at least, is simple, given that we can
2595            reference the low 16 bits of any integer register. */
2596         *p++ = 0x66;
2597         *p++ = 0x89;
2598         p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2599         goto done;
2600      }
2601
2602      if (i->Xin.Store.sz == 1) {
2603         /* We have to do complex dodging and weaving if src is not
2604            the low 8 bits of %eax/%ebx/%ecx/%edx. */
2605         if (iregNo(i->Xin.Store.src) < 4) {
2606            /* we're OK, can do it directly */
2607            *p++ = 0x88;
2608            p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2609           goto done;
2610         } else {
2611            /* Bleh.  This means the source is %edi or %esi.  Since
2612               the address mode can only mention three registers, at
2613               least one of %eax/%ebx/%ecx/%edx must be available to
2614               temporarily swap the source into, so the store can
2615               happen.  So we have to look at the regs mentioned
2616               in the amode. */
2617            HReg swap = INVALID_HREG;
2618            HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(),
2619                  ecx = hregX86_ECX(), edx = hregX86_EDX();
2620            Bool a_ok = True, b_ok = True, c_ok = True, d_ok = True;
2621            HRegUsage u;
2622            Int j;
2623            initHRegUsage(&u);
2624            addRegUsage_X86AMode(&u,  i->Xin.Store.dst);
2625            for (j = 0; j < u.n_used; j++) {
2626               HReg r = u.hreg[j];
2627               if (r == eax) a_ok = False;
2628               if (r == ebx) b_ok = False;
2629               if (r == ecx) c_ok = False;
2630               if (r == edx) d_ok = False;
2631            }
2632            if (a_ok) swap = eax;
2633            if (b_ok) swap = ebx;
2634            if (c_ok) swap = ecx;
2635            if (d_ok) swap = edx;
2636            vassert(swap != INVALID_HREG);
2637            /* xchgl %source, %swap. Could do better if swap is %eax. */
2638            *p++ = 0x87;
2639            p = doAMode_R(p, i->Xin.Store.src, swap);
2640            /* movb lo8{%swap}, (dst) */
2641            *p++ = 0x88;
2642            p = doAMode_M(p, swap, i->Xin.Store.dst);
2643            /* xchgl %source, %swap. Could do better if swap is %eax. */
2644            *p++ = 0x87;
2645            p = doAMode_R(p, i->Xin.Store.src, swap);
2646            goto done;
2647         }
2648      } /* if (i->Xin.Store.sz == 1) */
2649      break;
2650
2651   case Xin_FpUnary:
2652      /* gop %src, %dst
2653         --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
2654      */
2655      p = do_ffree_st7(p);
2656      p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
2657      p = do_fop1_st(p, i->Xin.FpUnary.op);
2658      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
2659      goto done;
2660
2661   case Xin_FpBinary:
2662      if (i->Xin.FpBinary.op == Xfp_YL2X
2663          || i->Xin.FpBinary.op == Xfp_YL2XP1) {
2664         /* Have to do this specially. */
2665         /* ffree %st7 ; fld %st(srcL) ;
2666            ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
2667         p = do_ffree_st7(p);
2668         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2669         p = do_ffree_st7(p);
2670         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2671         *p++ = 0xD9;
2672         *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
2673         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2674         goto done;
2675      }
2676      if (i->Xin.FpBinary.op == Xfp_ATAN) {
2677         /* Have to do this specially. */
2678         /* ffree %st7 ; fld %st(srcL) ;
2679            ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
2680         p = do_ffree_st7(p);
2681         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2682         p = do_ffree_st7(p);
2683         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2684         *p++ = 0xD9; *p++ = 0xF3;
2685         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2686         goto done;
2687      }
2688      if (i->Xin.FpBinary.op == Xfp_PREM
2689          || i->Xin.FpBinary.op == Xfp_PREM1
2690          || i->Xin.FpBinary.op == Xfp_SCALE) {
2691         /* Have to do this specially. */
2692         /* ffree %st7 ; fld %st(srcR) ;
2693            ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
2694            fincstp ; ffree %st7 */
2695         p = do_ffree_st7(p);
2696         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
2697         p = do_ffree_st7(p);
2698         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
2699         *p++ = 0xD9;
2700         switch (i->Xin.FpBinary.op) {
2701            case Xfp_PREM: *p++ = 0xF8; break;
2702            case Xfp_PREM1: *p++ = 0xF5; break;
2703            case Xfp_SCALE: *p++ =  0xFD; break;
2704            default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
2705         }
2706         p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
2707         *p++ = 0xD9; *p++ = 0xF7;
2708         p = do_ffree_st7(p);
2709         goto done;
2710      }
2711      /* General case */
2712      /* gop %srcL, %srcR, %dst
2713         --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
2714      */
2715      p = do_ffree_st7(p);
2716      p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2717      p = do_fop2_st(p, i->Xin.FpBinary.op,
2718                        1+hregNumber(i->Xin.FpBinary.srcR));
2719      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2720      goto done;
2721
2722   case Xin_FpLdSt:
2723      if (i->Xin.FpLdSt.isLoad) {
2724         /* Load from memory into %fakeN.
2725            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
2726         */
2727         p = do_ffree_st7(p);
2728         switch (i->Xin.FpLdSt.sz) {
2729            case 4:
2730               *p++ = 0xD9;
2731               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2732               break;
2733            case 8:
2734               *p++ = 0xDD;
2735               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2736               break;
2737            case 10:
2738               *p++ = 0xDB;
2739               p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
2740               break;
2741            default:
2742               vpanic("emitX86Instr(FpLdSt,load)");
2743         }
2744         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
2745         goto done;
2746      } else {
2747         /* Store from %fakeN into memory.
2748            --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
2749	 */
2750         p = do_ffree_st7(p);
2751         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
2752         switch (i->Xin.FpLdSt.sz) {
2753            case 4:
2754               *p++ = 0xD9;
2755               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2756               break;
2757            case 8:
2758               *p++ = 0xDD;
2759               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2760               break;
2761            case 10:
2762               *p++ = 0xDB;
2763               p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
2764               break;
2765            default:
2766               vpanic("emitX86Instr(FpLdSt,store)");
2767         }
2768         goto done;
2769      }
2770      break;
2771
2772   case Xin_FpLdStI:
2773      if (i->Xin.FpLdStI.isLoad) {
2774         /* Load from memory into %fakeN, converting from an int.
2775            --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
2776         */
2777         switch (i->Xin.FpLdStI.sz) {
2778            case 8:  opc = 0xDF; subopc_imm = 5; break;
2779            case 4:  opc = 0xDB; subopc_imm = 0; break;
2780            case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
2781            default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
2782         }
2783         p = do_ffree_st7(p);
2784         *p++ = toUChar(opc);
2785         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2786         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
2787         goto done;
2788      } else {
2789         /* Store from %fakeN into memory, converting to an int.
2790            --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
2791	 */
2792         switch (i->Xin.FpLdStI.sz) {
2793            case 8:  opc = 0xDF; subopc_imm = 7; break;
2794            case 4:  opc = 0xDB; subopc_imm = 3; break;
2795            case 2:  opc = 0xDF; subopc_imm = 3; break;
2796            default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
2797         }
2798         p = do_ffree_st7(p);
2799         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
2800         *p++ = toUChar(opc);
2801         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2802         goto done;
2803      }
2804      break;
2805
2806   case Xin_Fp64to32:
2807      /* ffree %st7 ; fld %st(src) */
2808      p = do_ffree_st7(p);
2809      p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
2810      /* subl $4, %esp */
2811      *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
2812      /* fstps (%esp) */
2813      *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
2814      /* flds (%esp) */
2815      *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
2816      /* addl $4, %esp */
2817      *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
2818      /* fstp %st(1+dst) */
2819      p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
2820      goto done;
2821
2822   case Xin_FpCMov:
2823      /* jmp fwds if !condition */
2824      *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
2825      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2826      ptmp = p;
2827
2828      /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
2829      p = do_ffree_st7(p);
2830      p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
2831      p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
2832
2833      /* Fill in the jump offset. */
2834      *(ptmp-1) = toUChar(p - ptmp);
2835      goto done;
2836
2837   case Xin_FpLdCW:
2838      *p++ = 0xD9;
2839      p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdCW.addr);
2840      goto done;
2841
2842   case Xin_FpStSW_AX:
2843      /* note, this emits fnstsw %ax, not fstsw %ax */
2844      *p++ = 0xDF;
2845      *p++ = 0xE0;
2846      goto done;
2847
2848   case Xin_FpCmp:
2849      /* gcmp %fL, %fR, %dst
2850         -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
2851            fnstsw %ax ; movl %eax, %dst
2852      */
2853      /* ffree %st7 */
2854      p = do_ffree_st7(p);
2855      /* fpush %fL */
2856      p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
2857      /* fucomp %(fR+1) */
2858      *p++ = 0xDD;
2859      *p++ = toUChar(0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR))));
2860      /* fnstsw %ax */
2861      *p++ = 0xDF;
2862      *p++ = 0xE0;
2863      /*  movl %eax, %dst */
2864      *p++ = 0x89;
2865      p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
2866      goto done;
2867
2868   case Xin_SseConst: {
2869      UShort con = i->Xin.SseConst.con;
2870      p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
2871      p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
2872      p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
2873      p = push_word_from_tags(p, toUShort(con & 0xF));
2874      /* movl (%esp), %xmm-dst */
2875      *p++ = 0x0F;
2876      *p++ = 0x10;
2877      *p++ = toUChar(0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst)));
2878      *p++ = 0x24;
2879      /* addl $16, %esp */
2880      *p++ = 0x83;
2881      *p++ = 0xC4;
2882      *p++ = 0x10;
2883      goto done;
2884   }
2885
2886   case Xin_SseLdSt:
2887      *p++ = 0x0F;
2888      *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
2889      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdSt.reg)), i->Xin.SseLdSt.addr);
2890      goto done;
2891
2892   case Xin_SseLdzLO:
2893      vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
2894      /* movs[sd] amode, %xmm-dst */
2895      *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
2896      *p++ = 0x0F;
2897      *p++ = 0x10;
2898      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdzLO.reg)),
2899                       i->Xin.SseLdzLO.addr);
2900      goto done;
2901
2902   case Xin_Sse32Fx4:
2903      xtra = 0;
2904      *p++ = 0x0F;
2905      switch (i->Xin.Sse32Fx4.op) {
2906         case Xsse_ADDF:   *p++ = 0x58; break;
2907         case Xsse_DIVF:   *p++ = 0x5E; break;
2908         case Xsse_MAXF:   *p++ = 0x5F; break;
2909         case Xsse_MINF:   *p++ = 0x5D; break;
2910         case Xsse_MULF:   *p++ = 0x59; break;
2911         case Xsse_RCPF:   *p++ = 0x53; break;
2912         case Xsse_RSQRTF: *p++ = 0x52; break;
2913         case Xsse_SQRTF:  *p++ = 0x51; break;
2914         case Xsse_SUBF:   *p++ = 0x5C; break;
2915         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2916         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2917         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2918         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2919         default: goto bad;
2920      }
2921      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32Fx4.dst)),
2922                       fake(vregNo(i->Xin.Sse32Fx4.src)) );
2923      if (xtra & 0x100)
2924         *p++ = toUChar(xtra & 0xFF);
2925      goto done;
2926
2927   case Xin_Sse64Fx2:
2928      xtra = 0;
2929      *p++ = 0x66;
2930      *p++ = 0x0F;
2931      switch (i->Xin.Sse64Fx2.op) {
2932         case Xsse_ADDF:   *p++ = 0x58; break;
2933         case Xsse_DIVF:   *p++ = 0x5E; break;
2934         case Xsse_MAXF:   *p++ = 0x5F; break;
2935         case Xsse_MINF:   *p++ = 0x5D; break;
2936         case Xsse_MULF:   *p++ = 0x59; break;
2937         case Xsse_RCPF:   *p++ = 0x53; break;
2938         case Xsse_RSQRTF: *p++ = 0x52; break;
2939         case Xsse_SQRTF:  *p++ = 0x51; break;
2940         case Xsse_SUBF:   *p++ = 0x5C; break;
2941         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2942         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2943         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2944         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2945         default: goto bad;
2946      }
2947      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64Fx2.dst)),
2948                       fake(vregNo(i->Xin.Sse64Fx2.src)) );
2949      if (xtra & 0x100)
2950         *p++ = toUChar(xtra & 0xFF);
2951      goto done;
2952
2953   case Xin_Sse32FLo:
2954      xtra = 0;
2955      *p++ = 0xF3;
2956      *p++ = 0x0F;
2957      switch (i->Xin.Sse32FLo.op) {
2958         case Xsse_ADDF:   *p++ = 0x58; break;
2959         case Xsse_DIVF:   *p++ = 0x5E; break;
2960         case Xsse_MAXF:   *p++ = 0x5F; break;
2961         case Xsse_MINF:   *p++ = 0x5D; break;
2962         case Xsse_MULF:   *p++ = 0x59; break;
2963         case Xsse_RCPF:   *p++ = 0x53; break;
2964         case Xsse_RSQRTF: *p++ = 0x52; break;
2965         case Xsse_SQRTF:  *p++ = 0x51; break;
2966         case Xsse_SUBF:   *p++ = 0x5C; break;
2967         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2968         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2969         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2970         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2971         default: goto bad;
2972      }
2973      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32FLo.dst)),
2974                       fake(vregNo(i->Xin.Sse32FLo.src)) );
2975      if (xtra & 0x100)
2976         *p++ = toUChar(xtra & 0xFF);
2977      goto done;
2978
2979   case Xin_Sse64FLo:
2980      xtra = 0;
2981      *p++ = 0xF2;
2982      *p++ = 0x0F;
2983      switch (i->Xin.Sse64FLo.op) {
2984         case Xsse_ADDF:   *p++ = 0x58; break;
2985         case Xsse_DIVF:   *p++ = 0x5E; break;
2986         case Xsse_MAXF:   *p++ = 0x5F; break;
2987         case Xsse_MINF:   *p++ = 0x5D; break;
2988         case Xsse_MULF:   *p++ = 0x59; break;
2989         case Xsse_RCPF:   *p++ = 0x53; break;
2990         case Xsse_RSQRTF: *p++ = 0x52; break;
2991         case Xsse_SQRTF:  *p++ = 0x51; break;
2992         case Xsse_SUBF:   *p++ = 0x5C; break;
2993         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2994         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2995         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2996         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2997         default: goto bad;
2998      }
2999      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64FLo.dst)),
3000                       fake(vregNo(i->Xin.Sse64FLo.src)) );
3001      if (xtra & 0x100)
3002         *p++ = toUChar(xtra & 0xFF);
3003      goto done;
3004
3005   case Xin_SseReRg:
3006#     define XX(_n) *p++ = (_n)
3007      switch (i->Xin.SseReRg.op) {
3008         case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
3009         case Xsse_OR:                 XX(0x0F); XX(0x56); break;
3010         case Xsse_XOR:                XX(0x0F); XX(0x57); break;
3011         case Xsse_AND:                XX(0x0F); XX(0x54); break;
3012         case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
3013         case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
3014         case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
3015         case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
3016         case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
3017         case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
3018         case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
3019         case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
3020         case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
3021         case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
3022         case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
3023         case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
3024         case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
3025         case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
3026         case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
3027         case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
3028         case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
3029         case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
3030         case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
3031         case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
3032         case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
3033         case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
3034         case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
3035         case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
3036         case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
3037         case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
3038         case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
3039         case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
3040         case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
3041         case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
3042         case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
3043         case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
3044         case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
3045         case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
3046         case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
3047         case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
3048         case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
3049         case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
3050         case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
3051         case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
3052         case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
3053         case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
3054         case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
3055         case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
3056         case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
3057         case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
3058         case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
3059         case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
3060         case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
3061         case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
3062         default: goto bad;
3063      }
3064      p = doAMode_R(p, fake(vregNo(i->Xin.SseReRg.dst)),
3065                       fake(vregNo(i->Xin.SseReRg.src)) );
3066#     undef XX
3067      goto done;
3068
3069   case Xin_SseCMov:
3070      /* jmp fwds if !condition */
3071      *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
3072      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3073      ptmp = p;
3074
3075      /* movaps %src, %dst */
3076      *p++ = 0x0F;
3077      *p++ = 0x28;
3078      p = doAMode_R(p, fake(vregNo(i->Xin.SseCMov.dst)),
3079                       fake(vregNo(i->Xin.SseCMov.src)) );
3080
3081      /* Fill in the jump offset. */
3082      *(ptmp-1) = toUChar(p - ptmp);
3083      goto done;
3084
3085   case Xin_SseShuf:
3086      *p++ = 0x66;
3087      *p++ = 0x0F;
3088      *p++ = 0x70;
3089      p = doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
3090                       fake(vregNo(i->Xin.SseShuf.src)) );
3091      *p++ = (UChar)(i->Xin.SseShuf.order);
3092      goto done;
3093
3094   default:
3095      goto bad;
3096   }
3097
3098  bad:
3099   ppX86Instr(i, mode64);
3100   vpanic("emit_X86Instr");
3101   /*NOTREACHED*/
3102
3103  done:
3104   vassert(p - &buf[0] <= 32);
3105   return p - &buf[0];
3106
3107#  undef fake
3108}
3109
3110/*---------------------------------------------------------------*/
3111/*--- end                                     host_x86_defs.c ---*/
3112/*---------------------------------------------------------------*/
3113