host_x86_defs.c revision 9bea4c13fca0e3bb4b719dcb3ed63d47d479294e
1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2010 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_x86_defs.h"
43
44
45/* --------- Registers. --------- */
46
47void ppHRegX86 ( HReg reg )
48{
49   Int r;
50   static HChar* ireg32_names[8]
51     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
52   /* Be generic for all virtual regs. */
53   if (hregIsVirtual(reg)) {
54      ppHReg(reg);
55      return;
56   }
57   /* But specific for real regs. */
58   switch (hregClass(reg)) {
59      case HRcInt32:
60         r = hregNumber(reg);
61         vassert(r >= 0 && r < 8);
62         vex_printf("%s", ireg32_names[r]);
63         return;
64      case HRcFlt64:
65         r = hregNumber(reg);
66         vassert(r >= 0 && r < 6);
67         vex_printf("%%fake%d", r);
68         return;
69      case HRcVec128:
70         r = hregNumber(reg);
71         vassert(r >= 0 && r < 8);
72         vex_printf("%%xmm%d", r);
73         return;
74      default:
75         vpanic("ppHRegX86");
76   }
77}
78
79HReg hregX86_EAX ( void ) { return mkHReg(0, HRcInt32, False); }
80HReg hregX86_ECX ( void ) { return mkHReg(1, HRcInt32, False); }
81HReg hregX86_EDX ( void ) { return mkHReg(2, HRcInt32, False); }
82HReg hregX86_EBX ( void ) { return mkHReg(3, HRcInt32, False); }
83HReg hregX86_ESP ( void ) { return mkHReg(4, HRcInt32, False); }
84HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt32, False); }
85HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt32, False); }
86HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt32, False); }
87
88HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
89HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
90HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
91HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
92HReg hregX86_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
93HReg hregX86_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
94
95HReg hregX86_XMM0 ( void ) { return mkHReg(0, HRcVec128, False); }
96HReg hregX86_XMM1 ( void ) { return mkHReg(1, HRcVec128, False); }
97HReg hregX86_XMM2 ( void ) { return mkHReg(2, HRcVec128, False); }
98HReg hregX86_XMM3 ( void ) { return mkHReg(3, HRcVec128, False); }
99HReg hregX86_XMM4 ( void ) { return mkHReg(4, HRcVec128, False); }
100HReg hregX86_XMM5 ( void ) { return mkHReg(5, HRcVec128, False); }
101HReg hregX86_XMM6 ( void ) { return mkHReg(6, HRcVec128, False); }
102HReg hregX86_XMM7 ( void ) { return mkHReg(7, HRcVec128, False); }
103
104
105void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
106{
107   *nregs = 20;
108   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
109   (*arr)[0] = hregX86_EAX();
110   (*arr)[1] = hregX86_EBX();
111   (*arr)[2] = hregX86_ECX();
112   (*arr)[3] = hregX86_EDX();
113   (*arr)[4] = hregX86_ESI();
114   (*arr)[5] = hregX86_EDI();
115   (*arr)[6] = hregX86_FAKE0();
116   (*arr)[7] = hregX86_FAKE1();
117   (*arr)[8] = hregX86_FAKE2();
118   (*arr)[9] = hregX86_FAKE3();
119   (*arr)[10] = hregX86_FAKE4();
120   (*arr)[11] = hregX86_FAKE5();
121   (*arr)[12] = hregX86_XMM0();
122   (*arr)[13] = hregX86_XMM1();
123   (*arr)[14] = hregX86_XMM2();
124   (*arr)[15] = hregX86_XMM3();
125   (*arr)[16] = hregX86_XMM4();
126   (*arr)[17] = hregX86_XMM5();
127   (*arr)[18] = hregX86_XMM6();
128   (*arr)[19] = hregX86_XMM7();
129}
130
131
132/* --------- Condition codes, Intel encoding. --------- */
133
134HChar* showX86CondCode ( X86CondCode cond )
135{
136   switch (cond) {
137      case Xcc_O:      return "o";
138      case Xcc_NO:     return "no";
139      case Xcc_B:      return "b";
140      case Xcc_NB:     return "nb";
141      case Xcc_Z:      return "z";
142      case Xcc_NZ:     return "nz";
143      case Xcc_BE:     return "be";
144      case Xcc_NBE:    return "nbe";
145      case Xcc_S:      return "s";
146      case Xcc_NS:     return "ns";
147      case Xcc_P:      return "p";
148      case Xcc_NP:     return "np";
149      case Xcc_L:      return "l";
150      case Xcc_NL:     return "nl";
151      case Xcc_LE:     return "le";
152      case Xcc_NLE:    return "nle";
153      case Xcc_ALWAYS: return "ALWAYS";
154      default: vpanic("ppX86CondCode");
155   }
156}
157
158
159/* --------- X86AMode: memory address expressions. --------- */
160
161X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
162   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
163   am->tag = Xam_IR;
164   am->Xam.IR.imm = imm32;
165   am->Xam.IR.reg = reg;
166   return am;
167}
168X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
169   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
170   am->tag = Xam_IRRS;
171   am->Xam.IRRS.imm = imm32;
172   am->Xam.IRRS.base = base;
173   am->Xam.IRRS.index = indEx;
174   am->Xam.IRRS.shift = shift;
175   vassert(shift >= 0 && shift <= 3);
176   return am;
177}
178
179X86AMode* dopyX86AMode ( X86AMode* am ) {
180   switch (am->tag) {
181      case Xam_IR:
182         return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
183      case Xam_IRRS:
184         return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
185                               am->Xam.IRRS.index, am->Xam.IRRS.shift );
186      default:
187         vpanic("dopyX86AMode");
188   }
189}
190
191void ppX86AMode ( X86AMode* am ) {
192   switch (am->tag) {
193      case Xam_IR:
194         if (am->Xam.IR.imm == 0)
195            vex_printf("(");
196         else
197            vex_printf("0x%x(", am->Xam.IR.imm);
198         ppHRegX86(am->Xam.IR.reg);
199         vex_printf(")");
200         return;
201      case Xam_IRRS:
202         vex_printf("0x%x(", am->Xam.IRRS.imm);
203         ppHRegX86(am->Xam.IRRS.base);
204         vex_printf(",");
205         ppHRegX86(am->Xam.IRRS.index);
206         vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
207         return;
208      default:
209         vpanic("ppX86AMode");
210   }
211}
212
213static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
214   switch (am->tag) {
215      case Xam_IR:
216         addHRegUse(u, HRmRead, am->Xam.IR.reg);
217         return;
218      case Xam_IRRS:
219         addHRegUse(u, HRmRead, am->Xam.IRRS.base);
220         addHRegUse(u, HRmRead, am->Xam.IRRS.index);
221         return;
222      default:
223         vpanic("addRegUsage_X86AMode");
224   }
225}
226
227static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
228   switch (am->tag) {
229      case Xam_IR:
230         am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
231         return;
232      case Xam_IRRS:
233         am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
234         am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
235         return;
236      default:
237         vpanic("mapRegs_X86AMode");
238   }
239}
240
241/* --------- Operand, which can be reg, immediate or memory. --------- */
242
243X86RMI* X86RMI_Imm ( UInt imm32 ) {
244   X86RMI* op         = LibVEX_Alloc(sizeof(X86RMI));
245   op->tag            = Xrmi_Imm;
246   op->Xrmi.Imm.imm32 = imm32;
247   return op;
248}
249X86RMI* X86RMI_Reg ( HReg reg ) {
250   X86RMI* op       = LibVEX_Alloc(sizeof(X86RMI));
251   op->tag          = Xrmi_Reg;
252   op->Xrmi.Reg.reg = reg;
253   return op;
254}
255X86RMI* X86RMI_Mem ( X86AMode* am ) {
256   X86RMI* op      = LibVEX_Alloc(sizeof(X86RMI));
257   op->tag         = Xrmi_Mem;
258   op->Xrmi.Mem.am = am;
259   return op;
260}
261
262void ppX86RMI ( X86RMI* op ) {
263   switch (op->tag) {
264      case Xrmi_Imm:
265         vex_printf("$0x%x", op->Xrmi.Imm.imm32);
266         return;
267      case Xrmi_Reg:
268         ppHRegX86(op->Xrmi.Reg.reg);
269         return;
270      case Xrmi_Mem:
271         ppX86AMode(op->Xrmi.Mem.am);
272         return;
273     default:
274         vpanic("ppX86RMI");
275   }
276}
277
278/* An X86RMI can only be used in a "read" context (what would it mean
279   to write or modify a literal?) and so we enumerate its registers
280   accordingly. */
281static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
282   switch (op->tag) {
283      case Xrmi_Imm:
284         return;
285      case Xrmi_Reg:
286         addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
287         return;
288      case Xrmi_Mem:
289         addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
290         return;
291      default:
292         vpanic("addRegUsage_X86RMI");
293   }
294}
295
296static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
297   switch (op->tag) {
298      case Xrmi_Imm:
299         return;
300      case Xrmi_Reg:
301         op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
302         return;
303      case Xrmi_Mem:
304         mapRegs_X86AMode(m, op->Xrmi.Mem.am);
305         return;
306      default:
307         vpanic("mapRegs_X86RMI");
308   }
309}
310
311
312/* --------- Operand, which can be reg or immediate only. --------- */
313
314X86RI* X86RI_Imm ( UInt imm32 ) {
315   X86RI* op         = LibVEX_Alloc(sizeof(X86RI));
316   op->tag           = Xri_Imm;
317   op->Xri.Imm.imm32 = imm32;
318   return op;
319}
320X86RI* X86RI_Reg ( HReg reg ) {
321   X86RI* op       = LibVEX_Alloc(sizeof(X86RI));
322   op->tag         = Xri_Reg;
323   op->Xri.Reg.reg = reg;
324   return op;
325}
326
327void ppX86RI ( X86RI* op ) {
328   switch (op->tag) {
329      case Xri_Imm:
330         vex_printf("$0x%x", op->Xri.Imm.imm32);
331         return;
332      case Xri_Reg:
333         ppHRegX86(op->Xri.Reg.reg);
334         return;
335     default:
336         vpanic("ppX86RI");
337   }
338}
339
340/* An X86RI can only be used in a "read" context (what would it mean
341   to write or modify a literal?) and so we enumerate its registers
342   accordingly. */
343static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
344   switch (op->tag) {
345      case Xri_Imm:
346         return;
347      case Xri_Reg:
348         addHRegUse(u, HRmRead, op->Xri.Reg.reg);
349         return;
350      default:
351         vpanic("addRegUsage_X86RI");
352   }
353}
354
355static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
356   switch (op->tag) {
357      case Xri_Imm:
358         return;
359      case Xri_Reg:
360         op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
361         return;
362      default:
363         vpanic("mapRegs_X86RI");
364   }
365}
366
367
368/* --------- Operand, which can be reg or memory only. --------- */
369
370X86RM* X86RM_Reg ( HReg reg ) {
371   X86RM* op       = LibVEX_Alloc(sizeof(X86RM));
372   op->tag         = Xrm_Reg;
373   op->Xrm.Reg.reg = reg;
374   return op;
375}
376X86RM* X86RM_Mem ( X86AMode* am ) {
377   X86RM* op      = LibVEX_Alloc(sizeof(X86RM));
378   op->tag        = Xrm_Mem;
379   op->Xrm.Mem.am = am;
380   return op;
381}
382
383void ppX86RM ( X86RM* op ) {
384   switch (op->tag) {
385      case Xrm_Mem:
386         ppX86AMode(op->Xrm.Mem.am);
387         return;
388      case Xrm_Reg:
389         ppHRegX86(op->Xrm.Reg.reg);
390         return;
391     default:
392         vpanic("ppX86RM");
393   }
394}
395
396/* Because an X86RM can be both a source or destination operand, we
397   have to supply a mode -- pertaining to the operand as a whole --
398   indicating how it's being used. */
399static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
400   switch (op->tag) {
401      case Xrm_Mem:
402         /* Memory is read, written or modified.  So we just want to
403            know the regs read by the amode. */
404         addRegUsage_X86AMode(u, op->Xrm.Mem.am);
405         return;
406      case Xrm_Reg:
407         /* reg is read, written or modified.  Add it in the
408            appropriate way. */
409         addHRegUse(u, mode, op->Xrm.Reg.reg);
410         return;
411     default:
412         vpanic("addRegUsage_X86RM");
413   }
414}
415
416static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
417{
418   switch (op->tag) {
419      case Xrm_Mem:
420         mapRegs_X86AMode(m, op->Xrm.Mem.am);
421         return;
422      case Xrm_Reg:
423         op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
424         return;
425     default:
426         vpanic("mapRegs_X86RM");
427   }
428}
429
430
431/* --------- Instructions. --------- */
432
433HChar* showX86UnaryOp ( X86UnaryOp op ) {
434   switch (op) {
435      case Xun_NOT: return "not";
436      case Xun_NEG: return "neg";
437      default: vpanic("showX86UnaryOp");
438   }
439}
440
441HChar* showX86AluOp ( X86AluOp op ) {
442   switch (op) {
443      case Xalu_MOV:  return "mov";
444      case Xalu_CMP:  return "cmp";
445      case Xalu_ADD:  return "add";
446      case Xalu_SUB:  return "sub";
447      case Xalu_ADC:  return "adc";
448      case Xalu_SBB:  return "sbb";
449      case Xalu_AND:  return "and";
450      case Xalu_OR:   return "or";
451      case Xalu_XOR:  return "xor";
452      case Xalu_MUL:  return "mul";
453      default: vpanic("showX86AluOp");
454   }
455}
456
457HChar* showX86ShiftOp ( X86ShiftOp op ) {
458   switch (op) {
459      case Xsh_SHL: return "shl";
460      case Xsh_SHR: return "shr";
461      case Xsh_SAR: return "sar";
462      default: vpanic("showX86ShiftOp");
463   }
464}
465
466HChar* showX86FpOp ( X86FpOp op ) {
467   switch (op) {
468      case Xfp_ADD:    return "add";
469      case Xfp_SUB:    return "sub";
470      case Xfp_MUL:    return "mul";
471      case Xfp_DIV:    return "div";
472      case Xfp_SCALE:  return "scale";
473      case Xfp_ATAN:   return "atan";
474      case Xfp_YL2X:   return "yl2x";
475      case Xfp_YL2XP1: return "yl2xp1";
476      case Xfp_PREM:   return "prem";
477      case Xfp_PREM1:  return "prem1";
478      case Xfp_SQRT:   return "sqrt";
479      case Xfp_ABS:    return "abs";
480      case Xfp_NEG:    return "chs";
481      case Xfp_MOV:    return "mov";
482      case Xfp_SIN:    return "sin";
483      case Xfp_COS:    return "cos";
484      case Xfp_TAN:    return "tan";
485      case Xfp_ROUND:  return "round";
486      case Xfp_2XM1:   return "2xm1";
487      default: vpanic("showX86FpOp");
488   }
489}
490
491HChar* showX86SseOp ( X86SseOp op ) {
492   switch (op) {
493      case Xsse_MOV:      return "mov(?!)";
494      case Xsse_ADDF:     return "add";
495      case Xsse_SUBF:     return "sub";
496      case Xsse_MULF:     return "mul";
497      case Xsse_DIVF:     return "div";
498      case Xsse_MAXF:     return "max";
499      case Xsse_MINF:     return "min";
500      case Xsse_CMPEQF:   return "cmpFeq";
501      case Xsse_CMPLTF:   return "cmpFlt";
502      case Xsse_CMPLEF:   return "cmpFle";
503      case Xsse_CMPUNF:   return "cmpFun";
504      case Xsse_RCPF:     return "rcp";
505      case Xsse_RSQRTF:   return "rsqrt";
506      case Xsse_SQRTF:    return "sqrt";
507      case Xsse_AND:      return "and";
508      case Xsse_OR:       return "or";
509      case Xsse_XOR:      return "xor";
510      case Xsse_ANDN:     return "andn";
511      case Xsse_ADD8:     return "paddb";
512      case Xsse_ADD16:    return "paddw";
513      case Xsse_ADD32:    return "paddd";
514      case Xsse_ADD64:    return "paddq";
515      case Xsse_QADD8U:   return "paddusb";
516      case Xsse_QADD16U:  return "paddusw";
517      case Xsse_QADD8S:   return "paddsb";
518      case Xsse_QADD16S:  return "paddsw";
519      case Xsse_SUB8:     return "psubb";
520      case Xsse_SUB16:    return "psubw";
521      case Xsse_SUB32:    return "psubd";
522      case Xsse_SUB64:    return "psubq";
523      case Xsse_QSUB8U:   return "psubusb";
524      case Xsse_QSUB16U:  return "psubusw";
525      case Xsse_QSUB8S:   return "psubsb";
526      case Xsse_QSUB16S:  return "psubsw";
527      case Xsse_MUL16:    return "pmullw";
528      case Xsse_MULHI16U: return "pmulhuw";
529      case Xsse_MULHI16S: return "pmulhw";
530      case Xsse_AVG8U:    return "pavgb";
531      case Xsse_AVG16U:   return "pavgw";
532      case Xsse_MAX16S:   return "pmaxw";
533      case Xsse_MAX8U:    return "pmaxub";
534      case Xsse_MIN16S:   return "pminw";
535      case Xsse_MIN8U:    return "pminub";
536      case Xsse_CMPEQ8:   return "pcmpeqb";
537      case Xsse_CMPEQ16:  return "pcmpeqw";
538      case Xsse_CMPEQ32:  return "pcmpeqd";
539      case Xsse_CMPGT8S:  return "pcmpgtb";
540      case Xsse_CMPGT16S: return "pcmpgtw";
541      case Xsse_CMPGT32S: return "pcmpgtd";
542      case Xsse_SHL16:    return "psllw";
543      case Xsse_SHL32:    return "pslld";
544      case Xsse_SHL64:    return "psllq";
545      case Xsse_SHR16:    return "psrlw";
546      case Xsse_SHR32:    return "psrld";
547      case Xsse_SHR64:    return "psrlq";
548      case Xsse_SAR16:    return "psraw";
549      case Xsse_SAR32:    return "psrad";
550      case Xsse_PACKSSD:  return "packssdw";
551      case Xsse_PACKSSW:  return "packsswb";
552      case Xsse_PACKUSW:  return "packuswb";
553      case Xsse_UNPCKHB:  return "punpckhb";
554      case Xsse_UNPCKHW:  return "punpckhw";
555      case Xsse_UNPCKHD:  return "punpckhd";
556      case Xsse_UNPCKHQ:  return "punpckhq";
557      case Xsse_UNPCKLB:  return "punpcklb";
558      case Xsse_UNPCKLW:  return "punpcklw";
559      case Xsse_UNPCKLD:  return "punpckld";
560      case Xsse_UNPCKLQ:  return "punpcklq";
561      default: vpanic("showX86SseOp");
562   }
563}
564
565X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
566   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
567   i->tag            = Xin_Alu32R;
568   i->Xin.Alu32R.op  = op;
569   i->Xin.Alu32R.src = src;
570   i->Xin.Alu32R.dst = dst;
571   return i;
572}
573X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
574   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
575   i->tag            = Xin_Alu32M;
576   i->Xin.Alu32M.op  = op;
577   i->Xin.Alu32M.src = src;
578   i->Xin.Alu32M.dst = dst;
579   vassert(op != Xalu_MUL);
580   return i;
581}
582X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
583   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
584   i->tag          = Xin_Sh32;
585   i->Xin.Sh32.op  = op;
586   i->Xin.Sh32.src = src;
587   i->Xin.Sh32.dst = dst;
588   return i;
589}
590X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
591   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
592   i->tag              = Xin_Test32;
593   i->Xin.Test32.imm32 = imm32;
594   i->Xin.Test32.dst   = dst;
595   return i;
596}
597X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
598   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
599   i->tag             = Xin_Unary32;
600   i->Xin.Unary32.op  = op;
601   i->Xin.Unary32.dst = dst;
602   return i;
603}
604X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
605   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
606   i->tag             = Xin_Lea32;
607   i->Xin.Lea32.am    = am;
608   i->Xin.Lea32.dst   = dst;
609   return i;
610}
611X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
612   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
613   i->tag             = Xin_MulL;
614   i->Xin.MulL.syned  = syned;
615   i->Xin.MulL.src    = src;
616   return i;
617}
618X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
619   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
620   i->tag           = Xin_Div;
621   i->Xin.Div.syned = syned;
622   i->Xin.Div.src   = src;
623   return i;
624}
625X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
626   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
627   i->tag            = Xin_Sh3232;
628   i->Xin.Sh3232.op  = op;
629   i->Xin.Sh3232.amt = amt;
630   i->Xin.Sh3232.src = src;
631   i->Xin.Sh3232.dst = dst;
632   vassert(op == Xsh_SHL || op == Xsh_SHR);
633   return i;
634}
635X86Instr* X86Instr_Push( X86RMI* src ) {
636   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
637   i->tag          = Xin_Push;
638   i->Xin.Push.src = src;
639   return i;
640}
641X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms ) {
642   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
643   i->tag               = Xin_Call;
644   i->Xin.Call.cond     = cond;
645   i->Xin.Call.target   = target;
646   i->Xin.Call.regparms = regparms;
647   vassert(regparms >= 0 && regparms <= 3);
648   return i;
649}
650X86Instr* X86Instr_Goto ( IRJumpKind jk, X86CondCode cond, X86RI* dst ) {
651   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
652   i->tag           = Xin_Goto;
653   i->Xin.Goto.cond = cond;
654   i->Xin.Goto.dst  = dst;
655   i->Xin.Goto.jk   = jk;
656   return i;
657}
658X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
659   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
660   i->tag             = Xin_CMov32;
661   i->Xin.CMov32.cond = cond;
662   i->Xin.CMov32.src  = src;
663   i->Xin.CMov32.dst  = dst;
664   vassert(cond != Xcc_ALWAYS);
665   return i;
666}
667X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
668                            X86AMode* src, HReg dst ) {
669   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
670   i->tag                = Xin_LoadEX;
671   i->Xin.LoadEX.szSmall = szSmall;
672   i->Xin.LoadEX.syned   = syned;
673   i->Xin.LoadEX.src     = src;
674   i->Xin.LoadEX.dst     = dst;
675   vassert(szSmall == 1 || szSmall == 2);
676   return i;
677}
678X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
679   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
680   i->tag           = Xin_Store;
681   i->Xin.Store.sz  = sz;
682   i->Xin.Store.src = src;
683   i->Xin.Store.dst = dst;
684   vassert(sz == 1 || sz == 2);
685   return i;
686}
687X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
688   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
689   i->tag            = Xin_Set32;
690   i->Xin.Set32.cond = cond;
691   i->Xin.Set32.dst  = dst;
692   return i;
693}
694X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
695   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
696   i->tag               = Xin_Bsfr32;
697   i->Xin.Bsfr32.isFwds = isFwds;
698   i->Xin.Bsfr32.src    = src;
699   i->Xin.Bsfr32.dst    = dst;
700   return i;
701}
702X86Instr* X86Instr_MFence ( UInt hwcaps ) {
703   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
704   i->tag               = Xin_MFence;
705   i->Xin.MFence.hwcaps = hwcaps;
706   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_SSE1
707                            |VEX_HWCAPS_X86_SSE2
708                            |VEX_HWCAPS_X86_SSE3
709                            |VEX_HWCAPS_X86_LZCNT)));
710   return i;
711}
712X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
713   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
714   i->tag           = Xin_ACAS;
715   i->Xin.ACAS.addr = addr;
716   i->Xin.ACAS.sz   = sz;
717   vassert(sz == 4 || sz == 2 || sz == 1);
718   return i;
719}
720X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
721   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
722   i->tag            = Xin_DACAS;
723   i->Xin.DACAS.addr = addr;
724   return i;
725}
726
727X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
728   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
729   i->tag             = Xin_FpUnary;
730   i->Xin.FpUnary.op  = op;
731   i->Xin.FpUnary.src = src;
732   i->Xin.FpUnary.dst = dst;
733   return i;
734}
735X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
736   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
737   i->tag               = Xin_FpBinary;
738   i->Xin.FpBinary.op   = op;
739   i->Xin.FpBinary.srcL = srcL;
740   i->Xin.FpBinary.srcR = srcR;
741   i->Xin.FpBinary.dst  = dst;
742   return i;
743}
744X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
745   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
746   i->tag               = Xin_FpLdSt;
747   i->Xin.FpLdSt.isLoad = isLoad;
748   i->Xin.FpLdSt.sz     = sz;
749   i->Xin.FpLdSt.reg    = reg;
750   i->Xin.FpLdSt.addr   = addr;
751   vassert(sz == 4 || sz == 8 || sz == 10);
752   return i;
753}
754X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
755                             HReg reg, X86AMode* addr ) {
756   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
757   i->tag                = Xin_FpLdStI;
758   i->Xin.FpLdStI.isLoad = isLoad;
759   i->Xin.FpLdStI.sz     = sz;
760   i->Xin.FpLdStI.reg    = reg;
761   i->Xin.FpLdStI.addr   = addr;
762   vassert(sz == 2 || sz == 4 || sz == 8);
763   return i;
764}
765X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
766   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
767   i->tag              = Xin_Fp64to32;
768   i->Xin.Fp64to32.src = src;
769   i->Xin.Fp64to32.dst = dst;
770   return i;
771}
772X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
773   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
774   i->tag             = Xin_FpCMov;
775   i->Xin.FpCMov.cond = cond;
776   i->Xin.FpCMov.src  = src;
777   i->Xin.FpCMov.dst  = dst;
778   vassert(cond != Xcc_ALWAYS);
779   return i;
780}
781X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
782   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
783   i->tag               = Xin_FpLdCW;
784   i->Xin.FpLdCW.addr   = addr;
785   return i;
786}
787X86Instr* X86Instr_FpStSW_AX ( void ) {
788   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
789   i->tag      = Xin_FpStSW_AX;
790   return i;
791}
792X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
793   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
794   i->tag            = Xin_FpCmp;
795   i->Xin.FpCmp.srcL = srcL;
796   i->Xin.FpCmp.srcR = srcR;
797   i->Xin.FpCmp.dst  = dst;
798   return i;
799}
800
801X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
802   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
803   i->tag                 = Xin_SseConst;
804   i->Xin.SseConst.con    = con;
805   i->Xin.SseConst.dst    = dst;
806   vassert(hregClass(dst) == HRcVec128);
807   return i;
808}
809X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
810   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
811   i->tag                = Xin_SseLdSt;
812   i->Xin.SseLdSt.isLoad = isLoad;
813   i->Xin.SseLdSt.reg    = reg;
814   i->Xin.SseLdSt.addr   = addr;
815   return i;
816}
817X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
818{
819   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
820   i->tag                = Xin_SseLdzLO;
821   i->Xin.SseLdzLO.sz    = toUChar(sz);
822   i->Xin.SseLdzLO.reg   = reg;
823   i->Xin.SseLdzLO.addr  = addr;
824   vassert(sz == 4 || sz == 8);
825   return i;
826}
827X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
828   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
829   i->tag              = Xin_Sse32Fx4;
830   i->Xin.Sse32Fx4.op  = op;
831   i->Xin.Sse32Fx4.src = src;
832   i->Xin.Sse32Fx4.dst = dst;
833   vassert(op != Xsse_MOV);
834   return i;
835}
836X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
837   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
838   i->tag              = Xin_Sse32FLo;
839   i->Xin.Sse32FLo.op  = op;
840   i->Xin.Sse32FLo.src = src;
841   i->Xin.Sse32FLo.dst = dst;
842   vassert(op != Xsse_MOV);
843   return i;
844}
845X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
846   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
847   i->tag              = Xin_Sse64Fx2;
848   i->Xin.Sse64Fx2.op  = op;
849   i->Xin.Sse64Fx2.src = src;
850   i->Xin.Sse64Fx2.dst = dst;
851   vassert(op != Xsse_MOV);
852   return i;
853}
854X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
855   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
856   i->tag              = Xin_Sse64FLo;
857   i->Xin.Sse64FLo.op  = op;
858   i->Xin.Sse64FLo.src = src;
859   i->Xin.Sse64FLo.dst = dst;
860   vassert(op != Xsse_MOV);
861   return i;
862}
863X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
864   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
865   i->tag             = Xin_SseReRg;
866   i->Xin.SseReRg.op  = op;
867   i->Xin.SseReRg.src = re;
868   i->Xin.SseReRg.dst = rg;
869   return i;
870}
871X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
872   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
873   i->tag              = Xin_SseCMov;
874   i->Xin.SseCMov.cond = cond;
875   i->Xin.SseCMov.src  = src;
876   i->Xin.SseCMov.dst  = dst;
877   vassert(cond != Xcc_ALWAYS);
878   return i;
879}
880X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
881   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
882   i->tag               = Xin_SseShuf;
883   i->Xin.SseShuf.order = order;
884   i->Xin.SseShuf.src   = src;
885   i->Xin.SseShuf.dst   = dst;
886   vassert(order >= 0 && order <= 0xFF);
887   return i;
888}
889
890void ppX86Instr ( X86Instr* i, Bool mode64 ) {
891   vassert(mode64 == False);
892   switch (i->tag) {
893      case Xin_Alu32R:
894         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
895         ppX86RMI(i->Xin.Alu32R.src);
896         vex_printf(",");
897         ppHRegX86(i->Xin.Alu32R.dst);
898         return;
899      case Xin_Alu32M:
900         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
901         ppX86RI(i->Xin.Alu32M.src);
902         vex_printf(",");
903         ppX86AMode(i->Xin.Alu32M.dst);
904         return;
905      case Xin_Sh32:
906         vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
907         if (i->Xin.Sh32.src == 0)
908           vex_printf("%%cl,");
909         else
910            vex_printf("$%d,", (Int)i->Xin.Sh32.src);
911         ppHRegX86(i->Xin.Sh32.dst);
912         return;
913      case Xin_Test32:
914         vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
915         ppX86RM(i->Xin.Test32.dst);
916         return;
917      case Xin_Unary32:
918         vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
919         ppHRegX86(i->Xin.Unary32.dst);
920         return;
921      case Xin_Lea32:
922         vex_printf("leal ");
923         ppX86AMode(i->Xin.Lea32.am);
924         vex_printf(",");
925         ppHRegX86(i->Xin.Lea32.dst);
926         return;
927      case Xin_MulL:
928         vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
929         ppX86RM(i->Xin.MulL.src);
930         return;
931      case Xin_Div:
932         vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
933         ppX86RM(i->Xin.Div.src);
934         return;
935      case Xin_Sh3232:
936         vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
937         if (i->Xin.Sh3232.amt == 0)
938           vex_printf(" %%cl,");
939         else
940            vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
941         ppHRegX86(i->Xin.Sh3232.src);
942         vex_printf(",");
943         ppHRegX86(i->Xin.Sh3232.dst);
944         return;
945      case Xin_Push:
946         vex_printf("pushl ");
947         ppX86RMI(i->Xin.Push.src);
948         return;
949      case Xin_Call:
950         vex_printf("call%s[%d] ",
951                    i->Xin.Call.cond==Xcc_ALWAYS
952                       ? "" : showX86CondCode(i->Xin.Call.cond),
953                    i->Xin.Call.regparms);
954         vex_printf("0x%x", i->Xin.Call.target);
955         break;
956      case Xin_Goto:
957         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
958            vex_printf("if (%%eflags.%s) { ",
959                       showX86CondCode(i->Xin.Goto.cond));
960	 }
961         if (i->Xin.Goto.jk != Ijk_Boring
962             && i->Xin.Goto.jk != Ijk_Call
963             && i->Xin.Goto.jk != Ijk_Ret) {
964            vex_printf("movl $");
965            ppIRJumpKind(i->Xin.Goto.jk);
966            vex_printf(",%%ebp ; ");
967         }
968         vex_printf("movl ");
969         ppX86RI(i->Xin.Goto.dst);
970         vex_printf(",%%eax ; movl $dispatcher_addr,%%edx ; jmp *%%edx");
971         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
972            vex_printf(" }");
973	 }
974         return;
975      case Xin_CMov32:
976         vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
977         ppX86RM(i->Xin.CMov32.src);
978         vex_printf(",");
979         ppHRegX86(i->Xin.CMov32.dst);
980         return;
981      case Xin_LoadEX:
982         vex_printf("mov%c%cl ",
983                    i->Xin.LoadEX.syned ? 's' : 'z',
984                    i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
985         ppX86AMode(i->Xin.LoadEX.src);
986         vex_printf(",");
987         ppHRegX86(i->Xin.LoadEX.dst);
988         return;
989      case Xin_Store:
990         vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
991         ppHRegX86(i->Xin.Store.src);
992         vex_printf(",");
993         ppX86AMode(i->Xin.Store.dst);
994         return;
995      case Xin_Set32:
996         vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
997         ppHRegX86(i->Xin.Set32.dst);
998         return;
999      case Xin_Bsfr32:
1000         vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
1001         ppHRegX86(i->Xin.Bsfr32.src);
1002         vex_printf(",");
1003         ppHRegX86(i->Xin.Bsfr32.dst);
1004         return;
1005      case Xin_MFence:
1006         vex_printf("mfence(%s)",
1007                    LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
1008         return;
1009      case Xin_ACAS:
1010         vex_printf("lock cmpxchg%c ",
1011                     i->Xin.ACAS.sz==1 ? 'b'
1012                                       : i->Xin.ACAS.sz==2 ? 'w' : 'l');
1013         vex_printf("{%%eax->%%ebx},");
1014         ppX86AMode(i->Xin.ACAS.addr);
1015         return;
1016      case Xin_DACAS:
1017         vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
1018         ppX86AMode(i->Xin.DACAS.addr);
1019         return;
1020      case Xin_FpUnary:
1021         vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
1022         ppHRegX86(i->Xin.FpUnary.src);
1023         vex_printf(",");
1024         ppHRegX86(i->Xin.FpUnary.dst);
1025         break;
1026      case Xin_FpBinary:
1027         vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
1028         ppHRegX86(i->Xin.FpBinary.srcL);
1029         vex_printf(",");
1030         ppHRegX86(i->Xin.FpBinary.srcR);
1031         vex_printf(",");
1032         ppHRegX86(i->Xin.FpBinary.dst);
1033         break;
1034      case Xin_FpLdSt:
1035         if (i->Xin.FpLdSt.isLoad) {
1036            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
1037                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1038            ppX86AMode(i->Xin.FpLdSt.addr);
1039            vex_printf(", ");
1040            ppHRegX86(i->Xin.FpLdSt.reg);
1041         } else {
1042            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
1043                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1044            ppHRegX86(i->Xin.FpLdSt.reg);
1045            vex_printf(", ");
1046            ppX86AMode(i->Xin.FpLdSt.addr);
1047         }
1048         return;
1049      case Xin_FpLdStI:
1050         if (i->Xin.FpLdStI.isLoad) {
1051            vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1052                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1053            ppX86AMode(i->Xin.FpLdStI.addr);
1054            vex_printf(", ");
1055            ppHRegX86(i->Xin.FpLdStI.reg);
1056         } else {
1057            vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1058                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1059            ppHRegX86(i->Xin.FpLdStI.reg);
1060            vex_printf(", ");
1061            ppX86AMode(i->Xin.FpLdStI.addr);
1062         }
1063         return;
1064      case Xin_Fp64to32:
1065         vex_printf("gdtof ");
1066         ppHRegX86(i->Xin.Fp64to32.src);
1067         vex_printf(",");
1068         ppHRegX86(i->Xin.Fp64to32.dst);
1069         return;
1070      case Xin_FpCMov:
1071         vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
1072         ppHRegX86(i->Xin.FpCMov.src);
1073         vex_printf(",");
1074         ppHRegX86(i->Xin.FpCMov.dst);
1075         return;
1076      case Xin_FpLdCW:
1077         vex_printf("fldcw ");
1078         ppX86AMode(i->Xin.FpLdCW.addr);
1079         return;
1080      case Xin_FpStSW_AX:
1081         vex_printf("fstsw %%ax");
1082         return;
1083      case Xin_FpCmp:
1084         vex_printf("gcmp ");
1085         ppHRegX86(i->Xin.FpCmp.srcL);
1086         vex_printf(",");
1087         ppHRegX86(i->Xin.FpCmp.srcR);
1088         vex_printf(",");
1089         ppHRegX86(i->Xin.FpCmp.dst);
1090         break;
1091      case Xin_SseConst:
1092         vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
1093         ppHRegX86(i->Xin.SseConst.dst);
1094         break;
1095      case Xin_SseLdSt:
1096         vex_printf("movups ");
1097         if (i->Xin.SseLdSt.isLoad) {
1098            ppX86AMode(i->Xin.SseLdSt.addr);
1099            vex_printf(",");
1100            ppHRegX86(i->Xin.SseLdSt.reg);
1101         } else {
1102            ppHRegX86(i->Xin.SseLdSt.reg);
1103            vex_printf(",");
1104            ppX86AMode(i->Xin.SseLdSt.addr);
1105         }
1106         return;
1107      case Xin_SseLdzLO:
1108         vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
1109         ppX86AMode(i->Xin.SseLdzLO.addr);
1110         vex_printf(",");
1111         ppHRegX86(i->Xin.SseLdzLO.reg);
1112         return;
1113      case Xin_Sse32Fx4:
1114         vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
1115         ppHRegX86(i->Xin.Sse32Fx4.src);
1116         vex_printf(",");
1117         ppHRegX86(i->Xin.Sse32Fx4.dst);
1118         return;
1119      case Xin_Sse32FLo:
1120         vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
1121         ppHRegX86(i->Xin.Sse32FLo.src);
1122         vex_printf(",");
1123         ppHRegX86(i->Xin.Sse32FLo.dst);
1124         return;
1125      case Xin_Sse64Fx2:
1126         vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
1127         ppHRegX86(i->Xin.Sse64Fx2.src);
1128         vex_printf(",");
1129         ppHRegX86(i->Xin.Sse64Fx2.dst);
1130         return;
1131      case Xin_Sse64FLo:
1132         vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
1133         ppHRegX86(i->Xin.Sse64FLo.src);
1134         vex_printf(",");
1135         ppHRegX86(i->Xin.Sse64FLo.dst);
1136         return;
1137      case Xin_SseReRg:
1138         vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
1139         ppHRegX86(i->Xin.SseReRg.src);
1140         vex_printf(",");
1141         ppHRegX86(i->Xin.SseReRg.dst);
1142         return;
1143      case Xin_SseCMov:
1144         vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
1145         ppHRegX86(i->Xin.SseCMov.src);
1146         vex_printf(",");
1147         ppHRegX86(i->Xin.SseCMov.dst);
1148         return;
1149      case Xin_SseShuf:
1150         vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
1151         ppHRegX86(i->Xin.SseShuf.src);
1152         vex_printf(",");
1153         ppHRegX86(i->Xin.SseShuf.dst);
1154         return;
1155
1156      default:
1157         vpanic("ppX86Instr");
1158   }
1159}
1160
1161/* --------- Helpers for register allocation. --------- */
1162
1163void getRegUsage_X86Instr (HRegUsage* u, X86Instr* i, Bool mode64)
1164{
1165   Bool unary;
1166   vassert(mode64 == False);
1167   initHRegUsage(u);
1168   switch (i->tag) {
1169      case Xin_Alu32R:
1170         addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
1171         if (i->Xin.Alu32R.op == Xalu_MOV) {
1172            addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
1173            return;
1174         }
1175         if (i->Xin.Alu32R.op == Xalu_CMP) {
1176            addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
1177            return;
1178         }
1179         addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
1180         return;
1181      case Xin_Alu32M:
1182         addRegUsage_X86RI(u, i->Xin.Alu32M.src);
1183         addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
1184         return;
1185      case Xin_Sh32:
1186         addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
1187         if (i->Xin.Sh32.src == 0)
1188            addHRegUse(u, HRmRead, hregX86_ECX());
1189         return;
1190      case Xin_Test32:
1191         addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
1192         return;
1193      case Xin_Unary32:
1194         addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
1195         return;
1196      case Xin_Lea32:
1197         addRegUsage_X86AMode(u, i->Xin.Lea32.am);
1198         addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
1199         return;
1200      case Xin_MulL:
1201         addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
1202         addHRegUse(u, HRmModify, hregX86_EAX());
1203         addHRegUse(u, HRmWrite, hregX86_EDX());
1204         return;
1205      case Xin_Div:
1206         addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
1207         addHRegUse(u, HRmModify, hregX86_EAX());
1208         addHRegUse(u, HRmModify, hregX86_EDX());
1209         return;
1210      case Xin_Sh3232:
1211         addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
1212         addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
1213         if (i->Xin.Sh3232.amt == 0)
1214            addHRegUse(u, HRmRead, hregX86_ECX());
1215         return;
1216      case Xin_Push:
1217         addRegUsage_X86RMI(u, i->Xin.Push.src);
1218         addHRegUse(u, HRmModify, hregX86_ESP());
1219         return;
1220      case Xin_Call:
1221         /* This is a bit subtle. */
1222         /* First off, claim it trashes all the caller-saved regs
1223            which fall within the register allocator's jurisdiction.
1224            These I believe to be %eax %ecx %edx and all the xmm
1225            registers. */
1226         addHRegUse(u, HRmWrite, hregX86_EAX());
1227         addHRegUse(u, HRmWrite, hregX86_ECX());
1228         addHRegUse(u, HRmWrite, hregX86_EDX());
1229         addHRegUse(u, HRmWrite, hregX86_XMM0());
1230         addHRegUse(u, HRmWrite, hregX86_XMM1());
1231         addHRegUse(u, HRmWrite, hregX86_XMM2());
1232         addHRegUse(u, HRmWrite, hregX86_XMM3());
1233         addHRegUse(u, HRmWrite, hregX86_XMM4());
1234         addHRegUse(u, HRmWrite, hregX86_XMM5());
1235         addHRegUse(u, HRmWrite, hregX86_XMM6());
1236         addHRegUse(u, HRmWrite, hregX86_XMM7());
1237         /* Now we have to state any parameter-carrying registers
1238            which might be read.  This depends on the regparmness. */
1239         switch (i->Xin.Call.regparms) {
1240            case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
1241            case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
1242            case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
1243            case 0: break;
1244            default: vpanic("getRegUsage_X86Instr:Call:regparms");
1245         }
1246         /* Finally, there is the issue that the insn trashes a
1247            register because the literal target address has to be
1248            loaded into a register.  Fortunately, for the 0/1/2
1249            regparm case, we can use EAX, EDX and ECX respectively, so
1250            this does not cause any further damage.  For the 3-regparm
1251            case, we'll have to choose another register arbitrarily --
1252            since A, D and C are used for parameters -- and so we might
1253            as well choose EDI. */
1254         if (i->Xin.Call.regparms == 3)
1255            addHRegUse(u, HRmWrite, hregX86_EDI());
1256         /* Upshot of this is that the assembler really must observe
1257            the here-stated convention of which register to use as an
1258            address temporary, depending on the regparmness: 0==EAX,
1259            1==EDX, 2==ECX, 3==EDI. */
1260         return;
1261      case Xin_Goto:
1262         addRegUsage_X86RI(u, i->Xin.Goto.dst);
1263         addHRegUse(u, HRmWrite, hregX86_EAX()); /* used for next guest addr */
1264         addHRegUse(u, HRmWrite, hregX86_EDX()); /* used for dispatcher addr */
1265         if (i->Xin.Goto.jk != Ijk_Boring
1266             && i->Xin.Goto.jk != Ijk_Call
1267             && i->Xin.Goto.jk != Ijk_Ret)
1268            /* note, this is irrelevant since ebp is not actually
1269               available to the allocator.  But still .. */
1270            addHRegUse(u, HRmWrite, hregX86_EBP());
1271         return;
1272      case Xin_CMov32:
1273         addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
1274         addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
1275         return;
1276      case Xin_LoadEX:
1277         addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
1278         addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
1279         return;
1280      case Xin_Store:
1281         addHRegUse(u, HRmRead, i->Xin.Store.src);
1282         addRegUsage_X86AMode(u, i->Xin.Store.dst);
1283         return;
1284      case Xin_Set32:
1285         addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
1286         return;
1287      case Xin_Bsfr32:
1288         addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
1289         addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
1290         return;
1291      case Xin_MFence:
1292         return;
1293      case Xin_ACAS:
1294         addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
1295         addHRegUse(u, HRmRead, hregX86_EBX());
1296         addHRegUse(u, HRmModify, hregX86_EAX());
1297         return;
1298      case Xin_DACAS:
1299         addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
1300         addHRegUse(u, HRmRead, hregX86_ECX());
1301         addHRegUse(u, HRmRead, hregX86_EBX());
1302         addHRegUse(u, HRmModify, hregX86_EDX());
1303         addHRegUse(u, HRmModify, hregX86_EAX());
1304         return;
1305      case Xin_FpUnary:
1306         addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
1307         addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
1308         return;
1309      case Xin_FpBinary:
1310         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
1311         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
1312         addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
1313         return;
1314      case Xin_FpLdSt:
1315         addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
1316         addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
1317                       i->Xin.FpLdSt.reg);
1318         return;
1319      case Xin_FpLdStI:
1320         addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
1321         addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
1322                       i->Xin.FpLdStI.reg);
1323         return;
1324      case Xin_Fp64to32:
1325         addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
1326         addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
1327         return;
1328      case Xin_FpCMov:
1329         addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
1330         addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
1331         return;
1332      case Xin_FpLdCW:
1333         addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
1334         return;
1335      case Xin_FpStSW_AX:
1336         addHRegUse(u, HRmWrite, hregX86_EAX());
1337         return;
1338      case Xin_FpCmp:
1339         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
1340         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
1341         addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
1342         addHRegUse(u, HRmWrite, hregX86_EAX());
1343         return;
1344      case Xin_SseLdSt:
1345         addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
1346         addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
1347                       i->Xin.SseLdSt.reg);
1348         return;
1349      case Xin_SseLdzLO:
1350         addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
1351         addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
1352         return;
1353      case Xin_SseConst:
1354         addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
1355         return;
1356      case Xin_Sse32Fx4:
1357         vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
1358         unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
1359                         || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
1360                         || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
1361         addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
1362         addHRegUse(u, unary ? HRmWrite : HRmModify,
1363                       i->Xin.Sse32Fx4.dst);
1364         return;
1365      case Xin_Sse32FLo:
1366         vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
1367         unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
1368                         || i->Xin.Sse32FLo.op == Xsse_RSQRTF
1369                         || i->Xin.Sse32FLo.op == Xsse_SQRTF );
1370         addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
1371         addHRegUse(u, unary ? HRmWrite : HRmModify,
1372                       i->Xin.Sse32FLo.dst);
1373         return;
1374      case Xin_Sse64Fx2:
1375         vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
1376         unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
1377                         || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
1378                         || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
1379         addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
1380         addHRegUse(u, unary ? HRmWrite : HRmModify,
1381                       i->Xin.Sse64Fx2.dst);
1382         return;
1383      case Xin_Sse64FLo:
1384         vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
1385         unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
1386                         || i->Xin.Sse64FLo.op == Xsse_RSQRTF
1387                         || i->Xin.Sse64FLo.op == Xsse_SQRTF );
1388         addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
1389         addHRegUse(u, unary ? HRmWrite : HRmModify,
1390                       i->Xin.Sse64FLo.dst);
1391         return;
1392      case Xin_SseReRg:
1393         if (i->Xin.SseReRg.op == Xsse_XOR
1394             && i->Xin.SseReRg.src == i->Xin.SseReRg.dst) {
1395            /* reg-alloc needs to understand 'xor r,r' as a write of r */
1396            /* (as opposed to a rite of passage :-) */
1397            addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
1398         } else {
1399            addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
1400            addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV
1401                             ? HRmWrite : HRmModify,
1402                          i->Xin.SseReRg.dst);
1403         }
1404         return;
1405      case Xin_SseCMov:
1406         addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
1407         addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
1408         return;
1409      case Xin_SseShuf:
1410         addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
1411         addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
1412         return;
1413      default:
1414         ppX86Instr(i, False);
1415         vpanic("getRegUsage_X86Instr");
1416   }
1417}
1418
1419/* local helper */
1420static void mapReg( HRegRemap* m, HReg* r )
1421{
1422   *r = lookupHRegRemap(m, *r);
1423}
1424
1425void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
1426{
1427   vassert(mode64 == False);
1428   switch (i->tag) {
1429      case Xin_Alu32R:
1430         mapRegs_X86RMI(m, i->Xin.Alu32R.src);
1431         mapReg(m, &i->Xin.Alu32R.dst);
1432         return;
1433      case Xin_Alu32M:
1434         mapRegs_X86RI(m, i->Xin.Alu32M.src);
1435         mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
1436         return;
1437      case Xin_Sh32:
1438         mapReg(m, &i->Xin.Sh32.dst);
1439         return;
1440      case Xin_Test32:
1441         mapRegs_X86RM(m, i->Xin.Test32.dst);
1442         return;
1443      case Xin_Unary32:
1444         mapReg(m, &i->Xin.Unary32.dst);
1445         return;
1446      case Xin_Lea32:
1447         mapRegs_X86AMode(m, i->Xin.Lea32.am);
1448         mapReg(m, &i->Xin.Lea32.dst);
1449         return;
1450      case Xin_MulL:
1451         mapRegs_X86RM(m, i->Xin.MulL.src);
1452         return;
1453      case Xin_Div:
1454         mapRegs_X86RM(m, i->Xin.Div.src);
1455         return;
1456      case Xin_Sh3232:
1457         mapReg(m, &i->Xin.Sh3232.src);
1458         mapReg(m, &i->Xin.Sh3232.dst);
1459         return;
1460      case Xin_Push:
1461         mapRegs_X86RMI(m, i->Xin.Push.src);
1462         return;
1463      case Xin_Call:
1464         return;
1465      case Xin_Goto:
1466         mapRegs_X86RI(m, i->Xin.Goto.dst);
1467         return;
1468      case Xin_CMov32:
1469         mapRegs_X86RM(m, i->Xin.CMov32.src);
1470         mapReg(m, &i->Xin.CMov32.dst);
1471         return;
1472      case Xin_LoadEX:
1473         mapRegs_X86AMode(m, i->Xin.LoadEX.src);
1474         mapReg(m, &i->Xin.LoadEX.dst);
1475         return;
1476      case Xin_Store:
1477         mapReg(m, &i->Xin.Store.src);
1478         mapRegs_X86AMode(m, i->Xin.Store.dst);
1479         return;
1480      case Xin_Set32:
1481         mapReg(m, &i->Xin.Set32.dst);
1482         return;
1483      case Xin_Bsfr32:
1484         mapReg(m, &i->Xin.Bsfr32.src);
1485         mapReg(m, &i->Xin.Bsfr32.dst);
1486         return;
1487      case Xin_MFence:
1488         return;
1489      case Xin_ACAS:
1490         mapRegs_X86AMode(m, i->Xin.ACAS.addr);
1491         return;
1492      case Xin_DACAS:
1493         mapRegs_X86AMode(m, i->Xin.DACAS.addr);
1494         return;
1495      case Xin_FpUnary:
1496         mapReg(m, &i->Xin.FpUnary.src);
1497         mapReg(m, &i->Xin.FpUnary.dst);
1498         return;
1499      case Xin_FpBinary:
1500         mapReg(m, &i->Xin.FpBinary.srcL);
1501         mapReg(m, &i->Xin.FpBinary.srcR);
1502         mapReg(m, &i->Xin.FpBinary.dst);
1503         return;
1504      case Xin_FpLdSt:
1505         mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
1506         mapReg(m, &i->Xin.FpLdSt.reg);
1507         return;
1508      case Xin_FpLdStI:
1509         mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
1510         mapReg(m, &i->Xin.FpLdStI.reg);
1511         return;
1512      case Xin_Fp64to32:
1513         mapReg(m, &i->Xin.Fp64to32.src);
1514         mapReg(m, &i->Xin.Fp64to32.dst);
1515         return;
1516      case Xin_FpCMov:
1517         mapReg(m, &i->Xin.FpCMov.src);
1518         mapReg(m, &i->Xin.FpCMov.dst);
1519         return;
1520      case Xin_FpLdCW:
1521         mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
1522         return;
1523      case Xin_FpStSW_AX:
1524         return;
1525      case Xin_FpCmp:
1526         mapReg(m, &i->Xin.FpCmp.srcL);
1527         mapReg(m, &i->Xin.FpCmp.srcR);
1528         mapReg(m, &i->Xin.FpCmp.dst);
1529         return;
1530      case Xin_SseConst:
1531         mapReg(m, &i->Xin.SseConst.dst);
1532         return;
1533      case Xin_SseLdSt:
1534         mapReg(m, &i->Xin.SseLdSt.reg);
1535         mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
1536         break;
1537      case Xin_SseLdzLO:
1538         mapReg(m, &i->Xin.SseLdzLO.reg);
1539         mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
1540         break;
1541      case Xin_Sse32Fx4:
1542         mapReg(m, &i->Xin.Sse32Fx4.src);
1543         mapReg(m, &i->Xin.Sse32Fx4.dst);
1544         return;
1545      case Xin_Sse32FLo:
1546         mapReg(m, &i->Xin.Sse32FLo.src);
1547         mapReg(m, &i->Xin.Sse32FLo.dst);
1548         return;
1549      case Xin_Sse64Fx2:
1550         mapReg(m, &i->Xin.Sse64Fx2.src);
1551         mapReg(m, &i->Xin.Sse64Fx2.dst);
1552         return;
1553      case Xin_Sse64FLo:
1554         mapReg(m, &i->Xin.Sse64FLo.src);
1555         mapReg(m, &i->Xin.Sse64FLo.dst);
1556         return;
1557      case Xin_SseReRg:
1558         mapReg(m, &i->Xin.SseReRg.src);
1559         mapReg(m, &i->Xin.SseReRg.dst);
1560         return;
1561      case Xin_SseCMov:
1562         mapReg(m, &i->Xin.SseCMov.src);
1563         mapReg(m, &i->Xin.SseCMov.dst);
1564         return;
1565      case Xin_SseShuf:
1566         mapReg(m, &i->Xin.SseShuf.src);
1567         mapReg(m, &i->Xin.SseShuf.dst);
1568         return;
1569      default:
1570         ppX86Instr(i, mode64);
1571         vpanic("mapRegs_X86Instr");
1572   }
1573}
1574
1575/* Figure out if i represents a reg-reg move, and if so assign the
1576   source and destination to *src and *dst.  If in doubt say No.  Used
1577   by the register allocator to do move coalescing.
1578*/
1579Bool isMove_X86Instr ( X86Instr* i, HReg* src, HReg* dst )
1580{
1581   /* Moves between integer regs */
1582   if (i->tag == Xin_Alu32R) {
1583      if (i->Xin.Alu32R.op != Xalu_MOV)
1584         return False;
1585      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
1586         return False;
1587      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
1588      *dst = i->Xin.Alu32R.dst;
1589      return True;
1590   }
1591   /* Moves between FP regs */
1592   if (i->tag == Xin_FpUnary) {
1593      if (i->Xin.FpUnary.op != Xfp_MOV)
1594         return False;
1595      *src = i->Xin.FpUnary.src;
1596      *dst = i->Xin.FpUnary.dst;
1597      return True;
1598   }
1599   if (i->tag == Xin_SseReRg) {
1600      if (i->Xin.SseReRg.op != Xsse_MOV)
1601         return False;
1602      *src = i->Xin.SseReRg.src;
1603      *dst = i->Xin.SseReRg.dst;
1604      return True;
1605   }
1606   return False;
1607}
1608
1609
1610/* Generate x86 spill/reload instructions under the direction of the
1611   register allocator.  Note it's critical these don't write the
1612   condition codes. */
1613
1614void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1615                    HReg rreg, Int offsetB, Bool mode64 )
1616{
1617   X86AMode* am;
1618   vassert(offsetB >= 0);
1619   vassert(!hregIsVirtual(rreg));
1620   vassert(mode64 == False);
1621   *i1 = *i2 = NULL;
1622   am = X86AMode_IR(offsetB, hregX86_EBP());
1623   switch (hregClass(rreg)) {
1624      case HRcInt32:
1625         *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
1626         return;
1627      case HRcFlt64:
1628         *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
1629         return;
1630      case HRcVec128:
1631         *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
1632         return;
1633      default:
1634         ppHRegClass(hregClass(rreg));
1635         vpanic("genSpill_X86: unimplemented regclass");
1636   }
1637}
1638
1639void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1640                     HReg rreg, Int offsetB, Bool mode64 )
1641{
1642   X86AMode* am;
1643   vassert(offsetB >= 0);
1644   vassert(!hregIsVirtual(rreg));
1645   vassert(mode64 == False);
1646   *i1 = *i2 = NULL;
1647   am = X86AMode_IR(offsetB, hregX86_EBP());
1648   switch (hregClass(rreg)) {
1649      case HRcInt32:
1650         *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
1651         return;
1652      case HRcFlt64:
1653         *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
1654         return;
1655      case HRcVec128:
1656         *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
1657         return;
1658      default:
1659         ppHRegClass(hregClass(rreg));
1660         vpanic("genReload_X86: unimplemented regclass");
1661   }
1662}
1663
1664/* The given instruction reads the specified vreg exactly once, and
1665   that vreg is currently located at the given spill offset.  If
1666   possible, return a variant of the instruction to one which instead
1667   references the spill slot directly. */
1668
1669X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
1670{
1671   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
1672
1673   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
1674      Convert to: src=RMI_Mem, dst=Reg
1675   */
1676   if (i->tag == Xin_Alu32R
1677       && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
1678           || i->Xin.Alu32R.op == Xalu_XOR)
1679       && i->Xin.Alu32R.src->tag == Xrmi_Reg
1680       && i->Xin.Alu32R.src->Xrmi.Reg.reg == vreg) {
1681      vassert(i->Xin.Alu32R.dst != vreg);
1682      return X86Instr_Alu32R(
1683                i->Xin.Alu32R.op,
1684                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
1685                i->Xin.Alu32R.dst
1686             );
1687   }
1688
1689   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
1690      Convert to: src=RI_Imm, dst=Mem
1691   */
1692   if (i->tag == Xin_Alu32R
1693       && (i->Xin.Alu32R.op == Xalu_CMP)
1694       && i->Xin.Alu32R.src->tag == Xrmi_Imm
1695       && i->Xin.Alu32R.dst == vreg) {
1696      return X86Instr_Alu32M(
1697                i->Xin.Alu32R.op,
1698		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
1699                X86AMode_IR( spill_off, hregX86_EBP())
1700             );
1701   }
1702
1703   /* Deal with form: Push(RMI_Reg)
1704      Convert to: Push(RMI_Mem)
1705   */
1706   if (i->tag == Xin_Push
1707       && i->Xin.Push.src->tag == Xrmi_Reg
1708       && i->Xin.Push.src->Xrmi.Reg.reg == vreg) {
1709      return X86Instr_Push(
1710                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
1711             );
1712   }
1713
1714   /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
1715      Convert to CMov32(RM_Mem, dst) */
1716   if (i->tag == Xin_CMov32
1717       && i->Xin.CMov32.src->tag == Xrm_Reg
1718       && i->Xin.CMov32.src->Xrm.Reg.reg == vreg) {
1719      vassert(i->Xin.CMov32.dst != vreg);
1720      return X86Instr_CMov32(
1721                i->Xin.CMov32.cond,
1722                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
1723                i->Xin.CMov32.dst
1724             );
1725   }
1726
1727   /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
1728   if (i->tag == Xin_Test32
1729       && i->Xin.Test32.dst->tag == Xrm_Reg
1730       && i->Xin.Test32.dst->Xrm.Reg.reg == vreg) {
1731      return X86Instr_Test32(
1732                i->Xin.Test32.imm32,
1733                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
1734             );
1735   }
1736
1737   return NULL;
1738}
1739
1740
1741/* --------- The x86 assembler (bleh.) --------- */
1742
1743static UChar iregNo ( HReg r )
1744{
1745   UInt n;
1746   vassert(hregClass(r) == HRcInt32);
1747   vassert(!hregIsVirtual(r));
1748   n = hregNumber(r);
1749   vassert(n <= 7);
1750   return toUChar(n);
1751}
1752
1753static UInt fregNo ( HReg r )
1754{
1755   UInt n;
1756   vassert(hregClass(r) == HRcFlt64);
1757   vassert(!hregIsVirtual(r));
1758   n = hregNumber(r);
1759   vassert(n <= 5);
1760   return n;
1761}
1762
1763static UInt vregNo ( HReg r )
1764{
1765   UInt n;
1766   vassert(hregClass(r) == HRcVec128);
1767   vassert(!hregIsVirtual(r));
1768   n = hregNumber(r);
1769   vassert(n <= 7);
1770   return n;
1771}
1772
1773static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
1774{
1775   return toUChar( ((mod & 3) << 6)
1776                   | ((reg & 7) << 3)
1777                   | (regmem & 7) );
1778}
1779
1780static UChar mkSIB ( Int shift, Int regindex, Int regbase )
1781{
1782   return toUChar( ((shift & 3) << 6)
1783                   | ((regindex & 7) << 3)
1784                   | (regbase & 7) );
1785}
1786
1787static UChar* emit32 ( UChar* p, UInt w32 )
1788{
1789   *p++ = toUChar( w32        & 0x000000FF);
1790   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1791   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1792   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1793   return p;
1794}
1795
1796/* Does a sign-extend of the lowest 8 bits give
1797   the original number? */
1798static Bool fits8bits ( UInt w32 )
1799{
1800   Int i32 = (Int)w32;
1801   return toBool(i32 == ((i32 << 24) >> 24));
1802}
1803
1804
1805/* Forming mod-reg-rm bytes and scale-index-base bytes.
1806
1807     greg,  0(ereg)    |  ereg != ESP && ereg != EBP
1808                       =  00 greg ereg
1809
1810     greg,  d8(ereg)   |  ereg != ESP
1811                       =  01 greg ereg, d8
1812
1813     greg,  d32(ereg)  |  ereg != ESP
1814                       =  10 greg ereg, d32
1815
1816     greg,  d8(%esp)   =  01 greg 100, 0x24, d8
1817
1818     -----------------------------------------------
1819
1820     greg,  d8(base,index,scale)
1821               |  index != ESP
1822               =  01 greg 100, scale index base, d8
1823
1824     greg,  d32(base,index,scale)
1825               |  index != ESP
1826               =  10 greg 100, scale index base, d32
1827*/
1828static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am )
1829{
1830   if (am->tag == Xam_IR) {
1831      if (am->Xam.IR.imm == 0
1832          && am->Xam.IR.reg != hregX86_ESP()
1833          && am->Xam.IR.reg != hregX86_EBP() ) {
1834         *p++ = mkModRegRM(0, iregNo(greg), iregNo(am->Xam.IR.reg));
1835         return p;
1836      }
1837      if (fits8bits(am->Xam.IR.imm)
1838          && am->Xam.IR.reg != hregX86_ESP()) {
1839         *p++ = mkModRegRM(1, iregNo(greg), iregNo(am->Xam.IR.reg));
1840         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1841         return p;
1842      }
1843      if (am->Xam.IR.reg != hregX86_ESP()) {
1844         *p++ = mkModRegRM(2, iregNo(greg), iregNo(am->Xam.IR.reg));
1845         p = emit32(p, am->Xam.IR.imm);
1846         return p;
1847      }
1848      if (am->Xam.IR.reg == hregX86_ESP()
1849          && fits8bits(am->Xam.IR.imm)) {
1850 	 *p++ = mkModRegRM(1, iregNo(greg), 4);
1851         *p++ = 0x24;
1852         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1853         return p;
1854      }
1855      ppX86AMode(am);
1856      vpanic("doAMode_M: can't emit amode IR");
1857      /*NOTREACHED*/
1858   }
1859   if (am->tag == Xam_IRRS) {
1860      if (fits8bits(am->Xam.IRRS.imm)
1861          && am->Xam.IRRS.index != hregX86_ESP()) {
1862         *p++ = mkModRegRM(1, iregNo(greg), 4);
1863         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
1864                                          am->Xam.IRRS.base);
1865         *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
1866         return p;
1867      }
1868      if (am->Xam.IRRS.index != hregX86_ESP()) {
1869         *p++ = mkModRegRM(2, iregNo(greg), 4);
1870         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
1871                                          am->Xam.IRRS.base);
1872         p = emit32(p, am->Xam.IRRS.imm);
1873         return p;
1874      }
1875      ppX86AMode(am);
1876      vpanic("doAMode_M: can't emit amode IRRS");
1877      /*NOTREACHED*/
1878   }
1879   vpanic("doAMode_M: unknown amode");
1880   /*NOTREACHED*/
1881}
1882
1883
1884/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
1885static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
1886{
1887   *p++ = mkModRegRM(3, iregNo(greg), iregNo(ereg));
1888   return p;
1889}
1890
1891
1892/* Emit ffree %st(7) */
1893static UChar* do_ffree_st7 ( UChar* p )
1894{
1895   *p++ = 0xDD;
1896   *p++ = 0xC7;
1897   return p;
1898}
1899
1900/* Emit fstp %st(i), 1 <= i <= 7 */
1901static UChar* do_fstp_st ( UChar* p, Int i )
1902{
1903   vassert(1 <= i && i <= 7);
1904   *p++ = 0xDD;
1905   *p++ = toUChar(0xD8+i);
1906   return p;
1907}
1908
1909/* Emit fld %st(i), 0 <= i <= 6 */
1910static UChar* do_fld_st ( UChar* p, Int i )
1911{
1912   vassert(0 <= i && i <= 6);
1913   *p++ = 0xD9;
1914   *p++ = toUChar(0xC0+i);
1915   return p;
1916}
1917
1918/* Emit f<op> %st(0) */
1919static UChar* do_fop1_st ( UChar* p, X86FpOp op )
1920{
1921   switch (op) {
1922      case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
1923      case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
1924      case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
1925      case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
1926      case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
1927      case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
1928      case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
1929      case Xfp_MOV:    break;
1930      case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
1931                       *p++ = 0xD9; *p++ = 0xF2; /* fptan */
1932                       *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
1933                       break;
1934      default: vpanic("do_fop1_st: unknown op");
1935   }
1936   return p;
1937}
1938
1939/* Emit f<op> %st(i), 1 <= i <= 5 */
1940static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
1941{
1942#  define fake(_n) mkHReg((_n), HRcInt32, False)
1943   Int subopc;
1944   switch (op) {
1945      case Xfp_ADD: subopc = 0; break;
1946      case Xfp_SUB: subopc = 4; break;
1947      case Xfp_MUL: subopc = 1; break;
1948      case Xfp_DIV: subopc = 6; break;
1949      default: vpanic("do_fop2_st: unknown op");
1950   }
1951   *p++ = 0xD8;
1952   p    = doAMode_R(p, fake(subopc), fake(i));
1953   return p;
1954#  undef fake
1955}
1956
1957/* Push a 32-bit word on the stack.  The word depends on tags[3:0];
1958each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
1959*/
1960static UChar* push_word_from_tags ( UChar* p, UShort tags )
1961{
1962   UInt w;
1963   vassert(0 == (tags & ~0xF));
1964   if (tags == 0) {
1965      /* pushl $0x00000000 */
1966      *p++ = 0x6A;
1967      *p++ = 0x00;
1968   }
1969   else
1970   /* pushl $0xFFFFFFFF */
1971   if (tags == 0xF) {
1972      *p++ = 0x6A;
1973      *p++ = 0xFF;
1974   } else {
1975      vassert(0); /* awaiting test case */
1976      w = 0;
1977      if (tags & 1) w |= 0x000000FF;
1978      if (tags & 2) w |= 0x0000FF00;
1979      if (tags & 4) w |= 0x00FF0000;
1980      if (tags & 8) w |= 0xFF000000;
1981      *p++ = 0x68;
1982      p = emit32(p, w);
1983   }
1984   return p;
1985}
1986
1987/* Emit an instruction into buf and return the number of bytes used.
1988   Note that buf is not the insn's final place, and therefore it is
1989   imperative to emit position-independent code. */
1990
1991Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i,
1992                    Bool mode64, void* dispatch )
1993{
1994   UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
1995
1996   UInt   xtra;
1997   UChar* p = &buf[0];
1998   UChar* ptmp;
1999   vassert(nbuf >= 32);
2000   vassert(mode64 == False);
2001
2002   /* Wrap an integer as a int register, for use assembling
2003      GrpN insns, in which the greg field is used as a sub-opcode
2004      and does not really contain a register. */
2005#  define fake(_n) mkHReg((_n), HRcInt32, False)
2006
2007   /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
2008
2009   switch (i->tag) {
2010
2011   case Xin_Alu32R:
2012      /* Deal specially with MOV */
2013      if (i->Xin.Alu32R.op == Xalu_MOV) {
2014         switch (i->Xin.Alu32R.src->tag) {
2015            case Xrmi_Imm:
2016               *p++ = toUChar(0xB8 + iregNo(i->Xin.Alu32R.dst));
2017               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2018               goto done;
2019            case Xrmi_Reg:
2020               *p++ = 0x89;
2021               p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2022                                i->Xin.Alu32R.dst);
2023               goto done;
2024            case Xrmi_Mem:
2025               *p++ = 0x8B;
2026               p = doAMode_M(p, i->Xin.Alu32R.dst,
2027                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2028               goto done;
2029            default:
2030               goto bad;
2031         }
2032      }
2033      /* MUL */
2034      if (i->Xin.Alu32R.op == Xalu_MUL) {
2035         switch (i->Xin.Alu32R.src->tag) {
2036            case Xrmi_Reg:
2037               *p++ = 0x0F;
2038               *p++ = 0xAF;
2039               p = doAMode_R(p, i->Xin.Alu32R.dst,
2040                                i->Xin.Alu32R.src->Xrmi.Reg.reg);
2041               goto done;
2042            case Xrmi_Mem:
2043               *p++ = 0x0F;
2044               *p++ = 0xAF;
2045               p = doAMode_M(p, i->Xin.Alu32R.dst,
2046                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2047               goto done;
2048            case Xrmi_Imm:
2049               if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2050                  *p++ = 0x6B;
2051                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2052                  *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2053               } else {
2054                  *p++ = 0x69;
2055                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2056                  p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2057               }
2058               goto done;
2059            default:
2060               goto bad;
2061         }
2062      }
2063      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2064      opc = opc_rr = subopc_imm = opc_imma = 0;
2065      switch (i->Xin.Alu32R.op) {
2066         case Xalu_ADC: opc = 0x13; opc_rr = 0x11;
2067                        subopc_imm = 2; opc_imma = 0x15; break;
2068         case Xalu_ADD: opc = 0x03; opc_rr = 0x01;
2069                        subopc_imm = 0; opc_imma = 0x05; break;
2070         case Xalu_SUB: opc = 0x2B; opc_rr = 0x29;
2071                        subopc_imm = 5; opc_imma = 0x2D; break;
2072         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19;
2073                        subopc_imm = 3; opc_imma = 0x1D; break;
2074         case Xalu_AND: opc = 0x23; opc_rr = 0x21;
2075                        subopc_imm = 4; opc_imma = 0x25; break;
2076         case Xalu_XOR: opc = 0x33; opc_rr = 0x31;
2077                        subopc_imm = 6; opc_imma = 0x35; break;
2078         case Xalu_OR:  opc = 0x0B; opc_rr = 0x09;
2079                        subopc_imm = 1; opc_imma = 0x0D; break;
2080         case Xalu_CMP: opc = 0x3B; opc_rr = 0x39;
2081                        subopc_imm = 7; opc_imma = 0x3D; break;
2082         default: goto bad;
2083      }
2084      switch (i->Xin.Alu32R.src->tag) {
2085         case Xrmi_Imm:
2086            if (i->Xin.Alu32R.dst == hregX86_EAX()
2087                && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2088               *p++ = toUChar(opc_imma);
2089               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2090            } else
2091            if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2092               *p++ = 0x83;
2093               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2094               *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2095            } else {
2096               *p++ = 0x81;
2097               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
2098               p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2099            }
2100            goto done;
2101         case Xrmi_Reg:
2102            *p++ = toUChar(opc_rr);
2103            p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2104                             i->Xin.Alu32R.dst);
2105            goto done;
2106         case Xrmi_Mem:
2107            *p++ = toUChar(opc);
2108            p = doAMode_M(p, i->Xin.Alu32R.dst,
2109                             i->Xin.Alu32R.src->Xrmi.Mem.am);
2110            goto done;
2111         default:
2112            goto bad;
2113      }
2114      break;
2115
2116   case Xin_Alu32M:
2117      /* Deal specially with MOV */
2118      if (i->Xin.Alu32M.op == Xalu_MOV) {
2119         switch (i->Xin.Alu32M.src->tag) {
2120            case Xri_Reg:
2121               *p++ = 0x89;
2122               p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2123                                i->Xin.Alu32M.dst);
2124               goto done;
2125            case Xri_Imm:
2126               *p++ = 0xC7;
2127               p = doAMode_M(p, fake(0), i->Xin.Alu32M.dst);
2128               p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2129               goto done;
2130            default:
2131               goto bad;
2132         }
2133      }
2134      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2135         allowed here. */
2136      opc = subopc_imm = opc_imma = 0;
2137      switch (i->Xin.Alu32M.op) {
2138         case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
2139         case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
2140         case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
2141         default: goto bad;
2142      }
2143      switch (i->Xin.Alu32M.src->tag) {
2144         case Xri_Reg:
2145            *p++ = toUChar(opc);
2146            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2147                             i->Xin.Alu32M.dst);
2148            goto done;
2149         case Xri_Imm:
2150            if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
2151               *p++ = 0x83;
2152               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2153               *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
2154               goto done;
2155            } else {
2156               *p++ = 0x81;
2157               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
2158               p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2159               goto done;
2160            }
2161         default:
2162            goto bad;
2163      }
2164      break;
2165
2166   case Xin_Sh32:
2167      opc_cl = opc_imm = subopc = 0;
2168      switch (i->Xin.Sh32.op) {
2169         case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2170         case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2171         case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2172         default: goto bad;
2173      }
2174      if (i->Xin.Sh32.src == 0) {
2175         *p++ = toUChar(opc_cl);
2176         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2177      } else {
2178         *p++ = toUChar(opc_imm);
2179         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
2180         *p++ = (UChar)(i->Xin.Sh32.src);
2181      }
2182      goto done;
2183
2184   case Xin_Test32:
2185      if (i->Xin.Test32.dst->tag == Xrm_Reg) {
2186         /* testl $imm32, %reg */
2187         *p++ = 0xF7;
2188         p = doAMode_R(p, fake(0), i->Xin.Test32.dst->Xrm.Reg.reg);
2189         p = emit32(p, i->Xin.Test32.imm32);
2190         goto done;
2191      } else {
2192         /* testl $imm32, amode */
2193         *p++ = 0xF7;
2194         p = doAMode_M(p, fake(0), i->Xin.Test32.dst->Xrm.Mem.am);
2195         p = emit32(p, i->Xin.Test32.imm32);
2196         goto done;
2197      }
2198
2199   case Xin_Unary32:
2200      if (i->Xin.Unary32.op == Xun_NOT) {
2201         *p++ = 0xF7;
2202         p = doAMode_R(p, fake(2), i->Xin.Unary32.dst);
2203         goto done;
2204      }
2205      if (i->Xin.Unary32.op == Xun_NEG) {
2206         *p++ = 0xF7;
2207         p = doAMode_R(p, fake(3), i->Xin.Unary32.dst);
2208         goto done;
2209      }
2210      break;
2211
2212   case Xin_Lea32:
2213      *p++ = 0x8D;
2214      p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
2215      goto done;
2216
2217   case Xin_MulL:
2218      subopc = i->Xin.MulL.syned ? 5 : 4;
2219      *p++ = 0xF7;
2220      switch (i->Xin.MulL.src->tag)  {
2221         case Xrm_Mem:
2222            p = doAMode_M(p, fake(subopc),
2223                             i->Xin.MulL.src->Xrm.Mem.am);
2224            goto done;
2225         case Xrm_Reg:
2226            p = doAMode_R(p, fake(subopc),
2227                             i->Xin.MulL.src->Xrm.Reg.reg);
2228            goto done;
2229         default:
2230            goto bad;
2231      }
2232      break;
2233
2234   case Xin_Div:
2235      subopc = i->Xin.Div.syned ? 7 : 6;
2236      *p++ = 0xF7;
2237      switch (i->Xin.Div.src->tag)  {
2238         case Xrm_Mem:
2239            p = doAMode_M(p, fake(subopc),
2240                             i->Xin.Div.src->Xrm.Mem.am);
2241            goto done;
2242         case Xrm_Reg:
2243            p = doAMode_R(p, fake(subopc),
2244                             i->Xin.Div.src->Xrm.Reg.reg);
2245            goto done;
2246         default:
2247            goto bad;
2248      }
2249      break;
2250
2251   case Xin_Sh3232:
2252      vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
2253      if (i->Xin.Sh3232.amt == 0) {
2254         /* shldl/shrdl by %cl */
2255         *p++ = 0x0F;
2256         if (i->Xin.Sh3232.op == Xsh_SHL) {
2257            *p++ = 0xA5;
2258         } else {
2259            *p++ = 0xAD;
2260         }
2261         p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
2262         goto done;
2263      }
2264      break;
2265
2266   case Xin_Push:
2267      switch (i->Xin.Push.src->tag) {
2268         case Xrmi_Mem:
2269            *p++ = 0xFF;
2270            p = doAMode_M(p, fake(6), i->Xin.Push.src->Xrmi.Mem.am);
2271            goto done;
2272         case Xrmi_Imm:
2273            *p++ = 0x68;
2274            p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
2275            goto done;
2276         case Xrmi_Reg:
2277            *p++ = toUChar(0x50 + iregNo(i->Xin.Push.src->Xrmi.Reg.reg));
2278            goto done;
2279        default:
2280            goto bad;
2281      }
2282
2283   case Xin_Call:
2284      /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
2285         for explanation of this. */
2286      switch (i->Xin.Call.regparms) {
2287         case 0: irno = iregNo(hregX86_EAX()); break;
2288         case 1: irno = iregNo(hregX86_EDX()); break;
2289         case 2: irno = iregNo(hregX86_ECX()); break;
2290         case 3: irno = iregNo(hregX86_EDI()); break;
2291         default: vpanic(" emit_X86Instr:call:regparms");
2292      }
2293      /* jump over the following two insns if the condition does not
2294         hold */
2295      if (i->Xin.Call.cond != Xcc_ALWAYS) {
2296         *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
2297         *p++ = 0x07; /* 7 bytes in the next two insns */
2298      }
2299      /* movl $target, %tmp */
2300      *p++ = toUChar(0xB8 + irno);
2301      p = emit32(p, i->Xin.Call.target);
2302      /* call *%tmp */
2303      *p++ = 0xFF;
2304      *p++ = toUChar(0xD0 + irno);
2305      goto done;
2306
2307   case Xin_Goto:
2308      /* Use ptmp for backpatching conditional jumps. */
2309      ptmp = NULL;
2310
2311      /* First off, if this is conditional, create a conditional
2312	 jump over the rest of it. */
2313      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
2314         /* jmp fwds if !condition */
2315         *p++ = toUChar(0x70 + (0xF & (i->Xin.Goto.cond ^ 1)));
2316         ptmp = p; /* fill in this bit later */
2317         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2318      }
2319
2320      /* If a non-boring, set %ebp (the guest state pointer)
2321         appropriately. */
2322      /* movl $magic_number, %ebp */
2323      switch (i->Xin.Goto.jk) {
2324         case Ijk_ClientReq:
2325            *p++ = 0xBD;
2326            p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
2327         case Ijk_Sys_int128:
2328            *p++ = 0xBD;
2329            p = emit32(p, VEX_TRC_JMP_SYS_INT128); break;
2330         case Ijk_Sys_int129:
2331            *p++ = 0xBD;
2332            p = emit32(p, VEX_TRC_JMP_SYS_INT129); break;
2333         case Ijk_Sys_int130:
2334            *p++ = 0xBD;
2335            p = emit32(p, VEX_TRC_JMP_SYS_INT130); break;
2336         case Ijk_Yield:
2337            *p++ = 0xBD;
2338            p = emit32(p, VEX_TRC_JMP_YIELD); break;
2339         case Ijk_YieldNoRedir:
2340            *p++ = 0xBD;
2341            p = emit32(p, VEX_TRC_JMP_YIELD_NOREDIR); break;
2342         case Ijk_EmWarn:
2343            *p++ = 0xBD;
2344            p = emit32(p, VEX_TRC_JMP_EMWARN); break;
2345         case Ijk_MapFail:
2346            *p++ = 0xBD;
2347            p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
2348         case Ijk_NoDecode:
2349            *p++ = 0xBD;
2350            p = emit32(p, VEX_TRC_JMP_NODECODE); break;
2351         case Ijk_TInval:
2352            *p++ = 0xBD;
2353            p = emit32(p, VEX_TRC_JMP_TINVAL); break;
2354         case Ijk_NoRedir:
2355            *p++ = 0xBD;
2356            p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
2357         case Ijk_Sys_sysenter:
2358            *p++ = 0xBD;
2359            p = emit32(p, VEX_TRC_JMP_SYS_SYSENTER); break;
2360         case Ijk_SigTRAP:
2361            *p++ = 0xBD;
2362            p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
2363         case Ijk_SigSEGV:
2364            *p++ = 0xBD;
2365            p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
2366         case Ijk_Ret:
2367	 case Ijk_Call:
2368         case Ijk_Boring:
2369            break;
2370         default:
2371            ppIRJumpKind(i->Xin.Goto.jk);
2372            vpanic("emit_X86Instr.Xin_Goto: unknown jump kind");
2373      }
2374
2375      /* Get the destination address into %eax */
2376      if (i->Xin.Goto.dst->tag == Xri_Imm) {
2377         /* movl $immediate, %eax */
2378         *p++ = 0xB8;
2379         p = emit32(p, i->Xin.Goto.dst->Xri.Imm.imm32);
2380      } else {
2381         vassert(i->Xin.Goto.dst->tag == Xri_Reg);
2382         /* movl %reg, %eax */
2383         if (i->Xin.Goto.dst->Xri.Reg.reg != hregX86_EAX()) {
2384            *p++ = 0x89;
2385            p = doAMode_R(p, i->Xin.Goto.dst->Xri.Reg.reg, hregX86_EAX());
2386         }
2387      }
2388
2389      /* Get the dispatcher address into %edx.  This has to happen
2390         after the load of %eax since %edx might be carrying the value
2391         destined for %eax immediately prior to this Xin_Goto. */
2392      vassert(sizeof(UInt) == sizeof(void*));
2393      vassert(dispatch != NULL);
2394      /* movl $imm32, %edx */
2395      *p++ = 0xBA;
2396      p = emit32(p, (UInt)Ptr_to_ULong(dispatch));
2397
2398      /* jmp *%edx */
2399      *p++ = 0xFF;
2400      *p++ = 0xE2;
2401
2402      /* Fix up the conditional jump, if there was one. */
2403      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
2404         Int delta = p - ptmp;
2405	 vassert(delta > 0 && delta < 20);
2406         *ptmp = toUChar(delta-1);
2407      }
2408      goto done;
2409
2410   case Xin_CMov32:
2411      vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
2412
2413      /* This generates cmov, which is illegal on P54/P55. */
2414      /*
2415      *p++ = 0x0F;
2416      *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
2417      if (i->Xin.CMov32.src->tag == Xrm_Reg) {
2418         p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
2419         goto done;
2420      }
2421      if (i->Xin.CMov32.src->tag == Xrm_Mem) {
2422         p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
2423         goto done;
2424      }
2425      */
2426
2427      /* Alternative version which works on any x86 variant. */
2428      /* jmp fwds if !condition */
2429      *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
2430      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2431      ptmp = p;
2432
2433      switch (i->Xin.CMov32.src->tag) {
2434         case Xrm_Reg:
2435            /* Big sigh.  This is movl E -> G ... */
2436            *p++ = 0x89;
2437            p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
2438                             i->Xin.CMov32.dst);
2439
2440            break;
2441         case Xrm_Mem:
2442            /* ... whereas this is movl G -> E.  That's why the args
2443               to doAMode_R appear to be the wrong way round in the
2444               Xrm_Reg case. */
2445            *p++ = 0x8B;
2446            p = doAMode_M(p, i->Xin.CMov32.dst,
2447                             i->Xin.CMov32.src->Xrm.Mem.am);
2448            break;
2449         default:
2450            goto bad;
2451      }
2452      /* Fill in the jump offset. */
2453      *(ptmp-1) = toUChar(p - ptmp);
2454      goto done;
2455
2456      break;
2457
2458   case Xin_LoadEX:
2459      if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
2460         /* movzbl */
2461         *p++ = 0x0F;
2462         *p++ = 0xB6;
2463         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2464         goto done;
2465      }
2466      if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
2467         /* movzwl */
2468         *p++ = 0x0F;
2469         *p++ = 0xB7;
2470         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2471         goto done;
2472      }
2473      if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
2474         /* movsbl */
2475         *p++ = 0x0F;
2476         *p++ = 0xBE;
2477         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2478         goto done;
2479      }
2480      break;
2481
2482   case Xin_Set32:
2483      /* Make the destination register be 1 or 0, depending on whether
2484         the relevant condition holds.  We have to dodge and weave
2485         when the destination is %esi or %edi as we cannot directly
2486         emit the native 'setb %reg' for those.  Further complication:
2487         the top 24 bits of the destination should be forced to zero,
2488         but doing 'xor %r,%r' kills the flag(s) we are about to read.
2489         Sigh.  So start off my moving $0 into the dest. */
2490
2491      /* Do we need to swap in %eax? */
2492      if (iregNo(i->Xin.Set32.dst) >= 4) {
2493         /* xchg %eax, %dst */
2494         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2495         /* movl $0, %eax */
2496         *p++ =toUChar(0xB8 + iregNo(hregX86_EAX()));
2497         p = emit32(p, 0);
2498         /* setb lo8(%eax) */
2499         *p++ = 0x0F;
2500         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2501         p = doAMode_R(p, fake(0), hregX86_EAX());
2502         /* xchg %eax, %dst */
2503         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
2504      } else {
2505         /* movl $0, %dst */
2506         *p++ = toUChar(0xB8 + iregNo(i->Xin.Set32.dst));
2507         p = emit32(p, 0);
2508         /* setb lo8(%dst) */
2509         *p++ = 0x0F;
2510         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2511         p = doAMode_R(p, fake(0), i->Xin.Set32.dst);
2512      }
2513      goto done;
2514
2515   case Xin_Bsfr32:
2516      *p++ = 0x0F;
2517      if (i->Xin.Bsfr32.isFwds) {
2518         *p++ = 0xBC;
2519      } else {
2520         *p++ = 0xBD;
2521      }
2522      p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
2523      goto done;
2524
2525   case Xin_MFence:
2526      /* see comment in hdefs.h re this insn */
2527      if (0) vex_printf("EMIT FENCE\n");
2528      if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
2529                                  |VEX_HWCAPS_X86_SSE2)) {
2530         /* mfence */
2531         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
2532         goto done;
2533      }
2534      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_SSE1) {
2535         /* sfence */
2536         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
2537         /* lock addl $0,0(%esp) */
2538         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2539         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2540         goto done;
2541      }
2542      if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
2543         /* lock addl $0,0(%esp) */
2544         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2545         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2546         goto done;
2547      }
2548      vpanic("emit_X86Instr:mfence:hwcaps");
2549      /*NOTREACHED*/
2550      break;
2551
2552   case Xin_ACAS:
2553      /* lock */
2554      *p++ = 0xF0;
2555      /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
2556         in %ebx.  The new-value register is hardwired to be %ebx
2557         since letting it be any integer register gives the problem
2558         that %sil and %dil are unaddressible on x86 and hence we
2559         would have to resort to the same kind of trickery as with
2560         byte-sized Xin.Store, just below.  Given that this isn't
2561         performance critical, it is simpler just to force the
2562         register operand to %ebx (could equally be %ecx or %edx).
2563         (Although %ebx is more consistent with cmpxchg8b.) */
2564      if (i->Xin.ACAS.sz == 2) *p++ = 0x66;
2565      *p++ = 0x0F;
2566      if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
2567      p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
2568      goto done;
2569
2570   case Xin_DACAS:
2571      /* lock */
2572      *p++ = 0xF0;
2573      /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
2574         in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
2575         aren't encoded in the insn. */
2576      *p++ = 0x0F;
2577      *p++ = 0xC7;
2578      p = doAMode_M(p, fake(1), i->Xin.DACAS.addr);
2579      goto done;
2580
2581   case Xin_Store:
2582      if (i->Xin.Store.sz == 2) {
2583         /* This case, at least, is simple, given that we can
2584            reference the low 16 bits of any integer register. */
2585         *p++ = 0x66;
2586         *p++ = 0x89;
2587         p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2588         goto done;
2589      }
2590
2591      if (i->Xin.Store.sz == 1) {
2592         /* We have to do complex dodging and weaving if src is not
2593            the low 8 bits of %eax/%ebx/%ecx/%edx. */
2594         if (iregNo(i->Xin.Store.src) < 4) {
2595            /* we're OK, can do it directly */
2596            *p++ = 0x88;
2597            p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2598           goto done;
2599         } else {
2600            /* Bleh.  This means the source is %edi or %esi.  Since
2601               the address mode can only mention three registers, at
2602               least one of %eax/%ebx/%ecx/%edx must be available to
2603               temporarily swap the source into, so the store can
2604               happen.  So we have to look at the regs mentioned
2605               in the amode. */
2606            HReg swap = INVALID_HREG;
2607            HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(),
2608                  ecx = hregX86_ECX(), edx = hregX86_EDX();
2609            Bool a_ok = True, b_ok = True, c_ok = True, d_ok = True;
2610            HRegUsage u;
2611            Int j;
2612            initHRegUsage(&u);
2613            addRegUsage_X86AMode(&u,  i->Xin.Store.dst);
2614            for (j = 0; j < u.n_used; j++) {
2615               HReg r = u.hreg[j];
2616               if (r == eax) a_ok = False;
2617               if (r == ebx) b_ok = False;
2618               if (r == ecx) c_ok = False;
2619               if (r == edx) d_ok = False;
2620            }
2621            if (a_ok) swap = eax;
2622            if (b_ok) swap = ebx;
2623            if (c_ok) swap = ecx;
2624            if (d_ok) swap = edx;
2625            vassert(swap != INVALID_HREG);
2626            /* xchgl %source, %swap. Could do better if swap is %eax. */
2627            *p++ = 0x87;
2628            p = doAMode_R(p, i->Xin.Store.src, swap);
2629            /* movb lo8{%swap}, (dst) */
2630            *p++ = 0x88;
2631            p = doAMode_M(p, swap, i->Xin.Store.dst);
2632            /* xchgl %source, %swap. Could do better if swap is %eax. */
2633            *p++ = 0x87;
2634            p = doAMode_R(p, i->Xin.Store.src, swap);
2635            goto done;
2636         }
2637      } /* if (i->Xin.Store.sz == 1) */
2638      break;
2639
2640   case Xin_FpUnary:
2641      /* gop %src, %dst
2642         --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
2643      */
2644      p = do_ffree_st7(p);
2645      p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
2646      p = do_fop1_st(p, i->Xin.FpUnary.op);
2647      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
2648      goto done;
2649
2650   case Xin_FpBinary:
2651      if (i->Xin.FpBinary.op == Xfp_YL2X
2652          || i->Xin.FpBinary.op == Xfp_YL2XP1) {
2653         /* Have to do this specially. */
2654         /* ffree %st7 ; fld %st(srcL) ;
2655            ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
2656         p = do_ffree_st7(p);
2657         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2658         p = do_ffree_st7(p);
2659         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2660         *p++ = 0xD9;
2661         *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
2662         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2663         goto done;
2664      }
2665      if (i->Xin.FpBinary.op == Xfp_ATAN) {
2666         /* Have to do this specially. */
2667         /* ffree %st7 ; fld %st(srcL) ;
2668            ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
2669         p = do_ffree_st7(p);
2670         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2671         p = do_ffree_st7(p);
2672         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
2673         *p++ = 0xD9; *p++ = 0xF3;
2674         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2675         goto done;
2676      }
2677      if (i->Xin.FpBinary.op == Xfp_PREM
2678          || i->Xin.FpBinary.op == Xfp_PREM1
2679          || i->Xin.FpBinary.op == Xfp_SCALE) {
2680         /* Have to do this specially. */
2681         /* ffree %st7 ; fld %st(srcR) ;
2682            ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
2683            fincstp ; ffree %st7 */
2684         p = do_ffree_st7(p);
2685         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
2686         p = do_ffree_st7(p);
2687         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
2688         *p++ = 0xD9;
2689         switch (i->Xin.FpBinary.op) {
2690            case Xfp_PREM: *p++ = 0xF8; break;
2691            case Xfp_PREM1: *p++ = 0xF5; break;
2692            case Xfp_SCALE: *p++ =  0xFD; break;
2693            default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
2694         }
2695         p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
2696         *p++ = 0xD9; *p++ = 0xF7;
2697         p = do_ffree_st7(p);
2698         goto done;
2699      }
2700      /* General case */
2701      /* gop %srcL, %srcR, %dst
2702         --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
2703      */
2704      p = do_ffree_st7(p);
2705      p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
2706      p = do_fop2_st(p, i->Xin.FpBinary.op,
2707                        1+hregNumber(i->Xin.FpBinary.srcR));
2708      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
2709      goto done;
2710
2711   case Xin_FpLdSt:
2712      if (i->Xin.FpLdSt.isLoad) {
2713         /* Load from memory into %fakeN.
2714            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
2715         */
2716         p = do_ffree_st7(p);
2717         switch (i->Xin.FpLdSt.sz) {
2718            case 4:
2719               *p++ = 0xD9;
2720               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2721               break;
2722            case 8:
2723               *p++ = 0xDD;
2724               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
2725               break;
2726            case 10:
2727               *p++ = 0xDB;
2728               p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
2729               break;
2730            default:
2731               vpanic("emitX86Instr(FpLdSt,load)");
2732         }
2733         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
2734         goto done;
2735      } else {
2736         /* Store from %fakeN into memory.
2737            --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
2738	 */
2739         p = do_ffree_st7(p);
2740         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
2741         switch (i->Xin.FpLdSt.sz) {
2742            case 4:
2743               *p++ = 0xD9;
2744               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2745               break;
2746            case 8:
2747               *p++ = 0xDD;
2748               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
2749               break;
2750            case 10:
2751               *p++ = 0xDB;
2752               p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
2753               break;
2754            default:
2755               vpanic("emitX86Instr(FpLdSt,store)");
2756         }
2757         goto done;
2758      }
2759      break;
2760
2761   case Xin_FpLdStI:
2762      if (i->Xin.FpLdStI.isLoad) {
2763         /* Load from memory into %fakeN, converting from an int.
2764            --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
2765         */
2766         switch (i->Xin.FpLdStI.sz) {
2767            case 8:  opc = 0xDF; subopc_imm = 5; break;
2768            case 4:  opc = 0xDB; subopc_imm = 0; break;
2769            case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
2770            default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
2771         }
2772         p = do_ffree_st7(p);
2773         *p++ = toUChar(opc);
2774         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2775         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
2776         goto done;
2777      } else {
2778         /* Store from %fakeN into memory, converting to an int.
2779            --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
2780	 */
2781         switch (i->Xin.FpLdStI.sz) {
2782            case 8:  opc = 0xDF; subopc_imm = 7; break;
2783            case 4:  opc = 0xDB; subopc_imm = 3; break;
2784            case 2:  opc = 0xDF; subopc_imm = 3; break;
2785            default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
2786         }
2787         p = do_ffree_st7(p);
2788         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
2789         *p++ = toUChar(opc);
2790         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
2791         goto done;
2792      }
2793      break;
2794
2795   case Xin_Fp64to32:
2796      /* ffree %st7 ; fld %st(src) */
2797      p = do_ffree_st7(p);
2798      p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
2799      /* subl $4, %esp */
2800      *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
2801      /* fstps (%esp) */
2802      *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
2803      /* flds (%esp) */
2804      *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
2805      /* addl $4, %esp */
2806      *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
2807      /* fstp %st(1+dst) */
2808      p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
2809      goto done;
2810
2811   case Xin_FpCMov:
2812      /* jmp fwds if !condition */
2813      *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
2814      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2815      ptmp = p;
2816
2817      /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
2818      p = do_ffree_st7(p);
2819      p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
2820      p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
2821
2822      /* Fill in the jump offset. */
2823      *(ptmp-1) = toUChar(p - ptmp);
2824      goto done;
2825
2826   case Xin_FpLdCW:
2827      *p++ = 0xD9;
2828      p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdCW.addr);
2829      goto done;
2830
2831   case Xin_FpStSW_AX:
2832      /* note, this emits fnstsw %ax, not fstsw %ax */
2833      *p++ = 0xDF;
2834      *p++ = 0xE0;
2835      goto done;
2836
2837   case Xin_FpCmp:
2838      /* gcmp %fL, %fR, %dst
2839         -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
2840            fnstsw %ax ; movl %eax, %dst
2841      */
2842      /* ffree %st7 */
2843      p = do_ffree_st7(p);
2844      /* fpush %fL */
2845      p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
2846      /* fucomp %(fR+1) */
2847      *p++ = 0xDD;
2848      *p++ = toUChar(0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR))));
2849      /* fnstsw %ax */
2850      *p++ = 0xDF;
2851      *p++ = 0xE0;
2852      /*  movl %eax, %dst */
2853      *p++ = 0x89;
2854      p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
2855      goto done;
2856
2857   case Xin_SseConst: {
2858      UShort con = i->Xin.SseConst.con;
2859      p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
2860      p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
2861      p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
2862      p = push_word_from_tags(p, toUShort(con & 0xF));
2863      /* movl (%esp), %xmm-dst */
2864      *p++ = 0x0F;
2865      *p++ = 0x10;
2866      *p++ = toUChar(0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst)));
2867      *p++ = 0x24;
2868      /* addl $16, %esp */
2869      *p++ = 0x83;
2870      *p++ = 0xC4;
2871      *p++ = 0x10;
2872      goto done;
2873   }
2874
2875   case Xin_SseLdSt:
2876      *p++ = 0x0F;
2877      *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
2878      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdSt.reg)), i->Xin.SseLdSt.addr);
2879      goto done;
2880
2881   case Xin_SseLdzLO:
2882      vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
2883      /* movs[sd] amode, %xmm-dst */
2884      *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
2885      *p++ = 0x0F;
2886      *p++ = 0x10;
2887      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdzLO.reg)),
2888                       i->Xin.SseLdzLO.addr);
2889      goto done;
2890
2891   case Xin_Sse32Fx4:
2892      xtra = 0;
2893      *p++ = 0x0F;
2894      switch (i->Xin.Sse32Fx4.op) {
2895         case Xsse_ADDF:   *p++ = 0x58; break;
2896         case Xsse_DIVF:   *p++ = 0x5E; break;
2897         case Xsse_MAXF:   *p++ = 0x5F; break;
2898         case Xsse_MINF:   *p++ = 0x5D; break;
2899         case Xsse_MULF:   *p++ = 0x59; break;
2900         case Xsse_RCPF:   *p++ = 0x53; break;
2901         case Xsse_RSQRTF: *p++ = 0x52; break;
2902         case Xsse_SQRTF:  *p++ = 0x51; break;
2903         case Xsse_SUBF:   *p++ = 0x5C; break;
2904         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2905         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2906         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2907         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2908         default: goto bad;
2909      }
2910      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32Fx4.dst)),
2911                       fake(vregNo(i->Xin.Sse32Fx4.src)) );
2912      if (xtra & 0x100)
2913         *p++ = toUChar(xtra & 0xFF);
2914      goto done;
2915
2916   case Xin_Sse64Fx2:
2917      xtra = 0;
2918      *p++ = 0x66;
2919      *p++ = 0x0F;
2920      switch (i->Xin.Sse64Fx2.op) {
2921         case Xsse_ADDF:   *p++ = 0x58; break;
2922         case Xsse_DIVF:   *p++ = 0x5E; break;
2923         case Xsse_MAXF:   *p++ = 0x5F; break;
2924         case Xsse_MINF:   *p++ = 0x5D; break;
2925         case Xsse_MULF:   *p++ = 0x59; break;
2926         case Xsse_RCPF:   *p++ = 0x53; break;
2927         case Xsse_RSQRTF: *p++ = 0x52; break;
2928         case Xsse_SQRTF:  *p++ = 0x51; break;
2929         case Xsse_SUBF:   *p++ = 0x5C; break;
2930         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2931         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2932         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2933         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2934         default: goto bad;
2935      }
2936      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64Fx2.dst)),
2937                       fake(vregNo(i->Xin.Sse64Fx2.src)) );
2938      if (xtra & 0x100)
2939         *p++ = toUChar(xtra & 0xFF);
2940      goto done;
2941
2942   case Xin_Sse32FLo:
2943      xtra = 0;
2944      *p++ = 0xF3;
2945      *p++ = 0x0F;
2946      switch (i->Xin.Sse32FLo.op) {
2947         case Xsse_ADDF:   *p++ = 0x58; break;
2948         case Xsse_DIVF:   *p++ = 0x5E; break;
2949         case Xsse_MAXF:   *p++ = 0x5F; break;
2950         case Xsse_MINF:   *p++ = 0x5D; break;
2951         case Xsse_MULF:   *p++ = 0x59; break;
2952         case Xsse_RCPF:   *p++ = 0x53; break;
2953         case Xsse_RSQRTF: *p++ = 0x52; break;
2954         case Xsse_SQRTF:  *p++ = 0x51; break;
2955         case Xsse_SUBF:   *p++ = 0x5C; break;
2956         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2957         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2958         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2959         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2960         default: goto bad;
2961      }
2962      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32FLo.dst)),
2963                       fake(vregNo(i->Xin.Sse32FLo.src)) );
2964      if (xtra & 0x100)
2965         *p++ = toUChar(xtra & 0xFF);
2966      goto done;
2967
2968   case Xin_Sse64FLo:
2969      xtra = 0;
2970      *p++ = 0xF2;
2971      *p++ = 0x0F;
2972      switch (i->Xin.Sse64FLo.op) {
2973         case Xsse_ADDF:   *p++ = 0x58; break;
2974         case Xsse_DIVF:   *p++ = 0x5E; break;
2975         case Xsse_MAXF:   *p++ = 0x5F; break;
2976         case Xsse_MINF:   *p++ = 0x5D; break;
2977         case Xsse_MULF:   *p++ = 0x59; break;
2978         case Xsse_RCPF:   *p++ = 0x53; break;
2979         case Xsse_RSQRTF: *p++ = 0x52; break;
2980         case Xsse_SQRTF:  *p++ = 0x51; break;
2981         case Xsse_SUBF:   *p++ = 0x5C; break;
2982         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
2983         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
2984         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
2985         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
2986         default: goto bad;
2987      }
2988      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64FLo.dst)),
2989                       fake(vregNo(i->Xin.Sse64FLo.src)) );
2990      if (xtra & 0x100)
2991         *p++ = toUChar(xtra & 0xFF);
2992      goto done;
2993
2994   case Xin_SseReRg:
2995#     define XX(_n) *p++ = (_n)
2996      switch (i->Xin.SseReRg.op) {
2997         case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
2998         case Xsse_OR:                 XX(0x0F); XX(0x56); break;
2999         case Xsse_XOR:                XX(0x0F); XX(0x57); break;
3000         case Xsse_AND:                XX(0x0F); XX(0x54); break;
3001         case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
3002         case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
3003         case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
3004         case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
3005         case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
3006         case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
3007         case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
3008         case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
3009         case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
3010         case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
3011         case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
3012         case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
3013         case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
3014         case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
3015         case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
3016         case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
3017         case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
3018         case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
3019         case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
3020         case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
3021         case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
3022         case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
3023         case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
3024         case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
3025         case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
3026         case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
3027         case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
3028         case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
3029         case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
3030         case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
3031         case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
3032         case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
3033         case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
3034         case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
3035         case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
3036         case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
3037         case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
3038         case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
3039         case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
3040         case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
3041         case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
3042         case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
3043         case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
3044         case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
3045         case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
3046         case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
3047         case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
3048         case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
3049         case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
3050         case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
3051         default: goto bad;
3052      }
3053      p = doAMode_R(p, fake(vregNo(i->Xin.SseReRg.dst)),
3054                       fake(vregNo(i->Xin.SseReRg.src)) );
3055#     undef XX
3056      goto done;
3057
3058   case Xin_SseCMov:
3059      /* jmp fwds if !condition */
3060      *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
3061      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3062      ptmp = p;
3063
3064      /* movaps %src, %dst */
3065      *p++ = 0x0F;
3066      *p++ = 0x28;
3067      p = doAMode_R(p, fake(vregNo(i->Xin.SseCMov.dst)),
3068                       fake(vregNo(i->Xin.SseCMov.src)) );
3069
3070      /* Fill in the jump offset. */
3071      *(ptmp-1) = toUChar(p - ptmp);
3072      goto done;
3073
3074   case Xin_SseShuf:
3075      *p++ = 0x66;
3076      *p++ = 0x0F;
3077      *p++ = 0x70;
3078      p = doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
3079                       fake(vregNo(i->Xin.SseShuf.src)) );
3080      *p++ = (UChar)(i->Xin.SseShuf.order);
3081      goto done;
3082
3083   default:
3084      goto bad;
3085   }
3086
3087  bad:
3088   ppX86Instr(i, mode64);
3089   vpanic("emit_X86Instr");
3090   /*NOTREACHED*/
3091
3092  done:
3093   vassert(p - &buf[0] <= 32);
3094   return p - &buf[0];
3095
3096#  undef fake
3097}
3098
3099/*---------------------------------------------------------------*/
3100/*--- end                                     host_x86_defs.c ---*/
3101/*---------------------------------------------------------------*/
3102