1
2/*---------------------------------------------------------------*/
3/*--- begin                                   host_x86_defs.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex.h"
38#include "libvex_trc_values.h"
39
40#include "main_util.h"
41#include "host_generic_regs.h"
42#include "host_x86_defs.h"
43
44
45/* --------- Registers. --------- */
46
47const RRegUniverse* getRRegUniverse_X86 ( void )
48{
49   /* The real-register universe is a big constant, so we just want to
50      initialise it once. */
51   static RRegUniverse rRegUniverse_X86;
52   static Bool         rRegUniverse_X86_initted = False;
53
54   /* Handy shorthand, nothing more */
55   RRegUniverse* ru = &rRegUniverse_X86;
56
57   /* This isn't thread-safe.  Sigh. */
58   if (LIKELY(rRegUniverse_X86_initted))
59      return ru;
60
61   RRegUniverse__init(ru);
62
63   /* Add the registers.  The initial segment of this array must be
64      those available for allocation by reg-alloc, and those that
65      follow are not available for allocation. */
66   ru->regs[ru->size++] = hregX86_EAX();
67   ru->regs[ru->size++] = hregX86_EBX();
68   ru->regs[ru->size++] = hregX86_ECX();
69   ru->regs[ru->size++] = hregX86_EDX();
70   ru->regs[ru->size++] = hregX86_ESI();
71   ru->regs[ru->size++] = hregX86_EDI();
72   ru->regs[ru->size++] = hregX86_FAKE0();
73   ru->regs[ru->size++] = hregX86_FAKE1();
74   ru->regs[ru->size++] = hregX86_FAKE2();
75   ru->regs[ru->size++] = hregX86_FAKE3();
76   ru->regs[ru->size++] = hregX86_FAKE4();
77   ru->regs[ru->size++] = hregX86_FAKE5();
78   ru->regs[ru->size++] = hregX86_XMM0();
79   ru->regs[ru->size++] = hregX86_XMM1();
80   ru->regs[ru->size++] = hregX86_XMM2();
81   ru->regs[ru->size++] = hregX86_XMM3();
82   ru->regs[ru->size++] = hregX86_XMM4();
83   ru->regs[ru->size++] = hregX86_XMM5();
84   ru->regs[ru->size++] = hregX86_XMM6();
85   ru->regs[ru->size++] = hregX86_XMM7();
86   ru->allocable = ru->size;
87   /* And other regs, not available to the allocator. */
88   ru->regs[ru->size++] = hregX86_ESP();
89   ru->regs[ru->size++] = hregX86_EBP();
90
91   rRegUniverse_X86_initted = True;
92
93   RRegUniverse__check_is_sane(ru);
94   return ru;
95}
96
97
98void ppHRegX86 ( HReg reg )
99{
100   Int r;
101   static const HChar* ireg32_names[8]
102     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
103   /* Be generic for all virtual regs. */
104   if (hregIsVirtual(reg)) {
105      ppHReg(reg);
106      return;
107   }
108   /* But specific for real regs. */
109   switch (hregClass(reg)) {
110      case HRcInt32:
111         r = hregEncoding(reg);
112         vassert(r >= 0 && r < 8);
113         vex_printf("%s", ireg32_names[r]);
114         return;
115      case HRcFlt64:
116         r = hregEncoding(reg);
117         vassert(r >= 0 && r < 6);
118         vex_printf("%%fake%d", r);
119         return;
120      case HRcVec128:
121         r = hregEncoding(reg);
122         vassert(r >= 0 && r < 8);
123         vex_printf("%%xmm%d", r);
124         return;
125      default:
126         vpanic("ppHRegX86");
127   }
128}
129
130
131/* --------- Condition codes, Intel encoding. --------- */
132
133const HChar* showX86CondCode ( X86CondCode cond )
134{
135   switch (cond) {
136      case Xcc_O:      return "o";
137      case Xcc_NO:     return "no";
138      case Xcc_B:      return "b";
139      case Xcc_NB:     return "nb";
140      case Xcc_Z:      return "z";
141      case Xcc_NZ:     return "nz";
142      case Xcc_BE:     return "be";
143      case Xcc_NBE:    return "nbe";
144      case Xcc_S:      return "s";
145      case Xcc_NS:     return "ns";
146      case Xcc_P:      return "p";
147      case Xcc_NP:     return "np";
148      case Xcc_L:      return "l";
149      case Xcc_NL:     return "nl";
150      case Xcc_LE:     return "le";
151      case Xcc_NLE:    return "nle";
152      case Xcc_ALWAYS: return "ALWAYS";
153      default: vpanic("ppX86CondCode");
154   }
155}
156
157
158/* --------- X86AMode: memory address expressions. --------- */
159
160X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
161   X86AMode* am = LibVEX_Alloc_inline(sizeof(X86AMode));
162   am->tag = Xam_IR;
163   am->Xam.IR.imm = imm32;
164   am->Xam.IR.reg = reg;
165   return am;
166}
167X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
168   X86AMode* am = LibVEX_Alloc_inline(sizeof(X86AMode));
169   am->tag = Xam_IRRS;
170   am->Xam.IRRS.imm = imm32;
171   am->Xam.IRRS.base = base;
172   am->Xam.IRRS.index = indEx;
173   am->Xam.IRRS.shift = shift;
174   vassert(shift >= 0 && shift <= 3);
175   return am;
176}
177
178X86AMode* dopyX86AMode ( X86AMode* am ) {
179   switch (am->tag) {
180      case Xam_IR:
181         return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
182      case Xam_IRRS:
183         return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
184                               am->Xam.IRRS.index, am->Xam.IRRS.shift );
185      default:
186         vpanic("dopyX86AMode");
187   }
188}
189
190void ppX86AMode ( X86AMode* am ) {
191   switch (am->tag) {
192      case Xam_IR:
193         if (am->Xam.IR.imm == 0)
194            vex_printf("(");
195         else
196            vex_printf("0x%x(", am->Xam.IR.imm);
197         ppHRegX86(am->Xam.IR.reg);
198         vex_printf(")");
199         return;
200      case Xam_IRRS:
201         vex_printf("0x%x(", am->Xam.IRRS.imm);
202         ppHRegX86(am->Xam.IRRS.base);
203         vex_printf(",");
204         ppHRegX86(am->Xam.IRRS.index);
205         vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
206         return;
207      default:
208         vpanic("ppX86AMode");
209   }
210}
211
212static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
213   switch (am->tag) {
214      case Xam_IR:
215         addHRegUse(u, HRmRead, am->Xam.IR.reg);
216         return;
217      case Xam_IRRS:
218         addHRegUse(u, HRmRead, am->Xam.IRRS.base);
219         addHRegUse(u, HRmRead, am->Xam.IRRS.index);
220         return;
221      default:
222         vpanic("addRegUsage_X86AMode");
223   }
224}
225
226static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
227   switch (am->tag) {
228      case Xam_IR:
229         am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
230         return;
231      case Xam_IRRS:
232         am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
233         am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
234         return;
235      default:
236         vpanic("mapRegs_X86AMode");
237   }
238}
239
240/* --------- Operand, which can be reg, immediate or memory. --------- */
241
242X86RMI* X86RMI_Imm ( UInt imm32 ) {
243   X86RMI* op         = LibVEX_Alloc_inline(sizeof(X86RMI));
244   op->tag            = Xrmi_Imm;
245   op->Xrmi.Imm.imm32 = imm32;
246   return op;
247}
248X86RMI* X86RMI_Reg ( HReg reg ) {
249   X86RMI* op       = LibVEX_Alloc_inline(sizeof(X86RMI));
250   op->tag          = Xrmi_Reg;
251   op->Xrmi.Reg.reg = reg;
252   return op;
253}
254X86RMI* X86RMI_Mem ( X86AMode* am ) {
255   X86RMI* op      = LibVEX_Alloc_inline(sizeof(X86RMI));
256   op->tag         = Xrmi_Mem;
257   op->Xrmi.Mem.am = am;
258   return op;
259}
260
261void ppX86RMI ( X86RMI* op ) {
262   switch (op->tag) {
263      case Xrmi_Imm:
264         vex_printf("$0x%x", op->Xrmi.Imm.imm32);
265         return;
266      case Xrmi_Reg:
267         ppHRegX86(op->Xrmi.Reg.reg);
268         return;
269      case Xrmi_Mem:
270         ppX86AMode(op->Xrmi.Mem.am);
271         return;
272     default:
273         vpanic("ppX86RMI");
274   }
275}
276
277/* An X86RMI can only be used in a "read" context (what would it mean
278   to write or modify a literal?) and so we enumerate its registers
279   accordingly. */
280static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
281   switch (op->tag) {
282      case Xrmi_Imm:
283         return;
284      case Xrmi_Reg:
285         addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
286         return;
287      case Xrmi_Mem:
288         addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
289         return;
290      default:
291         vpanic("addRegUsage_X86RMI");
292   }
293}
294
295static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
296   switch (op->tag) {
297      case Xrmi_Imm:
298         return;
299      case Xrmi_Reg:
300         op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
301         return;
302      case Xrmi_Mem:
303         mapRegs_X86AMode(m, op->Xrmi.Mem.am);
304         return;
305      default:
306         vpanic("mapRegs_X86RMI");
307   }
308}
309
310
311/* --------- Operand, which can be reg or immediate only. --------- */
312
313X86RI* X86RI_Imm ( UInt imm32 ) {
314   X86RI* op         = LibVEX_Alloc_inline(sizeof(X86RI));
315   op->tag           = Xri_Imm;
316   op->Xri.Imm.imm32 = imm32;
317   return op;
318}
319X86RI* X86RI_Reg ( HReg reg ) {
320   X86RI* op       = LibVEX_Alloc_inline(sizeof(X86RI));
321   op->tag         = Xri_Reg;
322   op->Xri.Reg.reg = reg;
323   return op;
324}
325
326void ppX86RI ( X86RI* op ) {
327   switch (op->tag) {
328      case Xri_Imm:
329         vex_printf("$0x%x", op->Xri.Imm.imm32);
330         return;
331      case Xri_Reg:
332         ppHRegX86(op->Xri.Reg.reg);
333         return;
334     default:
335         vpanic("ppX86RI");
336   }
337}
338
339/* An X86RI can only be used in a "read" context (what would it mean
340   to write or modify a literal?) and so we enumerate its registers
341   accordingly. */
342static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
343   switch (op->tag) {
344      case Xri_Imm:
345         return;
346      case Xri_Reg:
347         addHRegUse(u, HRmRead, op->Xri.Reg.reg);
348         return;
349      default:
350         vpanic("addRegUsage_X86RI");
351   }
352}
353
354static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
355   switch (op->tag) {
356      case Xri_Imm:
357         return;
358      case Xri_Reg:
359         op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
360         return;
361      default:
362         vpanic("mapRegs_X86RI");
363   }
364}
365
366
367/* --------- Operand, which can be reg or memory only. --------- */
368
369X86RM* X86RM_Reg ( HReg reg ) {
370   X86RM* op       = LibVEX_Alloc_inline(sizeof(X86RM));
371   op->tag         = Xrm_Reg;
372   op->Xrm.Reg.reg = reg;
373   return op;
374}
375X86RM* X86RM_Mem ( X86AMode* am ) {
376   X86RM* op      = LibVEX_Alloc_inline(sizeof(X86RM));
377   op->tag        = Xrm_Mem;
378   op->Xrm.Mem.am = am;
379   return op;
380}
381
382void ppX86RM ( X86RM* op ) {
383   switch (op->tag) {
384      case Xrm_Mem:
385         ppX86AMode(op->Xrm.Mem.am);
386         return;
387      case Xrm_Reg:
388         ppHRegX86(op->Xrm.Reg.reg);
389         return;
390     default:
391         vpanic("ppX86RM");
392   }
393}
394
395/* Because an X86RM can be both a source or destination operand, we
396   have to supply a mode -- pertaining to the operand as a whole --
397   indicating how it's being used. */
398static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
399   switch (op->tag) {
400      case Xrm_Mem:
401         /* Memory is read, written or modified.  So we just want to
402            know the regs read by the amode. */
403         addRegUsage_X86AMode(u, op->Xrm.Mem.am);
404         return;
405      case Xrm_Reg:
406         /* reg is read, written or modified.  Add it in the
407            appropriate way. */
408         addHRegUse(u, mode, op->Xrm.Reg.reg);
409         return;
410     default:
411         vpanic("addRegUsage_X86RM");
412   }
413}
414
415static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
416{
417   switch (op->tag) {
418      case Xrm_Mem:
419         mapRegs_X86AMode(m, op->Xrm.Mem.am);
420         return;
421      case Xrm_Reg:
422         op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
423         return;
424     default:
425         vpanic("mapRegs_X86RM");
426   }
427}
428
429
430/* --------- Instructions. --------- */
431
432const HChar* showX86UnaryOp ( X86UnaryOp op ) {
433   switch (op) {
434      case Xun_NOT: return "not";
435      case Xun_NEG: return "neg";
436      default: vpanic("showX86UnaryOp");
437   }
438}
439
440const HChar* showX86AluOp ( X86AluOp op ) {
441   switch (op) {
442      case Xalu_MOV:  return "mov";
443      case Xalu_CMP:  return "cmp";
444      case Xalu_ADD:  return "add";
445      case Xalu_SUB:  return "sub";
446      case Xalu_ADC:  return "adc";
447      case Xalu_SBB:  return "sbb";
448      case Xalu_AND:  return "and";
449      case Xalu_OR:   return "or";
450      case Xalu_XOR:  return "xor";
451      case Xalu_MUL:  return "mul";
452      default: vpanic("showX86AluOp");
453   }
454}
455
456const HChar* showX86ShiftOp ( X86ShiftOp op ) {
457   switch (op) {
458      case Xsh_SHL: return "shl";
459      case Xsh_SHR: return "shr";
460      case Xsh_SAR: return "sar";
461      default: vpanic("showX86ShiftOp");
462   }
463}
464
465const HChar* showX86FpOp ( X86FpOp op ) {
466   switch (op) {
467      case Xfp_ADD:    return "add";
468      case Xfp_SUB:    return "sub";
469      case Xfp_MUL:    return "mul";
470      case Xfp_DIV:    return "div";
471      case Xfp_SCALE:  return "scale";
472      case Xfp_ATAN:   return "atan";
473      case Xfp_YL2X:   return "yl2x";
474      case Xfp_YL2XP1: return "yl2xp1";
475      case Xfp_PREM:   return "prem";
476      case Xfp_PREM1:  return "prem1";
477      case Xfp_SQRT:   return "sqrt";
478      case Xfp_ABS:    return "abs";
479      case Xfp_NEG:    return "chs";
480      case Xfp_MOV:    return "mov";
481      case Xfp_SIN:    return "sin";
482      case Xfp_COS:    return "cos";
483      case Xfp_TAN:    return "tan";
484      case Xfp_ROUND:  return "round";
485      case Xfp_2XM1:   return "2xm1";
486      default: vpanic("showX86FpOp");
487   }
488}
489
490const HChar* showX86SseOp ( X86SseOp op ) {
491   switch (op) {
492      case Xsse_MOV:      return "mov(?!)";
493      case Xsse_ADDF:     return "add";
494      case Xsse_SUBF:     return "sub";
495      case Xsse_MULF:     return "mul";
496      case Xsse_DIVF:     return "div";
497      case Xsse_MAXF:     return "max";
498      case Xsse_MINF:     return "min";
499      case Xsse_CMPEQF:   return "cmpFeq";
500      case Xsse_CMPLTF:   return "cmpFlt";
501      case Xsse_CMPLEF:   return "cmpFle";
502      case Xsse_CMPUNF:   return "cmpFun";
503      case Xsse_RCPF:     return "rcp";
504      case Xsse_RSQRTF:   return "rsqrt";
505      case Xsse_SQRTF:    return "sqrt";
506      case Xsse_AND:      return "and";
507      case Xsse_OR:       return "or";
508      case Xsse_XOR:      return "xor";
509      case Xsse_ANDN:     return "andn";
510      case Xsse_ADD8:     return "paddb";
511      case Xsse_ADD16:    return "paddw";
512      case Xsse_ADD32:    return "paddd";
513      case Xsse_ADD64:    return "paddq";
514      case Xsse_QADD8U:   return "paddusb";
515      case Xsse_QADD16U:  return "paddusw";
516      case Xsse_QADD8S:   return "paddsb";
517      case Xsse_QADD16S:  return "paddsw";
518      case Xsse_SUB8:     return "psubb";
519      case Xsse_SUB16:    return "psubw";
520      case Xsse_SUB32:    return "psubd";
521      case Xsse_SUB64:    return "psubq";
522      case Xsse_QSUB8U:   return "psubusb";
523      case Xsse_QSUB16U:  return "psubusw";
524      case Xsse_QSUB8S:   return "psubsb";
525      case Xsse_QSUB16S:  return "psubsw";
526      case Xsse_MUL16:    return "pmullw";
527      case Xsse_MULHI16U: return "pmulhuw";
528      case Xsse_MULHI16S: return "pmulhw";
529      case Xsse_AVG8U:    return "pavgb";
530      case Xsse_AVG16U:   return "pavgw";
531      case Xsse_MAX16S:   return "pmaxw";
532      case Xsse_MAX8U:    return "pmaxub";
533      case Xsse_MIN16S:   return "pminw";
534      case Xsse_MIN8U:    return "pminub";
535      case Xsse_CMPEQ8:   return "pcmpeqb";
536      case Xsse_CMPEQ16:  return "pcmpeqw";
537      case Xsse_CMPEQ32:  return "pcmpeqd";
538      case Xsse_CMPGT8S:  return "pcmpgtb";
539      case Xsse_CMPGT16S: return "pcmpgtw";
540      case Xsse_CMPGT32S: return "pcmpgtd";
541      case Xsse_SHL16:    return "psllw";
542      case Xsse_SHL32:    return "pslld";
543      case Xsse_SHL64:    return "psllq";
544      case Xsse_SHR16:    return "psrlw";
545      case Xsse_SHR32:    return "psrld";
546      case Xsse_SHR64:    return "psrlq";
547      case Xsse_SAR16:    return "psraw";
548      case Xsse_SAR32:    return "psrad";
549      case Xsse_PACKSSD:  return "packssdw";
550      case Xsse_PACKSSW:  return "packsswb";
551      case Xsse_PACKUSW:  return "packuswb";
552      case Xsse_UNPCKHB:  return "punpckhb";
553      case Xsse_UNPCKHW:  return "punpckhw";
554      case Xsse_UNPCKHD:  return "punpckhd";
555      case Xsse_UNPCKHQ:  return "punpckhq";
556      case Xsse_UNPCKLB:  return "punpcklb";
557      case Xsse_UNPCKLW:  return "punpcklw";
558      case Xsse_UNPCKLD:  return "punpckld";
559      case Xsse_UNPCKLQ:  return "punpcklq";
560      default: vpanic("showX86SseOp");
561   }
562}
563
564X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
565   X86Instr* i       = LibVEX_Alloc_inline(sizeof(X86Instr));
566   i->tag            = Xin_Alu32R;
567   i->Xin.Alu32R.op  = op;
568   i->Xin.Alu32R.src = src;
569   i->Xin.Alu32R.dst = dst;
570   return i;
571}
572X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
573   X86Instr* i       = LibVEX_Alloc_inline(sizeof(X86Instr));
574   i->tag            = Xin_Alu32M;
575   i->Xin.Alu32M.op  = op;
576   i->Xin.Alu32M.src = src;
577   i->Xin.Alu32M.dst = dst;
578   vassert(op != Xalu_MUL);
579   return i;
580}
581X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
582   X86Instr* i     = LibVEX_Alloc_inline(sizeof(X86Instr));
583   i->tag          = Xin_Sh32;
584   i->Xin.Sh32.op  = op;
585   i->Xin.Sh32.src = src;
586   i->Xin.Sh32.dst = dst;
587   return i;
588}
589X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
590   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
591   i->tag              = Xin_Test32;
592   i->Xin.Test32.imm32 = imm32;
593   i->Xin.Test32.dst   = dst;
594   return i;
595}
596X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
597   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
598   i->tag             = Xin_Unary32;
599   i->Xin.Unary32.op  = op;
600   i->Xin.Unary32.dst = dst;
601   return i;
602}
603X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
604   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
605   i->tag             = Xin_Lea32;
606   i->Xin.Lea32.am    = am;
607   i->Xin.Lea32.dst   = dst;
608   return i;
609}
610X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
611   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
612   i->tag             = Xin_MulL;
613   i->Xin.MulL.syned  = syned;
614   i->Xin.MulL.src    = src;
615   return i;
616}
617X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
618   X86Instr* i      = LibVEX_Alloc_inline(sizeof(X86Instr));
619   i->tag           = Xin_Div;
620   i->Xin.Div.syned = syned;
621   i->Xin.Div.src   = src;
622   return i;
623}
624X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
625   X86Instr* i       = LibVEX_Alloc_inline(sizeof(X86Instr));
626   i->tag            = Xin_Sh3232;
627   i->Xin.Sh3232.op  = op;
628   i->Xin.Sh3232.amt = amt;
629   i->Xin.Sh3232.src = src;
630   i->Xin.Sh3232.dst = dst;
631   vassert(op == Xsh_SHL || op == Xsh_SHR);
632   return i;
633}
634X86Instr* X86Instr_Push( X86RMI* src ) {
635   X86Instr* i     = LibVEX_Alloc_inline(sizeof(X86Instr));
636   i->tag          = Xin_Push;
637   i->Xin.Push.src = src;
638   return i;
639}
640X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms,
641                          RetLoc rloc ) {
642   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
643   i->tag               = Xin_Call;
644   i->Xin.Call.cond     = cond;
645   i->Xin.Call.target   = target;
646   i->Xin.Call.regparms = regparms;
647   i->Xin.Call.rloc     = rloc;
648   vassert(regparms >= 0 && regparms <= 3);
649   vassert(is_sane_RetLoc(rloc));
650   return i;
651}
652X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP,
653                             X86CondCode cond, Bool toFastEP ) {
654   X86Instr* i             = LibVEX_Alloc_inline(sizeof(X86Instr));
655   i->tag                  = Xin_XDirect;
656   i->Xin.XDirect.dstGA    = dstGA;
657   i->Xin.XDirect.amEIP    = amEIP;
658   i->Xin.XDirect.cond     = cond;
659   i->Xin.XDirect.toFastEP = toFastEP;
660   return i;
661}
662X86Instr* X86Instr_XIndir ( HReg dstGA, X86AMode* amEIP,
663                            X86CondCode cond ) {
664   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
665   i->tag              = Xin_XIndir;
666   i->Xin.XIndir.dstGA = dstGA;
667   i->Xin.XIndir.amEIP = amEIP;
668   i->Xin.XIndir.cond  = cond;
669   return i;
670}
671X86Instr* X86Instr_XAssisted ( HReg dstGA, X86AMode* amEIP,
672                               X86CondCode cond, IRJumpKind jk ) {
673   X86Instr* i            = LibVEX_Alloc_inline(sizeof(X86Instr));
674   i->tag                 = Xin_XAssisted;
675   i->Xin.XAssisted.dstGA = dstGA;
676   i->Xin.XAssisted.amEIP = amEIP;
677   i->Xin.XAssisted.cond  = cond;
678   i->Xin.XAssisted.jk    = jk;
679   return i;
680}
681X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
682   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
683   i->tag             = Xin_CMov32;
684   i->Xin.CMov32.cond = cond;
685   i->Xin.CMov32.src  = src;
686   i->Xin.CMov32.dst  = dst;
687   vassert(cond != Xcc_ALWAYS);
688   return i;
689}
690X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
691                            X86AMode* src, HReg dst ) {
692   X86Instr* i           = LibVEX_Alloc_inline(sizeof(X86Instr));
693   i->tag                = Xin_LoadEX;
694   i->Xin.LoadEX.szSmall = szSmall;
695   i->Xin.LoadEX.syned   = syned;
696   i->Xin.LoadEX.src     = src;
697   i->Xin.LoadEX.dst     = dst;
698   vassert(szSmall == 1 || szSmall == 2);
699   return i;
700}
701X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
702   X86Instr* i      = LibVEX_Alloc_inline(sizeof(X86Instr));
703   i->tag           = Xin_Store;
704   i->Xin.Store.sz  = sz;
705   i->Xin.Store.src = src;
706   i->Xin.Store.dst = dst;
707   vassert(sz == 1 || sz == 2);
708   return i;
709}
710X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
711   X86Instr* i       = LibVEX_Alloc_inline(sizeof(X86Instr));
712   i->tag            = Xin_Set32;
713   i->Xin.Set32.cond = cond;
714   i->Xin.Set32.dst  = dst;
715   return i;
716}
717X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
718   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
719   i->tag               = Xin_Bsfr32;
720   i->Xin.Bsfr32.isFwds = isFwds;
721   i->Xin.Bsfr32.src    = src;
722   i->Xin.Bsfr32.dst    = dst;
723   return i;
724}
725X86Instr* X86Instr_MFence ( UInt hwcaps ) {
726   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
727   i->tag               = Xin_MFence;
728   i->Xin.MFence.hwcaps = hwcaps;
729   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_MMXEXT
730                            |VEX_HWCAPS_X86_SSE1
731                            |VEX_HWCAPS_X86_SSE2
732                            |VEX_HWCAPS_X86_SSE3
733                            |VEX_HWCAPS_X86_LZCNT)));
734   return i;
735}
736X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
737   X86Instr* i      = LibVEX_Alloc_inline(sizeof(X86Instr));
738   i->tag           = Xin_ACAS;
739   i->Xin.ACAS.addr = addr;
740   i->Xin.ACAS.sz   = sz;
741   vassert(sz == 4 || sz == 2 || sz == 1);
742   return i;
743}
744X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
745   X86Instr* i       = LibVEX_Alloc_inline(sizeof(X86Instr));
746   i->tag            = Xin_DACAS;
747   i->Xin.DACAS.addr = addr;
748   return i;
749}
750
751X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
752   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
753   i->tag             = Xin_FpUnary;
754   i->Xin.FpUnary.op  = op;
755   i->Xin.FpUnary.src = src;
756   i->Xin.FpUnary.dst = dst;
757   return i;
758}
759X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
760   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
761   i->tag               = Xin_FpBinary;
762   i->Xin.FpBinary.op   = op;
763   i->Xin.FpBinary.srcL = srcL;
764   i->Xin.FpBinary.srcR = srcR;
765   i->Xin.FpBinary.dst  = dst;
766   return i;
767}
768X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
769   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
770   i->tag               = Xin_FpLdSt;
771   i->Xin.FpLdSt.isLoad = isLoad;
772   i->Xin.FpLdSt.sz     = sz;
773   i->Xin.FpLdSt.reg    = reg;
774   i->Xin.FpLdSt.addr   = addr;
775   vassert(sz == 4 || sz == 8 || sz == 10);
776   return i;
777}
778X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
779                             HReg reg, X86AMode* addr ) {
780   X86Instr* i           = LibVEX_Alloc_inline(sizeof(X86Instr));
781   i->tag                = Xin_FpLdStI;
782   i->Xin.FpLdStI.isLoad = isLoad;
783   i->Xin.FpLdStI.sz     = sz;
784   i->Xin.FpLdStI.reg    = reg;
785   i->Xin.FpLdStI.addr   = addr;
786   vassert(sz == 2 || sz == 4 || sz == 8);
787   return i;
788}
789X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
790   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
791   i->tag              = Xin_Fp64to32;
792   i->Xin.Fp64to32.src = src;
793   i->Xin.Fp64to32.dst = dst;
794   return i;
795}
796X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
797   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
798   i->tag             = Xin_FpCMov;
799   i->Xin.FpCMov.cond = cond;
800   i->Xin.FpCMov.src  = src;
801   i->Xin.FpCMov.dst  = dst;
802   vassert(cond != Xcc_ALWAYS);
803   return i;
804}
805X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
806   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
807   i->tag               = Xin_FpLdCW;
808   i->Xin.FpLdCW.addr   = addr;
809   return i;
810}
811X86Instr* X86Instr_FpStSW_AX ( void ) {
812   X86Instr* i = LibVEX_Alloc_inline(sizeof(X86Instr));
813   i->tag      = Xin_FpStSW_AX;
814   return i;
815}
816X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
817   X86Instr* i       = LibVEX_Alloc_inline(sizeof(X86Instr));
818   i->tag            = Xin_FpCmp;
819   i->Xin.FpCmp.srcL = srcL;
820   i->Xin.FpCmp.srcR = srcR;
821   i->Xin.FpCmp.dst  = dst;
822   return i;
823}
824X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
825   X86Instr* i            = LibVEX_Alloc_inline(sizeof(X86Instr));
826   i->tag                 = Xin_SseConst;
827   i->Xin.SseConst.con    = con;
828   i->Xin.SseConst.dst    = dst;
829   vassert(hregClass(dst) == HRcVec128);
830   return i;
831}
832X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
833   X86Instr* i           = LibVEX_Alloc_inline(sizeof(X86Instr));
834   i->tag                = Xin_SseLdSt;
835   i->Xin.SseLdSt.isLoad = isLoad;
836   i->Xin.SseLdSt.reg    = reg;
837   i->Xin.SseLdSt.addr   = addr;
838   return i;
839}
840X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
841{
842   X86Instr* i           = LibVEX_Alloc_inline(sizeof(X86Instr));
843   i->tag                = Xin_SseLdzLO;
844   i->Xin.SseLdzLO.sz    = toUChar(sz);
845   i->Xin.SseLdzLO.reg   = reg;
846   i->Xin.SseLdzLO.addr  = addr;
847   vassert(sz == 4 || sz == 8);
848   return i;
849}
850X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
851   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
852   i->tag              = Xin_Sse32Fx4;
853   i->Xin.Sse32Fx4.op  = op;
854   i->Xin.Sse32Fx4.src = src;
855   i->Xin.Sse32Fx4.dst = dst;
856   vassert(op != Xsse_MOV);
857   return i;
858}
859X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
860   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
861   i->tag              = Xin_Sse32FLo;
862   i->Xin.Sse32FLo.op  = op;
863   i->Xin.Sse32FLo.src = src;
864   i->Xin.Sse32FLo.dst = dst;
865   vassert(op != Xsse_MOV);
866   return i;
867}
868X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
869   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
870   i->tag              = Xin_Sse64Fx2;
871   i->Xin.Sse64Fx2.op  = op;
872   i->Xin.Sse64Fx2.src = src;
873   i->Xin.Sse64Fx2.dst = dst;
874   vassert(op != Xsse_MOV);
875   return i;
876}
877X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
878   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
879   i->tag              = Xin_Sse64FLo;
880   i->Xin.Sse64FLo.op  = op;
881   i->Xin.Sse64FLo.src = src;
882   i->Xin.Sse64FLo.dst = dst;
883   vassert(op != Xsse_MOV);
884   return i;
885}
886X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
887   X86Instr* i        = LibVEX_Alloc_inline(sizeof(X86Instr));
888   i->tag             = Xin_SseReRg;
889   i->Xin.SseReRg.op  = op;
890   i->Xin.SseReRg.src = re;
891   i->Xin.SseReRg.dst = rg;
892   return i;
893}
894X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
895   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
896   i->tag              = Xin_SseCMov;
897   i->Xin.SseCMov.cond = cond;
898   i->Xin.SseCMov.src  = src;
899   i->Xin.SseCMov.dst  = dst;
900   vassert(cond != Xcc_ALWAYS);
901   return i;
902}
903X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
904   X86Instr* i          = LibVEX_Alloc_inline(sizeof(X86Instr));
905   i->tag               = Xin_SseShuf;
906   i->Xin.SseShuf.order = order;
907   i->Xin.SseShuf.src   = src;
908   i->Xin.SseShuf.dst   = dst;
909   vassert(order >= 0 && order <= 0xFF);
910   return i;
911}
912X86Instr* X86Instr_EvCheck ( X86AMode* amCounter,
913                             X86AMode* amFailAddr ) {
914   X86Instr* i               = LibVEX_Alloc_inline(sizeof(X86Instr));
915   i->tag                    = Xin_EvCheck;
916   i->Xin.EvCheck.amCounter  = amCounter;
917   i->Xin.EvCheck.amFailAddr = amFailAddr;
918   return i;
919}
920X86Instr* X86Instr_ProfInc ( void ) {
921   X86Instr* i = LibVEX_Alloc_inline(sizeof(X86Instr));
922   i->tag      = Xin_ProfInc;
923   return i;
924}
925
926void ppX86Instr ( const X86Instr* i, Bool mode64 ) {
927   vassert(mode64 == False);
928   switch (i->tag) {
929      case Xin_Alu32R:
930         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
931         ppX86RMI(i->Xin.Alu32R.src);
932         vex_printf(",");
933         ppHRegX86(i->Xin.Alu32R.dst);
934         return;
935      case Xin_Alu32M:
936         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
937         ppX86RI(i->Xin.Alu32M.src);
938         vex_printf(",");
939         ppX86AMode(i->Xin.Alu32M.dst);
940         return;
941      case Xin_Sh32:
942         vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
943         if (i->Xin.Sh32.src == 0)
944           vex_printf("%%cl,");
945         else
946            vex_printf("$%d,", (Int)i->Xin.Sh32.src);
947         ppHRegX86(i->Xin.Sh32.dst);
948         return;
949      case Xin_Test32:
950         vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
951         ppX86RM(i->Xin.Test32.dst);
952         return;
953      case Xin_Unary32:
954         vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
955         ppHRegX86(i->Xin.Unary32.dst);
956         return;
957      case Xin_Lea32:
958         vex_printf("leal ");
959         ppX86AMode(i->Xin.Lea32.am);
960         vex_printf(",");
961         ppHRegX86(i->Xin.Lea32.dst);
962         return;
963      case Xin_MulL:
964         vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
965         ppX86RM(i->Xin.MulL.src);
966         return;
967      case Xin_Div:
968         vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
969         ppX86RM(i->Xin.Div.src);
970         return;
971      case Xin_Sh3232:
972         vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
973         if (i->Xin.Sh3232.amt == 0)
974           vex_printf(" %%cl,");
975         else
976            vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
977         ppHRegX86(i->Xin.Sh3232.src);
978         vex_printf(",");
979         ppHRegX86(i->Xin.Sh3232.dst);
980         return;
981      case Xin_Push:
982         vex_printf("pushl ");
983         ppX86RMI(i->Xin.Push.src);
984         return;
985      case Xin_Call:
986         vex_printf("call%s[%d,",
987                    i->Xin.Call.cond==Xcc_ALWAYS
988                       ? "" : showX86CondCode(i->Xin.Call.cond),
989                    i->Xin.Call.regparms);
990         ppRetLoc(i->Xin.Call.rloc);
991         vex_printf("] 0x%x", i->Xin.Call.target);
992         break;
993      case Xin_XDirect:
994         vex_printf("(xDirect) ");
995         vex_printf("if (%%eflags.%s) { ",
996                    showX86CondCode(i->Xin.XDirect.cond));
997         vex_printf("movl $0x%x,", i->Xin.XDirect.dstGA);
998         ppX86AMode(i->Xin.XDirect.amEIP);
999         vex_printf("; ");
1000         vex_printf("movl $disp_cp_chain_me_to_%sEP,%%edx; call *%%edx }",
1001                    i->Xin.XDirect.toFastEP ? "fast" : "slow");
1002         return;
1003      case Xin_XIndir:
1004         vex_printf("(xIndir) ");
1005         vex_printf("if (%%eflags.%s) { movl ",
1006                    showX86CondCode(i->Xin.XIndir.cond));
1007         ppHRegX86(i->Xin.XIndir.dstGA);
1008         vex_printf(",");
1009         ppX86AMode(i->Xin.XIndir.amEIP);
1010         vex_printf("; movl $disp_indir,%%edx; jmp *%%edx }");
1011         return;
1012      case Xin_XAssisted:
1013         vex_printf("(xAssisted) ");
1014         vex_printf("if (%%eflags.%s) { ",
1015                    showX86CondCode(i->Xin.XAssisted.cond));
1016         vex_printf("movl ");
1017         ppHRegX86(i->Xin.XAssisted.dstGA);
1018         vex_printf(",");
1019         ppX86AMode(i->Xin.XAssisted.amEIP);
1020         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%ebp",
1021                    (Int)i->Xin.XAssisted.jk);
1022         vex_printf("; movl $disp_assisted,%%edx; jmp *%%edx }");
1023         return;
1024      case Xin_CMov32:
1025         vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
1026         ppX86RM(i->Xin.CMov32.src);
1027         vex_printf(",");
1028         ppHRegX86(i->Xin.CMov32.dst);
1029         return;
1030      case Xin_LoadEX:
1031         vex_printf("mov%c%cl ",
1032                    i->Xin.LoadEX.syned ? 's' : 'z',
1033                    i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
1034         ppX86AMode(i->Xin.LoadEX.src);
1035         vex_printf(",");
1036         ppHRegX86(i->Xin.LoadEX.dst);
1037         return;
1038      case Xin_Store:
1039         vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
1040         ppHRegX86(i->Xin.Store.src);
1041         vex_printf(",");
1042         ppX86AMode(i->Xin.Store.dst);
1043         return;
1044      case Xin_Set32:
1045         vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
1046         ppHRegX86(i->Xin.Set32.dst);
1047         return;
1048      case Xin_Bsfr32:
1049         vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
1050         ppHRegX86(i->Xin.Bsfr32.src);
1051         vex_printf(",");
1052         ppHRegX86(i->Xin.Bsfr32.dst);
1053         return;
1054      case Xin_MFence:
1055         vex_printf("mfence(%s)",
1056                    LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
1057         return;
1058      case Xin_ACAS:
1059         vex_printf("lock cmpxchg%c ",
1060                     i->Xin.ACAS.sz==1 ? 'b'
1061                                       : i->Xin.ACAS.sz==2 ? 'w' : 'l');
1062         vex_printf("{%%eax->%%ebx},");
1063         ppX86AMode(i->Xin.ACAS.addr);
1064         return;
1065      case Xin_DACAS:
1066         vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
1067         ppX86AMode(i->Xin.DACAS.addr);
1068         return;
1069      case Xin_FpUnary:
1070         vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
1071         ppHRegX86(i->Xin.FpUnary.src);
1072         vex_printf(",");
1073         ppHRegX86(i->Xin.FpUnary.dst);
1074         break;
1075      case Xin_FpBinary:
1076         vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
1077         ppHRegX86(i->Xin.FpBinary.srcL);
1078         vex_printf(",");
1079         ppHRegX86(i->Xin.FpBinary.srcR);
1080         vex_printf(",");
1081         ppHRegX86(i->Xin.FpBinary.dst);
1082         break;
1083      case Xin_FpLdSt:
1084         if (i->Xin.FpLdSt.isLoad) {
1085            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
1086                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1087            ppX86AMode(i->Xin.FpLdSt.addr);
1088            vex_printf(", ");
1089            ppHRegX86(i->Xin.FpLdSt.reg);
1090         } else {
1091            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
1092                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
1093            ppHRegX86(i->Xin.FpLdSt.reg);
1094            vex_printf(", ");
1095            ppX86AMode(i->Xin.FpLdSt.addr);
1096         }
1097         return;
1098      case Xin_FpLdStI:
1099         if (i->Xin.FpLdStI.isLoad) {
1100            vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1101                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1102            ppX86AMode(i->Xin.FpLdStI.addr);
1103            vex_printf(", ");
1104            ppHRegX86(i->Xin.FpLdStI.reg);
1105         } else {
1106            vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
1107                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
1108            ppHRegX86(i->Xin.FpLdStI.reg);
1109            vex_printf(", ");
1110            ppX86AMode(i->Xin.FpLdStI.addr);
1111         }
1112         return;
1113      case Xin_Fp64to32:
1114         vex_printf("gdtof ");
1115         ppHRegX86(i->Xin.Fp64to32.src);
1116         vex_printf(",");
1117         ppHRegX86(i->Xin.Fp64to32.dst);
1118         return;
1119      case Xin_FpCMov:
1120         vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
1121         ppHRegX86(i->Xin.FpCMov.src);
1122         vex_printf(",");
1123         ppHRegX86(i->Xin.FpCMov.dst);
1124         return;
1125      case Xin_FpLdCW:
1126         vex_printf("fldcw ");
1127         ppX86AMode(i->Xin.FpLdCW.addr);
1128         return;
1129      case Xin_FpStSW_AX:
1130         vex_printf("fstsw %%ax");
1131         return;
1132      case Xin_FpCmp:
1133         vex_printf("gcmp ");
1134         ppHRegX86(i->Xin.FpCmp.srcL);
1135         vex_printf(",");
1136         ppHRegX86(i->Xin.FpCmp.srcR);
1137         vex_printf(",");
1138         ppHRegX86(i->Xin.FpCmp.dst);
1139         break;
1140      case Xin_SseConst:
1141         vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
1142         ppHRegX86(i->Xin.SseConst.dst);
1143         break;
1144      case Xin_SseLdSt:
1145         vex_printf("movups ");
1146         if (i->Xin.SseLdSt.isLoad) {
1147            ppX86AMode(i->Xin.SseLdSt.addr);
1148            vex_printf(",");
1149            ppHRegX86(i->Xin.SseLdSt.reg);
1150         } else {
1151            ppHRegX86(i->Xin.SseLdSt.reg);
1152            vex_printf(",");
1153            ppX86AMode(i->Xin.SseLdSt.addr);
1154         }
1155         return;
1156      case Xin_SseLdzLO:
1157         vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
1158         ppX86AMode(i->Xin.SseLdzLO.addr);
1159         vex_printf(",");
1160         ppHRegX86(i->Xin.SseLdzLO.reg);
1161         return;
1162      case Xin_Sse32Fx4:
1163         vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
1164         ppHRegX86(i->Xin.Sse32Fx4.src);
1165         vex_printf(",");
1166         ppHRegX86(i->Xin.Sse32Fx4.dst);
1167         return;
1168      case Xin_Sse32FLo:
1169         vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
1170         ppHRegX86(i->Xin.Sse32FLo.src);
1171         vex_printf(",");
1172         ppHRegX86(i->Xin.Sse32FLo.dst);
1173         return;
1174      case Xin_Sse64Fx2:
1175         vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
1176         ppHRegX86(i->Xin.Sse64Fx2.src);
1177         vex_printf(",");
1178         ppHRegX86(i->Xin.Sse64Fx2.dst);
1179         return;
1180      case Xin_Sse64FLo:
1181         vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
1182         ppHRegX86(i->Xin.Sse64FLo.src);
1183         vex_printf(",");
1184         ppHRegX86(i->Xin.Sse64FLo.dst);
1185         return;
1186      case Xin_SseReRg:
1187         vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
1188         ppHRegX86(i->Xin.SseReRg.src);
1189         vex_printf(",");
1190         ppHRegX86(i->Xin.SseReRg.dst);
1191         return;
1192      case Xin_SseCMov:
1193         vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
1194         ppHRegX86(i->Xin.SseCMov.src);
1195         vex_printf(",");
1196         ppHRegX86(i->Xin.SseCMov.dst);
1197         return;
1198      case Xin_SseShuf:
1199         vex_printf("pshufd $0x%x,", (UInt)i->Xin.SseShuf.order);
1200         ppHRegX86(i->Xin.SseShuf.src);
1201         vex_printf(",");
1202         ppHRegX86(i->Xin.SseShuf.dst);
1203         return;
1204      case Xin_EvCheck:
1205         vex_printf("(evCheck) decl ");
1206         ppX86AMode(i->Xin.EvCheck.amCounter);
1207         vex_printf("; jns nofail; jmp *");
1208         ppX86AMode(i->Xin.EvCheck.amFailAddr);
1209         vex_printf("; nofail:");
1210         return;
1211      case Xin_ProfInc:
1212         vex_printf("(profInc) addl $1,NotKnownYet; "
1213                    "adcl $0,NotKnownYet+4");
1214         return;
1215      default:
1216         vpanic("ppX86Instr");
1217   }
1218}
1219
1220/* --------- Helpers for register allocation. --------- */
1221
1222void getRegUsage_X86Instr (HRegUsage* u, const X86Instr* i, Bool mode64)
1223{
1224   Bool unary;
1225   vassert(mode64 == False);
1226   initHRegUsage(u);
1227   switch (i->tag) {
1228      case Xin_Alu32R:
1229         addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
1230         if (i->Xin.Alu32R.op == Xalu_MOV) {
1231            addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
1232            return;
1233         }
1234         if (i->Xin.Alu32R.op == Xalu_CMP) {
1235            addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
1236            return;
1237         }
1238         addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
1239         return;
1240      case Xin_Alu32M:
1241         addRegUsage_X86RI(u, i->Xin.Alu32M.src);
1242         addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
1243         return;
1244      case Xin_Sh32:
1245         addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
1246         if (i->Xin.Sh32.src == 0)
1247            addHRegUse(u, HRmRead, hregX86_ECX());
1248         return;
1249      case Xin_Test32:
1250         addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
1251         return;
1252      case Xin_Unary32:
1253         addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
1254         return;
1255      case Xin_Lea32:
1256         addRegUsage_X86AMode(u, i->Xin.Lea32.am);
1257         addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
1258         return;
1259      case Xin_MulL:
1260         addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
1261         addHRegUse(u, HRmModify, hregX86_EAX());
1262         addHRegUse(u, HRmWrite, hregX86_EDX());
1263         return;
1264      case Xin_Div:
1265         addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
1266         addHRegUse(u, HRmModify, hregX86_EAX());
1267         addHRegUse(u, HRmModify, hregX86_EDX());
1268         return;
1269      case Xin_Sh3232:
1270         addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
1271         addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
1272         if (i->Xin.Sh3232.amt == 0)
1273            addHRegUse(u, HRmRead, hregX86_ECX());
1274         return;
1275      case Xin_Push:
1276         addRegUsage_X86RMI(u, i->Xin.Push.src);
1277         addHRegUse(u, HRmModify, hregX86_ESP());
1278         return;
1279      case Xin_Call:
1280         /* This is a bit subtle. */
1281         /* First off, claim it trashes all the caller-saved regs
1282            which fall within the register allocator's jurisdiction.
1283            These I believe to be %eax %ecx %edx and all the xmm
1284            registers. */
1285         addHRegUse(u, HRmWrite, hregX86_EAX());
1286         addHRegUse(u, HRmWrite, hregX86_ECX());
1287         addHRegUse(u, HRmWrite, hregX86_EDX());
1288         addHRegUse(u, HRmWrite, hregX86_XMM0());
1289         addHRegUse(u, HRmWrite, hregX86_XMM1());
1290         addHRegUse(u, HRmWrite, hregX86_XMM2());
1291         addHRegUse(u, HRmWrite, hregX86_XMM3());
1292         addHRegUse(u, HRmWrite, hregX86_XMM4());
1293         addHRegUse(u, HRmWrite, hregX86_XMM5());
1294         addHRegUse(u, HRmWrite, hregX86_XMM6());
1295         addHRegUse(u, HRmWrite, hregX86_XMM7());
1296         /* Now we have to state any parameter-carrying registers
1297            which might be read.  This depends on the regparmness. */
1298         switch (i->Xin.Call.regparms) {
1299            case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
1300            case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
1301            case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
1302            case 0: break;
1303            default: vpanic("getRegUsage_X86Instr:Call:regparms");
1304         }
1305         /* Finally, there is the issue that the insn trashes a
1306            register because the literal target address has to be
1307            loaded into a register.  Fortunately, for the 0/1/2
1308            regparm case, we can use EAX, EDX and ECX respectively, so
1309            this does not cause any further damage.  For the 3-regparm
1310            case, we'll have to choose another register arbitrarily --
1311            since A, D and C are used for parameters -- and so we might
1312            as well choose EDI. */
1313         if (i->Xin.Call.regparms == 3)
1314            addHRegUse(u, HRmWrite, hregX86_EDI());
1315         /* Upshot of this is that the assembler really must observe
1316            the here-stated convention of which register to use as an
1317            address temporary, depending on the regparmness: 0==EAX,
1318            1==EDX, 2==ECX, 3==EDI. */
1319         return;
1320      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1321         conditionally exit the block.  Hence we only need to list (1)
1322         the registers that they read, and (2) the registers that they
1323         write in the case where the block is not exited.  (2) is
1324         empty, hence only (1) is relevant here. */
1325      case Xin_XDirect:
1326         addRegUsage_X86AMode(u, i->Xin.XDirect.amEIP);
1327         return;
1328      case Xin_XIndir:
1329         addHRegUse(u, HRmRead, i->Xin.XIndir.dstGA);
1330         addRegUsage_X86AMode(u, i->Xin.XIndir.amEIP);
1331         return;
1332      case Xin_XAssisted:
1333         addHRegUse(u, HRmRead, i->Xin.XAssisted.dstGA);
1334         addRegUsage_X86AMode(u, i->Xin.XAssisted.amEIP);
1335         return;
1336      case Xin_CMov32:
1337         addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
1338         addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
1339         return;
1340      case Xin_LoadEX:
1341         addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
1342         addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
1343         return;
1344      case Xin_Store:
1345         addHRegUse(u, HRmRead, i->Xin.Store.src);
1346         addRegUsage_X86AMode(u, i->Xin.Store.dst);
1347         return;
1348      case Xin_Set32:
1349         addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
1350         return;
1351      case Xin_Bsfr32:
1352         addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
1353         addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
1354         return;
1355      case Xin_MFence:
1356         return;
1357      case Xin_ACAS:
1358         addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
1359         addHRegUse(u, HRmRead, hregX86_EBX());
1360         addHRegUse(u, HRmModify, hregX86_EAX());
1361         return;
1362      case Xin_DACAS:
1363         addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
1364         addHRegUse(u, HRmRead, hregX86_ECX());
1365         addHRegUse(u, HRmRead, hregX86_EBX());
1366         addHRegUse(u, HRmModify, hregX86_EDX());
1367         addHRegUse(u, HRmModify, hregX86_EAX());
1368         return;
1369      case Xin_FpUnary:
1370         addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
1371         addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
1372         return;
1373      case Xin_FpBinary:
1374         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
1375         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
1376         addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
1377         return;
1378      case Xin_FpLdSt:
1379         addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
1380         addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
1381                       i->Xin.FpLdSt.reg);
1382         return;
1383      case Xin_FpLdStI:
1384         addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
1385         addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
1386                       i->Xin.FpLdStI.reg);
1387         return;
1388      case Xin_Fp64to32:
1389         addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
1390         addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
1391         return;
1392      case Xin_FpCMov:
1393         addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
1394         addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
1395         return;
1396      case Xin_FpLdCW:
1397         addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
1398         return;
1399      case Xin_FpStSW_AX:
1400         addHRegUse(u, HRmWrite, hregX86_EAX());
1401         return;
1402      case Xin_FpCmp:
1403         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
1404         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
1405         addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
1406         addHRegUse(u, HRmWrite, hregX86_EAX());
1407         return;
1408      case Xin_SseLdSt:
1409         addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
1410         addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
1411                       i->Xin.SseLdSt.reg);
1412         return;
1413      case Xin_SseLdzLO:
1414         addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
1415         addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
1416         return;
1417      case Xin_SseConst:
1418         addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
1419         return;
1420      case Xin_Sse32Fx4:
1421         vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
1422         unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
1423                         || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
1424                         || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
1425         addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
1426         addHRegUse(u, unary ? HRmWrite : HRmModify,
1427                       i->Xin.Sse32Fx4.dst);
1428         return;
1429      case Xin_Sse32FLo:
1430         vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
1431         unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
1432                         || i->Xin.Sse32FLo.op == Xsse_RSQRTF
1433                         || i->Xin.Sse32FLo.op == Xsse_SQRTF );
1434         addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
1435         addHRegUse(u, unary ? HRmWrite : HRmModify,
1436                       i->Xin.Sse32FLo.dst);
1437         return;
1438      case Xin_Sse64Fx2:
1439         vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
1440         unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
1441                         || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
1442                         || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
1443         addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
1444         addHRegUse(u, unary ? HRmWrite : HRmModify,
1445                       i->Xin.Sse64Fx2.dst);
1446         return;
1447      case Xin_Sse64FLo:
1448         vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
1449         unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
1450                         || i->Xin.Sse64FLo.op == Xsse_RSQRTF
1451                         || i->Xin.Sse64FLo.op == Xsse_SQRTF );
1452         addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
1453         addHRegUse(u, unary ? HRmWrite : HRmModify,
1454                       i->Xin.Sse64FLo.dst);
1455         return;
1456      case Xin_SseReRg:
1457         if (i->Xin.SseReRg.op == Xsse_XOR
1458             && sameHReg(i->Xin.SseReRg.src, i->Xin.SseReRg.dst)) {
1459            /* reg-alloc needs to understand 'xor r,r' as a write of r */
1460            /* (as opposed to a rite of passage :-) */
1461            addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
1462         } else {
1463            addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
1464            addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV
1465                             ? HRmWrite : HRmModify,
1466                          i->Xin.SseReRg.dst);
1467         }
1468         return;
1469      case Xin_SseCMov:
1470         addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
1471         addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
1472         return;
1473      case Xin_SseShuf:
1474         addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
1475         addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
1476         return;
1477      case Xin_EvCheck:
1478         /* We expect both amodes only to mention %ebp, so this is in
1479            fact pointless, since %ebp isn't allocatable, but anyway.. */
1480         addRegUsage_X86AMode(u, i->Xin.EvCheck.amCounter);
1481         addRegUsage_X86AMode(u, i->Xin.EvCheck.amFailAddr);
1482         return;
1483      case Xin_ProfInc:
1484         /* does not use any registers. */
1485         return;
1486      default:
1487         ppX86Instr(i, False);
1488         vpanic("getRegUsage_X86Instr");
1489   }
1490}
1491
1492/* local helper */
1493static void mapReg( HRegRemap* m, HReg* r )
1494{
1495   *r = lookupHRegRemap(m, *r);
1496}
1497
1498void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
1499{
1500   vassert(mode64 == False);
1501   switch (i->tag) {
1502      case Xin_Alu32R:
1503         mapRegs_X86RMI(m, i->Xin.Alu32R.src);
1504         mapReg(m, &i->Xin.Alu32R.dst);
1505         return;
1506      case Xin_Alu32M:
1507         mapRegs_X86RI(m, i->Xin.Alu32M.src);
1508         mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
1509         return;
1510      case Xin_Sh32:
1511         mapReg(m, &i->Xin.Sh32.dst);
1512         return;
1513      case Xin_Test32:
1514         mapRegs_X86RM(m, i->Xin.Test32.dst);
1515         return;
1516      case Xin_Unary32:
1517         mapReg(m, &i->Xin.Unary32.dst);
1518         return;
1519      case Xin_Lea32:
1520         mapRegs_X86AMode(m, i->Xin.Lea32.am);
1521         mapReg(m, &i->Xin.Lea32.dst);
1522         return;
1523      case Xin_MulL:
1524         mapRegs_X86RM(m, i->Xin.MulL.src);
1525         return;
1526      case Xin_Div:
1527         mapRegs_X86RM(m, i->Xin.Div.src);
1528         return;
1529      case Xin_Sh3232:
1530         mapReg(m, &i->Xin.Sh3232.src);
1531         mapReg(m, &i->Xin.Sh3232.dst);
1532         return;
1533      case Xin_Push:
1534         mapRegs_X86RMI(m, i->Xin.Push.src);
1535         return;
1536      case Xin_Call:
1537         return;
1538      case Xin_XDirect:
1539         mapRegs_X86AMode(m, i->Xin.XDirect.amEIP);
1540         return;
1541      case Xin_XIndir:
1542         mapReg(m, &i->Xin.XIndir.dstGA);
1543         mapRegs_X86AMode(m, i->Xin.XIndir.amEIP);
1544         return;
1545      case Xin_XAssisted:
1546         mapReg(m, &i->Xin.XAssisted.dstGA);
1547         mapRegs_X86AMode(m, i->Xin.XAssisted.amEIP);
1548         return;
1549      case Xin_CMov32:
1550         mapRegs_X86RM(m, i->Xin.CMov32.src);
1551         mapReg(m, &i->Xin.CMov32.dst);
1552         return;
1553      case Xin_LoadEX:
1554         mapRegs_X86AMode(m, i->Xin.LoadEX.src);
1555         mapReg(m, &i->Xin.LoadEX.dst);
1556         return;
1557      case Xin_Store:
1558         mapReg(m, &i->Xin.Store.src);
1559         mapRegs_X86AMode(m, i->Xin.Store.dst);
1560         return;
1561      case Xin_Set32:
1562         mapReg(m, &i->Xin.Set32.dst);
1563         return;
1564      case Xin_Bsfr32:
1565         mapReg(m, &i->Xin.Bsfr32.src);
1566         mapReg(m, &i->Xin.Bsfr32.dst);
1567         return;
1568      case Xin_MFence:
1569         return;
1570      case Xin_ACAS:
1571         mapRegs_X86AMode(m, i->Xin.ACAS.addr);
1572         return;
1573      case Xin_DACAS:
1574         mapRegs_X86AMode(m, i->Xin.DACAS.addr);
1575         return;
1576      case Xin_FpUnary:
1577         mapReg(m, &i->Xin.FpUnary.src);
1578         mapReg(m, &i->Xin.FpUnary.dst);
1579         return;
1580      case Xin_FpBinary:
1581         mapReg(m, &i->Xin.FpBinary.srcL);
1582         mapReg(m, &i->Xin.FpBinary.srcR);
1583         mapReg(m, &i->Xin.FpBinary.dst);
1584         return;
1585      case Xin_FpLdSt:
1586         mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
1587         mapReg(m, &i->Xin.FpLdSt.reg);
1588         return;
1589      case Xin_FpLdStI:
1590         mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
1591         mapReg(m, &i->Xin.FpLdStI.reg);
1592         return;
1593      case Xin_Fp64to32:
1594         mapReg(m, &i->Xin.Fp64to32.src);
1595         mapReg(m, &i->Xin.Fp64to32.dst);
1596         return;
1597      case Xin_FpCMov:
1598         mapReg(m, &i->Xin.FpCMov.src);
1599         mapReg(m, &i->Xin.FpCMov.dst);
1600         return;
1601      case Xin_FpLdCW:
1602         mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
1603         return;
1604      case Xin_FpStSW_AX:
1605         return;
1606      case Xin_FpCmp:
1607         mapReg(m, &i->Xin.FpCmp.srcL);
1608         mapReg(m, &i->Xin.FpCmp.srcR);
1609         mapReg(m, &i->Xin.FpCmp.dst);
1610         return;
1611      case Xin_SseConst:
1612         mapReg(m, &i->Xin.SseConst.dst);
1613         return;
1614      case Xin_SseLdSt:
1615         mapReg(m, &i->Xin.SseLdSt.reg);
1616         mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
1617         break;
1618      case Xin_SseLdzLO:
1619         mapReg(m, &i->Xin.SseLdzLO.reg);
1620         mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
1621         break;
1622      case Xin_Sse32Fx4:
1623         mapReg(m, &i->Xin.Sse32Fx4.src);
1624         mapReg(m, &i->Xin.Sse32Fx4.dst);
1625         return;
1626      case Xin_Sse32FLo:
1627         mapReg(m, &i->Xin.Sse32FLo.src);
1628         mapReg(m, &i->Xin.Sse32FLo.dst);
1629         return;
1630      case Xin_Sse64Fx2:
1631         mapReg(m, &i->Xin.Sse64Fx2.src);
1632         mapReg(m, &i->Xin.Sse64Fx2.dst);
1633         return;
1634      case Xin_Sse64FLo:
1635         mapReg(m, &i->Xin.Sse64FLo.src);
1636         mapReg(m, &i->Xin.Sse64FLo.dst);
1637         return;
1638      case Xin_SseReRg:
1639         mapReg(m, &i->Xin.SseReRg.src);
1640         mapReg(m, &i->Xin.SseReRg.dst);
1641         return;
1642      case Xin_SseCMov:
1643         mapReg(m, &i->Xin.SseCMov.src);
1644         mapReg(m, &i->Xin.SseCMov.dst);
1645         return;
1646      case Xin_SseShuf:
1647         mapReg(m, &i->Xin.SseShuf.src);
1648         mapReg(m, &i->Xin.SseShuf.dst);
1649         return;
1650      case Xin_EvCheck:
1651         /* We expect both amodes only to mention %ebp, so this is in
1652            fact pointless, since %ebp isn't allocatable, but anyway.. */
1653         mapRegs_X86AMode(m, i->Xin.EvCheck.amCounter);
1654         mapRegs_X86AMode(m, i->Xin.EvCheck.amFailAddr);
1655         return;
1656      case Xin_ProfInc:
1657         /* does not use any registers. */
1658         return;
1659
1660      default:
1661         ppX86Instr(i, mode64);
1662         vpanic("mapRegs_X86Instr");
1663   }
1664}
1665
1666/* Figure out if i represents a reg-reg move, and if so assign the
1667   source and destination to *src and *dst.  If in doubt say No.  Used
1668   by the register allocator to do move coalescing.
1669*/
1670Bool isMove_X86Instr ( const X86Instr* i, HReg* src, HReg* dst )
1671{
1672   /* Moves between integer regs */
1673   if (i->tag == Xin_Alu32R) {
1674      if (i->Xin.Alu32R.op != Xalu_MOV)
1675         return False;
1676      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
1677         return False;
1678      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
1679      *dst = i->Xin.Alu32R.dst;
1680      return True;
1681   }
1682   /* Moves between FP regs */
1683   if (i->tag == Xin_FpUnary) {
1684      if (i->Xin.FpUnary.op != Xfp_MOV)
1685         return False;
1686      *src = i->Xin.FpUnary.src;
1687      *dst = i->Xin.FpUnary.dst;
1688      return True;
1689   }
1690   if (i->tag == Xin_SseReRg) {
1691      if (i->Xin.SseReRg.op != Xsse_MOV)
1692         return False;
1693      *src = i->Xin.SseReRg.src;
1694      *dst = i->Xin.SseReRg.dst;
1695      return True;
1696   }
1697   return False;
1698}
1699
1700
1701/* Generate x86 spill/reload instructions under the direction of the
1702   register allocator.  Note it's critical these don't write the
1703   condition codes. */
1704
1705void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1706                    HReg rreg, Int offsetB, Bool mode64 )
1707{
1708   X86AMode* am;
1709   vassert(offsetB >= 0);
1710   vassert(!hregIsVirtual(rreg));
1711   vassert(mode64 == False);
1712   *i1 = *i2 = NULL;
1713   am = X86AMode_IR(offsetB, hregX86_EBP());
1714   switch (hregClass(rreg)) {
1715      case HRcInt32:
1716         *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
1717         return;
1718      case HRcFlt64:
1719         *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
1720         return;
1721      case HRcVec128:
1722         *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
1723         return;
1724      default:
1725         ppHRegClass(hregClass(rreg));
1726         vpanic("genSpill_X86: unimplemented regclass");
1727   }
1728}
1729
1730void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1731                     HReg rreg, Int offsetB, Bool mode64 )
1732{
1733   X86AMode* am;
1734   vassert(offsetB >= 0);
1735   vassert(!hregIsVirtual(rreg));
1736   vassert(mode64 == False);
1737   *i1 = *i2 = NULL;
1738   am = X86AMode_IR(offsetB, hregX86_EBP());
1739   switch (hregClass(rreg)) {
1740      case HRcInt32:
1741         *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
1742         return;
1743      case HRcFlt64:
1744         *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
1745         return;
1746      case HRcVec128:
1747         *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
1748         return;
1749      default:
1750         ppHRegClass(hregClass(rreg));
1751         vpanic("genReload_X86: unimplemented regclass");
1752   }
1753}
1754
1755/* The given instruction reads the specified vreg exactly once, and
1756   that vreg is currently located at the given spill offset.  If
1757   possible, return a variant of the instruction to one which instead
1758   references the spill slot directly. */
1759
1760X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
1761{
1762   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
1763
1764   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
1765      Convert to: src=RMI_Mem, dst=Reg
1766   */
1767   if (i->tag == Xin_Alu32R
1768       && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
1769           || i->Xin.Alu32R.op == Xalu_XOR)
1770       && i->Xin.Alu32R.src->tag == Xrmi_Reg
1771       && sameHReg(i->Xin.Alu32R.src->Xrmi.Reg.reg, vreg)) {
1772      vassert(! sameHReg(i->Xin.Alu32R.dst, vreg));
1773      return X86Instr_Alu32R(
1774                i->Xin.Alu32R.op,
1775                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
1776                i->Xin.Alu32R.dst
1777             );
1778   }
1779
1780   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
1781      Convert to: src=RI_Imm, dst=Mem
1782   */
1783   if (i->tag == Xin_Alu32R
1784       && (i->Xin.Alu32R.op == Xalu_CMP)
1785       && i->Xin.Alu32R.src->tag == Xrmi_Imm
1786       && sameHReg(i->Xin.Alu32R.dst, vreg)) {
1787      return X86Instr_Alu32M(
1788                i->Xin.Alu32R.op,
1789		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
1790                X86AMode_IR( spill_off, hregX86_EBP())
1791             );
1792   }
1793
1794   /* Deal with form: Push(RMI_Reg)
1795      Convert to: Push(RMI_Mem)
1796   */
1797   if (i->tag == Xin_Push
1798       && i->Xin.Push.src->tag == Xrmi_Reg
1799       && sameHReg(i->Xin.Push.src->Xrmi.Reg.reg, vreg)) {
1800      return X86Instr_Push(
1801                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
1802             );
1803   }
1804
1805   /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
1806      Convert to CMov32(RM_Mem, dst) */
1807   if (i->tag == Xin_CMov32
1808       && i->Xin.CMov32.src->tag == Xrm_Reg
1809       && sameHReg(i->Xin.CMov32.src->Xrm.Reg.reg, vreg)) {
1810      vassert(! sameHReg(i->Xin.CMov32.dst, vreg));
1811      return X86Instr_CMov32(
1812                i->Xin.CMov32.cond,
1813                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
1814                i->Xin.CMov32.dst
1815             );
1816   }
1817
1818   /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
1819   if (i->tag == Xin_Test32
1820       && i->Xin.Test32.dst->tag == Xrm_Reg
1821       && sameHReg(i->Xin.Test32.dst->Xrm.Reg.reg, vreg)) {
1822      return X86Instr_Test32(
1823                i->Xin.Test32.imm32,
1824                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
1825             );
1826   }
1827
1828   return NULL;
1829}
1830
1831
1832/* --------- The x86 assembler (bleh.) --------- */
1833
1834inline static UInt iregEnc ( HReg r )
1835{
1836   UInt n;
1837   vassert(hregClass(r) == HRcInt32);
1838   vassert(!hregIsVirtual(r));
1839   n = hregEncoding(r);
1840   vassert(n <= 7);
1841   return n;
1842}
1843
1844inline static UInt fregEnc ( HReg r )
1845{
1846   UInt n;
1847   vassert(hregClass(r) == HRcFlt64);
1848   vassert(!hregIsVirtual(r));
1849   n = hregEncoding(r);
1850   vassert(n <= 5);
1851   return n;
1852}
1853
1854inline static UInt vregEnc ( HReg r )
1855{
1856   UInt n;
1857   vassert(hregClass(r) == HRcVec128);
1858   vassert(!hregIsVirtual(r));
1859   n = hregEncoding(r);
1860   vassert(n <= 7);
1861   return n;
1862}
1863
1864inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
1865{
1866   vassert(mod < 4);
1867   vassert((reg|regmem) < 8);
1868   return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
1869}
1870
1871inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
1872{
1873   vassert(shift < 4);
1874   vassert((regindex|regbase) < 8);
1875   return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
1876}
1877
1878static UChar* emit32 ( UChar* p, UInt w32 )
1879{
1880   *p++ = toUChar( w32        & 0x000000FF);
1881   *p++ = toUChar((w32 >>  8) & 0x000000FF);
1882   *p++ = toUChar((w32 >> 16) & 0x000000FF);
1883   *p++ = toUChar((w32 >> 24) & 0x000000FF);
1884   return p;
1885}
1886
1887/* Does a sign-extend of the lowest 8 bits give
1888   the original number? */
1889static Bool fits8bits ( UInt w32 )
1890{
1891   Int i32 = (Int)w32;
1892   return toBool(i32 == ((Int)(w32 << 24) >> 24));
1893}
1894
1895
1896/* Forming mod-reg-rm bytes and scale-index-base bytes.
1897
1898     greg,  0(ereg)    |  ereg != ESP && ereg != EBP
1899                       =  00 greg ereg
1900
1901     greg,  d8(ereg)   |  ereg != ESP
1902                       =  01 greg ereg, d8
1903
1904     greg,  d32(ereg)  |  ereg != ESP
1905                       =  10 greg ereg, d32
1906
1907     greg,  d8(%esp)   =  01 greg 100, 0x24, d8
1908
1909     -----------------------------------------------
1910
1911     greg,  d8(base,index,scale)
1912               |  index != ESP
1913               =  01 greg 100, scale index base, d8
1914
1915     greg,  d32(base,index,scale)
1916               |  index != ESP
1917               =  10 greg 100, scale index base, d32
1918*/
1919static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc, X86AMode* am )
1920{
1921   if (am->tag == Xam_IR) {
1922      if (am->Xam.IR.imm == 0
1923          && ! sameHReg(am->Xam.IR.reg, hregX86_ESP())
1924          && ! sameHReg(am->Xam.IR.reg, hregX86_EBP()) ) {
1925         *p++ = mkModRegRM(0, gregEnc, iregEnc(am->Xam.IR.reg));
1926         return p;
1927      }
1928      if (fits8bits(am->Xam.IR.imm)
1929          && ! sameHReg(am->Xam.IR.reg, hregX86_ESP())) {
1930         *p++ = mkModRegRM(1, gregEnc, iregEnc(am->Xam.IR.reg));
1931         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1932         return p;
1933      }
1934      if (! sameHReg(am->Xam.IR.reg, hregX86_ESP())) {
1935         *p++ = mkModRegRM(2, gregEnc, iregEnc(am->Xam.IR.reg));
1936         p = emit32(p, am->Xam.IR.imm);
1937         return p;
1938      }
1939      if (sameHReg(am->Xam.IR.reg, hregX86_ESP())
1940          && fits8bits(am->Xam.IR.imm)) {
1941 	 *p++ = mkModRegRM(1, gregEnc, 4);
1942         *p++ = 0x24;
1943         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
1944         return p;
1945      }
1946      ppX86AMode(am);
1947      vpanic("doAMode_M: can't emit amode IR");
1948      /*NOTREACHED*/
1949   }
1950   if (am->tag == Xam_IRRS) {
1951      if (fits8bits(am->Xam.IRRS.imm)
1952          && ! sameHReg(am->Xam.IRRS.index, hregX86_ESP())) {
1953         *p++ = mkModRegRM(1, gregEnc, 4);
1954         *p++ = mkSIB(am->Xam.IRRS.shift, iregEnc(am->Xam.IRRS.index),
1955                                          iregEnc(am->Xam.IRRS.base));
1956         *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
1957         return p;
1958      }
1959      if (! sameHReg(am->Xam.IRRS.index, hregX86_ESP())) {
1960         *p++ = mkModRegRM(2, gregEnc, 4);
1961         *p++ = mkSIB(am->Xam.IRRS.shift, iregEnc(am->Xam.IRRS.index),
1962                                          iregEnc(am->Xam.IRRS.base));
1963         p = emit32(p, am->Xam.IRRS.imm);
1964         return p;
1965      }
1966      ppX86AMode(am);
1967      vpanic("doAMode_M: can't emit amode IRRS");
1968      /*NOTREACHED*/
1969   }
1970   vpanic("doAMode_M: unknown amode");
1971   /*NOTREACHED*/
1972}
1973
1974static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am )
1975{
1976   return doAMode_M__wrk(p, iregEnc(greg), am);
1977}
1978
1979static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc, X86AMode* am )
1980{
1981   vassert(gregEnc < 8);
1982   return doAMode_M__wrk(p, gregEnc, am);
1983}
1984
1985
1986/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
1987inline static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc, UInt eregEnc )
1988{
1989   *p++ = mkModRegRM(3, gregEnc, eregEnc);
1990   return p;
1991}
1992
1993static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
1994{
1995   return doAMode_R__wrk(p, iregEnc(greg), iregEnc(ereg));
1996}
1997
1998static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc, HReg ereg )
1999{
2000   vassert(gregEnc < 8);
2001   return doAMode_R__wrk(p, gregEnc, iregEnc(ereg));
2002}
2003
2004static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc, UInt eregEnc )
2005{
2006   vassert( (gregEnc|eregEnc) < 8);
2007   return doAMode_R__wrk(p, gregEnc, eregEnc);
2008}
2009
2010
2011/* Emit ffree %st(7) */
2012static UChar* do_ffree_st7 ( UChar* p )
2013{
2014   *p++ = 0xDD;
2015   *p++ = 0xC7;
2016   return p;
2017}
2018
2019/* Emit fstp %st(i), 1 <= i <= 7 */
2020static UChar* do_fstp_st ( UChar* p, Int i )
2021{
2022   vassert(1 <= i && i <= 7);
2023   *p++ = 0xDD;
2024   *p++ = toUChar(0xD8+i);
2025   return p;
2026}
2027
2028/* Emit fld %st(i), 0 <= i <= 6 */
2029static UChar* do_fld_st ( UChar* p, Int i )
2030{
2031   vassert(0 <= i && i <= 6);
2032   *p++ = 0xD9;
2033   *p++ = toUChar(0xC0+i);
2034   return p;
2035}
2036
2037/* Emit f<op> %st(0) */
2038static UChar* do_fop1_st ( UChar* p, X86FpOp op )
2039{
2040   switch (op) {
2041      case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
2042      case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
2043      case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
2044      case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
2045      case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
2046      case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
2047      case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
2048      case Xfp_MOV:    break;
2049      case Xfp_TAN:
2050         /* fptan pushes 1.0 on the FP stack, except when the argument
2051            is out of range.  Hence we have to do the instruction,
2052            then inspect C2 to see if there is an out of range
2053            condition.  If there is, we skip the fincstp that is used
2054            by the in-range case to get rid of this extra 1.0
2055            value. */
2056         p = do_ffree_st7(p); /* since fptan sometimes pushes 1.0 */
2057         *p++ = 0xD9; *p++ = 0xF2; // fptan
2058         *p++ = 0x50;              // pushl %eax
2059         *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
2060         *p++ = 0x66; *p++ = 0xA9;
2061         *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
2062         *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
2063         *p++ = 0xD9; *p++ = 0xF7; // fincstp
2064         *p++ = 0x58;              // after_fincstp: popl %eax
2065         break;
2066      default:
2067         vpanic("do_fop1_st: unknown op");
2068   }
2069   return p;
2070}
2071
2072/* Emit f<op> %st(i), 1 <= i <= 5 */
2073static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
2074{
2075   Int subopc;
2076   switch (op) {
2077      case Xfp_ADD: subopc = 0; break;
2078      case Xfp_SUB: subopc = 4; break;
2079      case Xfp_MUL: subopc = 1; break;
2080      case Xfp_DIV: subopc = 6; break;
2081      default: vpanic("do_fop2_st: unknown op");
2082   }
2083   *p++ = 0xD8;
2084   p    = doAMode_R_enc_enc(p, subopc, i);
2085   return p;
2086}
2087
2088/* Push a 32-bit word on the stack.  The word depends on tags[3:0];
2089each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
2090*/
2091static UChar* push_word_from_tags ( UChar* p, UShort tags )
2092{
2093   UInt w;
2094   vassert(0 == (tags & ~0xF));
2095   if (tags == 0) {
2096      /* pushl $0x00000000 */
2097      *p++ = 0x6A;
2098      *p++ = 0x00;
2099   }
2100   else
2101   /* pushl $0xFFFFFFFF */
2102   if (tags == 0xF) {
2103      *p++ = 0x6A;
2104      *p++ = 0xFF;
2105   } else {
2106      vassert(0); /* awaiting test case */
2107      w = 0;
2108      if (tags & 1) w |= 0x000000FF;
2109      if (tags & 2) w |= 0x0000FF00;
2110      if (tags & 4) w |= 0x00FF0000;
2111      if (tags & 8) w |= 0xFF000000;
2112      *p++ = 0x68;
2113      p = emit32(p, w);
2114   }
2115   return p;
2116}
2117
2118/* Emit an instruction into buf and return the number of bytes used.
2119   Note that buf is not the insn's final place, and therefore it is
2120   imperative to emit position-independent code.  If the emitted
2121   instruction was a profiler inc, set *is_profInc to True, else
2122   leave it unchanged. */
2123
2124Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
2125                    UChar* buf, Int nbuf, const X86Instr* i,
2126                    Bool mode64, VexEndness endness_host,
2127                    const void* disp_cp_chain_me_to_slowEP,
2128                    const void* disp_cp_chain_me_to_fastEP,
2129                    const void* disp_cp_xindir,
2130                    const void* disp_cp_xassisted )
2131{
2132   UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2133
2134   UInt   xtra;
2135   UChar* p = &buf[0];
2136   UChar* ptmp;
2137   vassert(nbuf >= 32);
2138   vassert(mode64 == False);
2139
2140   /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
2141
2142   switch (i->tag) {
2143
2144   case Xin_Alu32R:
2145      /* Deal specially with MOV */
2146      if (i->Xin.Alu32R.op == Xalu_MOV) {
2147         switch (i->Xin.Alu32R.src->tag) {
2148            case Xrmi_Imm:
2149               *p++ = toUChar(0xB8 + iregEnc(i->Xin.Alu32R.dst));
2150               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2151               goto done;
2152            case Xrmi_Reg:
2153               *p++ = 0x89;
2154               p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2155                                i->Xin.Alu32R.dst);
2156               goto done;
2157            case Xrmi_Mem:
2158               *p++ = 0x8B;
2159               p = doAMode_M(p, i->Xin.Alu32R.dst,
2160                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2161               goto done;
2162            default:
2163               goto bad;
2164         }
2165      }
2166      /* MUL */
2167      if (i->Xin.Alu32R.op == Xalu_MUL) {
2168         switch (i->Xin.Alu32R.src->tag) {
2169            case Xrmi_Reg:
2170               *p++ = 0x0F;
2171               *p++ = 0xAF;
2172               p = doAMode_R(p, i->Xin.Alu32R.dst,
2173                                i->Xin.Alu32R.src->Xrmi.Reg.reg);
2174               goto done;
2175            case Xrmi_Mem:
2176               *p++ = 0x0F;
2177               *p++ = 0xAF;
2178               p = doAMode_M(p, i->Xin.Alu32R.dst,
2179                                i->Xin.Alu32R.src->Xrmi.Mem.am);
2180               goto done;
2181            case Xrmi_Imm:
2182               if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2183                  *p++ = 0x6B;
2184                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2185                  *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2186               } else {
2187                  *p++ = 0x69;
2188                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
2189                  p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2190               }
2191               goto done;
2192            default:
2193               goto bad;
2194         }
2195      }
2196      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2197      opc = opc_rr = subopc_imm = opc_imma = 0;
2198      switch (i->Xin.Alu32R.op) {
2199         case Xalu_ADC: opc = 0x13; opc_rr = 0x11;
2200                        subopc_imm = 2; opc_imma = 0x15; break;
2201         case Xalu_ADD: opc = 0x03; opc_rr = 0x01;
2202                        subopc_imm = 0; opc_imma = 0x05; break;
2203         case Xalu_SUB: opc = 0x2B; opc_rr = 0x29;
2204                        subopc_imm = 5; opc_imma = 0x2D; break;
2205         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19;
2206                        subopc_imm = 3; opc_imma = 0x1D; break;
2207         case Xalu_AND: opc = 0x23; opc_rr = 0x21;
2208                        subopc_imm = 4; opc_imma = 0x25; break;
2209         case Xalu_XOR: opc = 0x33; opc_rr = 0x31;
2210                        subopc_imm = 6; opc_imma = 0x35; break;
2211         case Xalu_OR:  opc = 0x0B; opc_rr = 0x09;
2212                        subopc_imm = 1; opc_imma = 0x0D; break;
2213         case Xalu_CMP: opc = 0x3B; opc_rr = 0x39;
2214                        subopc_imm = 7; opc_imma = 0x3D; break;
2215         default: goto bad;
2216      }
2217      switch (i->Xin.Alu32R.src->tag) {
2218         case Xrmi_Imm:
2219            if (sameHReg(i->Xin.Alu32R.dst, hregX86_EAX())
2220                && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2221               *p++ = toUChar(opc_imma);
2222               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2223            } else
2224            if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
2225               *p++ = 0x83;
2226               p    = doAMode_R_enc_reg(p, subopc_imm, i->Xin.Alu32R.dst);
2227               *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2228            } else {
2229               *p++ = 0x81;
2230               p    = doAMode_R_enc_reg(p, subopc_imm, i->Xin.Alu32R.dst);
2231               p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
2232            }
2233            goto done;
2234         case Xrmi_Reg:
2235            *p++ = toUChar(opc_rr);
2236            p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
2237                             i->Xin.Alu32R.dst);
2238            goto done;
2239         case Xrmi_Mem:
2240            *p++ = toUChar(opc);
2241            p = doAMode_M(p, i->Xin.Alu32R.dst,
2242                             i->Xin.Alu32R.src->Xrmi.Mem.am);
2243            goto done;
2244         default:
2245            goto bad;
2246      }
2247      break;
2248
2249   case Xin_Alu32M:
2250      /* Deal specially with MOV */
2251      if (i->Xin.Alu32M.op == Xalu_MOV) {
2252         switch (i->Xin.Alu32M.src->tag) {
2253            case Xri_Reg:
2254               *p++ = 0x89;
2255               p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2256                                i->Xin.Alu32M.dst);
2257               goto done;
2258            case Xri_Imm:
2259               *p++ = 0xC7;
2260               p = doAMode_M_enc(p, 0, i->Xin.Alu32M.dst);
2261               p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2262               goto done;
2263            default:
2264               goto bad;
2265         }
2266      }
2267      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2268         allowed here. */
2269      opc = subopc_imm = opc_imma = 0;
2270      switch (i->Xin.Alu32M.op) {
2271         case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
2272         case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
2273         case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
2274         default: goto bad;
2275      }
2276      switch (i->Xin.Alu32M.src->tag) {
2277         case Xri_Reg:
2278            *p++ = toUChar(opc);
2279            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2280                             i->Xin.Alu32M.dst);
2281            goto done;
2282         case Xri_Imm:
2283            if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
2284               *p++ = 0x83;
2285               p    = doAMode_M_enc(p, subopc_imm, i->Xin.Alu32M.dst);
2286               *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
2287               goto done;
2288            } else {
2289               *p++ = 0x81;
2290               p    = doAMode_M_enc(p, subopc_imm, i->Xin.Alu32M.dst);
2291               p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
2292               goto done;
2293            }
2294         default:
2295            goto bad;
2296      }
2297      break;
2298
2299   case Xin_Sh32:
2300      opc_cl = opc_imm = subopc = 0;
2301      switch (i->Xin.Sh32.op) {
2302         case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2303         case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2304         case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2305         default: goto bad;
2306      }
2307      if (i->Xin.Sh32.src == 0) {
2308         *p++ = toUChar(opc_cl);
2309         p = doAMode_R_enc_reg(p, subopc, i->Xin.Sh32.dst);
2310      } else {
2311         *p++ = toUChar(opc_imm);
2312         p = doAMode_R_enc_reg(p, subopc, i->Xin.Sh32.dst);
2313         *p++ = (UChar)(i->Xin.Sh32.src);
2314      }
2315      goto done;
2316
2317   case Xin_Test32:
2318      if (i->Xin.Test32.dst->tag == Xrm_Reg) {
2319         /* testl $imm32, %reg */
2320         *p++ = 0xF7;
2321         p = doAMode_R_enc_reg(p, 0, i->Xin.Test32.dst->Xrm.Reg.reg);
2322         p = emit32(p, i->Xin.Test32.imm32);
2323         goto done;
2324      } else {
2325         /* testl $imm32, amode */
2326         *p++ = 0xF7;
2327         p = doAMode_M_enc(p, 0, i->Xin.Test32.dst->Xrm.Mem.am);
2328         p = emit32(p, i->Xin.Test32.imm32);
2329         goto done;
2330      }
2331
2332   case Xin_Unary32:
2333      if (i->Xin.Unary32.op == Xun_NOT) {
2334         *p++ = 0xF7;
2335         p = doAMode_R_enc_reg(p, 2, i->Xin.Unary32.dst);
2336         goto done;
2337      }
2338      if (i->Xin.Unary32.op == Xun_NEG) {
2339         *p++ = 0xF7;
2340         p = doAMode_R_enc_reg(p, 3, i->Xin.Unary32.dst);
2341         goto done;
2342      }
2343      break;
2344
2345   case Xin_Lea32:
2346      *p++ = 0x8D;
2347      p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
2348      goto done;
2349
2350   case Xin_MulL:
2351      subopc = i->Xin.MulL.syned ? 5 : 4;
2352      *p++ = 0xF7;
2353      switch (i->Xin.MulL.src->tag)  {
2354         case Xrm_Mem:
2355            p = doAMode_M_enc(p, subopc, i->Xin.MulL.src->Xrm.Mem.am);
2356            goto done;
2357         case Xrm_Reg:
2358            p = doAMode_R_enc_reg(p, subopc, i->Xin.MulL.src->Xrm.Reg.reg);
2359            goto done;
2360         default:
2361            goto bad;
2362      }
2363      break;
2364
2365   case Xin_Div:
2366      subopc = i->Xin.Div.syned ? 7 : 6;
2367      *p++ = 0xF7;
2368      switch (i->Xin.Div.src->tag)  {
2369         case Xrm_Mem:
2370            p = doAMode_M_enc(p, subopc, i->Xin.Div.src->Xrm.Mem.am);
2371            goto done;
2372         case Xrm_Reg:
2373            p = doAMode_R_enc_reg(p, subopc, i->Xin.Div.src->Xrm.Reg.reg);
2374            goto done;
2375         default:
2376            goto bad;
2377      }
2378      break;
2379
2380   case Xin_Sh3232:
2381      vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
2382      if (i->Xin.Sh3232.amt == 0) {
2383         /* shldl/shrdl by %cl */
2384         *p++ = 0x0F;
2385         if (i->Xin.Sh3232.op == Xsh_SHL) {
2386            *p++ = 0xA5;
2387         } else {
2388            *p++ = 0xAD;
2389         }
2390         p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
2391         goto done;
2392      }
2393      break;
2394
2395   case Xin_Push:
2396      switch (i->Xin.Push.src->tag) {
2397         case Xrmi_Mem:
2398            *p++ = 0xFF;
2399            p = doAMode_M_enc(p, 6, i->Xin.Push.src->Xrmi.Mem.am);
2400            goto done;
2401         case Xrmi_Imm:
2402            *p++ = 0x68;
2403            p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
2404            goto done;
2405         case Xrmi_Reg:
2406            *p++ = toUChar(0x50 + iregEnc(i->Xin.Push.src->Xrmi.Reg.reg));
2407            goto done;
2408        default:
2409            goto bad;
2410      }
2411
2412   case Xin_Call:
2413      if (i->Xin.Call.cond != Xcc_ALWAYS
2414          && i->Xin.Call.rloc.pri != RLPri_None) {
2415         /* The call might not happen (it isn't unconditional) and it
2416            returns a result.  In this case we will need to generate a
2417            control flow diamond to put 0x555..555 in the return
2418            register(s) in the case where the call doesn't happen.  If
2419            this ever becomes necessary, maybe copy code from the ARM
2420            equivalent.  Until that day, just give up. */
2421         goto bad;
2422      }
2423      /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
2424         for explanation of this. */
2425      switch (i->Xin.Call.regparms) {
2426         case 0: irno = iregEnc(hregX86_EAX()); break;
2427         case 1: irno = iregEnc(hregX86_EDX()); break;
2428         case 2: irno = iregEnc(hregX86_ECX()); break;
2429         case 3: irno = iregEnc(hregX86_EDI()); break;
2430         default: vpanic(" emit_X86Instr:call:regparms");
2431      }
2432      /* jump over the following two insns if the condition does not
2433         hold */
2434      if (i->Xin.Call.cond != Xcc_ALWAYS) {
2435         *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
2436         *p++ = 0x07; /* 7 bytes in the next two insns */
2437      }
2438      /* movl $target, %tmp */
2439      *p++ = toUChar(0xB8 + irno);
2440      p = emit32(p, i->Xin.Call.target);
2441      /* call *%tmp */
2442      *p++ = 0xFF;
2443      *p++ = toUChar(0xD0 + irno);
2444      goto done;
2445
2446   case Xin_XDirect: {
2447      /* NB: what goes on here has to be very closely coordinated with the
2448         chainXDirect_X86 and unchainXDirect_X86 below. */
2449      /* We're generating chain-me requests here, so we need to be
2450         sure this is actually allowed -- no-redir translations can't
2451         use chain-me's.  Hence: */
2452      vassert(disp_cp_chain_me_to_slowEP != NULL);
2453      vassert(disp_cp_chain_me_to_fastEP != NULL);
2454
2455      /* Use ptmp for backpatching conditional jumps. */
2456      ptmp = NULL;
2457
2458      /* First off, if this is conditional, create a conditional
2459         jump over the rest of it. */
2460      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
2461         /* jmp fwds if !condition */
2462         *p++ = toUChar(0x70 + (0xF & (i->Xin.XDirect.cond ^ 1)));
2463         ptmp = p; /* fill in this bit later */
2464         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2465      }
2466
2467      /* Update the guest EIP. */
2468      /* movl $dstGA, amEIP */
2469      *p++ = 0xC7;
2470      p    = doAMode_M_enc(p, 0, i->Xin.XDirect.amEIP);
2471      p    = emit32(p, i->Xin.XDirect.dstGA);
2472
2473      /* --- FIRST PATCHABLE BYTE follows --- */
2474      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2475         to) backs up the return address, so as to find the address of
2476         the first patchable byte.  So: don't change the length of the
2477         two instructions below. */
2478      /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */
2479      *p++ = 0xBA;
2480      const void* disp_cp_chain_me
2481               = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2482                                         : disp_cp_chain_me_to_slowEP;
2483      p = emit32(p, (UInt)(Addr)disp_cp_chain_me);
2484      /* call *%edx */
2485      *p++ = 0xFF;
2486      *p++ = 0xD2;
2487      /* --- END of PATCHABLE BYTES --- */
2488
2489      /* Fix up the conditional jump, if there was one. */
2490      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
2491         Int delta = p - ptmp;
2492         vassert(delta > 0 && delta < 40);
2493         *ptmp = toUChar(delta-1);
2494      }
2495      goto done;
2496   }
2497
2498   case Xin_XIndir: {
2499      /* We're generating transfers that could lead indirectly to a
2500         chain-me, so we need to be sure this is actually allowed --
2501         no-redir translations are not allowed to reach normal
2502         translations without going through the scheduler.  That means
2503         no XDirects or XIndirs out from no-redir translations.
2504         Hence: */
2505      vassert(disp_cp_xindir != NULL);
2506
2507      /* Use ptmp for backpatching conditional jumps. */
2508      ptmp = NULL;
2509
2510      /* First off, if this is conditional, create a conditional
2511         jump over the rest of it. */
2512      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
2513         /* jmp fwds if !condition */
2514         *p++ = toUChar(0x70 + (0xF & (i->Xin.XIndir.cond ^ 1)));
2515         ptmp = p; /* fill in this bit later */
2516         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2517      }
2518
2519      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
2520      *p++ = 0x89;
2521      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
2522
2523      /* movl $disp_indir, %edx */
2524      *p++ = 0xBA;
2525      p = emit32(p, (UInt)(Addr)disp_cp_xindir);
2526      /* jmp *%edx */
2527      *p++ = 0xFF;
2528      *p++ = 0xE2;
2529
2530      /* Fix up the conditional jump, if there was one. */
2531      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
2532         Int delta = p - ptmp;
2533         vassert(delta > 0 && delta < 40);
2534         *ptmp = toUChar(delta-1);
2535      }
2536      goto done;
2537   }
2538
2539   case Xin_XAssisted: {
2540      /* Use ptmp for backpatching conditional jumps. */
2541      ptmp = NULL;
2542
2543      /* First off, if this is conditional, create a conditional
2544         jump over the rest of it. */
2545      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
2546         /* jmp fwds if !condition */
2547         *p++ = toUChar(0x70 + (0xF & (i->Xin.XAssisted.cond ^ 1)));
2548         ptmp = p; /* fill in this bit later */
2549         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2550      }
2551
2552      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
2553      *p++ = 0x89;
2554      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
2555      /* movl $magic_number, %ebp. */
2556      UInt trcval = 0;
2557      switch (i->Xin.XAssisted.jk) {
2558         case Ijk_ClientReq:    trcval = VEX_TRC_JMP_CLIENTREQ;    break;
2559         case Ijk_Sys_syscall:  trcval = VEX_TRC_JMP_SYS_SYSCALL;  break;
2560         case Ijk_Sys_int128:   trcval = VEX_TRC_JMP_SYS_INT128;   break;
2561         case Ijk_Sys_int129:   trcval = VEX_TRC_JMP_SYS_INT129;   break;
2562         case Ijk_Sys_int130:   trcval = VEX_TRC_JMP_SYS_INT130;   break;
2563         case Ijk_Sys_int145:   trcval = VEX_TRC_JMP_SYS_INT145;   break;
2564         case Ijk_Sys_int210:   trcval = VEX_TRC_JMP_SYS_INT210;   break;
2565         case Ijk_Sys_sysenter: trcval = VEX_TRC_JMP_SYS_SYSENTER; break;
2566         case Ijk_Yield:        trcval = VEX_TRC_JMP_YIELD;        break;
2567         case Ijk_EmWarn:       trcval = VEX_TRC_JMP_EMWARN;       break;
2568         case Ijk_MapFail:      trcval = VEX_TRC_JMP_MAPFAIL;      break;
2569         case Ijk_NoDecode:     trcval = VEX_TRC_JMP_NODECODE;     break;
2570         case Ijk_InvalICache:  trcval = VEX_TRC_JMP_INVALICACHE;  break;
2571         case Ijk_NoRedir:      trcval = VEX_TRC_JMP_NOREDIR;      break;
2572         case Ijk_SigTRAP:      trcval = VEX_TRC_JMP_SIGTRAP;      break;
2573         case Ijk_SigSEGV:      trcval = VEX_TRC_JMP_SIGSEGV;      break;
2574         case Ijk_Boring:       trcval = VEX_TRC_JMP_BORING;       break;
2575         /* We don't expect to see the following being assisted. */
2576         case Ijk_Ret:
2577         case Ijk_Call:
2578         /* fallthrough */
2579         default:
2580            ppIRJumpKind(i->Xin.XAssisted.jk);
2581            vpanic("emit_X86Instr.Xin_XAssisted: unexpected jump kind");
2582      }
2583      vassert(trcval != 0);
2584      *p++ = 0xBD;
2585      p = emit32(p, trcval);
2586
2587      /* movl $disp_indir, %edx */
2588      *p++ = 0xBA;
2589      p = emit32(p, (UInt)(Addr)disp_cp_xassisted);
2590      /* jmp *%edx */
2591      *p++ = 0xFF;
2592      *p++ = 0xE2;
2593
2594      /* Fix up the conditional jump, if there was one. */
2595      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
2596         Int delta = p - ptmp;
2597         vassert(delta > 0 && delta < 40);
2598         *ptmp = toUChar(delta-1);
2599      }
2600      goto done;
2601   }
2602
2603   case Xin_CMov32:
2604      vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
2605
2606      /* This generates cmov, which is illegal on P54/P55. */
2607      /*
2608      *p++ = 0x0F;
2609      *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
2610      if (i->Xin.CMov32.src->tag == Xrm_Reg) {
2611         p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
2612         goto done;
2613      }
2614      if (i->Xin.CMov32.src->tag == Xrm_Mem) {
2615         p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
2616         goto done;
2617      }
2618      */
2619
2620      /* Alternative version which works on any x86 variant. */
2621      /* jmp fwds if !condition */
2622      *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
2623      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2624      ptmp = p;
2625
2626      switch (i->Xin.CMov32.src->tag) {
2627         case Xrm_Reg:
2628            /* Big sigh.  This is movl E -> G ... */
2629            *p++ = 0x89;
2630            p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
2631                             i->Xin.CMov32.dst);
2632
2633            break;
2634         case Xrm_Mem:
2635            /* ... whereas this is movl G -> E.  That's why the args
2636               to doAMode_R appear to be the wrong way round in the
2637               Xrm_Reg case. */
2638            *p++ = 0x8B;
2639            p = doAMode_M(p, i->Xin.CMov32.dst,
2640                             i->Xin.CMov32.src->Xrm.Mem.am);
2641            break;
2642         default:
2643            goto bad;
2644      }
2645      /* Fill in the jump offset. */
2646      *(ptmp-1) = toUChar(p - ptmp);
2647      goto done;
2648
2649      break;
2650
2651   case Xin_LoadEX:
2652      if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
2653         /* movzbl */
2654         *p++ = 0x0F;
2655         *p++ = 0xB6;
2656         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2657         goto done;
2658      }
2659      if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
2660         /* movzwl */
2661         *p++ = 0x0F;
2662         *p++ = 0xB7;
2663         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2664         goto done;
2665      }
2666      if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
2667         /* movsbl */
2668         *p++ = 0x0F;
2669         *p++ = 0xBE;
2670         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
2671         goto done;
2672      }
2673      break;
2674
2675   case Xin_Set32:
2676      /* Make the destination register be 1 or 0, depending on whether
2677         the relevant condition holds.  We have to dodge and weave
2678         when the destination is %esi or %edi as we cannot directly
2679         emit the native 'setb %reg' for those.  Further complication:
2680         the top 24 bits of the destination should be forced to zero,
2681         but doing 'xor %r,%r' kills the flag(s) we are about to read.
2682         Sigh.  So start off my moving $0 into the dest. */
2683
2684      /* Do we need to swap in %eax? */
2685      if (iregEnc(i->Xin.Set32.dst) >= 4) {
2686         /* xchg %eax, %dst */
2687         *p++ = toUChar(0x90 + iregEnc(i->Xin.Set32.dst));
2688         /* movl $0, %eax */
2689         *p++ =toUChar(0xB8 + iregEnc(hregX86_EAX()));
2690         p = emit32(p, 0);
2691         /* setb lo8(%eax) */
2692         *p++ = 0x0F;
2693         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2694         p = doAMode_R_enc_reg(p, 0, hregX86_EAX());
2695         /* xchg %eax, %dst */
2696         *p++ = toUChar(0x90 + iregEnc(i->Xin.Set32.dst));
2697      } else {
2698         /* movl $0, %dst */
2699         *p++ = toUChar(0xB8 + iregEnc(i->Xin.Set32.dst));
2700         p = emit32(p, 0);
2701         /* setb lo8(%dst) */
2702         *p++ = 0x0F;
2703         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
2704         p = doAMode_R_enc_reg(p, 0, i->Xin.Set32.dst);
2705      }
2706      goto done;
2707
2708   case Xin_Bsfr32:
2709      *p++ = 0x0F;
2710      if (i->Xin.Bsfr32.isFwds) {
2711         *p++ = 0xBC;
2712      } else {
2713         *p++ = 0xBD;
2714      }
2715      p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
2716      goto done;
2717
2718   case Xin_MFence:
2719      /* see comment in hdefs.h re this insn */
2720      if (0) vex_printf("EMIT FENCE\n");
2721      if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
2722                                  |VEX_HWCAPS_X86_SSE2)) {
2723         /* mfence */
2724         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
2725         goto done;
2726      }
2727      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_MMXEXT) {
2728         /* sfence */
2729         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
2730         /* lock addl $0,0(%esp) */
2731         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2732         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2733         goto done;
2734      }
2735      if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
2736         /* lock addl $0,0(%esp) */
2737         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
2738         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
2739         goto done;
2740      }
2741      vpanic("emit_X86Instr:mfence:hwcaps");
2742      /*NOTREACHED*/
2743      break;
2744
2745   case Xin_ACAS:
2746      /* lock */
2747      *p++ = 0xF0;
2748      /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
2749         in %ebx.  The new-value register is hardwired to be %ebx
2750         since letting it be any integer register gives the problem
2751         that %sil and %dil are unaddressible on x86 and hence we
2752         would have to resort to the same kind of trickery as with
2753         byte-sized Xin.Store, just below.  Given that this isn't
2754         performance critical, it is simpler just to force the
2755         register operand to %ebx (could equally be %ecx or %edx).
2756         (Although %ebx is more consistent with cmpxchg8b.) */
2757      if (i->Xin.ACAS.sz == 2) *p++ = 0x66;
2758      *p++ = 0x0F;
2759      if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
2760      p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
2761      goto done;
2762
2763   case Xin_DACAS:
2764      /* lock */
2765      *p++ = 0xF0;
2766      /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
2767         in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
2768         aren't encoded in the insn. */
2769      *p++ = 0x0F;
2770      *p++ = 0xC7;
2771      p = doAMode_M_enc(p, 1, i->Xin.DACAS.addr);
2772      goto done;
2773
2774   case Xin_Store:
2775      if (i->Xin.Store.sz == 2) {
2776         /* This case, at least, is simple, given that we can
2777            reference the low 16 bits of any integer register. */
2778         *p++ = 0x66;
2779         *p++ = 0x89;
2780         p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2781         goto done;
2782      }
2783
2784      if (i->Xin.Store.sz == 1) {
2785         /* We have to do complex dodging and weaving if src is not
2786            the low 8 bits of %eax/%ebx/%ecx/%edx. */
2787         if (iregEnc(i->Xin.Store.src) < 4) {
2788            /* we're OK, can do it directly */
2789            *p++ = 0x88;
2790            p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
2791           goto done;
2792         } else {
2793            /* Bleh.  This means the source is %edi or %esi.  Since
2794               the address mode can only mention three registers, at
2795               least one of %eax/%ebx/%ecx/%edx must be available to
2796               temporarily swap the source into, so the store can
2797               happen.  So we have to look at the regs mentioned
2798               in the amode. */
2799            HReg swap = INVALID_HREG;
2800            HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(),
2801                  ecx = hregX86_ECX(), edx = hregX86_EDX();
2802            HRegUsage u;
2803            initHRegUsage(&u);
2804            addRegUsage_X86AMode(&u, i->Xin.Store.dst);
2805            /**/ if (! HRegUsage__contains(&u, eax)) { swap = eax; }
2806            else if (! HRegUsage__contains(&u, ebx)) { swap = ebx; }
2807            else if (! HRegUsage__contains(&u, ecx)) { swap = ecx; }
2808            else if (! HRegUsage__contains(&u, edx)) { swap = edx; }
2809            vassert(! hregIsInvalid(swap));
2810            /* xchgl %source, %swap. Could do better if swap is %eax. */
2811            *p++ = 0x87;
2812            p = doAMode_R(p, i->Xin.Store.src, swap);
2813            /* movb lo8{%swap}, (dst) */
2814            *p++ = 0x88;
2815            p = doAMode_M(p, swap, i->Xin.Store.dst);
2816            /* xchgl %source, %swap. Could do better if swap is %eax. */
2817            *p++ = 0x87;
2818            p = doAMode_R(p, i->Xin.Store.src, swap);
2819            goto done;
2820         }
2821      } /* if (i->Xin.Store.sz == 1) */
2822      break;
2823
2824   case Xin_FpUnary:
2825      /* gop %src, %dst
2826         --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
2827      */
2828      p = do_ffree_st7(p);
2829      p = do_fld_st(p, 0+fregEnc(i->Xin.FpUnary.src));
2830      p = do_fop1_st(p, i->Xin.FpUnary.op);
2831      p = do_fstp_st(p, 1+fregEnc(i->Xin.FpUnary.dst));
2832      goto done;
2833
2834   case Xin_FpBinary:
2835      if (i->Xin.FpBinary.op == Xfp_YL2X
2836          || i->Xin.FpBinary.op == Xfp_YL2XP1) {
2837         /* Have to do this specially. */
2838         /* ffree %st7 ; fld %st(srcL) ;
2839            ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
2840         p = do_ffree_st7(p);
2841         p = do_fld_st(p, 0+fregEnc(i->Xin.FpBinary.srcL));
2842         p = do_ffree_st7(p);
2843         p = do_fld_st(p, 1+fregEnc(i->Xin.FpBinary.srcR));
2844         *p++ = 0xD9;
2845         *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
2846         p = do_fstp_st(p, 1+fregEnc(i->Xin.FpBinary.dst));
2847         goto done;
2848      }
2849      if (i->Xin.FpBinary.op == Xfp_ATAN) {
2850         /* Have to do this specially. */
2851         /* ffree %st7 ; fld %st(srcL) ;
2852            ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
2853         p = do_ffree_st7(p);
2854         p = do_fld_st(p, 0+fregEnc(i->Xin.FpBinary.srcL));
2855         p = do_ffree_st7(p);
2856         p = do_fld_st(p, 1+fregEnc(i->Xin.FpBinary.srcR));
2857         *p++ = 0xD9; *p++ = 0xF3;
2858         p = do_fstp_st(p, 1+fregEnc(i->Xin.FpBinary.dst));
2859         goto done;
2860      }
2861      if (i->Xin.FpBinary.op == Xfp_PREM
2862          || i->Xin.FpBinary.op == Xfp_PREM1
2863          || i->Xin.FpBinary.op == Xfp_SCALE) {
2864         /* Have to do this specially. */
2865         /* ffree %st7 ; fld %st(srcR) ;
2866            ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
2867            fincstp ; ffree %st7 */
2868         p = do_ffree_st7(p);
2869         p = do_fld_st(p, 0+fregEnc(i->Xin.FpBinary.srcR));
2870         p = do_ffree_st7(p);
2871         p = do_fld_st(p, 1+fregEnc(i->Xin.FpBinary.srcL));
2872         *p++ = 0xD9;
2873         switch (i->Xin.FpBinary.op) {
2874            case Xfp_PREM: *p++ = 0xF8; break;
2875            case Xfp_PREM1: *p++ = 0xF5; break;
2876            case Xfp_SCALE: *p++ =  0xFD; break;
2877            default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
2878         }
2879         p = do_fstp_st(p, 2+fregEnc(i->Xin.FpBinary.dst));
2880         *p++ = 0xD9; *p++ = 0xF7;
2881         p = do_ffree_st7(p);
2882         goto done;
2883      }
2884      /* General case */
2885      /* gop %srcL, %srcR, %dst
2886         --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
2887      */
2888      p = do_ffree_st7(p);
2889      p = do_fld_st(p, 0+fregEnc(i->Xin.FpBinary.srcL));
2890      p = do_fop2_st(p, i->Xin.FpBinary.op,
2891                        1+fregEnc(i->Xin.FpBinary.srcR));
2892      p = do_fstp_st(p, 1+fregEnc(i->Xin.FpBinary.dst));
2893      goto done;
2894
2895   case Xin_FpLdSt:
2896      if (i->Xin.FpLdSt.isLoad) {
2897         /* Load from memory into %fakeN.
2898            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
2899         */
2900         p = do_ffree_st7(p);
2901         switch (i->Xin.FpLdSt.sz) {
2902            case 4:
2903               *p++ = 0xD9;
2904               p = doAMode_M_enc(p, 0/*subopcode*/, i->Xin.FpLdSt.addr);
2905               break;
2906            case 8:
2907               *p++ = 0xDD;
2908               p = doAMode_M_enc(p, 0/*subopcode*/, i->Xin.FpLdSt.addr);
2909               break;
2910            case 10:
2911               *p++ = 0xDB;
2912               p = doAMode_M_enc(p, 5/*subopcode*/, i->Xin.FpLdSt.addr);
2913               break;
2914            default:
2915               vpanic("emitX86Instr(FpLdSt,load)");
2916         }
2917         p = do_fstp_st(p, 1+fregEnc(i->Xin.FpLdSt.reg));
2918         goto done;
2919      } else {
2920         /* Store from %fakeN into memory.
2921            --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
2922	 */
2923         p = do_ffree_st7(p);
2924         p = do_fld_st(p, 0+fregEnc(i->Xin.FpLdSt.reg));
2925         switch (i->Xin.FpLdSt.sz) {
2926            case 4:
2927               *p++ = 0xD9;
2928               p = doAMode_M_enc(p, 3/*subopcode*/, i->Xin.FpLdSt.addr);
2929               break;
2930            case 8:
2931               *p++ = 0xDD;
2932               p = doAMode_M_enc(p, 3/*subopcode*/, i->Xin.FpLdSt.addr);
2933               break;
2934            case 10:
2935               *p++ = 0xDB;
2936               p = doAMode_M_enc(p, 7/*subopcode*/, i->Xin.FpLdSt.addr);
2937               break;
2938            default:
2939               vpanic("emitX86Instr(FpLdSt,store)");
2940         }
2941         goto done;
2942      }
2943      break;
2944
2945   case Xin_FpLdStI:
2946      if (i->Xin.FpLdStI.isLoad) {
2947         /* Load from memory into %fakeN, converting from an int.
2948            --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
2949         */
2950         switch (i->Xin.FpLdStI.sz) {
2951            case 8:  opc = 0xDF; subopc_imm = 5; break;
2952            case 4:  opc = 0xDB; subopc_imm = 0; break;
2953            case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
2954            default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
2955         }
2956         p = do_ffree_st7(p);
2957         *p++ = toUChar(opc);
2958         p = doAMode_M_enc(p, subopc_imm/*subopcode*/, i->Xin.FpLdStI.addr);
2959         p = do_fstp_st(p, 1+fregEnc(i->Xin.FpLdStI.reg));
2960         goto done;
2961      } else {
2962         /* Store from %fakeN into memory, converting to an int.
2963            --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
2964	 */
2965         switch (i->Xin.FpLdStI.sz) {
2966            case 8:  opc = 0xDF; subopc_imm = 7; break;
2967            case 4:  opc = 0xDB; subopc_imm = 3; break;
2968            case 2:  opc = 0xDF; subopc_imm = 3; break;
2969            default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
2970         }
2971         p = do_ffree_st7(p);
2972         p = do_fld_st(p, 0+fregEnc(i->Xin.FpLdStI.reg));
2973         *p++ = toUChar(opc);
2974         p = doAMode_M_enc(p, subopc_imm/*subopcode*/, i->Xin.FpLdStI.addr);
2975         goto done;
2976      }
2977      break;
2978
2979   case Xin_Fp64to32:
2980      /* ffree %st7 ; fld %st(src) */
2981      p = do_ffree_st7(p);
2982      p = do_fld_st(p, 0+fregEnc(i->Xin.Fp64to32.src));
2983      /* subl $4, %esp */
2984      *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
2985      /* fstps (%esp) */
2986      *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
2987      /* flds (%esp) */
2988      *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
2989      /* addl $4, %esp */
2990      *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
2991      /* fstp %st(1+dst) */
2992      p = do_fstp_st(p, 1+fregEnc(i->Xin.Fp64to32.dst));
2993      goto done;
2994
2995   case Xin_FpCMov:
2996      /* jmp fwds if !condition */
2997      *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
2998      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
2999      ptmp = p;
3000
3001      /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
3002      p = do_ffree_st7(p);
3003      p = do_fld_st(p, 0+fregEnc(i->Xin.FpCMov.src));
3004      p = do_fstp_st(p, 1+fregEnc(i->Xin.FpCMov.dst));
3005
3006      /* Fill in the jump offset. */
3007      *(ptmp-1) = toUChar(p - ptmp);
3008      goto done;
3009
3010   case Xin_FpLdCW:
3011      *p++ = 0xD9;
3012      p = doAMode_M_enc(p, 5/*subopcode*/, i->Xin.FpLdCW.addr);
3013      goto done;
3014
3015   case Xin_FpStSW_AX:
3016      /* note, this emits fnstsw %ax, not fstsw %ax */
3017      *p++ = 0xDF;
3018      *p++ = 0xE0;
3019      goto done;
3020
3021   case Xin_FpCmp:
3022      /* gcmp %fL, %fR, %dst
3023         -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
3024            fnstsw %ax ; movl %eax, %dst
3025      */
3026      /* ffree %st7 */
3027      p = do_ffree_st7(p);
3028      /* fpush %fL */
3029      p = do_fld_st(p, 0+fregEnc(i->Xin.FpCmp.srcL));
3030      /* fucomp %(fR+1) */
3031      *p++ = 0xDD;
3032      *p++ = toUChar(0xE8 + (7 & (1+fregEnc(i->Xin.FpCmp.srcR))));
3033      /* fnstsw %ax */
3034      *p++ = 0xDF;
3035      *p++ = 0xE0;
3036      /*  movl %eax, %dst */
3037      *p++ = 0x89;
3038      p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
3039      goto done;
3040
3041   case Xin_SseConst: {
3042      UShort con = i->Xin.SseConst.con;
3043      p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
3044      p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
3045      p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
3046      p = push_word_from_tags(p, toUShort(con & 0xF));
3047      /* movl (%esp), %xmm-dst */
3048      *p++ = 0x0F;
3049      *p++ = 0x10;
3050      *p++ = toUChar(0x04 + 8 * (7 & vregEnc(i->Xin.SseConst.dst)));
3051      *p++ = 0x24;
3052      /* addl $16, %esp */
3053      *p++ = 0x83;
3054      *p++ = 0xC4;
3055      *p++ = 0x10;
3056      goto done;
3057   }
3058
3059   case Xin_SseLdSt:
3060      *p++ = 0x0F;
3061      *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
3062      p = doAMode_M_enc(p, vregEnc(i->Xin.SseLdSt.reg), i->Xin.SseLdSt.addr);
3063      goto done;
3064
3065   case Xin_SseLdzLO:
3066      vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
3067      /* movs[sd] amode, %xmm-dst */
3068      *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3069      *p++ = 0x0F;
3070      *p++ = 0x10;
3071      p = doAMode_M_enc(p, vregEnc(i->Xin.SseLdzLO.reg), i->Xin.SseLdzLO.addr);
3072      goto done;
3073
3074   case Xin_Sse32Fx4:
3075      xtra = 0;
3076      *p++ = 0x0F;
3077      switch (i->Xin.Sse32Fx4.op) {
3078         case Xsse_ADDF:   *p++ = 0x58; break;
3079         case Xsse_DIVF:   *p++ = 0x5E; break;
3080         case Xsse_MAXF:   *p++ = 0x5F; break;
3081         case Xsse_MINF:   *p++ = 0x5D; break;
3082         case Xsse_MULF:   *p++ = 0x59; break;
3083         case Xsse_RCPF:   *p++ = 0x53; break;
3084         case Xsse_RSQRTF: *p++ = 0x52; break;
3085         case Xsse_SQRTF:  *p++ = 0x51; break;
3086         case Xsse_SUBF:   *p++ = 0x5C; break;
3087         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3088         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3089         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3090         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3091         default: goto bad;
3092      }
3093      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.Sse32Fx4.dst),
3094                               vregEnc(i->Xin.Sse32Fx4.src) );
3095      if (xtra & 0x100)
3096         *p++ = toUChar(xtra & 0xFF);
3097      goto done;
3098
3099   case Xin_Sse64Fx2:
3100      xtra = 0;
3101      *p++ = 0x66;
3102      *p++ = 0x0F;
3103      switch (i->Xin.Sse64Fx2.op) {
3104         case Xsse_ADDF:   *p++ = 0x58; break;
3105         case Xsse_DIVF:   *p++ = 0x5E; break;
3106         case Xsse_MAXF:   *p++ = 0x5F; break;
3107         case Xsse_MINF:   *p++ = 0x5D; break;
3108         case Xsse_MULF:   *p++ = 0x59; break;
3109         case Xsse_RCPF:   *p++ = 0x53; break;
3110         case Xsse_RSQRTF: *p++ = 0x52; break;
3111         case Xsse_SQRTF:  *p++ = 0x51; break;
3112         case Xsse_SUBF:   *p++ = 0x5C; break;
3113         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3114         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3115         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3116         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3117         default: goto bad;
3118      }
3119      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.Sse64Fx2.dst),
3120                               vregEnc(i->Xin.Sse64Fx2.src) );
3121      if (xtra & 0x100)
3122         *p++ = toUChar(xtra & 0xFF);
3123      goto done;
3124
3125   case Xin_Sse32FLo:
3126      xtra = 0;
3127      *p++ = 0xF3;
3128      *p++ = 0x0F;
3129      switch (i->Xin.Sse32FLo.op) {
3130         case Xsse_ADDF:   *p++ = 0x58; break;
3131         case Xsse_DIVF:   *p++ = 0x5E; break;
3132         case Xsse_MAXF:   *p++ = 0x5F; break;
3133         case Xsse_MINF:   *p++ = 0x5D; break;
3134         case Xsse_MULF:   *p++ = 0x59; break;
3135         case Xsse_RCPF:   *p++ = 0x53; break;
3136         case Xsse_RSQRTF: *p++ = 0x52; break;
3137         case Xsse_SQRTF:  *p++ = 0x51; break;
3138         case Xsse_SUBF:   *p++ = 0x5C; break;
3139         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3140         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3141         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3142         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3143         default: goto bad;
3144      }
3145      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.Sse32FLo.dst),
3146                               vregEnc(i->Xin.Sse32FLo.src) );
3147      if (xtra & 0x100)
3148         *p++ = toUChar(xtra & 0xFF);
3149      goto done;
3150
3151   case Xin_Sse64FLo:
3152      xtra = 0;
3153      *p++ = 0xF2;
3154      *p++ = 0x0F;
3155      switch (i->Xin.Sse64FLo.op) {
3156         case Xsse_ADDF:   *p++ = 0x58; break;
3157         case Xsse_DIVF:   *p++ = 0x5E; break;
3158         case Xsse_MAXF:   *p++ = 0x5F; break;
3159         case Xsse_MINF:   *p++ = 0x5D; break;
3160         case Xsse_MULF:   *p++ = 0x59; break;
3161         case Xsse_RCPF:   *p++ = 0x53; break;
3162         case Xsse_RSQRTF: *p++ = 0x52; break;
3163         case Xsse_SQRTF:  *p++ = 0x51; break;
3164         case Xsse_SUBF:   *p++ = 0x5C; break;
3165         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3166         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3167         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3168         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3169         default: goto bad;
3170      }
3171      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.Sse64FLo.dst),
3172                               vregEnc(i->Xin.Sse64FLo.src) );
3173      if (xtra & 0x100)
3174         *p++ = toUChar(xtra & 0xFF);
3175      goto done;
3176
3177   case Xin_SseReRg:
3178#     define XX(_n) *p++ = (_n)
3179      switch (i->Xin.SseReRg.op) {
3180         case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
3181         case Xsse_OR:                 XX(0x0F); XX(0x56); break;
3182         case Xsse_XOR:                XX(0x0F); XX(0x57); break;
3183         case Xsse_AND:                XX(0x0F); XX(0x54); break;
3184         case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
3185         case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
3186         case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
3187         case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
3188         case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
3189         case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
3190         case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
3191         case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
3192         case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
3193         case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
3194         case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
3195         case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
3196         case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
3197         case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
3198         case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
3199         case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
3200         case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
3201         case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
3202         case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
3203         case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
3204         case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
3205         case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
3206         case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
3207         case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
3208         case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
3209         case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
3210         case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
3211         case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
3212         case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
3213         case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
3214         case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
3215         case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
3216         case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
3217         case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
3218         case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
3219         case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
3220         case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
3221         case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
3222         case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
3223         case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
3224         case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
3225         case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
3226         case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
3227         case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
3228         case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
3229         case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
3230         case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
3231         case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
3232         case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
3233         case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
3234         default: goto bad;
3235      }
3236      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.SseReRg.dst),
3237                               vregEnc(i->Xin.SseReRg.src) );
3238#     undef XX
3239      goto done;
3240
3241   case Xin_SseCMov:
3242      /* jmp fwds if !condition */
3243      *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
3244      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3245      ptmp = p;
3246
3247      /* movaps %src, %dst */
3248      *p++ = 0x0F;
3249      *p++ = 0x28;
3250      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.SseCMov.dst),
3251                               vregEnc(i->Xin.SseCMov.src) );
3252
3253      /* Fill in the jump offset. */
3254      *(ptmp-1) = toUChar(p - ptmp);
3255      goto done;
3256
3257   case Xin_SseShuf:
3258      *p++ = 0x66;
3259      *p++ = 0x0F;
3260      *p++ = 0x70;
3261      p = doAMode_R_enc_enc(p, vregEnc(i->Xin.SseShuf.dst),
3262                               vregEnc(i->Xin.SseShuf.src) );
3263      *p++ = (UChar)(i->Xin.SseShuf.order);
3264      goto done;
3265
3266   case Xin_EvCheck: {
3267      /* We generate:
3268            (3 bytes)  decl 4(%ebp)    4 == offsetof(host_EvC_COUNTER)
3269            (2 bytes)  jns  nofail     expected taken
3270            (3 bytes)  jmp* 0(%ebp)    0 == offsetof(host_EvC_FAILADDR)
3271            nofail:
3272      */
3273      /* This is heavily asserted re instruction lengths.  It needs to
3274         be.  If we get given unexpected forms of .amCounter or
3275         .amFailAddr -- basically, anything that's not of the form
3276         uimm7(%ebp) -- they are likely to fail. */
3277      /* Note also that after the decl we must be very careful not to
3278         read the carry flag, else we get a partial flags stall.
3279         js/jns avoids that, though. */
3280      UChar* p0 = p;
3281      /* ---  decl 8(%ebp) --- */
3282      /* "1" because + there's no register in this encoding;
3283         instead the register + field is used as a sub opcode.  The
3284         encoding for "decl r/m32" + is FF /1, hence the "1". */
3285      *p++ = 0xFF;
3286      p = doAMode_M_enc(p, 1, i->Xin.EvCheck.amCounter);
3287      vassert(p - p0 == 3);
3288      /* --- jns nofail --- */
3289      *p++ = 0x79;
3290      *p++ = 0x03; /* need to check this 0x03 after the next insn */
3291      vassert(p - p0 == 5);
3292      /* --- jmp* 0(%ebp) --- */
3293      /* The encoding is FF /4. */
3294      *p++ = 0xFF;
3295      p = doAMode_M_enc(p, 4, i->Xin.EvCheck.amFailAddr);
3296      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3297      /* And crosscheck .. */
3298      vassert(evCheckSzB_X86() == 8);
3299      goto done;
3300   }
3301
3302   case Xin_ProfInc: {
3303      /* We generate   addl $1,NotKnownYet
3304                       adcl $0,NotKnownYet+4
3305         in the expectation that a later call to LibVEX_patchProfCtr
3306         will be used to fill in the immediate fields once the right
3307         value is known.
3308           83 05  00 00 00 00  01
3309           83 15  00 00 00 00  00
3310      */
3311      *p++ = 0x83; *p++ = 0x05;
3312      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3313      *p++ = 0x01;
3314      *p++ = 0x83; *p++ = 0x15;
3315      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3316      *p++ = 0x00;
3317      /* Tell the caller .. */
3318      vassert(!(*is_profInc));
3319      *is_profInc = True;
3320      goto done;
3321   }
3322
3323   default:
3324      goto bad;
3325   }
3326
3327  bad:
3328   ppX86Instr(i, mode64);
3329   vpanic("emit_X86Instr");
3330   /*NOTREACHED*/
3331
3332  done:
3333   vassert(p - &buf[0] <= 32);
3334   return p - &buf[0];
3335}
3336
3337
3338/* How big is an event check?  See case for Xin_EvCheck in
3339   emit_X86Instr just above.  That crosschecks what this returns, so
3340   we can tell if we're inconsistent. */
3341Int evCheckSzB_X86 (void)
3342{
3343   return 8;
3344}
3345
3346
3347/* NB: what goes on here has to be very closely coordinated with the
3348   emitInstr case for XDirect, above. */
3349VexInvalRange chainXDirect_X86 ( VexEndness endness_host,
3350                                 void* place_to_chain,
3351                                 const void* disp_cp_chain_me_EXPECTED,
3352                                 const void* place_to_jump_to )
3353{
3354   vassert(endness_host == VexEndnessLE);
3355
3356   /* What we're expecting to see is:
3357        movl $disp_cp_chain_me_EXPECTED, %edx
3358        call *%edx
3359      viz
3360        BA <4 bytes value == disp_cp_chain_me_EXPECTED>
3361        FF D2
3362   */
3363   UChar* p = (UChar*)place_to_chain;
3364   vassert(p[0] == 0xBA);
3365   vassert(read_misaligned_UInt_LE(&p[1])
3366           == (UInt)(Addr)disp_cp_chain_me_EXPECTED);
3367   vassert(p[5] == 0xFF);
3368   vassert(p[6] == 0xD2);
3369   /* And what we want to change it to is:
3370          jmp disp32   where disp32 is relative to the next insn
3371          ud2;
3372        viz
3373          E9 <4 bytes == disp32>
3374          0F 0B
3375      The replacement has the same length as the original.
3376   */
3377   /* This is the delta we need to put into a JMP d32 insn.  It's
3378      relative to the start of the next insn, hence the -5.  */
3379   Long delta = (Long)((const UChar *)place_to_jump_to - p) - 5;
3380
3381   /* And make the modifications. */
3382   p[0] = 0xE9;
3383   write_misaligned_UInt_LE(&p[1], (UInt)(ULong)delta);
3384   p[5] = 0x0F; p[6] = 0x0B;
3385   /* sanity check on the delta -- top 32 are all 0 or all 1 */
3386   delta >>= 32;
3387   vassert(delta == 0LL || delta == -1LL);
3388   VexInvalRange vir = { (HWord)place_to_chain, 7 };
3389   return vir;
3390}
3391
3392
3393/* NB: what goes on here has to be very closely coordinated with the
3394   emitInstr case for XDirect, above. */
3395VexInvalRange unchainXDirect_X86 ( VexEndness endness_host,
3396                                   void* place_to_unchain,
3397                                   const void* place_to_jump_to_EXPECTED,
3398                                   const void* disp_cp_chain_me )
3399{
3400   vassert(endness_host == VexEndnessLE);
3401
3402   /* What we're expecting to see is:
3403          jmp d32
3404          ud2;
3405       viz
3406          E9 <4 bytes == disp32>
3407          0F 0B
3408   */
3409   UChar* p     = (UChar*)place_to_unchain;
3410   Bool   valid = False;
3411   if (p[0] == 0xE9
3412       && p[5] == 0x0F && p[6]  == 0x0B) {
3413      /* Check the offset is right. */
3414      Int s32 = (Int)read_misaligned_UInt_LE(&p[1]);
3415      if ((UChar*)p + 5 + s32 == place_to_jump_to_EXPECTED) {
3416         valid = True;
3417         if (0)
3418            vex_printf("QQQ unchainXDirect_X86: found valid\n");
3419      }
3420   }
3421   vassert(valid);
3422   /* And what we want to change it to is:
3423         movl $disp_cp_chain_me, %edx
3424         call *%edx
3425      viz
3426         BA <4 bytes value == disp_cp_chain_me_EXPECTED>
3427         FF D2
3428      So it's the same length (convenient, huh).
3429   */
3430   p[0] = 0xBA;
3431   write_misaligned_UInt_LE(&p[1], (UInt)(Addr)disp_cp_chain_me);
3432   p[5] = 0xFF;
3433   p[6] = 0xD2;
3434   VexInvalRange vir = { (HWord)place_to_unchain, 7 };
3435   return vir;
3436}
3437
3438
3439/* Patch the counter address into a profile inc point, as previously
3440   created by the Xin_ProfInc case for emit_X86Instr. */
3441VexInvalRange patchProfInc_X86 ( VexEndness endness_host,
3442                                 void*  place_to_patch,
3443                                 const ULong* location_of_counter )
3444{
3445   vassert(endness_host == VexEndnessLE);
3446   vassert(sizeof(ULong*) == 4);
3447   UChar* p = (UChar*)place_to_patch;
3448   vassert(p[0] == 0x83);
3449   vassert(p[1] == 0x05);
3450   vassert(p[2] == 0x00);
3451   vassert(p[3] == 0x00);
3452   vassert(p[4] == 0x00);
3453   vassert(p[5] == 0x00);
3454   vassert(p[6] == 0x01);
3455   vassert(p[7] == 0x83);
3456   vassert(p[8] == 0x15);
3457   vassert(p[9] == 0x00);
3458   vassert(p[10] == 0x00);
3459   vassert(p[11] == 0x00);
3460   vassert(p[12] == 0x00);
3461   vassert(p[13] == 0x00);
3462   UInt imm32 = (UInt)(Addr)location_of_counter;
3463   p[2] = imm32 & 0xFF; imm32 >>= 8;
3464   p[3] = imm32 & 0xFF; imm32 >>= 8;
3465   p[4] = imm32 & 0xFF; imm32 >>= 8;
3466   p[5] = imm32 & 0xFF;
3467   imm32 = 4 + (UInt)(Addr)location_of_counter;
3468   p[9]  = imm32 & 0xFF; imm32 >>= 8;
3469   p[10] = imm32 & 0xFF; imm32 >>= 8;
3470   p[11] = imm32 & 0xFF; imm32 >>= 8;
3471   p[12] = imm32 & 0xFF;
3472   VexInvalRange vir = { (HWord)place_to_patch, 14 };
3473   return vir;
3474}
3475
3476
3477/*---------------------------------------------------------------*/
3478/*--- end                                     host_x86_defs.c ---*/
3479/*---------------------------------------------------------------*/
3480