1
2/*---------------------------------------------------------------*/
3/*--- begin                             guest_amd64_helpers.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2011 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_emwarn.h"
38#include "libvex_guest_amd64.h"
39#include "libvex_ir.h"
40#include "libvex.h"
41
42#include "main_util.h"
43#include "guest_generic_bb_to_IR.h"
44#include "guest_amd64_defs.h"
45#include "guest_generic_x87.h"
46
47
48/* This file contains helper functions for amd64 guest code.
49   Calls to these functions are generated by the back end.
50   These calls are of course in the host machine code and
51   this file will be compiled to host machine code, so that
52   all makes sense.
53
54   Only change the signatures of these helper functions very
55   carefully.  If you change the signature here, you'll have to change
56   the parameters passed to it in the IR calls constructed by
57   guest-amd64/toIR.c.
58
59   The convention used is that all functions called from generated
60   code are named amd64g_<something>, and any function whose name lacks
61   that prefix is not called from generated code.  Note that some
62   LibVEX_* functions can however be called by VEX's client, but that
63   is not the same as calling them from VEX-generated code.
64*/
65
66
67/* Set to 1 to get detailed profiling info about use of the flag
68   machinery. */
69#define PROFILE_RFLAGS 0
70
71
72/*---------------------------------------------------------------*/
73/*--- %rflags run-time helpers.                               ---*/
74/*---------------------------------------------------------------*/
75
76/* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
77   after imulq/mulq. */
78
79static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
80{
81   ULong u0, v0, w0;
82    Long u1, v1, w1, w2, t;
83   u0   = u & 0xFFFFFFFFULL;
84   u1   = u >> 32;
85   v0   = v & 0xFFFFFFFFULL;
86   v1   = v >> 32;
87   w0   = u0 * v0;
88   t    = u1 * v0 + (w0 >> 32);
89   w1   = t & 0xFFFFFFFFULL;
90   w2   = t >> 32;
91   w1   = u0 * v1 + w1;
92   *rHi = u1 * v1 + w2 + (w1 >> 32);
93   *rLo = u * v;
94}
95
96static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
97{
98   ULong u0, v0, w0;
99   ULong u1, v1, w1,w2,t;
100   u0   = u & 0xFFFFFFFFULL;
101   u1   = u >> 32;
102   v0   = v & 0xFFFFFFFFULL;
103   v1   = v >> 32;
104   w0   = u0 * v0;
105   t    = u1 * v0 + (w0 >> 32);
106   w1   = t & 0xFFFFFFFFULL;
107   w2   = t >> 32;
108   w1   = u0 * v1 + w1;
109   *rHi = u1 * v1 + w2 + (w1 >> 32);
110   *rLo = u * v;
111}
112
113
114static const UChar parity_table[256] = {
115    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
116    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
117    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
118    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
121    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
124    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
126    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
129    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
133    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
137    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
140    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
142    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
145    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147};
148
149/* generalised left-shifter */
150static inline Long lshift ( Long x, Int n )
151{
152   if (n >= 0)
153      return x << n;
154   else
155      return x >> (-n);
156}
157
158/* identity on ULong */
159static inline ULong idULong ( ULong x )
160{
161   return x;
162}
163
164
165#define PREAMBLE(__data_bits)					\
166   /* const */ ULong DATA_MASK 					\
167      = __data_bits==8                                          \
168           ? 0xFFULL 					        \
169           : (__data_bits==16                                   \
170                ? 0xFFFFULL 		                        \
171                : (__data_bits==32                              \
172                     ? 0xFFFFFFFFULL                            \
173                     : 0xFFFFFFFFFFFFFFFFULL));                 \
174   /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
175   /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
176   /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
177   /* const */ ULong CC_NDEP = cc_ndep_formal;			\
178   /* Four bogus assignments, which hopefully gcc can     */	\
179   /* optimise away, and which stop it complaining about  */	\
180   /* unused variables.                                   */	\
181   SIGN_MASK = SIGN_MASK;					\
182   DATA_MASK = DATA_MASK;					\
183   CC_DEP2 = CC_DEP2;						\
184   CC_NDEP = CC_NDEP;
185
186
187/*-------------------------------------------------------------*/
188
189#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
190{								\
191   PREAMBLE(DATA_BITS);						\
192   { Long cf, pf, af, zf, sf, of;				\
193     Long argL, argR, res;					\
194     argL = CC_DEP1;						\
195     argR = CC_DEP2;						\
196     res  = argL + argR;					\
197     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
198     pf = parity_table[(UChar)res];				\
199     af = (res ^ argL ^ argR) & 0x10;				\
200     zf = ((DATA_UTYPE)res == 0) << 6;				\
201     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
202     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
203                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
204     return cf | pf | af | zf | sf | of;			\
205   }								\
206}
207
208/*-------------------------------------------------------------*/
209
210#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
211{								\
212   PREAMBLE(DATA_BITS);						\
213   { Long cf, pf, af, zf, sf, of;				\
214     Long argL, argR, res;					\
215     argL = CC_DEP1;						\
216     argR = CC_DEP2;						\
217     res  = argL - argR;					\
218     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
219     pf = parity_table[(UChar)res];				\
220     af = (res ^ argL ^ argR) & 0x10;				\
221     zf = ((DATA_UTYPE)res == 0) << 6;				\
222     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
223     of = lshift((argL ^ argR) & (argL ^ res),	 		\
224                 12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
225     return cf | pf | af | zf | sf | of;			\
226   }								\
227}
228
229/*-------------------------------------------------------------*/
230
231#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
232{								\
233   PREAMBLE(DATA_BITS);						\
234   { Long cf, pf, af, zf, sf, of;				\
235     Long argL, argR, oldC, res;		 		\
236     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
237     argL = CC_DEP1;						\
238     argR = CC_DEP2 ^ oldC;	       				\
239     res  = (argL + argR) + oldC;				\
240     if (oldC)							\
241        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
242     else							\
243        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
244     pf = parity_table[(UChar)res];				\
245     af = (res ^ argL ^ argR) & 0x10;				\
246     zf = ((DATA_UTYPE)res == 0) << 6;				\
247     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
248     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
249                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
250     return cf | pf | af | zf | sf | of;			\
251   }								\
252}
253
254/*-------------------------------------------------------------*/
255
256#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
257{								\
258   PREAMBLE(DATA_BITS);						\
259   { Long cf, pf, af, zf, sf, of;				\
260     Long argL, argR, oldC, res;	       			\
261     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
262     argL = CC_DEP1;						\
263     argR = CC_DEP2 ^ oldC;	       				\
264     res  = (argL - argR) - oldC;				\
265     if (oldC)							\
266        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
267     else							\
268        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
269     pf = parity_table[(UChar)res];				\
270     af = (res ^ argL ^ argR) & 0x10;				\
271     zf = ((DATA_UTYPE)res == 0) << 6;				\
272     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
273     of = lshift((argL ^ argR) & (argL ^ res), 			\
274                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
275     return cf | pf | af | zf | sf | of;			\
276   }								\
277}
278
279/*-------------------------------------------------------------*/
280
281#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
282{								\
283   PREAMBLE(DATA_BITS);						\
284   { Long cf, pf, af, zf, sf, of;				\
285     cf = 0;							\
286     pf = parity_table[(UChar)CC_DEP1];				\
287     af = 0;							\
288     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
289     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
290     of = 0;							\
291     return cf | pf | af | zf | sf | of;			\
292   }								\
293}
294
295/*-------------------------------------------------------------*/
296
297#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
298{								\
299   PREAMBLE(DATA_BITS);						\
300   { Long cf, pf, af, zf, sf, of;				\
301     Long argL, argR, res;					\
302     res  = CC_DEP1;						\
303     argL = res - 1;						\
304     argR = 1;							\
305     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
306     pf = parity_table[(UChar)res];				\
307     af = (res ^ argL ^ argR) & 0x10;				\
308     zf = ((DATA_UTYPE)res == 0) << 6;				\
309     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
310     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
311     return cf | pf | af | zf | sf | of;			\
312   }								\
313}
314
315/*-------------------------------------------------------------*/
316
317#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
318{								\
319   PREAMBLE(DATA_BITS);						\
320   { Long cf, pf, af, zf, sf, of;				\
321     Long argL, argR, res;					\
322     res  = CC_DEP1;						\
323     argL = res + 1;						\
324     argR = 1;							\
325     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
326     pf = parity_table[(UChar)res];				\
327     af = (res ^ argL ^ argR) & 0x10;				\
328     zf = ((DATA_UTYPE)res == 0) << 6;				\
329     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
330     of = ((res & DATA_MASK) 					\
331          == ((ULong)SIGN_MASK - 1)) << 11;			\
332     return cf | pf | af | zf | sf | of;			\
333   }								\
334}
335
336/*-------------------------------------------------------------*/
337
338#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
339{								\
340   PREAMBLE(DATA_BITS);						\
341   { Long cf, pf, af, zf, sf, of;				\
342     cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
343     pf = parity_table[(UChar)CC_DEP1];				\
344     af = 0; /* undefined */					\
345     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
346     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
347     /* of is defined if shift count == 1 */			\
348     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
349          & AMD64G_CC_MASK_O;					\
350     return cf | pf | af | zf | sf | of;			\
351   }								\
352}
353
354/*-------------------------------------------------------------*/
355
356#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
357{								\
358   PREAMBLE(DATA_BITS);  					\
359   { Long cf, pf, af, zf, sf, of;				\
360     cf = CC_DEP2 & 1;						\
361     pf = parity_table[(UChar)CC_DEP1];				\
362     af = 0; /* undefined */					\
363     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
364     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
365     /* of is defined if shift count == 1 */			\
366     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
367          & AMD64G_CC_MASK_O;					\
368     return cf | pf | af | zf | sf | of;			\
369   }								\
370}
371
372/*-------------------------------------------------------------*/
373
374/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
375/* DEP1 = result, NDEP = old flags */
376#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
377{								\
378   PREAMBLE(DATA_BITS);						\
379   { Long fl 							\
380        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
381          | (AMD64G_CC_MASK_C & CC_DEP1)			\
382          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
383                                      11-(DATA_BITS-1)) 	\
384                     ^ lshift(CC_DEP1, 11)));			\
385     return fl;							\
386   }								\
387}
388
389/*-------------------------------------------------------------*/
390
391/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
392/* DEP1 = result, NDEP = old flags */
393#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
394{								\
395   PREAMBLE(DATA_BITS);						\
396   { Long fl 							\
397        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
398          | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
399          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
400                                      11-(DATA_BITS-1)) 	\
401                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
402     return fl;							\
403   }								\
404}
405
406/*-------------------------------------------------------------*/
407
408#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
409                                DATA_U2TYPE, NARROWto2U)        \
410{                                                               \
411   PREAMBLE(DATA_BITS);                                         \
412   { Long cf, pf, af, zf, sf, of;                               \
413     DATA_UTYPE  hi;                                            \
414     DATA_UTYPE  lo                                             \
415        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
416                     * ((DATA_UTYPE)CC_DEP2) );                 \
417     DATA_U2TYPE rr                                             \
418        = NARROWto2U(                                           \
419             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
420             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
421     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
422     cf = (hi != 0);                                            \
423     pf = parity_table[(UChar)lo];                              \
424     af = 0; /* undefined */                                    \
425     zf = (lo == 0) << 6;                                       \
426     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
427     of = cf << 11;                                             \
428     return cf | pf | af | zf | sf | of;                        \
429   }								\
430}
431
432/*-------------------------------------------------------------*/
433
434#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
435                                DATA_S2TYPE, NARROWto2S)        \
436{                                                               \
437   PREAMBLE(DATA_BITS);                                         \
438   { Long cf, pf, af, zf, sf, of;                               \
439     DATA_STYPE  hi;                                            \
440     DATA_STYPE  lo                                             \
441        = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
442                     * ((DATA_STYPE)CC_DEP2) );                 \
443     DATA_S2TYPE rr                                             \
444        = NARROWto2S(                                           \
445             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
446             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
447     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
448     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
449     pf = parity_table[(UChar)lo];                              \
450     af = 0; /* undefined */                                    \
451     zf = (lo == 0) << 6;                                       \
452     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
453     of = cf << 11;                                             \
454     return cf | pf | af | zf | sf | of;                        \
455   }								\
456}
457
458/*-------------------------------------------------------------*/
459
460#define ACTIONS_UMULQ                                           \
461{                                                               \
462   PREAMBLE(64);                                                \
463   { Long cf, pf, af, zf, sf, of;                               \
464     ULong lo, hi;                                              \
465     mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
466     cf = (hi != 0);                                            \
467     pf = parity_table[(UChar)lo];                              \
468     af = 0; /* undefined */                                    \
469     zf = (lo == 0) << 6;                                       \
470     sf = lshift(lo, 8 - 64) & 0x80;                            \
471     of = cf << 11;                                             \
472     return cf | pf | af | zf | sf | of;                        \
473   }								\
474}
475
476/*-------------------------------------------------------------*/
477
478#define ACTIONS_SMULQ                                           \
479{                                                               \
480   PREAMBLE(64);                                                \
481   { Long cf, pf, af, zf, sf, of;                               \
482     Long lo, hi;                                               \
483     mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
484     cf = (hi != (lo >>/*s*/ (64-1)));                          \
485     pf = parity_table[(UChar)lo];                              \
486     af = 0; /* undefined */                                    \
487     zf = (lo == 0) << 6;                                       \
488     sf = lshift(lo, 8 - 64) & 0x80;                            \
489     of = cf << 11;                                             \
490     return cf | pf | af | zf | sf | of;                        \
491   }								\
492}
493
494
495#if PROFILE_RFLAGS
496
497static Bool initted     = False;
498
499/* C flag, fast route */
500static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
501/* C flag, slow route */
502static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
503/* table for calculate_cond */
504static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
505/* total entry counts for calc_all, calc_c, calc_cond. */
506static UInt n_calc_all  = 0;
507static UInt n_calc_c    = 0;
508static UInt n_calc_cond = 0;
509
510#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
511
512
513static void showCounts ( void )
514{
515   Int op, co;
516   Char ch;
517   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
518              n_calc_all, n_calc_cond, n_calc_c);
519
520   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
521              "    S   NS    P   NP    L   NL   LE  NLE\n");
522   vex_printf("     -----------------------------------------------------"
523              "----------------------------------------\n");
524   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
525
526      ch = ' ';
527      if (op > 0 && (op-1) % 4 == 0)
528         ch = 'B';
529      if (op > 0 && (op-1) % 4 == 1)
530         ch = 'W';
531      if (op > 0 && (op-1) % 4 == 2)
532         ch = 'L';
533      if (op > 0 && (op-1) % 4 == 3)
534         ch = 'Q';
535
536      vex_printf("%2d%c: ", op, ch);
537      vex_printf("%6u ", tabc_slow[op]);
538      vex_printf("%6u ", tabc_fast[op]);
539      for (co = 0; co < 16; co++) {
540         Int n = tab_cond[op][co];
541         if (n >= 1000) {
542            vex_printf(" %3dK", n / 1000);
543         } else
544         if (n >= 0) {
545            vex_printf(" %3d ", n );
546         } else {
547            vex_printf("     ");
548         }
549      }
550      vex_printf("\n");
551   }
552   vex_printf("\n");
553}
554
555static void initCounts ( void )
556{
557   Int op, co;
558   initted = True;
559   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
560      tabc_fast[op] = tabc_slow[op] = 0;
561      for (co = 0; co < 16; co++)
562         tab_cond[op][co] = 0;
563   }
564}
565
566#endif /* PROFILE_RFLAGS */
567
568
569/* CALLED FROM GENERATED CODE: CLEAN HELPER */
570/* Calculate all the 6 flags from the supplied thunk parameters.
571   Worker function, not directly called from generated code. */
572static
573ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
574                                        ULong cc_dep1_formal,
575                                        ULong cc_dep2_formal,
576                                        ULong cc_ndep_formal )
577{
578   switch (cc_op) {
579      case AMD64G_CC_OP_COPY:
580         return cc_dep1_formal
581                & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
582                   | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
583
584      case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
585      case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
586      case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
587      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
588
589      case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
590      case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
591      case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
592      case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
593
594      case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
595      case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
596      case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
597      case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
598
599      case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
600      case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
601      case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
602      case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
603
604      case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
605      case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
606      case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
607      case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
608
609      case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
610      case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
611      case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
612      case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
613
614      case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
615      case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
616      case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
617      case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
618
619      case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
620      case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
621      case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
622      case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
623
624      case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
625      case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
626      case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
627      case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
628
629      case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
630      case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
631      case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
632      case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
633
634      case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
635      case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
636      case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
637      case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
638
639      case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
640                                                  UShort, toUShort );
641      case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
642                                                  UInt,   toUInt );
643      case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
644                                                  ULong,  idULong );
645
646      case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
647
648      case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
649                                                  Short,  toUShort );
650      case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
651                                                  Int,    toUInt   );
652      case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
653                                                  Long,   idULong );
654
655      case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
656
657      default:
658         /* shouldn't really make these calls from generated code */
659         vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
660                    "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
661                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
662         vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
663   }
664}
665
666
667/* CALLED FROM GENERATED CODE: CLEAN HELPER */
668/* Calculate all the 6 flags from the supplied thunk parameters. */
669ULong amd64g_calculate_rflags_all ( ULong cc_op,
670                                    ULong cc_dep1,
671                                    ULong cc_dep2,
672                                    ULong cc_ndep )
673{
674#  if PROFILE_RFLAGS
675   if (!initted) initCounts();
676   n_calc_all++;
677   if (SHOW_COUNTS_NOW) showCounts();
678#  endif
679   return
680      amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
681}
682
683
684/* CALLED FROM GENERATED CODE: CLEAN HELPER */
685/* Calculate just the carry flag from the supplied thunk parameters. */
686ULong amd64g_calculate_rflags_c ( ULong cc_op,
687                                  ULong cc_dep1,
688                                  ULong cc_dep2,
689                                  ULong cc_ndep )
690{
691#  if PROFILE_RFLAGS
692   if (!initted) initCounts();
693   n_calc_c++;
694   tabc_fast[cc_op]++;
695   if (SHOW_COUNTS_NOW) showCounts();
696#  endif
697
698   /* Fast-case some common ones. */
699   switch (cc_op) {
700      case AMD64G_CC_OP_COPY:
701         return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
702      case AMD64G_CC_OP_LOGICQ:
703      case AMD64G_CC_OP_LOGICL:
704      case AMD64G_CC_OP_LOGICW:
705      case AMD64G_CC_OP_LOGICB:
706         return 0;
707	 //      case AMD64G_CC_OP_SUBL:
708	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
709	 //                   ? AMD64G_CC_MASK_C : 0;
710	 //      case AMD64G_CC_OP_SUBW:
711	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
712	 //                   ? AMD64G_CC_MASK_C : 0;
713	 //      case AMD64G_CC_OP_SUBB:
714	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
715	 //                   ? AMD64G_CC_MASK_C : 0;
716	 //      case AMD64G_CC_OP_INCL:
717	 //      case AMD64G_CC_OP_DECL:
718	 //         return cc_ndep & AMD64G_CC_MASK_C;
719      default:
720         break;
721   }
722
723#  if PROFILE_RFLAGS
724   tabc_fast[cc_op]--;
725   tabc_slow[cc_op]++;
726#  endif
727
728   return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
729          & AMD64G_CC_MASK_C;
730}
731
732
733/* CALLED FROM GENERATED CODE: CLEAN HELPER */
734/* returns 1 or 0 */
735ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
736                                   ULong cc_op,
737                                   ULong cc_dep1,
738                                   ULong cc_dep2,
739                                   ULong cc_ndep )
740{
741   ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
742                                                  cc_dep2, cc_ndep);
743   ULong of,sf,zf,cf,pf;
744   ULong inv = cond & 1;
745
746#  if PROFILE_RFLAGS
747   if (!initted) initCounts();
748   tab_cond[cc_op][cond]++;
749   n_calc_cond++;
750   if (SHOW_COUNTS_NOW) showCounts();
751#  endif
752
753   switch (cond) {
754      case AMD64CondNO:
755      case AMD64CondO: /* OF == 1 */
756         of = rflags >> AMD64G_CC_SHIFT_O;
757         return 1 & (inv ^ of);
758
759      case AMD64CondNZ:
760      case AMD64CondZ: /* ZF == 1 */
761         zf = rflags >> AMD64G_CC_SHIFT_Z;
762         return 1 & (inv ^ zf);
763
764      case AMD64CondNB:
765      case AMD64CondB: /* CF == 1 */
766         cf = rflags >> AMD64G_CC_SHIFT_C;
767         return 1 & (inv ^ cf);
768         break;
769
770      case AMD64CondNBE:
771      case AMD64CondBE: /* (CF or ZF) == 1 */
772         cf = rflags >> AMD64G_CC_SHIFT_C;
773         zf = rflags >> AMD64G_CC_SHIFT_Z;
774         return 1 & (inv ^ (cf | zf));
775         break;
776
777      case AMD64CondNS:
778      case AMD64CondS: /* SF == 1 */
779         sf = rflags >> AMD64G_CC_SHIFT_S;
780         return 1 & (inv ^ sf);
781
782      case AMD64CondNP:
783      case AMD64CondP: /* PF == 1 */
784         pf = rflags >> AMD64G_CC_SHIFT_P;
785         return 1 & (inv ^ pf);
786
787      case AMD64CondNL:
788      case AMD64CondL: /* (SF xor OF) == 1 */
789         sf = rflags >> AMD64G_CC_SHIFT_S;
790         of = rflags >> AMD64G_CC_SHIFT_O;
791         return 1 & (inv ^ (sf ^ of));
792         break;
793
794      case AMD64CondNLE:
795      case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
796         sf = rflags >> AMD64G_CC_SHIFT_S;
797         of = rflags >> AMD64G_CC_SHIFT_O;
798         zf = rflags >> AMD64G_CC_SHIFT_Z;
799         return 1 & (inv ^ ((sf ^ of) | zf));
800         break;
801
802      default:
803         /* shouldn't really make these calls from generated code */
804         vex_printf("amd64g_calculate_condition"
805                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
806                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
807         vpanic("amd64g_calculate_condition");
808   }
809}
810
811
812/* VISIBLE TO LIBVEX CLIENT */
813ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state )
814{
815   ULong rflags = amd64g_calculate_rflags_all_WRK(
816                     vex_state->guest_CC_OP,
817                     vex_state->guest_CC_DEP1,
818                     vex_state->guest_CC_DEP2,
819                     vex_state->guest_CC_NDEP
820                  );
821   Long dflag = vex_state->guest_DFLAG;
822   vassert(dflag == 1 || dflag == -1);
823   if (dflag == -1)
824      rflags |= (1<<10);
825   if (vex_state->guest_IDFLAG == 1)
826      rflags |= (1<<21);
827   if (vex_state->guest_ACFLAG == 1)
828      rflags |= (1<<18);
829
830   return rflags;
831}
832
833/* VISIBLE TO LIBVEX CLIENT */
834void
835LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
836                               /*MOD*/VexGuestAMD64State* vex_state )
837{
838   ULong oszacp = amd64g_calculate_rflags_all_WRK(
839                     vex_state->guest_CC_OP,
840                     vex_state->guest_CC_DEP1,
841                     vex_state->guest_CC_DEP2,
842                     vex_state->guest_CC_NDEP
843                  );
844   if (new_carry_flag & 1) {
845      oszacp |= AMD64G_CC_MASK_C;
846   } else {
847      oszacp &= ~AMD64G_CC_MASK_C;
848   }
849   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
850   vex_state->guest_CC_DEP1 = oszacp;
851   vex_state->guest_CC_DEP2 = 0;
852   vex_state->guest_CC_NDEP = 0;
853}
854
855
856/*---------------------------------------------------------------*/
857/*--- %rflags translation-time function specialisers.         ---*/
858/*--- These help iropt specialise calls the above run-time    ---*/
859/*--- %rflags functions.                                      ---*/
860/*---------------------------------------------------------------*/
861
862/* Used by the optimiser to try specialisations.  Returns an
863   equivalent expression, or NULL if none. */
864
865static Bool isU64 ( IRExpr* e, ULong n )
866{
867   return toBool( e->tag == Iex_Const
868                  && e->Iex.Const.con->tag == Ico_U64
869                  && e->Iex.Const.con->Ico.U64 == n );
870}
871
872IRExpr* guest_amd64_spechelper ( HChar* function_name,
873                                 IRExpr** args,
874                                 IRStmt** precedingStmts,
875                                 Int      n_precedingStmts )
876{
877#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
878#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
879#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
880#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
881#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
882
883   Int i, arity = 0;
884   for (i = 0; args[i]; i++)
885      arity++;
886#  if 0
887   vex_printf("spec request:\n");
888   vex_printf("   %s  ", function_name);
889   for (i = 0; i < arity; i++) {
890      vex_printf("  ");
891      ppIRExpr(args[i]);
892   }
893   vex_printf("\n");
894#  endif
895
896   /* --------- specialising "amd64g_calculate_condition" --------- */
897
898   if (vex_streq(function_name, "amd64g_calculate_condition")) {
899      /* specialise calls to above "calculate condition" function */
900      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
901      vassert(arity == 5);
902      cond    = args[0];
903      cc_op   = args[1];
904      cc_dep1 = args[2];
905      cc_dep2 = args[3];
906
907      /*---------------- ADDQ ----------------*/
908
909      if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
910         /* long long add, then Z --> test (dst+src == 0) */
911         return unop(Iop_1Uto64,
912                     binop(Iop_CmpEQ64,
913                           binop(Iop_Add64, cc_dep1, cc_dep2),
914                           mkU64(0)));
915      }
916
917      /*---------------- SUBQ ----------------*/
918
919      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
920         /* long long sub/cmp, then Z --> test dst==src */
921         return unop(Iop_1Uto64,
922                     binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
923      }
924      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
925         /* long long sub/cmp, then NZ --> test dst!=src */
926         return unop(Iop_1Uto64,
927                     binop(Iop_CmpNE64,cc_dep1,cc_dep2));
928      }
929
930      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
931         /* long long sub/cmp, then L (signed less than)
932            --> test dst <s src */
933         return unop(Iop_1Uto64,
934                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
935      }
936
937      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
938         /* long long sub/cmp, then B (unsigned less than)
939            --> test dst <u src */
940         return unop(Iop_1Uto64,
941                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
942      }
943      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
944         /* long long sub/cmp, then NB (unsigned greater than or equal)
945            --> test src <=u dst */
946         /* Note, args are opposite way round from the usual */
947         return unop(Iop_1Uto64,
948                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
949      }
950
951      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
952         /* long long sub/cmp, then BE (unsigned less than or equal)
953            --> test dst <=u src */
954         return unop(Iop_1Uto64,
955                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
956      }
957
958      /*---------------- SUBL ----------------*/
959
960      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
961         /* long sub/cmp, then Z --> test dst==src */
962         return unop(Iop_1Uto64,
963                     binop(Iop_CmpEQ32,
964                           unop(Iop_64to32, cc_dep1),
965                           unop(Iop_64to32, cc_dep2)));
966      }
967      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
968         /* long sub/cmp, then NZ --> test dst!=src */
969         return unop(Iop_1Uto64,
970                     binop(Iop_CmpNE32,
971                           unop(Iop_64to32, cc_dep1),
972                           unop(Iop_64to32, cc_dep2)));
973      }
974
975      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
976         /* long sub/cmp, then L (signed less than)
977            --> test dst <s src */
978         return unop(Iop_1Uto64,
979                     binop(Iop_CmpLT32S,
980                           unop(Iop_64to32, cc_dep1),
981                           unop(Iop_64to32, cc_dep2)));
982      }
983
984      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
985         /* long sub/cmp, then LE (signed less than or equal)
986            --> test dst <=s src */
987         return unop(Iop_1Uto64,
988                     binop(Iop_CmpLE32S,
989                           unop(Iop_64to32, cc_dep1),
990                           unop(Iop_64to32, cc_dep2)));
991
992      }
993      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
994         /* long sub/cmp, then NLE (signed greater than)
995            --> test !(dst <=s src)
996            --> test (dst >s src)
997            --> test (src <s dst) */
998         return unop(Iop_1Uto64,
999                     binop(Iop_CmpLT32S,
1000                           unop(Iop_64to32, cc_dep2),
1001                           unop(Iop_64to32, cc_dep1)));
1002
1003      }
1004
1005      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1006         /* long sub/cmp, then BE (unsigned less than or equal)
1007            --> test dst <=u src */
1008         return unop(Iop_1Uto64,
1009                     binop(Iop_CmpLE32U,
1010                           unop(Iop_64to32, cc_dep1),
1011                           unop(Iop_64to32, cc_dep2)));
1012      }
1013      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1014         /* long sub/cmp, then NBE (unsigned greater than)
1015            --> test src <u dst */
1016         /* Note, args are opposite way round from the usual */
1017         return unop(Iop_1Uto64,
1018                     binop(Iop_CmpLT32U,
1019                           unop(Iop_64to32, cc_dep2),
1020                           unop(Iop_64to32, cc_dep1)));
1021      }
1022
1023      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1024         /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
1025         return unop(Iop_1Uto64,
1026                     binop(Iop_CmpLT32S,
1027                           binop(Iop_Sub32,
1028                                 unop(Iop_64to32, cc_dep1),
1029                                 unop(Iop_64to32, cc_dep2)),
1030                           mkU32(0)));
1031      }
1032
1033      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1034         /* long sub/cmp, then B (unsigned less than)
1035            --> test dst <u src */
1036         return unop(Iop_1Uto64,
1037                     binop(Iop_CmpLT32U,
1038                           unop(Iop_64to32, cc_dep1),
1039                           unop(Iop_64to32, cc_dep2)));
1040      }
1041
1042      /*---------------- SUBW ----------------*/
1043
1044      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1045         /* word sub/cmp, then Z --> test dst==src */
1046         return unop(Iop_1Uto64,
1047                     binop(Iop_CmpEQ16,
1048                           unop(Iop_64to16,cc_dep1),
1049                           unop(Iop_64to16,cc_dep2)));
1050      }
1051      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1052         /* word sub/cmp, then NZ --> test dst!=src */
1053         return unop(Iop_1Uto64,
1054                     binop(Iop_CmpNE16,
1055                           unop(Iop_64to16,cc_dep1),
1056                           unop(Iop_64to16,cc_dep2)));
1057      }
1058
1059      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1060         /* word sub/cmp, then LE (signed less than or equal)
1061            --> test dst <=s src */
1062         return unop(Iop_1Uto64,
1063                     binop(Iop_CmpLE64S,
1064                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1065                           binop(Iop_Shl64,cc_dep2,mkU8(48))));
1066
1067      }
1068
1069      /*---------------- SUBB ----------------*/
1070
1071      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1072         /* byte sub/cmp, then Z --> test dst==src */
1073         return unop(Iop_1Uto64,
1074                     binop(Iop_CmpEQ8,
1075                           unop(Iop_64to8,cc_dep1),
1076                           unop(Iop_64to8,cc_dep2)));
1077      }
1078      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1079         /* byte sub/cmp, then NZ --> test dst!=src */
1080         return unop(Iop_1Uto64,
1081                     binop(Iop_CmpNE8,
1082                           unop(Iop_64to8,cc_dep1),
1083                           unop(Iop_64to8,cc_dep2)));
1084      }
1085
1086      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1087         /* byte sub/cmp, then BE (unsigned less than or equal)
1088            --> test dst <=u src */
1089         return unop(Iop_1Uto64,
1090                     binop(Iop_CmpLE64U,
1091                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1092                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1093      }
1094
1095      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1096                                          && isU64(cc_dep2, 0)) {
1097         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1098                                         --> test dst <s 0
1099                                         --> (ULong)dst[7]
1100            This is yet another scheme by which gcc figures out if the
1101            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1102         /* Note: isU64(cc_dep2, 0) is correct, even though this is
1103            for an 8-bit comparison, since the args to the helper
1104            function are always U64s. */
1105         return binop(Iop_And64,
1106                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1107                      mkU64(1));
1108      }
1109      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1110                                          && isU64(cc_dep2, 0)) {
1111         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1112                                          --> test !(dst <s 0)
1113                                          --> (ULong) !dst[7]
1114         */
1115         return binop(Iop_Xor64,
1116                      binop(Iop_And64,
1117                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1118                            mkU64(1)),
1119                      mkU64(1));
1120      }
1121
1122      /*---------------- LOGICQ ----------------*/
1123
1124      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1125         /* long long and/or/xor, then Z --> test dst==0 */
1126         return unop(Iop_1Uto64,
1127                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1128      }
1129      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1130         /* long long and/or/xor, then NZ --> test dst!=0 */
1131         return unop(Iop_1Uto64,
1132                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1133      }
1134
1135      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1136         /* long long and/or/xor, then L
1137            LOGIC sets SF and ZF according to the
1138            result and makes OF be zero.  L computes SF ^ OF, but
1139            OF is zero, so this reduces to SF -- which will be 1 iff
1140            the result is < signed 0.  Hence ...
1141         */
1142         return unop(Iop_1Uto64,
1143                     binop(Iop_CmpLT64S,
1144                           cc_dep1,
1145                           mkU64(0)));
1146      }
1147
1148      /*---------------- LOGICL ----------------*/
1149
1150      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1151         /* long and/or/xor, then Z --> test dst==0 */
1152         return unop(Iop_1Uto64,
1153                     binop(Iop_CmpEQ32,
1154                           unop(Iop_64to32, cc_dep1),
1155                           mkU32(0)));
1156      }
1157      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1158         /* long and/or/xor, then NZ --> test dst!=0 */
1159         return unop(Iop_1Uto64,
1160                     binop(Iop_CmpNE32,
1161                           unop(Iop_64to32, cc_dep1),
1162                           mkU32(0)));
1163      }
1164
1165      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1166         /* long and/or/xor, then LE
1167            This is pretty subtle.  LOGIC sets SF and ZF according to the
1168            result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1169            OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1170            the result is <=signed 0.  Hence ...
1171         */
1172         return unop(Iop_1Uto64,
1173                     binop(Iop_CmpLE32S,
1174                           unop(Iop_64to32, cc_dep1),
1175                           mkU32(0)));
1176      }
1177
1178      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1179         /* long and/or/xor, then S --> (ULong)result[31] */
1180         return binop(Iop_And64,
1181                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1182                      mkU64(1));
1183      }
1184      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1185         /* long and/or/xor, then S --> (ULong) ~ result[31] */
1186         return binop(Iop_Xor64,
1187                binop(Iop_And64,
1188                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1189                      mkU64(1)),
1190                mkU64(1));
1191      }
1192
1193      /*---------------- LOGICB ----------------*/
1194
1195      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1196         /* byte and/or/xor, then Z --> test dst==0 */
1197         return unop(Iop_1Uto64,
1198                     binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1199                                        mkU64(0)));
1200      }
1201      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1202         /* byte and/or/xor, then NZ --> test dst!=0 */
1203         return unop(Iop_1Uto64,
1204                     binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1205                                        mkU64(0)));
1206      }
1207
1208      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1209         /* this is an idiom gcc sometimes uses to find out if the top
1210            bit of a byte register is set: eg testb %al,%al; js ..
1211            Since it just depends on the top bit of the byte, extract
1212            that bit and explicitly get rid of all the rest.  This
1213            helps memcheck avoid false positives in the case where any
1214            of the other bits in the byte are undefined. */
1215         /* byte and/or/xor, then S --> (UInt)result[7] */
1216         return binop(Iop_And64,
1217                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1218                      mkU64(1));
1219      }
1220      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1221         /* byte and/or/xor, then NS --> (UInt)!result[7] */
1222         return binop(Iop_Xor64,
1223                      binop(Iop_And64,
1224                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1225                            mkU64(1)),
1226                      mkU64(1));
1227      }
1228
1229      /*---------------- INCB ----------------*/
1230
1231      if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1232         /* 8-bit inc, then LE --> sign bit of the arg */
1233         return binop(Iop_And64,
1234                      binop(Iop_Shr64,
1235                            binop(Iop_Sub64, cc_dep1, mkU64(1)),
1236                            mkU8(7)),
1237                      mkU64(1));
1238      }
1239
1240      /*---------------- INCW ----------------*/
1241
1242      if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1243         /* 16-bit inc, then Z --> test dst == 0 */
1244         return unop(Iop_1Uto64,
1245                     binop(Iop_CmpEQ64,
1246                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1247                           mkU64(0)));
1248      }
1249
1250      /*---------------- DECL ----------------*/
1251
1252      if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1253         /* dec L, then Z --> test dst == 0 */
1254         return unop(Iop_1Uto64,
1255                     binop(Iop_CmpEQ32,
1256                           unop(Iop_64to32, cc_dep1),
1257                           mkU32(0)));
1258      }
1259
1260      /*---------------- DECW ----------------*/
1261
1262      if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1263         /* 16-bit dec, then NZ --> test dst != 0 */
1264         return unop(Iop_1Uto64,
1265                     binop(Iop_CmpNE64,
1266                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1267                           mkU64(0)));
1268      }
1269
1270      /*---------------- COPY ----------------*/
1271      /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1272         jbe" for example. */
1273
1274      if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1275          (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1276         /* COPY, then BE --> extract C and Z from dep1, and test (C
1277            or Z == 1). */
1278         /* COPY, then NBE --> extract C and Z from dep1, and test (C
1279            or Z == 0). */
1280         ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1281         return
1282            unop(
1283               Iop_1Uto64,
1284               binop(
1285                  Iop_CmpEQ64,
1286                  binop(
1287                     Iop_And64,
1288                     binop(
1289                        Iop_Or64,
1290                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1291                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1292                     ),
1293                     mkU64(1)
1294                  ),
1295                  mkU64(nnn)
1296               )
1297            );
1298      }
1299
1300      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1301         /* COPY, then B --> extract C dep1, and test (C == 1). */
1302         return
1303            unop(
1304               Iop_1Uto64,
1305               binop(
1306                  Iop_CmpNE64,
1307                  binop(
1308                     Iop_And64,
1309                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1310                     mkU64(1)
1311                  ),
1312                  mkU64(0)
1313               )
1314            );
1315      }
1316
1317      if (isU64(cc_op, AMD64G_CC_OP_COPY)
1318          && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1319         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1320         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1321         UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1322         return
1323            unop(
1324               Iop_1Uto64,
1325               binop(
1326                  Iop_CmpEQ64,
1327                  binop(
1328                     Iop_And64,
1329                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1330                     mkU64(1)
1331                  ),
1332                  mkU64(nnn)
1333               )
1334            );
1335      }
1336
1337      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1338         /* COPY, then P --> extract P from dep1, and test (P == 1). */
1339         return
1340            unop(
1341               Iop_1Uto64,
1342               binop(
1343                  Iop_CmpNE64,
1344                  binop(
1345                     Iop_And64,
1346                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1347                     mkU64(1)
1348                  ),
1349                  mkU64(0)
1350               )
1351            );
1352      }
1353
1354      return NULL;
1355   }
1356
1357   /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1358
1359   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1360      /* specialise calls to above "calculate_rflags_c" function */
1361      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1362      vassert(arity == 4);
1363      cc_op   = args[0];
1364      cc_dep1 = args[1];
1365      cc_dep2 = args[2];
1366      cc_ndep = args[3];
1367
1368      if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1369         /* C after sub denotes unsigned less than */
1370         return unop(Iop_1Uto64,
1371                     binop(Iop_CmpLT64U,
1372                           cc_dep1,
1373                           cc_dep2));
1374      }
1375      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1376         /* C after sub denotes unsigned less than */
1377         return unop(Iop_1Uto64,
1378                     binop(Iop_CmpLT32U,
1379                           unop(Iop_64to32, cc_dep1),
1380                           unop(Iop_64to32, cc_dep2)));
1381      }
1382      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1383         /* C after sub denotes unsigned less than */
1384         return unop(Iop_1Uto64,
1385                     binop(Iop_CmpLT64U,
1386                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1387                           binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1388      }
1389      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1390          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1391          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1392          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1393         /* cflag after logic is zero */
1394         return mkU64(0);
1395      }
1396      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1397          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1398         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1399         return cc_ndep;
1400      }
1401
1402#     if 0
1403      if (cc_op->tag == Iex_Const) {
1404         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1405      }
1406#     endif
1407
1408      return NULL;
1409   }
1410
1411#  undef unop
1412#  undef binop
1413#  undef mkU64
1414#  undef mkU32
1415#  undef mkU8
1416
1417   return NULL;
1418}
1419
1420
1421/*---------------------------------------------------------------*/
1422/*--- Supporting functions for x87 FPU activities.            ---*/
1423/*---------------------------------------------------------------*/
1424
1425static inline Bool host_is_little_endian ( void )
1426{
1427   UInt x = 0x76543210;
1428   UChar* p = (UChar*)(&x);
1429   return toBool(*p == 0x10);
1430}
1431
1432/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1433/* CALLED FROM GENERATED CODE: CLEAN HELPER */
1434ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1435{
1436   Bool   mantissaIsZero;
1437   Int    bexp;
1438   UChar  sign;
1439   UChar* f64;
1440
1441   vassert(host_is_little_endian());
1442
1443   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1444
1445   f64  = (UChar*)(&dbl);
1446   sign = toUChar( (f64[7] >> 7) & 1 );
1447
1448   /* First off, if the tag indicates the register was empty,
1449      return 1,0,sign,1 */
1450   if (tag == 0) {
1451      /* vex_printf("Empty\n"); */
1452      return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1453                                   | AMD64G_FC_MASK_C0;
1454   }
1455
1456   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1457   bexp &= 0x7FF;
1458
1459   mantissaIsZero
1460      = toBool(
1461           (f64[6] & 0x0F) == 0
1462           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1463        );
1464
1465   /* If both exponent and mantissa are zero, the value is zero.
1466      Return 1,0,sign,0. */
1467   if (bexp == 0 && mantissaIsZero) {
1468      /* vex_printf("Zero\n"); */
1469      return AMD64G_FC_MASK_C3 | 0
1470                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1471   }
1472
1473   /* If exponent is zero but mantissa isn't, it's a denormal.
1474      Return 1,1,sign,0. */
1475   if (bexp == 0 && !mantissaIsZero) {
1476      /* vex_printf("Denormal\n"); */
1477      return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1478                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1479   }
1480
1481   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1482      Return 0,1,sign,1. */
1483   if (bexp == 0x7FF && mantissaIsZero) {
1484      /* vex_printf("Inf\n"); */
1485      return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1486                                   | AMD64G_FC_MASK_C0;
1487   }
1488
1489   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1490      Return 0,0,sign,1. */
1491   if (bexp == 0x7FF && !mantissaIsZero) {
1492      /* vex_printf("NaN\n"); */
1493      return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1494   }
1495
1496   /* Uh, ok, we give up.  It must be a normal finite number.
1497      Return 0,1,sign,0.
1498   */
1499   /* vex_printf("normal\n"); */
1500   return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1501}
1502
1503
1504/* This is used to implement both 'frstor' and 'fldenv'.  The latter
1505   appears to differ from the former only in that the 8 FP registers
1506   themselves are not transferred into the guest state. */
1507static
1508VexEmWarn do_put_x87 ( Bool moveRegs,
1509                       /*IN*/UChar* x87_state,
1510                       /*OUT*/VexGuestAMD64State* vex_state )
1511{
1512   Int        stno, preg;
1513   UInt       tag;
1514   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1515   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1516   Fpu_State* x87     = (Fpu_State*)x87_state;
1517   UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
1518   UInt       tagw    = x87->env[FP_ENV_TAG];
1519   UInt       fpucw   = x87->env[FP_ENV_CTRL];
1520   UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
1521   VexEmWarn  ew;
1522   UInt       fpround;
1523   ULong      pair;
1524
1525   /* Copy registers and tags */
1526   for (stno = 0; stno < 8; stno++) {
1527      preg = (stno + ftop) & 7;
1528      tag = (tagw >> (2*preg)) & 3;
1529      if (tag == 3) {
1530         /* register is empty */
1531         /* hmm, if it's empty, does it still get written?  Probably
1532            safer to say it does.  If we don't, memcheck could get out
1533            of sync, in that it thinks all FP registers are defined by
1534            this helper, but in reality some have not been updated. */
1535         if (moveRegs)
1536            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1537         vexTags[preg] = 0;
1538      } else {
1539         /* register is non-empty */
1540         if (moveRegs)
1541            convert_f80le_to_f64le( &x87->reg[10*stno],
1542                                    (UChar*)&vexRegs[preg] );
1543         vexTags[preg] = 1;
1544      }
1545   }
1546
1547   /* stack pointer */
1548   vex_state->guest_FTOP = ftop;
1549
1550   /* status word */
1551   vex_state->guest_FC3210 = c3210;
1552
1553   /* handle the control word, setting FPROUND and detecting any
1554      emulation warnings. */
1555   pair    = amd64g_check_fldcw ( (ULong)fpucw );
1556   fpround = (UInt)pair;
1557   ew      = (VexEmWarn)(pair >> 32);
1558
1559   vex_state->guest_FPROUND = fpround & 3;
1560
1561   /* emulation warnings --> caller */
1562   return ew;
1563}
1564
1565
1566/* Create an x87 FPU state from the guest state, as close as
1567   we can approximate it. */
1568static
1569void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
1570                  /*OUT*/UChar* x87_state )
1571{
1572   Int        i, stno, preg;
1573   UInt       tagw;
1574   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1575   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1576   Fpu_State* x87     = (Fpu_State*)x87_state;
1577   UInt       ftop    = vex_state->guest_FTOP;
1578   UInt       c3210   = vex_state->guest_FC3210;
1579
1580   for (i = 0; i < 14; i++)
1581      x87->env[i] = 0;
1582
1583   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1584   x87->env[FP_ENV_STAT]
1585      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
1586   x87->env[FP_ENV_CTRL]
1587      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
1588
1589   /* Dump the register stack in ST order. */
1590   tagw = 0;
1591   for (stno = 0; stno < 8; stno++) {
1592      preg = (stno + ftop) & 7;
1593      if (vexTags[preg] == 0) {
1594         /* register is empty */
1595         tagw |= (3 << (2*preg));
1596         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1597                                 &x87->reg[10*stno] );
1598      } else {
1599         /* register is full. */
1600         tagw |= (0 << (2*preg));
1601         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1602                                 &x87->reg[10*stno] );
1603      }
1604   }
1605   x87->env[FP_ENV_TAG] = toUShort(tagw);
1606}
1607
1608
1609/* CALLED FROM GENERATED CODE */
1610/* DIRTY HELPER (reads guest state, writes guest mem) */
1611/* NOTE: only handles 32-bit format (no REX.W on the insn) */
1612void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
1613{
1614   /* Derived from values obtained from
1615      vendor_id       : AuthenticAMD
1616      cpu family      : 15
1617      model           : 12
1618      model name      : AMD Athlon(tm) 64 Processor 3200+
1619      stepping        : 0
1620      cpu MHz         : 2200.000
1621      cache size      : 512 KB
1622   */
1623   /* Somewhat roundabout, but at least it's simple. */
1624   Fpu_State tmp;
1625   UShort*   addrS = (UShort*)addr;
1626   UChar*    addrC = (UChar*)addr;
1627   U128*     xmm   = (U128*)(addr + 160);
1628   UInt      mxcsr;
1629   UShort    fp_tags;
1630   UInt      summary_tags;
1631   Int       r, stno;
1632   UShort    *srcS, *dstS;
1633
1634   do_get_x87( gst, (UChar*)&tmp );
1635   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
1636
1637   /* Now build the proper fxsave image from the x87 image we just
1638      made. */
1639
1640   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
1641   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
1642
1643   /* set addrS[2] in an endian-independent way */
1644   summary_tags = 0;
1645   fp_tags = tmp.env[FP_ENV_TAG];
1646   for (r = 0; r < 8; r++) {
1647      if ( ((fp_tags >> (2*r)) & 3) != 3 )
1648         summary_tags |= (1 << r);
1649   }
1650   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
1651   addrC[5]  = 0; /* pad */
1652
1653   /* FOP: faulting fpu opcode.  From experimentation, the real CPU
1654      does not write this field. (?!) */
1655   addrS[3]  = 0; /* BOGUS */
1656
1657   /* RIP (Last x87 instruction pointer).  From experimentation, the
1658      real CPU does not write this field. (?!) */
1659   addrS[4]  = 0; /* BOGUS */
1660   addrS[5]  = 0; /* BOGUS */
1661   addrS[6]  = 0; /* BOGUS */
1662   addrS[7]  = 0; /* BOGUS */
1663
1664   /* RDP (Last x87 data pointer).  From experimentation, the real CPU
1665      does not write this field. (?!) */
1666   addrS[8]  = 0; /* BOGUS */
1667   addrS[9]  = 0; /* BOGUS */
1668   addrS[10] = 0; /* BOGUS */
1669   addrS[11] = 0; /* BOGUS */
1670
1671   addrS[12] = toUShort(mxcsr);  /* MXCSR */
1672   addrS[13] = toUShort(mxcsr >> 16);
1673
1674   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
1675   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
1676
1677   /* Copy in the FP registers, in ST order. */
1678   for (stno = 0; stno < 8; stno++) {
1679      srcS = (UShort*)(&tmp.reg[10*stno]);
1680      dstS = (UShort*)(&addrS[16 + 8*stno]);
1681      dstS[0] = srcS[0];
1682      dstS[1] = srcS[1];
1683      dstS[2] = srcS[2];
1684      dstS[3] = srcS[3];
1685      dstS[4] = srcS[4];
1686      dstS[5] = 0;
1687      dstS[6] = 0;
1688      dstS[7] = 0;
1689   }
1690
1691   /* That's the first 160 bytes of the image done.  Now only %xmm0
1692      .. %xmm15 remain to be copied.  If the host is big-endian, these
1693      need to be byte-swapped. */
1694   vassert(host_is_little_endian());
1695
1696#  define COPY_U128(_dst,_src)                       \
1697      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1698           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1699      while (0)
1700
1701   COPY_U128( xmm[0],  gst->guest_XMM0 );
1702   COPY_U128( xmm[1],  gst->guest_XMM1 );
1703   COPY_U128( xmm[2],  gst->guest_XMM2 );
1704   COPY_U128( xmm[3],  gst->guest_XMM3 );
1705   COPY_U128( xmm[4],  gst->guest_XMM4 );
1706   COPY_U128( xmm[5],  gst->guest_XMM5 );
1707   COPY_U128( xmm[6],  gst->guest_XMM6 );
1708   COPY_U128( xmm[7],  gst->guest_XMM7 );
1709   COPY_U128( xmm[8],  gst->guest_XMM8 );
1710   COPY_U128( xmm[9],  gst->guest_XMM9 );
1711   COPY_U128( xmm[10], gst->guest_XMM10 );
1712   COPY_U128( xmm[11], gst->guest_XMM11 );
1713   COPY_U128( xmm[12], gst->guest_XMM12 );
1714   COPY_U128( xmm[13], gst->guest_XMM13 );
1715   COPY_U128( xmm[14], gst->guest_XMM14 );
1716   COPY_U128( xmm[15], gst->guest_XMM15 );
1717
1718#  undef COPY_U128
1719}
1720
1721
1722/* CALLED FROM GENERATED CODE */
1723/* DIRTY HELPER (writes guest state, reads guest mem) */
1724VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
1725{
1726   Fpu_State tmp;
1727   VexEmWarn warnX87 = EmWarn_NONE;
1728   VexEmWarn warnXMM = EmWarn_NONE;
1729   UShort*   addrS   = (UShort*)addr;
1730   UChar*    addrC   = (UChar*)addr;
1731   U128*     xmm     = (U128*)(addr + 160);
1732   UShort    fp_tags;
1733   Int       r, stno, i;
1734
1735   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
1736      to be byte-swapped. */
1737   vassert(host_is_little_endian());
1738
1739#  define COPY_U128(_dst,_src)                       \
1740      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1741           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1742      while (0)
1743
1744   COPY_U128( gst->guest_XMM0, xmm[0] );
1745   COPY_U128( gst->guest_XMM1, xmm[1] );
1746   COPY_U128( gst->guest_XMM2, xmm[2] );
1747   COPY_U128( gst->guest_XMM3, xmm[3] );
1748   COPY_U128( gst->guest_XMM4, xmm[4] );
1749   COPY_U128( gst->guest_XMM5, xmm[5] );
1750   COPY_U128( gst->guest_XMM6, xmm[6] );
1751   COPY_U128( gst->guest_XMM7, xmm[7] );
1752   COPY_U128( gst->guest_XMM8, xmm[8] );
1753   COPY_U128( gst->guest_XMM9, xmm[9] );
1754   COPY_U128( gst->guest_XMM10, xmm[10] );
1755   COPY_U128( gst->guest_XMM11, xmm[11] );
1756   COPY_U128( gst->guest_XMM12, xmm[12] );
1757   COPY_U128( gst->guest_XMM13, xmm[13] );
1758   COPY_U128( gst->guest_XMM14, xmm[14] );
1759   COPY_U128( gst->guest_XMM15, xmm[15] );
1760
1761#  undef COPY_U128
1762
1763   /* Copy the x87 registers out of the image, into a temporary
1764      Fpu_State struct. */
1765   for (i = 0; i < 14; i++) tmp.env[i] = 0;
1766   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
1767   /* fill in tmp.reg[0..7] */
1768   for (stno = 0; stno < 8; stno++) {
1769      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
1770      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
1771      dstS[0] = srcS[0];
1772      dstS[1] = srcS[1];
1773      dstS[2] = srcS[2];
1774      dstS[3] = srcS[3];
1775      dstS[4] = srcS[4];
1776   }
1777   /* fill in tmp.env[0..13] */
1778   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
1779   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
1780
1781   fp_tags = 0;
1782   for (r = 0; r < 8; r++) {
1783      if (addrC[4] & (1<<r))
1784         fp_tags |= (0 << (2*r)); /* EMPTY */
1785      else
1786         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
1787   }
1788   tmp.env[FP_ENV_TAG] = fp_tags;
1789
1790   /* Now write 'tmp' into the guest state. */
1791   warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
1792
1793   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
1794                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
1795     ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
1796
1797     warnXMM = (VexEmWarn)(w64 >> 32);
1798
1799     gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
1800   }
1801
1802   /* Prefer an X87 emwarn over an XMM one, if both exist. */
1803   if (warnX87 != EmWarn_NONE)
1804      return warnX87;
1805   else
1806      return warnXMM;
1807}
1808
1809
1810/* DIRTY HELPER (writes guest state) */
1811/* Initialise the x87 FPU state as per 'finit'. */
1812void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
1813{
1814   Int i;
1815   gst->guest_FTOP = 0;
1816   for (i = 0; i < 8; i++) {
1817      gst->guest_FPTAG[i] = 0; /* empty */
1818      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
1819   }
1820   gst->guest_FPROUND = (ULong)Irrm_NEAREST;
1821   gst->guest_FC3210  = 0;
1822}
1823
1824
1825/* CALLED FROM GENERATED CODE */
1826/* DIRTY HELPER (reads guest memory) */
1827ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
1828{
1829   ULong f64;
1830   convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
1831   return f64;
1832}
1833
1834/* CALLED FROM GENERATED CODE */
1835/* DIRTY HELPER (writes guest memory) */
1836void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
1837{
1838   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
1839}
1840
1841
1842/* CALLED FROM GENERATED CODE */
1843/* CLEAN HELPER */
1844/* mxcsr[15:0] contains a SSE native format MXCSR value.
1845   Extract from it the required SSEROUND value and any resulting
1846   emulation warning, and return (warn << 32) | sseround value.
1847*/
1848ULong amd64g_check_ldmxcsr ( ULong mxcsr )
1849{
1850   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
1851   /* NOTE, encoded exactly as per enum IRRoundingMode. */
1852   ULong rmode = (mxcsr >> 13) & 3;
1853
1854   /* Detect any required emulation warnings. */
1855   VexEmWarn ew = EmWarn_NONE;
1856
1857   if ((mxcsr & 0x1F80) != 0x1F80) {
1858      /* unmasked exceptions! */
1859      ew = EmWarn_X86_sseExns;
1860   }
1861   else
1862   if (mxcsr & (1<<15)) {
1863      /* FZ is set */
1864      ew = EmWarn_X86_fz;
1865   }
1866   else
1867   if (mxcsr & (1<<6)) {
1868      /* DAZ is set */
1869      ew = EmWarn_X86_daz;
1870   }
1871
1872   return (((ULong)ew) << 32) | ((ULong)rmode);
1873}
1874
1875
1876/* CALLED FROM GENERATED CODE */
1877/* CLEAN HELPER */
1878/* Given sseround as an IRRoundingMode value, create a suitable SSE
1879   native format MXCSR value. */
1880ULong amd64g_create_mxcsr ( ULong sseround )
1881{
1882   sseround &= 3;
1883   return 0x1F80 | (sseround << 13);
1884}
1885
1886
1887/* CLEAN HELPER */
1888/* fpucw[15:0] contains a x87 native format FPU control word.
1889   Extract from it the required FPROUND value and any resulting
1890   emulation warning, and return (warn << 32) | fpround value.
1891*/
1892ULong amd64g_check_fldcw ( ULong fpucw )
1893{
1894   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
1895   /* NOTE, encoded exactly as per enum IRRoundingMode. */
1896   ULong rmode = (fpucw >> 10) & 3;
1897
1898   /* Detect any required emulation warnings. */
1899   VexEmWarn ew = EmWarn_NONE;
1900
1901   if ((fpucw & 0x3F) != 0x3F) {
1902      /* unmasked exceptions! */
1903      ew = EmWarn_X86_x87exns;
1904   }
1905   else
1906   if (((fpucw >> 8) & 3) != 3) {
1907      /* unsupported precision */
1908      ew = EmWarn_X86_x87precision;
1909   }
1910
1911   return (((ULong)ew) << 32) | ((ULong)rmode);
1912}
1913
1914
1915/* CLEAN HELPER */
1916/* Given fpround as an IRRoundingMode value, create a suitable x87
1917   native format FPU control word. */
1918ULong amd64g_create_fpucw ( ULong fpround )
1919{
1920   fpround &= 3;
1921   return 0x037F | (fpround << 10);
1922}
1923
1924
1925/* This is used to implement 'fldenv'.
1926   Reads 28 bytes at x87_state[0 .. 27]. */
1927/* CALLED FROM GENERATED CODE */
1928/* DIRTY HELPER */
1929VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
1930                                      /*IN*/HWord x87_state)
1931{
1932   Int        stno, preg;
1933   UInt       tag;
1934   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1935   Fpu_State* x87     = (Fpu_State*)x87_state;
1936   UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
1937   UInt       tagw    = x87->env[FP_ENV_TAG];
1938   UInt       fpucw   = x87->env[FP_ENV_CTRL];
1939   ULong      c3210   = x87->env[FP_ENV_STAT] & 0x4700;
1940   VexEmWarn  ew;
1941   ULong      fpround;
1942   ULong      pair;
1943
1944   /* Copy tags */
1945   for (stno = 0; stno < 8; stno++) {
1946      preg = (stno + ftop) & 7;
1947      tag = (tagw >> (2*preg)) & 3;
1948      if (tag == 3) {
1949         /* register is empty */
1950         vexTags[preg] = 0;
1951      } else {
1952         /* register is non-empty */
1953         vexTags[preg] = 1;
1954      }
1955   }
1956
1957   /* stack pointer */
1958   vex_state->guest_FTOP = ftop;
1959
1960   /* status word */
1961   vex_state->guest_FC3210 = c3210;
1962
1963   /* handle the control word, setting FPROUND and detecting any
1964      emulation warnings. */
1965   pair    = amd64g_check_fldcw ( (ULong)fpucw );
1966   fpround = pair & 0xFFFFFFFFULL;
1967   ew      = (VexEmWarn)(pair >> 32);
1968
1969   vex_state->guest_FPROUND = fpround & 3;
1970
1971   /* emulation warnings --> caller */
1972   return ew;
1973}
1974
1975
1976/* CALLED FROM GENERATED CODE */
1977/* DIRTY HELPER */
1978/* Create an x87 FPU env from the guest state, as close as we can
1979   approximate it.  Writes 28 bytes at x87_state[0..27]. */
1980void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
1981                                 /*OUT*/HWord x87_state )
1982{
1983   Int        i, stno, preg;
1984   UInt       tagw;
1985   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1986   Fpu_State* x87     = (Fpu_State*)x87_state;
1987   UInt       ftop    = vex_state->guest_FTOP;
1988   ULong      c3210   = vex_state->guest_FC3210;
1989
1990   for (i = 0; i < 14; i++)
1991      x87->env[i] = 0;
1992
1993   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1994   x87->env[FP_ENV_STAT]
1995      = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
1996   x87->env[FP_ENV_CTRL]
1997      = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
1998
1999   /* Compute the x87 tag word. */
2000   tagw = 0;
2001   for (stno = 0; stno < 8; stno++) {
2002      preg = (stno + ftop) & 7;
2003      if (vexTags[preg] == 0) {
2004         /* register is empty */
2005         tagw |= (3 << (2*preg));
2006      } else {
2007         /* register is full. */
2008         tagw |= (0 << (2*preg));
2009      }
2010   }
2011   x87->env[FP_ENV_TAG] = toUShort(tagw);
2012
2013   /* We don't dump the x87 registers, tho. */
2014}
2015
2016
2017/*---------------------------------------------------------------*/
2018/*--- Misc integer helpers, including rotates and CPUID.      ---*/
2019/*---------------------------------------------------------------*/
2020
2021/* Claim to be the following CPU, which is probably representative of
2022   the lowliest (earliest) amd64 offerings.  It can do neither sse3
2023   nor cx16.
2024
2025   vendor_id       : AuthenticAMD
2026   cpu family      : 15
2027   model           : 5
2028   model name      : AMD Opteron (tm) Processor 848
2029   stepping        : 10
2030   cpu MHz         : 1797.682
2031   cache size      : 1024 KB
2032   fpu             : yes
2033   fpu_exception   : yes
2034   cpuid level     : 1
2035   wp              : yes
2036   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2037                     mtrr pge mca cmov pat pse36 clflush mmx fxsr
2038                     sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2039   bogomips        : 3600.62
2040   TLB size        : 1088 4K pages
2041   clflush size    : 64
2042   cache_alignment : 64
2043   address sizes   : 40 bits physical, 48 bits virtual
2044   power management: ts fid vid ttp
2045*/
2046void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2047{
2048#  define SET_ABCD(_a,_b,_c,_d)                \
2049      do { st->guest_RAX = (ULong)(_a);        \
2050           st->guest_RBX = (ULong)(_b);        \
2051           st->guest_RCX = (ULong)(_c);        \
2052           st->guest_RDX = (ULong)(_d);        \
2053      } while (0)
2054
2055   switch (0xFFFFFFFF & st->guest_RAX) {
2056      case 0x00000000:
2057         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2058         break;
2059      case 0x00000001:
2060         SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2061         break;
2062      case 0x80000000:
2063         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2064         break;
2065      case 0x80000001:
2066         SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, 0xe1d3fbff);
2067         break;
2068      case 0x80000002:
2069         SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2070         break;
2071      case 0x80000003:
2072         SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2073         break;
2074      case 0x80000004:
2075         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2076         break;
2077      case 0x80000005:
2078         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2079         break;
2080      case 0x80000006:
2081         SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2082         break;
2083      case 0x80000007:
2084         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2085         break;
2086      case 0x80000008:
2087         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2088         break;
2089      default:
2090         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2091         break;
2092   }
2093#  undef SET_ABCD
2094}
2095
2096
2097/* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2098   capable.
2099
2100   vendor_id       : GenuineIntel
2101   cpu family      : 6
2102   model           : 15
2103   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2104   stepping        : 6
2105   cpu MHz         : 2394.000
2106   cache size      : 4096 KB
2107   physical id     : 0
2108   siblings        : 2
2109   core id         : 0
2110   cpu cores       : 2
2111   fpu             : yes
2112   fpu_exception   : yes
2113   cpuid level     : 10
2114   wp              : yes
2115   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2116                     mtrr pge mca cmov pat pse36 clflush dts acpi
2117                     mmx fxsr sse sse2 ss ht tm syscall nx lm
2118                     constant_tsc pni monitor ds_cpl vmx est tm2
2119                     cx16 xtpr lahf_lm
2120   bogomips        : 4798.78
2121   clflush size    : 64
2122   cache_alignment : 64
2123   address sizes   : 36 bits physical, 48 bits virtual
2124   power management:
2125*/
2126void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2127{
2128#  define SET_ABCD(_a,_b,_c,_d)                \
2129      do { st->guest_RAX = (ULong)(_a);        \
2130           st->guest_RBX = (ULong)(_b);        \
2131           st->guest_RCX = (ULong)(_c);        \
2132           st->guest_RDX = (ULong)(_d);        \
2133      } while (0)
2134
2135   switch (0xFFFFFFFF & st->guest_RAX) {
2136      case 0x00000000:
2137         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2138         break;
2139      case 0x00000001:
2140         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2141         break;
2142      case 0x00000002:
2143         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2144         break;
2145      case 0x00000003:
2146         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2147         break;
2148      case 0x00000004: {
2149         switch (0xFFFFFFFF & st->guest_RCX) {
2150            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2151                                      0x0000003f, 0x00000001); break;
2152            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2153                                      0x0000003f, 0x00000001); break;
2154            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2155                                      0x00000fff, 0x00000001); break;
2156            default:         SET_ABCD(0x00000000, 0x00000000,
2157                                      0x00000000, 0x00000000); break;
2158         }
2159         break;
2160      }
2161      case 0x00000005:
2162         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2163         break;
2164      case 0x00000006:
2165         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2166         break;
2167      case 0x00000007:
2168         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2169         break;
2170      case 0x00000008:
2171         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2172         break;
2173      case 0x00000009:
2174         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2175         break;
2176      case 0x0000000a:
2177      unhandled_eax_value:
2178         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2179         break;
2180      case 0x80000000:
2181         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2182         break;
2183      case 0x80000001:
2184         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2185         break;
2186      case 0x80000002:
2187         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2188         break;
2189      case 0x80000003:
2190         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2191         break;
2192      case 0x80000004:
2193         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2194         break;
2195      case 0x80000005:
2196         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2197         break;
2198      case 0x80000006:
2199         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2200         break;
2201      case 0x80000007:
2202         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2203         break;
2204      case 0x80000008:
2205         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2206         break;
2207      default:
2208         goto unhandled_eax_value;
2209   }
2210#  undef SET_ABCD
2211}
2212
2213
2214/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2215   capable.
2216
2217   vendor_id       : GenuineIntel
2218   cpu family      : 6
2219   model           : 37
2220   model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2221   stepping        : 2
2222   cpu MHz         : 3334.000
2223   cache size      : 4096 KB
2224   physical id     : 0
2225   siblings        : 4
2226   core id         : 0
2227   cpu cores       : 2
2228   apicid          : 0
2229   initial apicid  : 0
2230   fpu             : yes
2231   fpu_exception   : yes
2232   cpuid level     : 11
2233   wp              : yes
2234   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2235                     mtrr pge mca cmov pat pse36 clflush dts acpi
2236                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2237                     lm constant_tsc arch_perfmon pebs bts rep_good
2238                     xtopology nonstop_tsc aperfmperf pni pclmulqdq
2239                     dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2240                     xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2241                     arat tpr_shadow vnmi flexpriority ept vpid
2242                     MINUS aes (see below)
2243   bogomips        : 6957.57
2244   clflush size    : 64
2245   cache_alignment : 64
2246   address sizes   : 36 bits physical, 48 bits virtual
2247   power management:
2248*/
2249void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2250{
2251#  define SET_ABCD(_a,_b,_c,_d)                \
2252      do { st->guest_RAX = (ULong)(_a);        \
2253           st->guest_RBX = (ULong)(_b);        \
2254           st->guest_RCX = (ULong)(_c);        \
2255           st->guest_RDX = (ULong)(_d);        \
2256      } while (0)
2257
2258   UInt old_eax = (UInt)st->guest_RAX;
2259   UInt old_ecx = (UInt)st->guest_RCX;
2260
2261   switch (old_eax) {
2262      case 0x00000000:
2263         SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2264         break;
2265      case 0x00000001:
2266         // & ~(1<<25): don't claim to support AES insns.  See
2267         // bug 249991.
2268         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff & ~(1<<25),
2269                                          0xbfebfbff);
2270         break;
2271      case 0x00000002:
2272         SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2273         break;
2274      case 0x00000003:
2275         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2276         break;
2277      case 0x00000004:
2278         switch (old_ecx) {
2279            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2280                                      0x0000003f, 0x00000000); break;
2281            case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2282                                      0x0000007f, 0x00000000); break;
2283            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2284                                      0x000001ff, 0x00000000); break;
2285            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2286                                      0x00000fff, 0x00000002); break;
2287            default:         SET_ABCD(0x00000000, 0x00000000,
2288                                      0x00000000, 0x00000000); break;
2289         }
2290         break;
2291      case 0x00000005:
2292         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2293         break;
2294      case 0x00000006:
2295         SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2296         break;
2297      case 0x00000007:
2298         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2299         break;
2300      case 0x00000008:
2301         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2302         break;
2303      case 0x00000009:
2304         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2305         break;
2306      case 0x0000000a:
2307         SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2308         break;
2309      case 0x0000000b:
2310         switch (old_ecx) {
2311            case 0x00000000:
2312               SET_ABCD(0x00000001, 0x00000002,
2313                        0x00000100, 0x00000000); break;
2314            case 0x00000001:
2315               SET_ABCD(0x00000004, 0x00000004,
2316                        0x00000201, 0x00000000); break;
2317            default:
2318               SET_ABCD(0x00000000, 0x00000000,
2319                        old_ecx,    0x00000000); break;
2320         }
2321         break;
2322      case 0x0000000c:
2323         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2324         break;
2325      case 0x0000000d:
2326         switch (old_ecx) {
2327            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2328                                      0x00000100, 0x00000000); break;
2329            case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2330                                      0x00000201, 0x00000000); break;
2331            default:         SET_ABCD(0x00000000, 0x00000000,
2332                                      old_ecx,    0x00000000); break;
2333         }
2334         break;
2335      case 0x80000000:
2336         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2337         break;
2338      case 0x80000001:
2339         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2340         break;
2341      case 0x80000002:
2342         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2343         break;
2344      case 0x80000003:
2345         SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2346         break;
2347      case 0x80000004:
2348         SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2349         break;
2350      case 0x80000005:
2351         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2352         break;
2353      case 0x80000006:
2354         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2355         break;
2356      case 0x80000007:
2357         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2358         break;
2359      case 0x80000008:
2360         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2361         break;
2362      default:
2363         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2364         break;
2365   }
2366#  undef SET_ABCD
2367}
2368
2369
2370ULong amd64g_calculate_RCR ( ULong arg,
2371                             ULong rot_amt,
2372                             ULong rflags_in,
2373                             Long  szIN )
2374{
2375   Bool  wantRflags = toBool(szIN < 0);
2376   ULong sz         = wantRflags ? (-szIN) : szIN;
2377   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2378   ULong cf=0, of=0, tempcf;
2379
2380   switch (sz) {
2381      case 8:
2382         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2383         of        = ((arg >> 63) ^ cf) & 1;
2384         while (tempCOUNT > 0) {
2385            tempcf = arg & 1;
2386            arg    = (arg >> 1) | (cf << 63);
2387            cf     = tempcf;
2388            tempCOUNT--;
2389         }
2390         break;
2391      case 4:
2392         while (tempCOUNT >= 33) tempCOUNT -= 33;
2393         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2394         of        = ((arg >> 31) ^ cf) & 1;
2395         while (tempCOUNT > 0) {
2396            tempcf = arg & 1;
2397            arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
2398            cf     = tempcf;
2399            tempCOUNT--;
2400         }
2401         break;
2402      case 2:
2403         while (tempCOUNT >= 17) tempCOUNT -= 17;
2404         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2405         of        = ((arg >> 15) ^ cf) & 1;
2406         while (tempCOUNT > 0) {
2407            tempcf = arg & 1;
2408            arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
2409            cf     = tempcf;
2410            tempCOUNT--;
2411         }
2412         break;
2413      case 1:
2414         while (tempCOUNT >= 9) tempCOUNT -= 9;
2415         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2416         of        = ((arg >> 7) ^ cf) & 1;
2417         while (tempCOUNT > 0) {
2418            tempcf = arg & 1;
2419            arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
2420            cf     = tempcf;
2421            tempCOUNT--;
2422         }
2423         break;
2424      default:
2425         vpanic("calculate_RCR(amd64g): invalid size");
2426   }
2427
2428   cf &= 1;
2429   of &= 1;
2430   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2431   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2432
2433   /* caller can ask to have back either the resulting flags or
2434      resulting value, but not both */
2435   return wantRflags ? rflags_in : arg;
2436}
2437
2438ULong amd64g_calculate_RCL ( ULong arg,
2439                             ULong rot_amt,
2440                             ULong rflags_in,
2441                             Long  szIN )
2442{
2443   Bool  wantRflags = toBool(szIN < 0);
2444   ULong sz         = wantRflags ? (-szIN) : szIN;
2445   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2446   ULong cf=0, of=0, tempcf;
2447
2448   switch (sz) {
2449      case 8:
2450         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2451         while (tempCOUNT > 0) {
2452            tempcf = (arg >> 63) & 1;
2453            arg    = (arg << 1) | (cf & 1);
2454            cf     = tempcf;
2455            tempCOUNT--;
2456         }
2457         of = ((arg >> 63) ^ cf) & 1;
2458         break;
2459      case 4:
2460         while (tempCOUNT >= 33) tempCOUNT -= 33;
2461         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2462         while (tempCOUNT > 0) {
2463            tempcf = (arg >> 31) & 1;
2464            arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
2465            cf     = tempcf;
2466            tempCOUNT--;
2467         }
2468         of = ((arg >> 31) ^ cf) & 1;
2469         break;
2470      case 2:
2471         while (tempCOUNT >= 17) tempCOUNT -= 17;
2472         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2473         while (tempCOUNT > 0) {
2474            tempcf = (arg >> 15) & 1;
2475            arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
2476            cf     = tempcf;
2477            tempCOUNT--;
2478         }
2479         of = ((arg >> 15) ^ cf) & 1;
2480         break;
2481      case 1:
2482         while (tempCOUNT >= 9) tempCOUNT -= 9;
2483         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2484         while (tempCOUNT > 0) {
2485            tempcf = (arg >> 7) & 1;
2486            arg    = 0xFFULL & ((arg << 1) | (cf & 1));
2487            cf     = tempcf;
2488            tempCOUNT--;
2489         }
2490         of = ((arg >> 7) ^ cf) & 1;
2491         break;
2492      default:
2493         vpanic("calculate_RCL(amd64g): invalid size");
2494   }
2495
2496   cf &= 1;
2497   of &= 1;
2498   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2499   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2500
2501   return wantRflags ? rflags_in : arg;
2502}
2503
2504/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
2505 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
2506 */
2507ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
2508{
2509    ULong hi, lo, tmp, A[16];
2510
2511   A[0] = 0;            A[1] = a;
2512   A[2] = A[1] << 1;    A[3] = A[2] ^ a;
2513   A[4] = A[2] << 1;    A[5] = A[4] ^ a;
2514   A[6] = A[3] << 1;    A[7] = A[6] ^ a;
2515   A[8] = A[4] << 1;    A[9] = A[8] ^ a;
2516   A[10] = A[5] << 1;   A[11] = A[10] ^ a;
2517   A[12] = A[6] << 1;   A[13] = A[12] ^ a;
2518   A[14] = A[7] << 1;   A[15] = A[14] ^ a;
2519
2520   lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
2521   hi = lo >> 56;
2522   lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
2523   hi = (hi << 8) | (lo >> 56);
2524   lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
2525   hi = (hi << 8) | (lo >> 56);
2526   lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
2527   hi = (hi << 8) | (lo >> 56);
2528   lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
2529   hi = (hi << 8) | (lo >> 56);
2530   lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
2531   hi = (hi << 8) | (lo >> 56);
2532   lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
2533   hi = (hi << 8) | (lo >> 56);
2534   lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
2535
2536   ULong m0 = -1;
2537   m0 /= 255;
2538   tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
2539   tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
2540   tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
2541   tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
2542   tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
2543   tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
2544   tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
2545
2546   return which ? hi : lo;
2547}
2548
2549
2550/* CALLED FROM GENERATED CODE */
2551/* DIRTY HELPER (non-referentially-transparent) */
2552/* Horrible hack.  On non-amd64 platforms, return 1. */
2553ULong amd64g_dirtyhelper_RDTSC ( void )
2554{
2555#  if defined(__x86_64__)
2556   UInt  eax, edx;
2557   __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
2558   return (((ULong)edx) << 32) | ((ULong)eax);
2559#  else
2560   return 1ULL;
2561#  endif
2562}
2563
2564
2565/* CALLED FROM GENERATED CODE */
2566/* DIRTY HELPER (non-referentially-transparent) */
2567/* Horrible hack.  On non-amd64 platforms, return 0. */
2568ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
2569{
2570#  if defined(__x86_64__)
2571   ULong r = 0;
2572   portno &= 0xFFFF;
2573   switch (sz) {
2574      case 4:
2575         __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
2576                              : "=a" (r) : "Nd" (portno));
2577	 break;
2578      case 2:
2579         __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
2580                              : "=a" (r) : "Nd" (portno));
2581	 break;
2582      case 1:
2583         __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
2584                              : "=a" (r) : "Nd" (portno));
2585	 break;
2586      default:
2587         break; /* note: no 64-bit version of insn exists */
2588   }
2589   return r;
2590#  else
2591   return 0;
2592#  endif
2593}
2594
2595
2596/* CALLED FROM GENERATED CODE */
2597/* DIRTY HELPER (non-referentially-transparent) */
2598/* Horrible hack.  On non-amd64 platforms, do nothing. */
2599void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
2600{
2601#  if defined(__x86_64__)
2602   portno &= 0xFFFF;
2603   switch (sz) {
2604      case 4:
2605         __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
2606                              : : "a" (data), "Nd" (portno));
2607	 break;
2608      case 2:
2609         __asm__ __volatile__("outw %w0, %w1"
2610                              : : "a" (data), "Nd" (portno));
2611	 break;
2612      case 1:
2613         __asm__ __volatile__("outb %b0, %w1"
2614                              : : "a" (data), "Nd" (portno));
2615	 break;
2616      default:
2617         break; /* note: no 64-bit version of insn exists */
2618   }
2619#  else
2620   /* do nothing */
2621#  endif
2622}
2623
2624/* CALLED FROM GENERATED CODE */
2625/* DIRTY HELPER (non-referentially-transparent) */
2626/* Horrible hack.  On non-amd64 platforms, do nothing. */
2627/* op = 0: call the native SGDT instruction.
2628   op = 1: call the native SIDT instruction.
2629*/
2630void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
2631#  if defined(__x86_64__)
2632   switch (op) {
2633      case 0:
2634         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
2635         break;
2636      case 1:
2637         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
2638         break;
2639      default:
2640         vpanic("amd64g_dirtyhelper_SxDT");
2641   }
2642#  else
2643   /* do nothing */
2644   UChar* p = (UChar*)address;
2645   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
2646   p[6] = p[7] = p[8] = p[9] = 0;
2647#  endif
2648}
2649
2650/*---------------------------------------------------------------*/
2651/*--- Helpers for MMX/SSE/SSE2.                               ---*/
2652/*---------------------------------------------------------------*/
2653
2654static inline UChar abdU8 ( UChar xx, UChar yy ) {
2655   return toUChar(xx>yy ? xx-yy : yy-xx);
2656}
2657
2658static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
2659   return (((ULong)w1) << 32) | ((ULong)w0);
2660}
2661
2662static inline UShort sel16x4_3 ( ULong w64 ) {
2663   UInt hi32 = toUInt(w64 >> 32);
2664   return toUShort(hi32 >> 16);
2665}
2666static inline UShort sel16x4_2 ( ULong w64 ) {
2667   UInt hi32 = toUInt(w64 >> 32);
2668   return toUShort(hi32);
2669}
2670static inline UShort sel16x4_1 ( ULong w64 ) {
2671   UInt lo32 = toUInt(w64);
2672   return toUShort(lo32 >> 16);
2673}
2674static inline UShort sel16x4_0 ( ULong w64 ) {
2675   UInt lo32 = toUInt(w64);
2676   return toUShort(lo32);
2677}
2678
2679static inline UChar sel8x8_7 ( ULong w64 ) {
2680   UInt hi32 = toUInt(w64 >> 32);
2681   return toUChar(hi32 >> 24);
2682}
2683static inline UChar sel8x8_6 ( ULong w64 ) {
2684   UInt hi32 = toUInt(w64 >> 32);
2685   return toUChar(hi32 >> 16);
2686}
2687static inline UChar sel8x8_5 ( ULong w64 ) {
2688   UInt hi32 = toUInt(w64 >> 32);
2689   return toUChar(hi32 >> 8);
2690}
2691static inline UChar sel8x8_4 ( ULong w64 ) {
2692   UInt hi32 = toUInt(w64 >> 32);
2693   return toUChar(hi32 >> 0);
2694}
2695static inline UChar sel8x8_3 ( ULong w64 ) {
2696   UInt lo32 = toUInt(w64);
2697   return toUChar(lo32 >> 24);
2698}
2699static inline UChar sel8x8_2 ( ULong w64 ) {
2700   UInt lo32 = toUInt(w64);
2701   return toUChar(lo32 >> 16);
2702}
2703static inline UChar sel8x8_1 ( ULong w64 ) {
2704   UInt lo32 = toUInt(w64);
2705   return toUChar(lo32 >> 8);
2706}
2707static inline UChar sel8x8_0 ( ULong w64 ) {
2708   UInt lo32 = toUInt(w64);
2709   return toUChar(lo32 >> 0);
2710}
2711
2712/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2713ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
2714{
2715   return
2716      mk32x2(
2717         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
2718            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
2719         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
2720            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
2721      );
2722}
2723
2724/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2725ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
2726{
2727   ULong r = 0;
2728   if (xx & (1ULL << (64-1))) r |= (1<<7);
2729   if (xx & (1ULL << (56-1))) r |= (1<<6);
2730   if (xx & (1ULL << (48-1))) r |= (1<<5);
2731   if (xx & (1ULL << (40-1))) r |= (1<<4);
2732   if (xx & (1ULL << (32-1))) r |= (1<<3);
2733   if (xx & (1ULL << (24-1))) r |= (1<<2);
2734   if (xx & (1ULL << (16-1))) r |= (1<<1);
2735   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
2736   return r;
2737}
2738
2739/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2740ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
2741{
2742   UInt t = 0;
2743   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
2744   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
2745   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
2746   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
2747   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
2748   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
2749   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
2750   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
2751   t &= 0xFFFF;
2752   return (ULong)t;
2753}
2754
2755/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2756ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
2757{
2758   ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
2759   ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
2760   return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
2761}
2762
2763/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2764ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
2765{
2766   UInt  i;
2767   ULong crc = (b & 0xFFULL) ^ crcIn;
2768   for (i = 0; i < 8; i++)
2769      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
2770   return crc;
2771}
2772
2773/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2774ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
2775{
2776   UInt  i;
2777   ULong crc = (w & 0xFFFFULL) ^ crcIn;
2778   for (i = 0; i < 16; i++)
2779      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
2780   return crc;
2781}
2782
2783/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2784ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
2785{
2786   UInt i;
2787   ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
2788   for (i = 0; i < 32; i++)
2789      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
2790   return crc;
2791}
2792
2793/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2794ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
2795{
2796   ULong crc = amd64g_calc_crc32l(crcIn, q);
2797   return amd64g_calc_crc32l(crc, q >> 32);
2798}
2799
2800
2801/*---------------------------------------------------------------*/
2802/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
2803/*---------------------------------------------------------------*/
2804
2805static UInt zmask_from_V128 ( V128* arg )
2806{
2807   UInt i, res = 0;
2808   for (i = 0; i < 16; i++) {
2809      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
2810   }
2811   return res;
2812}
2813
2814/* Helps with PCMP{I,E}STR{I,M}.
2815
2816   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
2817   actually it could be a clean helper, but for the fact that we can't
2818   pass by value 2 x V128 to a clean helper, nor have one returned.)
2819   Reads guest state, writes to guest state for the xSTRM cases, no
2820   accesses of memory, is a pure function.
2821
2822   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
2823   the callee knows which I/E and I/M variant it is dealing with and
2824   what the specific operation is.  4th byte of opcode is in the range
2825   0x60 to 0x63:
2826       istri  66 0F 3A 63
2827       istrm  66 0F 3A 62
2828       estri  66 0F 3A 61
2829       estrm  66 0F 3A 60
2830
2831   gstOffL and gstOffR are the guest state offsets for the two XMM
2832   register inputs.  We never have to deal with the memory case since
2833   that is handled by pre-loading the relevant value into the fake
2834   XMM16 register.
2835
2836   For ESTRx variants, edxIN and eaxIN hold the values of those two
2837   registers.
2838
2839   In all cases, the bottom 16 bits of the result contain the new
2840   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
2841   result hold the new %ecx value.  For xSTRM variants, the helper
2842   writes the result directly to the guest XMM0.
2843
2844   Declarable side effects: in all cases, reads guest state at
2845   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
2846   guest_XMM0.
2847
2848   Is expected to be called with opc_and_imm combinations which have
2849   actually been validated, and will assert if otherwise.  The front
2850   end should ensure we're only called with verified values.
2851*/
2852ULong amd64g_dirtyhelper_PCMPxSTRx (
2853          VexGuestAMD64State* gst,
2854          HWord opc4_and_imm,
2855          HWord gstOffL, HWord gstOffR,
2856          HWord edxIN, HWord eaxIN
2857       )
2858{
2859   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
2860   HWord imm8 = opc4_and_imm & 0xFF;
2861   HWord isISTRx = opc4 & 2;
2862   HWord isxSTRM = (opc4 & 1) ^ 1;
2863   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
2864   vassert((imm8 & 1) == 0); /* we support byte-size cases only */
2865
2866   // where the args are
2867   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
2868   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
2869
2870   /* Create the arg validity masks, either from the vectors
2871      themselves or from the supplied edx/eax values. */
2872   // FIXME: this is only right for the 8-bit data cases.
2873   // At least that is asserted above.
2874   UInt zmaskL, zmaskR;
2875   if (isISTRx) {
2876      zmaskL = zmask_from_V128(argL);
2877      zmaskR = zmask_from_V128(argR);
2878   } else {
2879      Int tmp;
2880      tmp = edxIN & 0xFFFFFFFF;
2881      if (tmp < -16) tmp = -16;
2882      if (tmp > 16)  tmp = 16;
2883      if (tmp < 0)   tmp = -tmp;
2884      vassert(tmp >= 0 && tmp <= 16);
2885      zmaskL = (1 << tmp) & 0xFFFF;
2886      tmp = eaxIN & 0xFFFFFFFF;
2887      if (tmp < -16) tmp = -16;
2888      if (tmp > 16)  tmp = 16;
2889      if (tmp < 0)   tmp = -tmp;
2890      vassert(tmp >= 0 && tmp <= 16);
2891      zmaskR = (1 << tmp) & 0xFFFF;
2892   }
2893
2894   // temp spot for the resulting flags and vector.
2895   V128 resV;
2896   UInt resOSZACP;
2897
2898   // do the meyaath
2899   Bool ok = compute_PCMPxSTRx (
2900                &resV, &resOSZACP, argL, argR,
2901                zmaskL, zmaskR, imm8, (Bool)isxSTRM
2902             );
2903
2904   // front end shouldn't pass us any imm8 variants we can't
2905   // handle.  Hence:
2906   vassert(ok);
2907
2908   // So, finally we need to get the results back to the caller.
2909   // In all cases, the new OSZACP value is the lowest 16 of
2910   // the return value.
2911   if (isxSTRM) {
2912      /* gst->guest_XMM0 = resV; */ // gcc don't like that
2913      gst->guest_XMM0[0] = resV.w32[0];
2914      gst->guest_XMM0[1] = resV.w32[1];
2915      gst->guest_XMM0[2] = resV.w32[2];
2916      gst->guest_XMM0[3] = resV.w32[3];
2917      return resOSZACP & 0x8D5;
2918   } else {
2919      UInt newECX = resV.w32[0] & 0xFFFF;
2920      return (newECX << 16) | (resOSZACP & 0x8D5);
2921   }
2922}
2923
2924
2925/*---------------------------------------------------------------*/
2926/*--- Helpers for dealing with, and describing,               ---*/
2927/*--- guest state as a whole.                                 ---*/
2928/*---------------------------------------------------------------*/
2929
2930/* Initialise the entire amd64 guest state. */
2931/* VISIBLE TO LIBVEX CLIENT */
2932void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
2933{
2934   vex_state->guest_RAX = 0;
2935   vex_state->guest_RCX = 0;
2936   vex_state->guest_RDX = 0;
2937   vex_state->guest_RBX = 0;
2938   vex_state->guest_RSP = 0;
2939   vex_state->guest_RBP = 0;
2940   vex_state->guest_RSI = 0;
2941   vex_state->guest_RDI = 0;
2942   vex_state->guest_R8  = 0;
2943   vex_state->guest_R9  = 0;
2944   vex_state->guest_R10 = 0;
2945   vex_state->guest_R11 = 0;
2946   vex_state->guest_R12 = 0;
2947   vex_state->guest_R13 = 0;
2948   vex_state->guest_R14 = 0;
2949   vex_state->guest_R15 = 0;
2950
2951   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
2952   vex_state->guest_CC_DEP1 = 0;
2953   vex_state->guest_CC_DEP2 = 0;
2954   vex_state->guest_CC_NDEP = 0;
2955
2956   vex_state->guest_DFLAG   = 1; /* forwards */
2957   vex_state->guest_IDFLAG  = 0;
2958
2959   /* HACK: represent the offset associated with %fs==0. This
2960      assumes that %fs is only ever zero. */
2961   vex_state->guest_FS_ZERO = 0;
2962
2963   vex_state->guest_RIP = 0;
2964
2965   /* Initialise the simulated FPU */
2966   amd64g_dirtyhelper_FINIT( vex_state );
2967
2968   /* Initialise the SSE state. */
2969#  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;
2970
2971   vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
2972   SSEZERO(vex_state->guest_XMM0);
2973   SSEZERO(vex_state->guest_XMM1);
2974   SSEZERO(vex_state->guest_XMM2);
2975   SSEZERO(vex_state->guest_XMM3);
2976   SSEZERO(vex_state->guest_XMM4);
2977   SSEZERO(vex_state->guest_XMM5);
2978   SSEZERO(vex_state->guest_XMM6);
2979   SSEZERO(vex_state->guest_XMM7);
2980   SSEZERO(vex_state->guest_XMM8);
2981   SSEZERO(vex_state->guest_XMM9);
2982   SSEZERO(vex_state->guest_XMM10);
2983   SSEZERO(vex_state->guest_XMM11);
2984   SSEZERO(vex_state->guest_XMM12);
2985   SSEZERO(vex_state->guest_XMM13);
2986   SSEZERO(vex_state->guest_XMM14);
2987   SSEZERO(vex_state->guest_XMM15);
2988   SSEZERO(vex_state->guest_XMM16);
2989
2990#  undef SSEZERO
2991
2992   vex_state->guest_EMWARN = EmWarn_NONE;
2993
2994   /* These should not ever be either read or written, but we
2995      initialise them anyway. */
2996   vex_state->guest_TISTART = 0;
2997   vex_state->guest_TILEN   = 0;
2998
2999   vex_state->guest_NRADDR   = 0;
3000   vex_state->guest_SC_CLASS = 0;
3001   vex_state->guest_GS_0x60  = 0;
3002
3003   vex_state->guest_IP_AT_SYSCALL = 0;
3004   /* vex_state->padding = 0; */
3005}
3006
3007
3008/* Figure out if any part of the guest state contained in minoff
3009   .. maxoff requires precise memory exceptions.  If in doubt return
3010   True (but this is generates significantly slower code).
3011
3012   By default we enforce precise exns for guest %RSP, %RBP and %RIP
3013   only.  These are the minimum needed to extract correct stack
3014   backtraces from amd64 code.
3015*/
3016Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
3017                                                   Int maxoff)
3018{
3019   Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
3020   Int rbp_max = rbp_min + 8 - 1;
3021   Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
3022   Int rsp_max = rsp_min + 8 - 1;
3023   Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
3024   Int rip_max = rip_min + 8 - 1;
3025
3026   if (maxoff < rbp_min || minoff > rbp_max) {
3027      /* no overlap with rbp */
3028   } else {
3029      return True;
3030   }
3031
3032   if (maxoff < rsp_min || minoff > rsp_max) {
3033      /* no overlap with rsp */
3034   } else {
3035      return True;
3036   }
3037
3038   if (maxoff < rip_min || minoff > rip_max) {
3039      /* no overlap with eip */
3040   } else {
3041      return True;
3042   }
3043
3044   return False;
3045}
3046
3047
3048#define ALWAYSDEFD(field)                             \
3049    { offsetof(VexGuestAMD64State, field),            \
3050      (sizeof ((VexGuestAMD64State*)0)->field) }
3051
3052VexGuestLayout
3053   amd64guest_layout
3054      = {
3055          /* Total size of the guest state, in bytes. */
3056          .total_sizeB = sizeof(VexGuestAMD64State),
3057
3058          /* Describe the stack pointer. */
3059          .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
3060          .sizeof_SP = 8,
3061
3062          /* Describe the frame pointer. */
3063          .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
3064          .sizeof_FP = 8,
3065
3066          /* Describe the instruction pointer. */
3067          .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
3068          .sizeof_IP = 8,
3069
3070          /* Describe any sections to be regarded by Memcheck as
3071             'always-defined'. */
3072          .n_alwaysDefd = 16,
3073
3074          /* flags thunk: OP and NDEP are always defd, whereas DEP1
3075             and DEP2 have to be tracked.  See detailed comment in
3076             gdefs.h on meaning of thunk fields. */
3077          .alwaysDefd
3078             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
3079                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
3080		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
3081                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
3082                 /*  4 */ ALWAYSDEFD(guest_RIP),
3083                 /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
3084                 /*  6 */ ALWAYSDEFD(guest_FTOP),
3085                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
3086                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
3087                 /*  9 */ ALWAYSDEFD(guest_FC3210),
3088                 // /* */ ALWAYSDEFD(guest_CS),
3089                 // /* */ ALWAYSDEFD(guest_DS),
3090                 // /* */ ALWAYSDEFD(guest_ES),
3091                 // /* */ ALWAYSDEFD(guest_FS),
3092                 // /* */ ALWAYSDEFD(guest_GS),
3093                 // /* */ ALWAYSDEFD(guest_SS),
3094                 // /* */ ALWAYSDEFD(guest_LDT),
3095                 // /* */ ALWAYSDEFD(guest_GDT),
3096                 /* 10 */ ALWAYSDEFD(guest_EMWARN),
3097                 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
3098                 /* 12 */ ALWAYSDEFD(guest_TISTART),
3099                 /* 13 */ ALWAYSDEFD(guest_TILEN),
3100                 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
3101                 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
3102               }
3103        };
3104
3105
3106/*---------------------------------------------------------------*/
3107/*--- end                               guest_amd64_helpers.c ---*/
3108/*---------------------------------------------------------------*/
3109