1
2/*---------------------------------------------------------------*/
3/*--- begin                             guest_amd64_helpers.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_emnote.h"
38#include "libvex_guest_amd64.h"
39#include "libvex_ir.h"
40#include "libvex.h"
41
42#include "main_util.h"
43#include "main_globals.h"
44#include "guest_generic_bb_to_IR.h"
45#include "guest_amd64_defs.h"
46#include "guest_generic_x87.h"
47
48
49/* This file contains helper functions for amd64 guest code.
50   Calls to these functions are generated by the back end.
51   These calls are of course in the host machine code and
52   this file will be compiled to host machine code, so that
53   all makes sense.
54
55   Only change the signatures of these helper functions very
56   carefully.  If you change the signature here, you'll have to change
57   the parameters passed to it in the IR calls constructed by
58   guest-amd64/toIR.c.
59
60   The convention used is that all functions called from generated
61   code are named amd64g_<something>, and any function whose name lacks
62   that prefix is not called from generated code.  Note that some
63   LibVEX_* functions can however be called by VEX's client, but that
64   is not the same as calling them from VEX-generated code.
65*/
66
67
68/* Set to 1 to get detailed profiling info about use of the flag
69   machinery. */
70#define PROFILE_RFLAGS 0
71
72
73/*---------------------------------------------------------------*/
74/*--- %rflags run-time helpers.                               ---*/
75/*---------------------------------------------------------------*/
76
77/* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
78   after imulq/mulq. */
79
80static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
81{
82   const Long halfMask = 0xFFFFFFFFLL;
83   ULong u0, v0, w0;
84    Long u1, v1, w1, w2, t;
85   u0   = u & halfMask;
86   u1   = u >> 32;
87   v0   = v & halfMask;
88   v1   = v >> 32;
89   w0   = u0 * v0;
90   t    = u1 * v0 + (w0 >> 32);
91   w1   = t & halfMask;
92   w2   = t >> 32;
93   w1   = u0 * v1 + w1;
94   *rHi = u1 * v1 + w2 + (w1 >> 32);
95   *rLo = (Long)((ULong)u * (ULong)v);
96}
97
98static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
99{
100   const ULong halfMask = 0xFFFFFFFFULL;
101   ULong u0, v0, w0;
102   ULong u1, v1, w1,w2,t;
103   u0   = u & halfMask;
104   u1   = u >> 32;
105   v0   = v & halfMask;
106   v1   = v >> 32;
107   w0   = u0 * v0;
108   t    = u1 * v0 + (w0 >> 32);
109   w1   = t & halfMask;
110   w2   = t >> 32;
111   w1   = u0 * v1 + w1;
112   *rHi = u1 * v1 + w2 + (w1 >> 32);
113   *rLo = u * v;
114}
115
116
117static const UChar parity_table[256] = {
118    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
121    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
124    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
126    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
129    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
133    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
137    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
140    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
142    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
145    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
148    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
149    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
150};
151
152/* generalised left-shifter */
153static inline Long lshift ( Long x, Int n )
154{
155   if (n >= 0)
156      return (ULong)x << n;
157   else
158      return x >> (-n);
159}
160
161/* identity on ULong */
162static inline ULong idULong ( ULong x )
163{
164   return x;
165}
166
167
168#define PREAMBLE(__data_bits)					\
169   /* const */ ULong DATA_MASK 					\
170      = __data_bits==8                                          \
171           ? 0xFFULL 					        \
172           : (__data_bits==16                                   \
173                ? 0xFFFFULL 		                        \
174                : (__data_bits==32                              \
175                     ? 0xFFFFFFFFULL                            \
176                     : 0xFFFFFFFFFFFFFFFFULL));                 \
177   /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
178   /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
179   /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
180   /* const */ ULong CC_NDEP = cc_ndep_formal;			\
181   /* Four bogus assignments, which hopefully gcc can     */	\
182   /* optimise away, and which stop it complaining about  */	\
183   /* unused variables.                                   */	\
184   SIGN_MASK = SIGN_MASK;					\
185   DATA_MASK = DATA_MASK;					\
186   CC_DEP2 = CC_DEP2;						\
187   CC_NDEP = CC_NDEP;
188
189
190/*-------------------------------------------------------------*/
191
192#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
193{								\
194   PREAMBLE(DATA_BITS);						\
195   { ULong cf, pf, af, zf, sf, of;				\
196     ULong argL, argR, res;					\
197     argL = CC_DEP1;						\
198     argR = CC_DEP2;						\
199     res  = argL + argR;					\
200     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
201     pf = parity_table[(UChar)res];				\
202     af = (res ^ argL ^ argR) & 0x10;				\
203     zf = ((DATA_UTYPE)res == 0) << 6;				\
204     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
205     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
206                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
207     return cf | pf | af | zf | sf | of;			\
208   }								\
209}
210
211/*-------------------------------------------------------------*/
212
213#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
214{								\
215   PREAMBLE(DATA_BITS);						\
216   { ULong cf, pf, af, zf, sf, of;				\
217     ULong argL, argR, res;					\
218     argL = CC_DEP1;						\
219     argR = CC_DEP2;						\
220     res  = argL - argR;					\
221     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
222     pf = parity_table[(UChar)res];				\
223     af = (res ^ argL ^ argR) & 0x10;				\
224     zf = ((DATA_UTYPE)res == 0) << 6;				\
225     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
226     of = lshift((argL ^ argR) & (argL ^ res),	 		\
227                 12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
228     return cf | pf | af | zf | sf | of;			\
229   }								\
230}
231
232/*-------------------------------------------------------------*/
233
234#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
235{								\
236   PREAMBLE(DATA_BITS);						\
237   { ULong cf, pf, af, zf, sf, of;				\
238     ULong argL, argR, oldC, res;		 		\
239     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
240     argL = CC_DEP1;						\
241     argR = CC_DEP2 ^ oldC;	       				\
242     res  = (argL + argR) + oldC;				\
243     if (oldC)							\
244        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
245     else							\
246        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
247     pf = parity_table[(UChar)res];				\
248     af = (res ^ argL ^ argR) & 0x10;				\
249     zf = ((DATA_UTYPE)res == 0) << 6;				\
250     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
251     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
252                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
253     return cf | pf | af | zf | sf | of;			\
254   }								\
255}
256
257/*-------------------------------------------------------------*/
258
259#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
260{								\
261   PREAMBLE(DATA_BITS);						\
262   { ULong cf, pf, af, zf, sf, of;				\
263     ULong argL, argR, oldC, res;	       			\
264     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
265     argL = CC_DEP1;						\
266     argR = CC_DEP2 ^ oldC;	       				\
267     res  = (argL - argR) - oldC;				\
268     if (oldC)							\
269        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
270     else							\
271        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
272     pf = parity_table[(UChar)res];				\
273     af = (res ^ argL ^ argR) & 0x10;				\
274     zf = ((DATA_UTYPE)res == 0) << 6;				\
275     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
276     of = lshift((argL ^ argR) & (argL ^ res), 			\
277                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
278     return cf | pf | af | zf | sf | of;			\
279   }								\
280}
281
282/*-------------------------------------------------------------*/
283
284#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
285{								\
286   PREAMBLE(DATA_BITS);						\
287   { ULong cf, pf, af, zf, sf, of;				\
288     cf = 0;							\
289     pf = parity_table[(UChar)CC_DEP1];				\
290     af = 0;							\
291     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
292     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
293     of = 0;							\
294     return cf | pf | af | zf | sf | of;			\
295   }								\
296}
297
298/*-------------------------------------------------------------*/
299
300#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
301{								\
302   PREAMBLE(DATA_BITS);						\
303   { ULong cf, pf, af, zf, sf, of;				\
304     ULong argL, argR, res;					\
305     res  = CC_DEP1;						\
306     argL = res - 1;						\
307     argR = 1;							\
308     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
309     pf = parity_table[(UChar)res];				\
310     af = (res ^ argL ^ argR) & 0x10;				\
311     zf = ((DATA_UTYPE)res == 0) << 6;				\
312     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
313     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
314     return cf | pf | af | zf | sf | of;			\
315   }								\
316}
317
318/*-------------------------------------------------------------*/
319
320#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
321{								\
322   PREAMBLE(DATA_BITS);						\
323   { ULong cf, pf, af, zf, sf, of;				\
324     ULong argL, argR, res;					\
325     res  = CC_DEP1;						\
326     argL = res + 1;						\
327     argR = 1;							\
328     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
329     pf = parity_table[(UChar)res];				\
330     af = (res ^ argL ^ argR) & 0x10;				\
331     zf = ((DATA_UTYPE)res == 0) << 6;				\
332     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
333     of = ((res & DATA_MASK) 					\
334          == ((ULong)SIGN_MASK - 1)) << 11;			\
335     return cf | pf | af | zf | sf | of;			\
336   }								\
337}
338
339/*-------------------------------------------------------------*/
340
341#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
342{								\
343   PREAMBLE(DATA_BITS);						\
344   { ULong cf, pf, af, zf, sf, of;				\
345     cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
346     pf = parity_table[(UChar)CC_DEP1];				\
347     af = 0; /* undefined */					\
348     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
349     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
350     /* of is defined if shift count == 1 */			\
351     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
352          & AMD64G_CC_MASK_O;					\
353     return cf | pf | af | zf | sf | of;			\
354   }								\
355}
356
357/*-------------------------------------------------------------*/
358
359#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
360{								\
361   PREAMBLE(DATA_BITS);  					\
362   { ULong cf, pf, af, zf, sf, of;				\
363     cf = CC_DEP2 & 1;						\
364     pf = parity_table[(UChar)CC_DEP1];				\
365     af = 0; /* undefined */					\
366     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
367     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
368     /* of is defined if shift count == 1 */			\
369     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
370          & AMD64G_CC_MASK_O;					\
371     return cf | pf | af | zf | sf | of;			\
372   }								\
373}
374
375/*-------------------------------------------------------------*/
376
377/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
378/* DEP1 = result, NDEP = old flags */
379#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
380{								\
381   PREAMBLE(DATA_BITS);						\
382   { ULong fl 							\
383        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
384          | (AMD64G_CC_MASK_C & CC_DEP1)			\
385          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
386                                      11-(DATA_BITS-1)) 	\
387                     ^ lshift(CC_DEP1, 11)));			\
388     return fl;							\
389   }								\
390}
391
392/*-------------------------------------------------------------*/
393
394/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
395/* DEP1 = result, NDEP = old flags */
396#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
397{								\
398   PREAMBLE(DATA_BITS);						\
399   { ULong fl 							\
400        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
401          | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
402          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
403                                      11-(DATA_BITS-1)) 	\
404                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
405     return fl;							\
406   }								\
407}
408
409/*-------------------------------------------------------------*/
410
411#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
412                                DATA_U2TYPE, NARROWto2U)        \
413{                                                               \
414   PREAMBLE(DATA_BITS);                                         \
415   { ULong cf, pf, af, zf, sf, of;                              \
416     DATA_UTYPE  hi;                                            \
417     DATA_UTYPE  lo                                             \
418        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
419                     * ((DATA_UTYPE)CC_DEP2) );                 \
420     DATA_U2TYPE rr                                             \
421        = NARROWto2U(                                           \
422             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
423             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
424     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
425     cf = (hi != 0);                                            \
426     pf = parity_table[(UChar)lo];                              \
427     af = 0; /* undefined */                                    \
428     zf = (lo == 0) << 6;                                       \
429     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
430     of = cf << 11;                                             \
431     return cf | pf | af | zf | sf | of;                        \
432   }								\
433}
434
435/*-------------------------------------------------------------*/
436
437#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
438                                DATA_S2TYPE, NARROWto2S)        \
439{                                                               \
440   PREAMBLE(DATA_BITS);                                         \
441   { ULong cf, pf, af, zf, sf, of;                              \
442     DATA_STYPE  hi;                                            \
443     DATA_STYPE  lo                                             \
444        = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
445                     * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
446     DATA_S2TYPE rr                                             \
447        = NARROWto2S(                                           \
448             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
449             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
450     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
451     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
452     pf = parity_table[(UChar)lo];                              \
453     af = 0; /* undefined */                                    \
454     zf = (lo == 0) << 6;                                       \
455     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
456     of = cf << 11;                                             \
457     return cf | pf | af | zf | sf | of;                        \
458   }								\
459}
460
461/*-------------------------------------------------------------*/
462
463#define ACTIONS_UMULQ                                           \
464{                                                               \
465   PREAMBLE(64);                                                \
466   { ULong cf, pf, af, zf, sf, of;                              \
467     ULong lo, hi;                                              \
468     mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
469     cf = (hi != 0);                                            \
470     pf = parity_table[(UChar)lo];                              \
471     af = 0; /* undefined */                                    \
472     zf = (lo == 0) << 6;                                       \
473     sf = lshift(lo, 8 - 64) & 0x80;                            \
474     of = cf << 11;                                             \
475     return cf | pf | af | zf | sf | of;                        \
476   }								\
477}
478
479/*-------------------------------------------------------------*/
480
481#define ACTIONS_SMULQ                                           \
482{                                                               \
483   PREAMBLE(64);                                                \
484   { ULong cf, pf, af, zf, sf, of;                              \
485     Long lo, hi;                                               \
486     mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
487     cf = (hi != (lo >>/*s*/ (64-1)));                          \
488     pf = parity_table[(UChar)lo];                              \
489     af = 0; /* undefined */                                    \
490     zf = (lo == 0) << 6;                                       \
491     sf = lshift(lo, 8 - 64) & 0x80;                            \
492     of = cf << 11;                                             \
493     return cf | pf | af | zf | sf | of;                        \
494   }								\
495}
496
497/*-------------------------------------------------------------*/
498
499#define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
500{								\
501   PREAMBLE(DATA_BITS);						\
502   { ULong cf, pf, af, zf, sf, of;				\
503     cf = 0;							\
504     pf = 0;							\
505     af = 0;							\
506     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
507     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
508     of = 0;							\
509     return cf | pf | af | zf | sf | of;			\
510   }								\
511}
512
513/*-------------------------------------------------------------*/
514
515#define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
516{								\
517   PREAMBLE(DATA_BITS);						\
518   { ULong cf, pf, af, zf, sf, of;				\
519     cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
520     pf = 0;							\
521     af = 0;							\
522     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
523     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
524     of = 0;							\
525     return cf | pf | af | zf | sf | of;			\
526   }								\
527}
528
529/*-------------------------------------------------------------*/
530
531#define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
532{								\
533   PREAMBLE(DATA_BITS);						\
534   { Long cf, pf, af, zf, sf, of;				\
535     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
536     pf = 0;							\
537     af = 0;							\
538     zf = 0;							\
539     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
540     of = 0;							\
541     return cf | pf | af | zf | sf | of;			\
542   }								\
543}
544
545/*-------------------------------------------------------------*/
546
547#define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
548{								\
549   PREAMBLE(DATA_BITS);						\
550   { ULong cf, pf, af, zf, sf, of;				\
551     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
552     pf = 0;							\
553     af = 0;							\
554     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
555     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
556     of = 0;							\
557     return cf | pf | af | zf | sf | of;			\
558   }								\
559}
560
561/*-------------------------------------------------------------*/
562
563#define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)		\
564{								\
565   PREAMBLE(DATA_BITS);						\
566   { ULong ocf;	/* o or c */					\
567     ULong argL, argR, oldOC, res;				\
568     oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;	\
569     argL  = CC_DEP1;						\
570     argR  = CC_DEP2 ^ oldOC;					\
571     res   = (argL + argR) + oldOC;				\
572     if (oldOC)							\
573        ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
574     else							\
575        ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
576     return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)		\
577            | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);		\
578   }								\
579}
580
581/*-------------------------------------------------------------*/
582
583
584#if PROFILE_RFLAGS
585
586static Bool initted     = False;
587
588/* C flag, fast route */
589static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
590/* C flag, slow route */
591static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
592/* table for calculate_cond */
593static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
594/* total entry counts for calc_all, calc_c, calc_cond. */
595static UInt n_calc_all  = 0;
596static UInt n_calc_c    = 0;
597static UInt n_calc_cond = 0;
598
599#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
600
601
602static void showCounts ( void )
603{
604   Int op, co;
605   HChar ch;
606   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
607              n_calc_all, n_calc_cond, n_calc_c);
608
609   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
610              "    S   NS    P   NP    L   NL   LE  NLE\n");
611   vex_printf("     -----------------------------------------------------"
612              "----------------------------------------\n");
613   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
614
615      ch = ' ';
616      if (op > 0 && (op-1) % 4 == 0)
617         ch = 'B';
618      if (op > 0 && (op-1) % 4 == 1)
619         ch = 'W';
620      if (op > 0 && (op-1) % 4 == 2)
621         ch = 'L';
622      if (op > 0 && (op-1) % 4 == 3)
623         ch = 'Q';
624
625      vex_printf("%2d%c: ", op, ch);
626      vex_printf("%6u ", tabc_slow[op]);
627      vex_printf("%6u ", tabc_fast[op]);
628      for (co = 0; co < 16; co++) {
629         Int n = tab_cond[op][co];
630         if (n >= 1000) {
631            vex_printf(" %3dK", n / 1000);
632         } else
633         if (n >= 0) {
634            vex_printf(" %3d ", n );
635         } else {
636            vex_printf("     ");
637         }
638      }
639      vex_printf("\n");
640   }
641   vex_printf("\n");
642}
643
644static void initCounts ( void )
645{
646   Int op, co;
647   initted = True;
648   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
649      tabc_fast[op] = tabc_slow[op] = 0;
650      for (co = 0; co < 16; co++)
651         tab_cond[op][co] = 0;
652   }
653}
654
655#endif /* PROFILE_RFLAGS */
656
657
658/* CALLED FROM GENERATED CODE: CLEAN HELPER */
659/* Calculate all the 6 flags from the supplied thunk parameters.
660   Worker function, not directly called from generated code. */
661static
662ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
663                                        ULong cc_dep1_formal,
664                                        ULong cc_dep2_formal,
665                                        ULong cc_ndep_formal )
666{
667   switch (cc_op) {
668      case AMD64G_CC_OP_COPY:
669         return cc_dep1_formal
670                & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
671                   | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
672
673      case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
674      case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
675      case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
676      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
677
678      case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
679      case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
680      case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
681      case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
682
683      case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
684      case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
685      case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
686      case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
687
688      case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
689      case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
690      case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
691      case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
692
693      case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
694      case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
695      case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
696      case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
697
698      case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
699      case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
700      case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
701      case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
702
703      case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
704      case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
705      case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
706      case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
707
708      case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
709      case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
710      case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
711      case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
712
713      case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
714      case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
715      case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
716      case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
717
718      case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
719      case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
720      case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
721      case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
722
723      case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
724      case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
725      case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
726      case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
727
728      case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
729                                                  UShort, toUShort );
730      case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
731                                                  UInt,   toUInt );
732      case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
733                                                  ULong,  idULong );
734
735      case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
736
737      case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
738                                                  Short,  toUShort );
739      case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
740                                                  Int,    toUInt   );
741      case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
742                                                  Long,   idULong );
743
744      case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
745
746      case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
747      case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
748
749      case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
750      case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
751
752      case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
753      case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
754
755      case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
756      case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
757
758      case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
759      case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
760
761      case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
762      case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
763
764      default:
765         /* shouldn't really make these calls from generated code */
766         vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
767                    "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
768                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
769         vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
770   }
771}
772
773
774/* CALLED FROM GENERATED CODE: CLEAN HELPER */
775/* Calculate all the 6 flags from the supplied thunk parameters. */
776ULong amd64g_calculate_rflags_all ( ULong cc_op,
777                                    ULong cc_dep1,
778                                    ULong cc_dep2,
779                                    ULong cc_ndep )
780{
781#  if PROFILE_RFLAGS
782   if (!initted) initCounts();
783   n_calc_all++;
784   if (SHOW_COUNTS_NOW) showCounts();
785#  endif
786   return
787      amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
788}
789
790
791/* CALLED FROM GENERATED CODE: CLEAN HELPER */
792/* Calculate just the carry flag from the supplied thunk parameters. */
793ULong amd64g_calculate_rflags_c ( ULong cc_op,
794                                  ULong cc_dep1,
795                                  ULong cc_dep2,
796                                  ULong cc_ndep )
797{
798#  if PROFILE_RFLAGS
799   if (!initted) initCounts();
800   n_calc_c++;
801   tabc_fast[cc_op]++;
802   if (SHOW_COUNTS_NOW) showCounts();
803#  endif
804
805   /* Fast-case some common ones. */
806   switch (cc_op) {
807      case AMD64G_CC_OP_COPY:
808         return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
809      case AMD64G_CC_OP_LOGICQ:
810      case AMD64G_CC_OP_LOGICL:
811      case AMD64G_CC_OP_LOGICW:
812      case AMD64G_CC_OP_LOGICB:
813         return 0;
814	 //      case AMD64G_CC_OP_SUBL:
815	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
816	 //                   ? AMD64G_CC_MASK_C : 0;
817	 //      case AMD64G_CC_OP_SUBW:
818	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
819	 //                   ? AMD64G_CC_MASK_C : 0;
820	 //      case AMD64G_CC_OP_SUBB:
821	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
822	 //                   ? AMD64G_CC_MASK_C : 0;
823	 //      case AMD64G_CC_OP_INCL:
824	 //      case AMD64G_CC_OP_DECL:
825	 //         return cc_ndep & AMD64G_CC_MASK_C;
826      default:
827         break;
828   }
829
830#  if PROFILE_RFLAGS
831   tabc_fast[cc_op]--;
832   tabc_slow[cc_op]++;
833#  endif
834
835   return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
836          & AMD64G_CC_MASK_C;
837}
838
839
840/* CALLED FROM GENERATED CODE: CLEAN HELPER */
841/* returns 1 or 0 */
842ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
843                                   ULong cc_op,
844                                   ULong cc_dep1,
845                                   ULong cc_dep2,
846                                   ULong cc_ndep )
847{
848   ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
849                                                  cc_dep2, cc_ndep);
850   ULong of,sf,zf,cf,pf;
851   ULong inv = cond & 1;
852
853#  if PROFILE_RFLAGS
854   if (!initted) initCounts();
855   tab_cond[cc_op][cond]++;
856   n_calc_cond++;
857   if (SHOW_COUNTS_NOW) showCounts();
858#  endif
859
860   switch (cond) {
861      case AMD64CondNO:
862      case AMD64CondO: /* OF == 1 */
863         of = rflags >> AMD64G_CC_SHIFT_O;
864         return 1 & (inv ^ of);
865
866      case AMD64CondNZ:
867      case AMD64CondZ: /* ZF == 1 */
868         zf = rflags >> AMD64G_CC_SHIFT_Z;
869         return 1 & (inv ^ zf);
870
871      case AMD64CondNB:
872      case AMD64CondB: /* CF == 1 */
873         cf = rflags >> AMD64G_CC_SHIFT_C;
874         return 1 & (inv ^ cf);
875         break;
876
877      case AMD64CondNBE:
878      case AMD64CondBE: /* (CF or ZF) == 1 */
879         cf = rflags >> AMD64G_CC_SHIFT_C;
880         zf = rflags >> AMD64G_CC_SHIFT_Z;
881         return 1 & (inv ^ (cf | zf));
882         break;
883
884      case AMD64CondNS:
885      case AMD64CondS: /* SF == 1 */
886         sf = rflags >> AMD64G_CC_SHIFT_S;
887         return 1 & (inv ^ sf);
888
889      case AMD64CondNP:
890      case AMD64CondP: /* PF == 1 */
891         pf = rflags >> AMD64G_CC_SHIFT_P;
892         return 1 & (inv ^ pf);
893
894      case AMD64CondNL:
895      case AMD64CondL: /* (SF xor OF) == 1 */
896         sf = rflags >> AMD64G_CC_SHIFT_S;
897         of = rflags >> AMD64G_CC_SHIFT_O;
898         return 1 & (inv ^ (sf ^ of));
899         break;
900
901      case AMD64CondNLE:
902      case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
903         sf = rflags >> AMD64G_CC_SHIFT_S;
904         of = rflags >> AMD64G_CC_SHIFT_O;
905         zf = rflags >> AMD64G_CC_SHIFT_Z;
906         return 1 & (inv ^ ((sf ^ of) | zf));
907         break;
908
909      default:
910         /* shouldn't really make these calls from generated code */
911         vex_printf("amd64g_calculate_condition"
912                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
913                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
914         vpanic("amd64g_calculate_condition");
915   }
916}
917
918
919/* VISIBLE TO LIBVEX CLIENT */
920ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
921{
922   ULong rflags = amd64g_calculate_rflags_all_WRK(
923                     vex_state->guest_CC_OP,
924                     vex_state->guest_CC_DEP1,
925                     vex_state->guest_CC_DEP2,
926                     vex_state->guest_CC_NDEP
927                  );
928   Long dflag = vex_state->guest_DFLAG;
929   vassert(dflag == 1 || dflag == -1);
930   if (dflag == -1)
931      rflags |= (1<<10);
932   if (vex_state->guest_IDFLAG == 1)
933      rflags |= (1<<21);
934   if (vex_state->guest_ACFLAG == 1)
935      rflags |= (1<<18);
936
937   return rflags;
938}
939
940/* VISIBLE TO LIBVEX CLIENT */
941void
942LibVEX_GuestAMD64_put_rflags ( ULong rflags,
943                               /*MOD*/VexGuestAMD64State* vex_state )
944{
945   /* D flag */
946   if (rflags & AMD64G_CC_MASK_D) {
947      vex_state->guest_DFLAG = -1;
948      rflags &= ~AMD64G_CC_MASK_D;
949   }
950   else
951      vex_state->guest_DFLAG = 1;
952
953   /* ID flag */
954   if (rflags & AMD64G_CC_MASK_ID) {
955      vex_state->guest_IDFLAG = 1;
956      rflags &= ~AMD64G_CC_MASK_ID;
957   }
958   else
959      vex_state->guest_IDFLAG = 0;
960
961   /* AC flag */
962   if (rflags & AMD64G_CC_MASK_AC) {
963      vex_state->guest_ACFLAG = 1;
964      rflags &= ~AMD64G_CC_MASK_AC;
965   }
966   else
967      vex_state->guest_ACFLAG = 0;
968
969   UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
970                  AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
971   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
972   vex_state->guest_CC_DEP1 = rflags & cc_mask;
973   vex_state->guest_CC_DEP2 = 0;
974   vex_state->guest_CC_NDEP = 0;
975}
976
977/* VISIBLE TO LIBVEX CLIENT */
978void
979LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
980                               /*MOD*/VexGuestAMD64State* vex_state )
981{
982   ULong oszacp = amd64g_calculate_rflags_all_WRK(
983                     vex_state->guest_CC_OP,
984                     vex_state->guest_CC_DEP1,
985                     vex_state->guest_CC_DEP2,
986                     vex_state->guest_CC_NDEP
987                  );
988   if (new_carry_flag & 1) {
989      oszacp |= AMD64G_CC_MASK_C;
990   } else {
991      oszacp &= ~AMD64G_CC_MASK_C;
992   }
993   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
994   vex_state->guest_CC_DEP1 = oszacp;
995   vex_state->guest_CC_DEP2 = 0;
996   vex_state->guest_CC_NDEP = 0;
997}
998
999
1000/*---------------------------------------------------------------*/
1001/*--- %rflags translation-time function specialisers.         ---*/
1002/*--- These help iropt specialise calls the above run-time    ---*/
1003/*--- %rflags functions.                                      ---*/
1004/*---------------------------------------------------------------*/
1005
1006/* Used by the optimiser to try specialisations.  Returns an
1007   equivalent expression, or NULL if none. */
1008
1009static Bool isU64 ( IRExpr* e, ULong n )
1010{
1011   return toBool( e->tag == Iex_Const
1012                  && e->Iex.Const.con->tag == Ico_U64
1013                  && e->Iex.Const.con->Ico.U64 == n );
1014}
1015
1016IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1017                                 IRExpr** args,
1018                                 IRStmt** precedingStmts,
1019                                 Int      n_precedingStmts )
1020{
1021#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1022#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1023#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1024#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1025#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
1026
1027   Int i, arity = 0;
1028   for (i = 0; args[i]; i++)
1029      arity++;
1030#  if 0
1031   vex_printf("spec request:\n");
1032   vex_printf("   %s  ", function_name);
1033   for (i = 0; i < arity; i++) {
1034      vex_printf("  ");
1035      ppIRExpr(args[i]);
1036   }
1037   vex_printf("\n");
1038#  endif
1039
1040   /* --------- specialising "amd64g_calculate_condition" --------- */
1041
1042   if (vex_streq(function_name, "amd64g_calculate_condition")) {
1043      /* specialise calls to above "calculate condition" function */
1044      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1045      vassert(arity == 5);
1046      cond    = args[0];
1047      cc_op   = args[1];
1048      cc_dep1 = args[2];
1049      cc_dep2 = args[3];
1050
1051      /*---------------- ADDQ ----------------*/
1052
1053      if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1054         /* long long add, then Z --> test (dst+src == 0) */
1055         return unop(Iop_1Uto64,
1056                     binop(Iop_CmpEQ64,
1057                           binop(Iop_Add64, cc_dep1, cc_dep2),
1058                           mkU64(0)));
1059      }
1060
1061      /*---------------- ADDL ----------------*/
1062
1063      if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1064         /* This is very commonly generated by Javascript JITs, for
1065            the idiom "do a 32-bit add and jump to out-of-line code if
1066            an overflow occurs". */
1067         /* long add, then O (overflow)
1068            --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1069            --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1070            --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1071         */
1072         vassert(isIRAtom(cc_dep1));
1073         vassert(isIRAtom(cc_dep2));
1074         return
1075            binop(Iop_And64,
1076                  binop(Iop_Shr64,
1077                        binop(Iop_And64,
1078                              unop(Iop_Not64,
1079                                   binop(Iop_Xor64, cc_dep1, cc_dep2)),
1080                              binop(Iop_Xor64,
1081                                    cc_dep1,
1082                                    binop(Iop_Add64, cc_dep1, cc_dep2))),
1083                        mkU8(31)),
1084                  mkU64(1));
1085
1086      }
1087
1088      /*---------------- SUBQ ----------------*/
1089
1090      /* 0, */
1091      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1092         /* long long sub/cmp, then O (overflow)
1093            --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1094            --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1095         */
1096         vassert(isIRAtom(cc_dep1));
1097         vassert(isIRAtom(cc_dep2));
1098         return binop(Iop_Shr64,
1099                      binop(Iop_And64,
1100                            binop(Iop_Xor64, cc_dep1, cc_dep2),
1101                            binop(Iop_Xor64,
1102                                  cc_dep1,
1103                                  binop(Iop_Sub64, cc_dep1, cc_dep2))),
1104                      mkU8(63));
1105      }
1106      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1107         /* No action.  Never yet found a test case. */
1108      }
1109
1110      /* 2, 3 */
1111      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1112         /* long long sub/cmp, then B (unsigned less than)
1113            --> test dst <u src */
1114         return unop(Iop_1Uto64,
1115                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1116      }
1117      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1118         /* long long sub/cmp, then NB (unsigned greater than or equal)
1119            --> test src <=u dst */
1120         /* Note, args are opposite way round from the usual */
1121         return unop(Iop_1Uto64,
1122                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1123      }
1124
1125      /* 4, 5 */
1126      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1127         /* long long sub/cmp, then Z --> test dst==src */
1128         return unop(Iop_1Uto64,
1129                     binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1130      }
1131      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1132         /* long long sub/cmp, then NZ --> test dst!=src */
1133         return unop(Iop_1Uto64,
1134                     binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1135      }
1136
1137      /* 6, 7 */
1138      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1139         /* long long sub/cmp, then BE (unsigned less than or equal)
1140            --> test dst <=u src */
1141         return unop(Iop_1Uto64,
1142                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1143      }
1144      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1145         /* long long sub/cmp, then NBE (unsigned greater than)
1146            --> test !(dst <=u src) */
1147         return binop(Iop_Xor64,
1148                      unop(Iop_1Uto64,
1149                           binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1150                      mkU64(1));
1151      }
1152
1153      /* 8, 9 */
1154      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1155         /* long long sub/cmp, then S (negative)
1156            --> (dst-src)[63]
1157            --> (dst-src) >>u 63 */
1158         return binop(Iop_Shr64,
1159                      binop(Iop_Sub64, cc_dep1, cc_dep2),
1160                      mkU8(63));
1161      }
1162      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1163         /* long long sub/cmp, then NS (not negative)
1164            --> (dst-src)[63] ^ 1
1165            --> ((dst-src) >>u 63) ^ 1 */
1166         return binop(Iop_Xor64,
1167                      binop(Iop_Shr64,
1168                            binop(Iop_Sub64, cc_dep1, cc_dep2),
1169                            mkU8(63)),
1170                      mkU64(1));
1171      }
1172
1173      /* 12, 13 */
1174      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1175         /* long long sub/cmp, then L (signed less than)
1176            --> test dst <s src */
1177         return unop(Iop_1Uto64,
1178                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1179      }
1180      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1181         /* long long sub/cmp, then NL (signed greater than or equal)
1182            --> test dst >=s src
1183            --> test src <=s dst */
1184         return unop(Iop_1Uto64,
1185                     binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1186      }
1187
1188      /* 14, 15 */
1189      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1190         /* long long sub/cmp, then LE (signed less than or equal)
1191            --> test dst <=s src */
1192         return unop(Iop_1Uto64,
1193                     binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1194      }
1195      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1196         /* long sub/cmp, then NLE (signed greater than)
1197            --> test !(dst <=s src)
1198            --> test (dst >s src)
1199            --> test (src <s dst) */
1200         return unop(Iop_1Uto64,
1201                     binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1202
1203      }
1204
1205      /*---------------- SUBL ----------------*/
1206
1207      /* 0, */
1208      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1209         /* This is very commonly generated by Javascript JITs, for
1210            the idiom "do a 32-bit subtract and jump to out-of-line
1211            code if an overflow occurs". */
1212         /* long sub/cmp, then O (overflow)
1213            --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1214            --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1215         */
1216         vassert(isIRAtom(cc_dep1));
1217         vassert(isIRAtom(cc_dep2));
1218         return
1219            binop(Iop_And64,
1220                  binop(Iop_Shr64,
1221                        binop(Iop_And64,
1222                              binop(Iop_Xor64, cc_dep1, cc_dep2),
1223                              binop(Iop_Xor64,
1224                                    cc_dep1,
1225                                    binop(Iop_Sub64, cc_dep1, cc_dep2))),
1226                        mkU8(31)),
1227                  mkU64(1));
1228      }
1229      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1230         /* No action.  Never yet found a test case. */
1231      }
1232
1233      /* 2, 3 */
1234      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1235         /* long sub/cmp, then B (unsigned less than)
1236            --> test dst <u src */
1237         return unop(Iop_1Uto64,
1238                     binop(Iop_CmpLT32U,
1239                           unop(Iop_64to32, cc_dep1),
1240                           unop(Iop_64to32, cc_dep2)));
1241      }
1242      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1243         /* long sub/cmp, then NB (unsigned greater than or equal)
1244            --> test src <=u dst */
1245         /* Note, args are opposite way round from the usual */
1246         return unop(Iop_1Uto64,
1247                     binop(Iop_CmpLE32U,
1248                           unop(Iop_64to32, cc_dep2),
1249                           unop(Iop_64to32, cc_dep1)));
1250      }
1251
1252      /* 4, 5 */
1253      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1254         /* long sub/cmp, then Z --> test dst==src */
1255         return unop(Iop_1Uto64,
1256                     binop(Iop_CmpEQ32,
1257                           unop(Iop_64to32, cc_dep1),
1258                           unop(Iop_64to32, cc_dep2)));
1259      }
1260      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1261         /* long sub/cmp, then NZ --> test dst!=src */
1262         return unop(Iop_1Uto64,
1263                     binop(Iop_CmpNE32,
1264                           unop(Iop_64to32, cc_dep1),
1265                           unop(Iop_64to32, cc_dep2)));
1266      }
1267
1268      /* 6, 7 */
1269      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1270         /* long sub/cmp, then BE (unsigned less than or equal)
1271            --> test dst <=u src */
1272         return unop(Iop_1Uto64,
1273                     binop(Iop_CmpLE32U,
1274                           unop(Iop_64to32, cc_dep1),
1275                           unop(Iop_64to32, cc_dep2)));
1276      }
1277      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1278         /* long sub/cmp, then NBE (unsigned greater than)
1279            --> test src <u dst */
1280         /* Note, args are opposite way round from the usual */
1281         return unop(Iop_1Uto64,
1282                     binop(Iop_CmpLT32U,
1283                           unop(Iop_64to32, cc_dep2),
1284                           unop(Iop_64to32, cc_dep1)));
1285      }
1286
1287      /* 8, 9 */
1288      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1289         /* long sub/cmp, then S (negative)
1290            --> (dst-src)[31]
1291            --> ((dst -64 src) >>u 31) & 1
1292            Pointless to narrow the args to 32 bit before the subtract. */
1293         return binop(Iop_And64,
1294                      binop(Iop_Shr64,
1295                            binop(Iop_Sub64, cc_dep1, cc_dep2),
1296                            mkU8(31)),
1297                      mkU64(1));
1298      }
1299      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1300         /* long sub/cmp, then NS (not negative)
1301            --> (dst-src)[31] ^ 1
1302            --> (((dst -64 src) >>u 31) & 1) ^ 1
1303            Pointless to narrow the args to 32 bit before the subtract. */
1304         return binop(Iop_Xor64,
1305                      binop(Iop_And64,
1306                            binop(Iop_Shr64,
1307                                  binop(Iop_Sub64, cc_dep1, cc_dep2),
1308                                  mkU8(31)),
1309                            mkU64(1)),
1310                      mkU64(1));
1311      }
1312
1313      /* 12, 13 */
1314      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1315         /* long sub/cmp, then L (signed less than)
1316            --> test dst <s src */
1317         return unop(Iop_1Uto64,
1318                     binop(Iop_CmpLT32S,
1319                           unop(Iop_64to32, cc_dep1),
1320                           unop(Iop_64to32, cc_dep2)));
1321      }
1322      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1323         /* long sub/cmp, then NL (signed greater than or equal)
1324            --> test dst >=s src
1325            --> test src <=s dst */
1326         return unop(Iop_1Uto64,
1327                     binop(Iop_CmpLE32S,
1328                           unop(Iop_64to32, cc_dep2),
1329                           unop(Iop_64to32, cc_dep1)));
1330      }
1331
1332      /* 14, 15 */
1333      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1334         /* long sub/cmp, then LE (signed less than or equal)
1335            --> test dst <=s src */
1336         return unop(Iop_1Uto64,
1337                     binop(Iop_CmpLE32S,
1338                           unop(Iop_64to32, cc_dep1),
1339                           unop(Iop_64to32, cc_dep2)));
1340
1341      }
1342      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1343         /* long sub/cmp, then NLE (signed greater than)
1344            --> test !(dst <=s src)
1345            --> test (dst >s src)
1346            --> test (src <s dst) */
1347         return unop(Iop_1Uto64,
1348                     binop(Iop_CmpLT32S,
1349                           unop(Iop_64to32, cc_dep2),
1350                           unop(Iop_64to32, cc_dep1)));
1351
1352      }
1353
1354      /*---------------- SUBW ----------------*/
1355
1356      /* 4, 5 */
1357      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1358         /* word sub/cmp, then Z --> test dst==src */
1359         return unop(Iop_1Uto64,
1360                     binop(Iop_CmpEQ16,
1361                           unop(Iop_64to16,cc_dep1),
1362                           unop(Iop_64to16,cc_dep2)));
1363      }
1364      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1365         /* word sub/cmp, then NZ --> test dst!=src */
1366         return unop(Iop_1Uto64,
1367                     binop(Iop_CmpNE16,
1368                           unop(Iop_64to16,cc_dep1),
1369                           unop(Iop_64to16,cc_dep2)));
1370      }
1371
1372      /* 6, */
1373      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1374         /* word sub/cmp, then BE (unsigned less than or equal)
1375            --> test dst <=u src */
1376         return unop(Iop_1Uto64,
1377                     binop(Iop_CmpLE64U,
1378                           binop(Iop_Shl64, cc_dep1, mkU8(48)),
1379                           binop(Iop_Shl64, cc_dep2, mkU8(48))));
1380      }
1381
1382      /* 8, 9 */
1383      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1384                                          && isU64(cc_dep2, 0)) {
1385         /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1386                                         --> test dst <s 0
1387                                         --> (ULong)dst[15]
1388            This is yet another scheme by which clang figures out if the
1389            top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
1390         /* Note: isU64(cc_dep2, 0) is correct, even though this is
1391            for an 16-bit comparison, since the args to the helper
1392            function are always U64s. */
1393         return binop(Iop_And64,
1394                      binop(Iop_Shr64,cc_dep1,mkU8(15)),
1395                      mkU64(1));
1396      }
1397      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1398                                          && isU64(cc_dep2, 0)) {
1399         /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1400                                          --> test !(dst <s 0)
1401                                          --> (ULong) !dst[15]
1402         */
1403         return binop(Iop_Xor64,
1404                      binop(Iop_And64,
1405                            binop(Iop_Shr64,cc_dep1,mkU8(15)),
1406                            mkU64(1)),
1407                      mkU64(1));
1408      }
1409
1410      /* 14, */
1411      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1412         /* word sub/cmp, then LE (signed less than or equal)
1413            --> test dst <=s src */
1414         return unop(Iop_1Uto64,
1415                     binop(Iop_CmpLE64S,
1416                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1417                           binop(Iop_Shl64,cc_dep2,mkU8(48))));
1418
1419      }
1420
1421      /*---------------- SUBB ----------------*/
1422
1423      /* 2, 3 */
1424      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1425         /* byte sub/cmp, then B (unsigned less than)
1426            --> test dst <u src */
1427         return unop(Iop_1Uto64,
1428                     binop(Iop_CmpLT64U,
1429                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1430                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1431      }
1432      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1433         /* byte sub/cmp, then NB (unsigned greater than or equal)
1434            --> test src <=u dst */
1435         /* Note, args are opposite way round from the usual */
1436         return unop(Iop_1Uto64,
1437                     binop(Iop_CmpLE64U,
1438                           binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1439                           binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1440      }
1441
1442      /* 4, 5 */
1443      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1444         /* byte sub/cmp, then Z --> test dst==src */
1445         return unop(Iop_1Uto64,
1446                     binop(Iop_CmpEQ8,
1447                           unop(Iop_64to8,cc_dep1),
1448                           unop(Iop_64to8,cc_dep2)));
1449      }
1450      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1451         /* byte sub/cmp, then NZ --> test dst!=src */
1452         return unop(Iop_1Uto64,
1453                     binop(Iop_CmpNE8,
1454                           unop(Iop_64to8,cc_dep1),
1455                           unop(Iop_64to8,cc_dep2)));
1456      }
1457
1458      /* 6, */
1459      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1460         /* byte sub/cmp, then BE (unsigned less than or equal)
1461            --> test dst <=u src */
1462         return unop(Iop_1Uto64,
1463                     binop(Iop_CmpLE64U,
1464                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1465                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1466      }
1467
1468      /* 8, 9 */
1469      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1470                                          && isU64(cc_dep2, 0)) {
1471         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1472                                         --> test dst <s 0
1473                                         --> (ULong)dst[7]
1474            This is yet another scheme by which gcc figures out if the
1475            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1476         /* Note: isU64(cc_dep2, 0) is correct, even though this is
1477            for an 8-bit comparison, since the args to the helper
1478            function are always U64s. */
1479         return binop(Iop_And64,
1480                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1481                      mkU64(1));
1482      }
1483      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1484                                          && isU64(cc_dep2, 0)) {
1485         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1486                                          --> test !(dst <s 0)
1487                                          --> (ULong) !dst[7]
1488         */
1489         return binop(Iop_Xor64,
1490                      binop(Iop_And64,
1491                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1492                            mkU64(1)),
1493                      mkU64(1));
1494      }
1495
1496      /*---------------- LOGICQ ----------------*/
1497
1498      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1499         /* long long and/or/xor, then Z --> test dst==0 */
1500         return unop(Iop_1Uto64,
1501                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1502      }
1503      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1504         /* long long and/or/xor, then NZ --> test dst!=0 */
1505         return unop(Iop_1Uto64,
1506                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1507      }
1508
1509      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1510         /* long long and/or/xor, then L
1511            LOGIC sets SF and ZF according to the
1512            result and makes OF be zero.  L computes SF ^ OF, but
1513            OF is zero, so this reduces to SF -- which will be 1 iff
1514            the result is < signed 0.  Hence ...
1515         */
1516         return unop(Iop_1Uto64,
1517                     binop(Iop_CmpLT64S,
1518                           cc_dep1,
1519                           mkU64(0)));
1520      }
1521
1522      /*---------------- LOGICL ----------------*/
1523
1524      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1525         /* long and/or/xor, then Z --> test dst==0 */
1526         return unop(Iop_1Uto64,
1527                     binop(Iop_CmpEQ32,
1528                           unop(Iop_64to32, cc_dep1),
1529                           mkU32(0)));
1530      }
1531      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1532         /* long and/or/xor, then NZ --> test dst!=0 */
1533         return unop(Iop_1Uto64,
1534                     binop(Iop_CmpNE32,
1535                           unop(Iop_64to32, cc_dep1),
1536                           mkU32(0)));
1537      }
1538
1539      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1540         /* long and/or/xor, then LE
1541            This is pretty subtle.  LOGIC sets SF and ZF according to the
1542            result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1543            OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1544            the result is <=signed 0.  Hence ...
1545         */
1546         return unop(Iop_1Uto64,
1547                     binop(Iop_CmpLE32S,
1548                           unop(Iop_64to32, cc_dep1),
1549                           mkU32(0)));
1550      }
1551
1552      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1553         /* long and/or/xor, then S --> (ULong)result[31] */
1554         return binop(Iop_And64,
1555                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1556                      mkU64(1));
1557      }
1558      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1559         /* long and/or/xor, then S --> (ULong) ~ result[31] */
1560         return binop(Iop_Xor64,
1561                binop(Iop_And64,
1562                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1563                      mkU64(1)),
1564                mkU64(1));
1565      }
1566
1567      /*---------------- LOGICW ----------------*/
1568
1569      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1570         /* word and/or/xor, then Z --> test dst==0 */
1571         return unop(Iop_1Uto64,
1572                     binop(Iop_CmpEQ64,
1573                           binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1574                           mkU64(0)));
1575      }
1576      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1577         /* word and/or/xor, then NZ --> test dst!=0 */
1578         return unop(Iop_1Uto64,
1579                     binop(Iop_CmpNE64,
1580                           binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1581                           mkU64(0)));
1582      }
1583
1584      /*---------------- LOGICB ----------------*/
1585
1586      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1587         /* byte and/or/xor, then Z --> test dst==0 */
1588         return unop(Iop_1Uto64,
1589                     binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1590                                        mkU64(0)));
1591      }
1592      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1593         /* byte and/or/xor, then NZ --> test dst!=0 */
1594         return unop(Iop_1Uto64,
1595                     binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1596                                        mkU64(0)));
1597      }
1598
1599      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1600         /* this is an idiom gcc sometimes uses to find out if the top
1601            bit of a byte register is set: eg testb %al,%al; js ..
1602            Since it just depends on the top bit of the byte, extract
1603            that bit and explicitly get rid of all the rest.  This
1604            helps memcheck avoid false positives in the case where any
1605            of the other bits in the byte are undefined. */
1606         /* byte and/or/xor, then S --> (UInt)result[7] */
1607         return binop(Iop_And64,
1608                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1609                      mkU64(1));
1610      }
1611      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1612         /* byte and/or/xor, then NS --> (UInt)!result[7] */
1613         return binop(Iop_Xor64,
1614                      binop(Iop_And64,
1615                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1616                            mkU64(1)),
1617                      mkU64(1));
1618      }
1619
1620      /*---------------- INCB ----------------*/
1621
1622      if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1623         /* 8-bit inc, then LE --> sign bit of the arg */
1624         return binop(Iop_And64,
1625                      binop(Iop_Shr64,
1626                            binop(Iop_Sub64, cc_dep1, mkU64(1)),
1627                            mkU8(7)),
1628                      mkU64(1));
1629      }
1630
1631      /*---------------- INCW ----------------*/
1632
1633      if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1634         /* 16-bit inc, then Z --> test dst == 0 */
1635         return unop(Iop_1Uto64,
1636                     binop(Iop_CmpEQ64,
1637                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1638                           mkU64(0)));
1639      }
1640
1641      /*---------------- DECL ----------------*/
1642
1643      if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1644         /* dec L, then Z --> test dst == 0 */
1645         return unop(Iop_1Uto64,
1646                     binop(Iop_CmpEQ32,
1647                           unop(Iop_64to32, cc_dep1),
1648                           mkU32(0)));
1649      }
1650
1651      /*---------------- DECW ----------------*/
1652
1653      if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1654         /* 16-bit dec, then NZ --> test dst != 0 */
1655         return unop(Iop_1Uto64,
1656                     binop(Iop_CmpNE64,
1657                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1658                           mkU64(0)));
1659      }
1660
1661      /*---------------- SHRQ ----------------*/
1662
1663      if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1664         /* SHRQ, then Z --> test dep1 == 0 */
1665         return unop(Iop_1Uto64,
1666                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1667      }
1668      if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1669         /* SHRQ, then NZ --> test dep1 != 0 */
1670         return unop(Iop_1Uto64,
1671                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1672      }
1673
1674      /*---------------- SHRL ----------------*/
1675
1676      if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1677         /* SHRL, then Z --> test dep1 == 0 */
1678         return unop(Iop_1Uto64,
1679                     binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1680                           mkU32(0)));
1681      }
1682      if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1683         /* SHRL, then NZ --> test dep1 != 0 */
1684         return unop(Iop_1Uto64,
1685                     binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1686                           mkU32(0)));
1687      }
1688
1689      /*---------------- COPY ----------------*/
1690      /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1691         jbe" for example. */
1692
1693      if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1694          (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1695         /* COPY, then BE --> extract C and Z from dep1, and test (C
1696            or Z == 1). */
1697         /* COPY, then NBE --> extract C and Z from dep1, and test (C
1698            or Z == 0). */
1699         ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1700         return
1701            unop(
1702               Iop_1Uto64,
1703               binop(
1704                  Iop_CmpEQ64,
1705                  binop(
1706                     Iop_And64,
1707                     binop(
1708                        Iop_Or64,
1709                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1710                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1711                     ),
1712                     mkU64(1)
1713                  ),
1714                  mkU64(nnn)
1715               )
1716            );
1717      }
1718
1719      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1720         /* COPY, then B --> extract C dep1, and test (C == 1). */
1721         return
1722            unop(
1723               Iop_1Uto64,
1724               binop(
1725                  Iop_CmpNE64,
1726                  binop(
1727                     Iop_And64,
1728                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1729                     mkU64(1)
1730                  ),
1731                  mkU64(0)
1732               )
1733            );
1734      }
1735
1736      if (isU64(cc_op, AMD64G_CC_OP_COPY)
1737          && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1738         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1739         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1740         UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1741         return
1742            unop(
1743               Iop_1Uto64,
1744               binop(
1745                  Iop_CmpEQ64,
1746                  binop(
1747                     Iop_And64,
1748                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1749                     mkU64(1)
1750                  ),
1751                  mkU64(nnn)
1752               )
1753            );
1754      }
1755
1756      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1757         /* COPY, then P --> extract P from dep1, and test (P == 1). */
1758         return
1759            unop(
1760               Iop_1Uto64,
1761               binop(
1762                  Iop_CmpNE64,
1763                  binop(
1764                     Iop_And64,
1765                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1766                     mkU64(1)
1767                  ),
1768                  mkU64(0)
1769               )
1770            );
1771      }
1772
1773      return NULL;
1774   }
1775
1776   /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1777
1778   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1779      /* specialise calls to above "calculate_rflags_c" function */
1780      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1781      vassert(arity == 4);
1782      cc_op   = args[0];
1783      cc_dep1 = args[1];
1784      cc_dep2 = args[2];
1785      cc_ndep = args[3];
1786
1787      if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1788         /* C after sub denotes unsigned less than */
1789         return unop(Iop_1Uto64,
1790                     binop(Iop_CmpLT64U,
1791                           cc_dep1,
1792                           cc_dep2));
1793      }
1794      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1795         /* C after sub denotes unsigned less than */
1796         return unop(Iop_1Uto64,
1797                     binop(Iop_CmpLT32U,
1798                           unop(Iop_64to32, cc_dep1),
1799                           unop(Iop_64to32, cc_dep2)));
1800      }
1801      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1802         /* C after sub denotes unsigned less than */
1803         return unop(Iop_1Uto64,
1804                     binop(Iop_CmpLT64U,
1805                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1806                           binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1807      }
1808      if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
1809         /* C after add denotes sum <u either arg */
1810         return unop(Iop_1Uto64,
1811                     binop(Iop_CmpLT64U,
1812                           binop(Iop_Add64, cc_dep1, cc_dep2),
1813                           cc_dep1));
1814      }
1815      if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
1816         /* C after add denotes sum <u either arg */
1817         return unop(Iop_1Uto64,
1818                     binop(Iop_CmpLT32U,
1819                           unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
1820                           unop(Iop_64to32, cc_dep1)));
1821      }
1822      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1823          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1824          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1825          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1826         /* cflag after logic is zero */
1827         return mkU64(0);
1828      }
1829      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1830          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1831         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1832         return cc_ndep;
1833      }
1834
1835#     if 0
1836      if (cc_op->tag == Iex_Const) {
1837         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1838      }
1839#     endif
1840
1841      return NULL;
1842   }
1843
1844#  undef unop
1845#  undef binop
1846#  undef mkU64
1847#  undef mkU32
1848#  undef mkU8
1849
1850   return NULL;
1851}
1852
1853
1854/*---------------------------------------------------------------*/
1855/*--- Supporting functions for x87 FPU activities.            ---*/
1856/*---------------------------------------------------------------*/
1857
1858static inline Bool host_is_little_endian ( void )
1859{
1860   UInt x = 0x76543210;
1861   UChar* p = (UChar*)(&x);
1862   return toBool(*p == 0x10);
1863}
1864
1865/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1866/* CALLED FROM GENERATED CODE: CLEAN HELPER */
1867ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1868{
1869   Bool   mantissaIsZero;
1870   Int    bexp;
1871   UChar  sign;
1872   UChar* f64;
1873
1874   vassert(host_is_little_endian());
1875
1876   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1877
1878   f64  = (UChar*)(&dbl);
1879   sign = toUChar( (f64[7] >> 7) & 1 );
1880
1881   /* First off, if the tag indicates the register was empty,
1882      return 1,0,sign,1 */
1883   if (tag == 0) {
1884      /* vex_printf("Empty\n"); */
1885      return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1886                                   | AMD64G_FC_MASK_C0;
1887   }
1888
1889   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1890   bexp &= 0x7FF;
1891
1892   mantissaIsZero
1893      = toBool(
1894           (f64[6] & 0x0F) == 0
1895           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1896        );
1897
1898   /* If both exponent and mantissa are zero, the value is zero.
1899      Return 1,0,sign,0. */
1900   if (bexp == 0 && mantissaIsZero) {
1901      /* vex_printf("Zero\n"); */
1902      return AMD64G_FC_MASK_C3 | 0
1903                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1904   }
1905
1906   /* If exponent is zero but mantissa isn't, it's a denormal.
1907      Return 1,1,sign,0. */
1908   if (bexp == 0 && !mantissaIsZero) {
1909      /* vex_printf("Denormal\n"); */
1910      return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1911                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1912   }
1913
1914   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1915      Return 0,1,sign,1. */
1916   if (bexp == 0x7FF && mantissaIsZero) {
1917      /* vex_printf("Inf\n"); */
1918      return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1919                                   | AMD64G_FC_MASK_C0;
1920   }
1921
1922   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1923      Return 0,0,sign,1. */
1924   if (bexp == 0x7FF && !mantissaIsZero) {
1925      /* vex_printf("NaN\n"); */
1926      return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1927   }
1928
1929   /* Uh, ok, we give up.  It must be a normal finite number.
1930      Return 0,1,sign,0.
1931   */
1932   /* vex_printf("normal\n"); */
1933   return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1934}
1935
1936
1937/* This is used to implement both 'frstor' and 'fldenv'.  The latter
1938   appears to differ from the former only in that the 8 FP registers
1939   themselves are not transferred into the guest state. */
1940static
1941VexEmNote do_put_x87 ( Bool moveRegs,
1942                       /*IN*/Fpu_State* x87_state,
1943                       /*OUT*/VexGuestAMD64State* vex_state )
1944{
1945   Int        stno, preg;
1946   UInt       tag;
1947   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1948   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1949   UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
1950   UInt       tagw    = x87_state->env[FP_ENV_TAG];
1951   UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
1952   UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
1953   VexEmNote  ew;
1954   UInt       fpround;
1955   ULong      pair;
1956
1957   /* Copy registers and tags */
1958   for (stno = 0; stno < 8; stno++) {
1959      preg = (stno + ftop) & 7;
1960      tag = (tagw >> (2*preg)) & 3;
1961      if (tag == 3) {
1962         /* register is empty */
1963         /* hmm, if it's empty, does it still get written?  Probably
1964            safer to say it does.  If we don't, memcheck could get out
1965            of sync, in that it thinks all FP registers are defined by
1966            this helper, but in reality some have not been updated. */
1967         if (moveRegs)
1968            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1969         vexTags[preg] = 0;
1970      } else {
1971         /* register is non-empty */
1972         if (moveRegs)
1973            convert_f80le_to_f64le( &x87_state->reg[10*stno],
1974                                    (UChar*)&vexRegs[preg] );
1975         vexTags[preg] = 1;
1976      }
1977   }
1978
1979   /* stack pointer */
1980   vex_state->guest_FTOP = ftop;
1981
1982   /* status word */
1983   vex_state->guest_FC3210 = c3210;
1984
1985   /* handle the control word, setting FPROUND and detecting any
1986      emulation warnings. */
1987   pair    = amd64g_check_fldcw ( (ULong)fpucw );
1988   fpround = (UInt)pair & 0xFFFFFFFFULL;
1989   ew      = (VexEmNote)(pair >> 32);
1990
1991   vex_state->guest_FPROUND = fpround & 3;
1992
1993   /* emulation warnings --> caller */
1994   return ew;
1995}
1996
1997
1998/* Create an x87 FPU state from the guest state, as close as
1999   we can approximate it. */
2000static
2001void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2002                  /*OUT*/Fpu_State* x87_state )
2003{
2004   Int        i, stno, preg;
2005   UInt       tagw;
2006   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2007   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2008   UInt       ftop    = vex_state->guest_FTOP;
2009   UInt       c3210   = vex_state->guest_FC3210;
2010
2011   for (i = 0; i < 14; i++)
2012      x87_state->env[i] = 0;
2013
2014   x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2015      = x87_state->env[13] = 0xFFFF;
2016   x87_state->env[FP_ENV_STAT]
2017      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2018   x87_state->env[FP_ENV_CTRL]
2019      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2020
2021   /* Dump the register stack in ST order. */
2022   tagw = 0;
2023   for (stno = 0; stno < 8; stno++) {
2024      preg = (stno + ftop) & 7;
2025      if (vexTags[preg] == 0) {
2026         /* register is empty */
2027         tagw |= (3 << (2*preg));
2028         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2029                                 &x87_state->reg[10*stno] );
2030      } else {
2031         /* register is full. */
2032         tagw |= (0 << (2*preg));
2033         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2034                                 &x87_state->reg[10*stno] );
2035      }
2036   }
2037   x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2038}
2039
2040
2041/*---------------------------------------------------------------*/
2042/*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
2043/*---------------------------------------------------------------*/
2044
2045/* CALLED FROM GENERATED CODE */
2046/* DIRTY HELPER (reads guest state, writes guest mem) */
2047/* XSAVE component 0 is the x87 FPU state. */
2048void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2049        ( VexGuestAMD64State* gst, HWord addr )
2050{
2051   /* Derived from values obtained from
2052      vendor_id       : AuthenticAMD
2053      cpu family      : 15
2054      model           : 12
2055      model name      : AMD Athlon(tm) 64 Processor 3200+
2056      stepping        : 0
2057      cpu MHz         : 2200.000
2058      cache size      : 512 KB
2059   */
2060   /* Somewhat roundabout, but at least it's simple. */
2061   Fpu_State tmp;
2062   UShort*   addrS = (UShort*)addr;
2063   UChar*    addrC = (UChar*)addr;
2064   UShort    fp_tags;
2065   UInt      summary_tags;
2066   Int       r, stno;
2067   UShort    *srcS, *dstS;
2068
2069   do_get_x87( gst, &tmp );
2070
2071   /* Now build the proper fxsave x87 image from the fsave x87 image
2072      we just made. */
2073
2074   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2075   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2076
2077   /* set addrS[2] in an endian-independent way */
2078   summary_tags = 0;
2079   fp_tags = tmp.env[FP_ENV_TAG];
2080   for (r = 0; r < 8; r++) {
2081      if ( ((fp_tags >> (2*r)) & 3) != 3 )
2082         summary_tags |= (1 << r);
2083   }
2084   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
2085   addrC[5]  = 0; /* pad */
2086
2087   /* FOP: faulting fpu opcode.  From experimentation, the real CPU
2088      does not write this field. (?!) */
2089   addrS[3]  = 0; /* BOGUS */
2090
2091   /* RIP (Last x87 instruction pointer).  From experimentation, the
2092      real CPU does not write this field. (?!) */
2093   addrS[4]  = 0; /* BOGUS */
2094   addrS[5]  = 0; /* BOGUS */
2095   addrS[6]  = 0; /* BOGUS */
2096   addrS[7]  = 0; /* BOGUS */
2097
2098   /* RDP (Last x87 data pointer).  From experimentation, the real CPU
2099      does not write this field. (?!) */
2100   addrS[8]  = 0; /* BOGUS */
2101   addrS[9]  = 0; /* BOGUS */
2102   addrS[10] = 0; /* BOGUS */
2103   addrS[11] = 0; /* BOGUS */
2104
2105   /* addrS[13,12] are MXCSR -- not written */
2106   /* addrS[15,14] are MXCSR_MASK -- not written */
2107
2108   /* Copy in the FP registers, in ST order. */
2109   for (stno = 0; stno < 8; stno++) {
2110      srcS = (UShort*)(&tmp.reg[10*stno]);
2111      dstS = (UShort*)(&addrS[16 + 8*stno]);
2112      dstS[0] = srcS[0];
2113      dstS[1] = srcS[1];
2114      dstS[2] = srcS[2];
2115      dstS[3] = srcS[3];
2116      dstS[4] = srcS[4];
2117      dstS[5] = 0;
2118      dstS[6] = 0;
2119      dstS[7] = 0;
2120   }
2121}
2122
2123
2124/* CALLED FROM GENERATED CODE */
2125/* DIRTY HELPER (reads guest state, writes guest mem) */
2126/* XSAVE component 1 is the SSE state. */
2127void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2128        ( VexGuestAMD64State* gst, HWord addr )
2129{
2130   UShort* addrS = (UShort*)addr;
2131   UInt    mxcsr;
2132
2133   /* The only non-register parts of the SSE state are MXCSR and
2134      MXCSR_MASK. */
2135   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2136
2137   addrS[12] = toUShort(mxcsr);  /* MXCSR */
2138   addrS[13] = toUShort(mxcsr >> 16);
2139
2140   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2141   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2142}
2143
2144
2145/* VISIBLE TO LIBVEX CLIENT */
2146/* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2147   the result at the given address which represents a buffer of at
2148   least 416 bytes.
2149
2150   This function is not called from generated code.  FXSAVE is dealt
2151   with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2152   functions above plus some in-line IR.  This function is merely a
2153   convenience function for VEX's users.
2154*/
2155void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2156                                /*OUT*/HWord fp_state )
2157{
2158   /* Do the x87 part */
2159   amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2160
2161   /* And now the SSE part, except for the registers themselves. */
2162   amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2163
2164   /* That's the first 160 bytes of the image done. */
2165   /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
2166      big-endian, these need to be byte-swapped. */
2167   U128 *xmm = (U128 *)(fp_state + 160);
2168   vassert(host_is_little_endian());
2169
2170#  define COPY_U128(_dst,_src)                       \
2171      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2172           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2173      while (0)
2174
2175   COPY_U128( xmm[0],  gst->guest_YMM0 );
2176   COPY_U128( xmm[1],  gst->guest_YMM1 );
2177   COPY_U128( xmm[2],  gst->guest_YMM2 );
2178   COPY_U128( xmm[3],  gst->guest_YMM3 );
2179   COPY_U128( xmm[4],  gst->guest_YMM4 );
2180   COPY_U128( xmm[5],  gst->guest_YMM5 );
2181   COPY_U128( xmm[6],  gst->guest_YMM6 );
2182   COPY_U128( xmm[7],  gst->guest_YMM7 );
2183   COPY_U128( xmm[8],  gst->guest_YMM8 );
2184   COPY_U128( xmm[9],  gst->guest_YMM9 );
2185   COPY_U128( xmm[10], gst->guest_YMM10 );
2186   COPY_U128( xmm[11], gst->guest_YMM11 );
2187   COPY_U128( xmm[12], gst->guest_YMM12 );
2188   COPY_U128( xmm[13], gst->guest_YMM13 );
2189   COPY_U128( xmm[14], gst->guest_YMM14 );
2190   COPY_U128( xmm[15], gst->guest_YMM15 );
2191#  undef COPY_U128
2192}
2193
2194
2195/*---------------------------------------------------------------*/
2196/*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
2197/*---------------------------------------------------------------*/
2198
2199/* CALLED FROM GENERATED CODE */
2200/* DIRTY HELPER (writes guest state, reads guest mem) */
2201VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2202             ( VexGuestAMD64State* gst, HWord addr )
2203{
2204   Fpu_State tmp;
2205   UShort*   addrS   = (UShort*)addr;
2206   UChar*    addrC   = (UChar*)addr;
2207   UShort    fp_tags;
2208   Int       r, stno, i;
2209
2210   /* Copy the x87 registers out of the image, into a temporary
2211      Fpu_State struct. */
2212   for (i = 0; i < 14; i++) tmp.env[i] = 0;
2213   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2214   /* fill in tmp.reg[0..7] */
2215   for (stno = 0; stno < 8; stno++) {
2216      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2217      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2218      dstS[0] = srcS[0];
2219      dstS[1] = srcS[1];
2220      dstS[2] = srcS[2];
2221      dstS[3] = srcS[3];
2222      dstS[4] = srcS[4];
2223   }
2224   /* fill in tmp.env[0..13] */
2225   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2226   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2227
2228   fp_tags = 0;
2229   for (r = 0; r < 8; r++) {
2230      if (addrC[4] & (1<<r))
2231         fp_tags |= (0 << (2*r)); /* EMPTY */
2232      else
2233         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2234   }
2235   tmp.env[FP_ENV_TAG] = fp_tags;
2236
2237   /* Now write 'tmp' into the guest state. */
2238   VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2239
2240   return warnX87;
2241}
2242
2243
2244/* CALLED FROM GENERATED CODE */
2245/* DIRTY HELPER (writes guest state, reads guest mem) */
2246VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2247             ( VexGuestAMD64State* gst, HWord addr )
2248{
2249   UShort* addrS = (UShort*)addr;
2250   UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
2251                   | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2252   ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
2253
2254   VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2255
2256   gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2257   return warnXMM;
2258}
2259
2260
2261/* VISIBLE TO LIBVEX CLIENT */
2262/* Do FXRSTOR from the supplied address and store read values to the given
2263   VexGuestAMD64State structure.
2264
2265   This function is not called from generated code.  FXRSTOR is dealt
2266   with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2267   functions above plus some in-line IR.  This function is merely a
2268   convenience function for VEX's users.
2269*/
2270VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2271                                      /*MOD*/VexGuestAMD64State* gst )
2272{
2273   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
2274      to be byte-swapped. */
2275   U128 *xmm = (U128 *)(fp_state + 160);
2276
2277   vassert(host_is_little_endian());
2278
2279#  define COPY_U128(_dst,_src)                       \
2280      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2281           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2282      while (0)
2283
2284   COPY_U128( gst->guest_YMM0, xmm[0] );
2285   COPY_U128( gst->guest_YMM1, xmm[1] );
2286   COPY_U128( gst->guest_YMM2, xmm[2] );
2287   COPY_U128( gst->guest_YMM3, xmm[3] );
2288   COPY_U128( gst->guest_YMM4, xmm[4] );
2289   COPY_U128( gst->guest_YMM5, xmm[5] );
2290   COPY_U128( gst->guest_YMM6, xmm[6] );
2291   COPY_U128( gst->guest_YMM7, xmm[7] );
2292   COPY_U128( gst->guest_YMM8, xmm[8] );
2293   COPY_U128( gst->guest_YMM9, xmm[9] );
2294   COPY_U128( gst->guest_YMM10, xmm[10] );
2295   COPY_U128( gst->guest_YMM11, xmm[11] );
2296   COPY_U128( gst->guest_YMM12, xmm[12] );
2297   COPY_U128( gst->guest_YMM13, xmm[13] );
2298   COPY_U128( gst->guest_YMM14, xmm[14] );
2299   COPY_U128( gst->guest_YMM15, xmm[15] );
2300
2301#  undef COPY_U128
2302
2303   VexEmNote warnXMM
2304      = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2305   VexEmNote warnX87
2306      = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2307
2308   /* Prefer an X87 emwarn over an XMM one, if both exist. */
2309   if (warnX87 != EmNote_NONE)
2310      return warnX87;
2311   else
2312      return warnXMM;
2313}
2314
2315
2316/*---------------------------------------------------------------*/
2317/*--- Supporting functions for FSAVE/FRSTOR                   ---*/
2318/*---------------------------------------------------------------*/
2319
2320/* DIRTY HELPER (writes guest state) */
2321/* Initialise the x87 FPU state as per 'finit'. */
2322void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2323{
2324   Int i;
2325   gst->guest_FTOP = 0;
2326   for (i = 0; i < 8; i++) {
2327      gst->guest_FPTAG[i] = 0; /* empty */
2328      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2329   }
2330   gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2331   gst->guest_FC3210  = 0;
2332}
2333
2334
2335/* CALLED FROM GENERATED CODE */
2336/* DIRTY HELPER (reads guest memory) */
2337ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2338{
2339   ULong f64;
2340   convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2341   return f64;
2342}
2343
2344/* CALLED FROM GENERATED CODE */
2345/* DIRTY HELPER (writes guest memory) */
2346void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2347{
2348   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2349}
2350
2351
2352/* CALLED FROM GENERATED CODE */
2353/* CLEAN HELPER */
2354/* mxcsr[15:0] contains a SSE native format MXCSR value.
2355   Extract from it the required SSEROUND value and any resulting
2356   emulation warning, and return (warn << 32) | sseround value.
2357*/
2358ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2359{
2360   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
2361   /* NOTE, encoded exactly as per enum IRRoundingMode. */
2362   ULong rmode = (mxcsr >> 13) & 3;
2363
2364   /* Detect any required emulation warnings. */
2365   VexEmNote ew = EmNote_NONE;
2366
2367   if ((mxcsr & 0x1F80) != 0x1F80) {
2368      /* unmasked exceptions! */
2369      ew = EmWarn_X86_sseExns;
2370   }
2371   else
2372   if (mxcsr & (1<<15)) {
2373      /* FZ is set */
2374      ew = EmWarn_X86_fz;
2375   }
2376   else
2377   if (mxcsr & (1<<6)) {
2378      /* DAZ is set */
2379      ew = EmWarn_X86_daz;
2380   }
2381
2382   return (((ULong)ew) << 32) | ((ULong)rmode);
2383}
2384
2385
2386/* CALLED FROM GENERATED CODE */
2387/* CLEAN HELPER */
2388/* Given sseround as an IRRoundingMode value, create a suitable SSE
2389   native format MXCSR value. */
2390ULong amd64g_create_mxcsr ( ULong sseround )
2391{
2392   sseround &= 3;
2393   return 0x1F80 | (sseround << 13);
2394}
2395
2396
2397/* CLEAN HELPER */
2398/* fpucw[15:0] contains a x87 native format FPU control word.
2399   Extract from it the required FPROUND value and any resulting
2400   emulation warning, and return (warn << 32) | fpround value.
2401*/
2402ULong amd64g_check_fldcw ( ULong fpucw )
2403{
2404   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
2405   /* NOTE, encoded exactly as per enum IRRoundingMode. */
2406   ULong rmode = (fpucw >> 10) & 3;
2407
2408   /* Detect any required emulation warnings. */
2409   VexEmNote ew = EmNote_NONE;
2410
2411   if ((fpucw & 0x3F) != 0x3F) {
2412      /* unmasked exceptions! */
2413      ew = EmWarn_X86_x87exns;
2414   }
2415   else
2416   if (((fpucw >> 8) & 3) != 3) {
2417      /* unsupported precision */
2418      ew = EmWarn_X86_x87precision;
2419   }
2420
2421   return (((ULong)ew) << 32) | ((ULong)rmode);
2422}
2423
2424
2425/* CLEAN HELPER */
2426/* Given fpround as an IRRoundingMode value, create a suitable x87
2427   native format FPU control word. */
2428ULong amd64g_create_fpucw ( ULong fpround )
2429{
2430   fpround &= 3;
2431   return 0x037F | (fpround << 10);
2432}
2433
2434
2435/* This is used to implement 'fldenv'.
2436   Reads 28 bytes at x87_state[0 .. 27]. */
2437/* CALLED FROM GENERATED CODE */
2438/* DIRTY HELPER */
2439VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2440                                      /*IN*/HWord x87_state)
2441{
2442   return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2443}
2444
2445
2446/* CALLED FROM GENERATED CODE */
2447/* DIRTY HELPER */
2448/* Create an x87 FPU env from the guest state, as close as we can
2449   approximate it.  Writes 28 bytes at x87_state[0..27]. */
2450void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2451                                 /*OUT*/HWord x87_state )
2452{
2453   Int        i, stno, preg;
2454   UInt       tagw;
2455   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2456   Fpu_State* x87     = (Fpu_State*)x87_state;
2457   UInt       ftop    = vex_state->guest_FTOP;
2458   ULong      c3210   = vex_state->guest_FC3210;
2459
2460   for (i = 0; i < 14; i++)
2461      x87->env[i] = 0;
2462
2463   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2464   x87->env[FP_ENV_STAT]
2465      = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2466   x87->env[FP_ENV_CTRL]
2467      = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2468
2469   /* Compute the x87 tag word. */
2470   tagw = 0;
2471   for (stno = 0; stno < 8; stno++) {
2472      preg = (stno + ftop) & 7;
2473      if (vexTags[preg] == 0) {
2474         /* register is empty */
2475         tagw |= (3 << (2*preg));
2476      } else {
2477         /* register is full. */
2478         tagw |= (0 << (2*preg));
2479      }
2480   }
2481   x87->env[FP_ENV_TAG] = toUShort(tagw);
2482
2483   /* We don't dump the x87 registers, tho. */
2484}
2485
2486
2487/* This is used to implement 'fnsave'.
2488   Writes 108 bytes at x87_state[0 .. 107]. */
2489/* CALLED FROM GENERATED CODE */
2490/* DIRTY HELPER */
2491void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2492                                 /*OUT*/HWord x87_state)
2493{
2494   do_get_x87( vex_state, (Fpu_State*)x87_state );
2495}
2496
2497
2498/* This is used to implement 'fnsaves'.
2499   Writes 94 bytes at x87_state[0 .. 93]. */
2500/* CALLED FROM GENERATED CODE */
2501/* DIRTY HELPER */
2502void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2503                                  /*OUT*/HWord x87_state)
2504{
2505   Int           i, stno, preg;
2506   UInt          tagw;
2507   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2508   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2509   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2510   UInt          ftop    = vex_state->guest_FTOP;
2511   UInt          c3210   = vex_state->guest_FC3210;
2512
2513   for (i = 0; i < 7; i++)
2514      x87->env[i] = 0;
2515
2516   x87->env[FPS_ENV_STAT]
2517      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2518   x87->env[FPS_ENV_CTRL]
2519      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2520
2521   /* Dump the register stack in ST order. */
2522   tagw = 0;
2523   for (stno = 0; stno < 8; stno++) {
2524      preg = (stno + ftop) & 7;
2525      if (vexTags[preg] == 0) {
2526         /* register is empty */
2527         tagw |= (3 << (2*preg));
2528         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2529                                 &x87->reg[10*stno] );
2530      } else {
2531         /* register is full. */
2532         tagw |= (0 << (2*preg));
2533         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2534                                 &x87->reg[10*stno] );
2535      }
2536   }
2537   x87->env[FPS_ENV_TAG] = toUShort(tagw);
2538}
2539
2540
2541/* This is used to implement 'frstor'.
2542   Reads 108 bytes at x87_state[0 .. 107]. */
2543/* CALLED FROM GENERATED CODE */
2544/* DIRTY HELPER */
2545VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2546                                      /*IN*/HWord x87_state)
2547{
2548   return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2549}
2550
2551
2552/* This is used to implement 'frstors'.
2553   Reads 94 bytes at x87_state[0 .. 93]. */
2554/* CALLED FROM GENERATED CODE */
2555/* DIRTY HELPER */
2556VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2557                                       /*IN*/HWord x87_state)
2558{
2559   Int           stno, preg;
2560   UInt          tag;
2561   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2562   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2563   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2564   UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2565   UInt          tagw    = x87->env[FPS_ENV_TAG];
2566   UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2567   UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2568   VexEmNote     ew;
2569   UInt          fpround;
2570   ULong         pair;
2571
2572   /* Copy registers and tags */
2573   for (stno = 0; stno < 8; stno++) {
2574      preg = (stno + ftop) & 7;
2575      tag = (tagw >> (2*preg)) & 3;
2576      if (tag == 3) {
2577         /* register is empty */
2578         /* hmm, if it's empty, does it still get written?  Probably
2579            safer to say it does.  If we don't, memcheck could get out
2580            of sync, in that it thinks all FP registers are defined by
2581            this helper, but in reality some have not been updated. */
2582         vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2583         vexTags[preg] = 0;
2584      } else {
2585         /* register is non-empty */
2586         convert_f80le_to_f64le( &x87->reg[10*stno],
2587                                 (UChar*)&vexRegs[preg] );
2588         vexTags[preg] = 1;
2589      }
2590   }
2591
2592   /* stack pointer */
2593   vex_state->guest_FTOP = ftop;
2594
2595   /* status word */
2596   vex_state->guest_FC3210 = c3210;
2597
2598   /* handle the control word, setting FPROUND and detecting any
2599      emulation warnings. */
2600   pair    = amd64g_check_fldcw ( (ULong)fpucw );
2601   fpround = (UInt)pair & 0xFFFFFFFFULL;
2602   ew      = (VexEmNote)(pair >> 32);
2603
2604   vex_state->guest_FPROUND = fpround & 3;
2605
2606   /* emulation warnings --> caller */
2607   return ew;
2608}
2609
2610
2611/*---------------------------------------------------------------*/
2612/*--- CPUID helpers.                                          ---*/
2613/*---------------------------------------------------------------*/
2614
2615/* Claim to be the following CPU, which is probably representative of
2616   the lowliest (earliest) amd64 offerings.  It can do neither sse3
2617   nor cx16.
2618
2619   vendor_id       : AuthenticAMD
2620   cpu family      : 15
2621   model           : 5
2622   model name      : AMD Opteron (tm) Processor 848
2623   stepping        : 10
2624   cpu MHz         : 1797.682
2625   cache size      : 1024 KB
2626   fpu             : yes
2627   fpu_exception   : yes
2628   cpuid level     : 1
2629   wp              : yes
2630   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2631                     mtrr pge mca cmov pat pse36 clflush mmx fxsr
2632                     sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2633   bogomips        : 3600.62
2634   TLB size        : 1088 4K pages
2635   clflush size    : 64
2636   cache_alignment : 64
2637   address sizes   : 40 bits physical, 48 bits virtual
2638   power management: ts fid vid ttp
2639
2640   2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2641   we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2642   and 3dnowext is 80000001.EDX.30.
2643*/
2644void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2645{
2646#  define SET_ABCD(_a,_b,_c,_d)                \
2647      do { st->guest_RAX = (ULong)(_a);        \
2648           st->guest_RBX = (ULong)(_b);        \
2649           st->guest_RCX = (ULong)(_c);        \
2650           st->guest_RDX = (ULong)(_d);        \
2651      } while (0)
2652
2653   switch (0xFFFFFFFF & st->guest_RAX) {
2654      case 0x00000000:
2655         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2656         break;
2657      case 0x00000001:
2658         SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2659         break;
2660      case 0x80000000:
2661         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2662         break;
2663      case 0x80000001:
2664         /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
2665            the original it-is-supported value that the h/w provides.
2666            See #291568. */
2667         SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2668                                                      0x21d3fbff);
2669         break;
2670      case 0x80000002:
2671         SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2672         break;
2673      case 0x80000003:
2674         SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2675         break;
2676      case 0x80000004:
2677         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2678         break;
2679      case 0x80000005:
2680         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2681         break;
2682      case 0x80000006:
2683         SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2684         break;
2685      case 0x80000007:
2686         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2687         break;
2688      case 0x80000008:
2689         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2690         break;
2691      default:
2692         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2693         break;
2694   }
2695#  undef SET_ABCD
2696}
2697
2698
2699/* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2700   capable.
2701
2702   vendor_id       : GenuineIntel
2703   cpu family      : 6
2704   model           : 15
2705   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2706   stepping        : 6
2707   cpu MHz         : 2394.000
2708   cache size      : 4096 KB
2709   physical id     : 0
2710   siblings        : 2
2711   core id         : 0
2712   cpu cores       : 2
2713   fpu             : yes
2714   fpu_exception   : yes
2715   cpuid level     : 10
2716   wp              : yes
2717   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2718                     mtrr pge mca cmov pat pse36 clflush dts acpi
2719                     mmx fxsr sse sse2 ss ht tm syscall nx lm
2720                     constant_tsc pni monitor ds_cpl vmx est tm2
2721                     cx16 xtpr lahf_lm
2722   bogomips        : 4798.78
2723   clflush size    : 64
2724   cache_alignment : 64
2725   address sizes   : 36 bits physical, 48 bits virtual
2726   power management:
2727*/
2728void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2729{
2730#  define SET_ABCD(_a,_b,_c,_d)                \
2731      do { st->guest_RAX = (ULong)(_a);        \
2732           st->guest_RBX = (ULong)(_b);        \
2733           st->guest_RCX = (ULong)(_c);        \
2734           st->guest_RDX = (ULong)(_d);        \
2735      } while (0)
2736
2737   switch (0xFFFFFFFF & st->guest_RAX) {
2738      case 0x00000000:
2739         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2740         break;
2741      case 0x00000001:
2742         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2743         break;
2744      case 0x00000002:
2745         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2746         break;
2747      case 0x00000003:
2748         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2749         break;
2750      case 0x00000004: {
2751         switch (0xFFFFFFFF & st->guest_RCX) {
2752            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2753                                      0x0000003f, 0x00000001); break;
2754            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2755                                      0x0000003f, 0x00000001); break;
2756            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2757                                      0x00000fff, 0x00000001); break;
2758            default:         SET_ABCD(0x00000000, 0x00000000,
2759                                      0x00000000, 0x00000000); break;
2760         }
2761         break;
2762      }
2763      case 0x00000005:
2764         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2765         break;
2766      case 0x00000006:
2767         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2768         break;
2769      case 0x00000007:
2770         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2771         break;
2772      case 0x00000008:
2773         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2774         break;
2775      case 0x00000009:
2776         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2777         break;
2778      case 0x0000000a:
2779      unhandled_eax_value:
2780         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2781         break;
2782      case 0x80000000:
2783         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2784         break;
2785      case 0x80000001:
2786         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2787         break;
2788      case 0x80000002:
2789         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2790         break;
2791      case 0x80000003:
2792         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2793         break;
2794      case 0x80000004:
2795         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2796         break;
2797      case 0x80000005:
2798         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2799         break;
2800      case 0x80000006:
2801         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2802         break;
2803      case 0x80000007:
2804         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2805         break;
2806      case 0x80000008:
2807         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2808         break;
2809      default:
2810         goto unhandled_eax_value;
2811   }
2812#  undef SET_ABCD
2813}
2814
2815
2816/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2817   capable.
2818
2819   vendor_id       : GenuineIntel
2820   cpu family      : 6
2821   model           : 37
2822   model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2823   stepping        : 2
2824   cpu MHz         : 3334.000
2825   cache size      : 4096 KB
2826   physical id     : 0
2827   siblings        : 4
2828   core id         : 0
2829   cpu cores       : 2
2830   apicid          : 0
2831   initial apicid  : 0
2832   fpu             : yes
2833   fpu_exception   : yes
2834   cpuid level     : 11
2835   wp              : yes
2836   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2837                     mtrr pge mca cmov pat pse36 clflush dts acpi
2838                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2839                     lm constant_tsc arch_perfmon pebs bts rep_good
2840                     xtopology nonstop_tsc aperfmperf pni pclmulqdq
2841                     dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2842                     xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2843                     arat tpr_shadow vnmi flexpriority ept vpid
2844   bogomips        : 6957.57
2845   clflush size    : 64
2846   cache_alignment : 64
2847   address sizes   : 36 bits physical, 48 bits virtual
2848   power management:
2849*/
2850void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2851{
2852#  define SET_ABCD(_a,_b,_c,_d)                \
2853      do { st->guest_RAX = (ULong)(_a);        \
2854           st->guest_RBX = (ULong)(_b);        \
2855           st->guest_RCX = (ULong)(_c);        \
2856           st->guest_RDX = (ULong)(_d);        \
2857      } while (0)
2858
2859   UInt old_eax = (UInt)st->guest_RAX;
2860   UInt old_ecx = (UInt)st->guest_RCX;
2861
2862   switch (old_eax) {
2863      case 0x00000000:
2864         SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2865         break;
2866      case 0x00000001:
2867         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2868         break;
2869      case 0x00000002:
2870         SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2871         break;
2872      case 0x00000003:
2873         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2874         break;
2875      case 0x00000004:
2876         switch (old_ecx) {
2877            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2878                                      0x0000003f, 0x00000000); break;
2879            case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2880                                      0x0000007f, 0x00000000); break;
2881            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2882                                      0x000001ff, 0x00000000); break;
2883            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2884                                      0x00000fff, 0x00000002); break;
2885            default:         SET_ABCD(0x00000000, 0x00000000,
2886                                      0x00000000, 0x00000000); break;
2887         }
2888         break;
2889      case 0x00000005:
2890         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2891         break;
2892      case 0x00000006:
2893         SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2894         break;
2895      case 0x00000007:
2896         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2897         break;
2898      case 0x00000008:
2899         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2900         break;
2901      case 0x00000009:
2902         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2903         break;
2904      case 0x0000000a:
2905         SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2906         break;
2907      case 0x0000000b:
2908         switch (old_ecx) {
2909            case 0x00000000:
2910               SET_ABCD(0x00000001, 0x00000002,
2911                        0x00000100, 0x00000000); break;
2912            case 0x00000001:
2913               SET_ABCD(0x00000004, 0x00000004,
2914                        0x00000201, 0x00000000); break;
2915            default:
2916               SET_ABCD(0x00000000, 0x00000000,
2917                        old_ecx,    0x00000000); break;
2918         }
2919         break;
2920      case 0x0000000c:
2921         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2922         break;
2923      case 0x0000000d:
2924         switch (old_ecx) {
2925            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2926                                      0x00000100, 0x00000000); break;
2927            case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2928                                      0x00000201, 0x00000000); break;
2929            default:         SET_ABCD(0x00000000, 0x00000000,
2930                                      old_ecx,    0x00000000); break;
2931         }
2932         break;
2933      case 0x80000000:
2934         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2935         break;
2936      case 0x80000001:
2937         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2938         break;
2939      case 0x80000002:
2940         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2941         break;
2942      case 0x80000003:
2943         SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2944         break;
2945      case 0x80000004:
2946         SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2947         break;
2948      case 0x80000005:
2949         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2950         break;
2951      case 0x80000006:
2952         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2953         break;
2954      case 0x80000007:
2955         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2956         break;
2957      case 0x80000008:
2958         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2959         break;
2960      default:
2961         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2962         break;
2963   }
2964#  undef SET_ABCD
2965}
2966
2967
2968/* Claim to be the following CPU (4 x ...), which is AVX and cx16
2969   capable.  Plus (kludge!) it "supports" HTM.
2970
2971   Also with the following change: claim that XSaveOpt is not
2972   available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
2973   on the real CPU.  Consequently, programs that correctly observe
2974   these CPUID values should only try to use 3 of the 8 XSave-family
2975   instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
2976   having to implement the compacted or optimised save/restore
2977   variants.
2978
2979   vendor_id       : GenuineIntel
2980   cpu family      : 6
2981   model           : 42
2982   model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
2983   stepping        : 7
2984   cpu MHz         : 1600.000
2985   cache size      : 6144 KB
2986   physical id     : 0
2987   siblings        : 4
2988   core id         : 3
2989   cpu cores       : 4
2990   apicid          : 6
2991   initial apicid  : 6
2992   fpu             : yes
2993   fpu_exception   : yes
2994   cpuid level     : 13
2995   wp              : yes
2996   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2997                     mtrr pge mca cmov pat pse36 clflush dts acpi
2998                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2999                     lm constant_tsc arch_perfmon pebs bts rep_good
3000                     nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3001                     dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3002                     xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3003                     lahf_lm ida arat epb xsaveopt pln pts dts
3004                     tpr_shadow vnmi flexpriority ept vpid
3005
3006   bogomips        : 5768.94
3007   clflush size    : 64
3008   cache_alignment : 64
3009   address sizes   : 36 bits physical, 48 bits virtual
3010   power management:
3011*/
3012void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
3013{
3014#  define SET_ABCD(_a,_b,_c,_d)                \
3015      do { st->guest_RAX = (ULong)(_a);        \
3016           st->guest_RBX = (ULong)(_b);        \
3017           st->guest_RCX = (ULong)(_c);        \
3018           st->guest_RDX = (ULong)(_d);        \
3019      } while (0)
3020
3021   UInt old_eax = (UInt)st->guest_RAX;
3022   UInt old_ecx = (UInt)st->guest_RCX;
3023
3024   switch (old_eax) {
3025      case 0x00000000:
3026         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3027         break;
3028      case 0x00000001:
3029         SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
3030         break;
3031      case 0x00000002:
3032         SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3033         break;
3034      case 0x00000003:
3035         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3036         break;
3037      case 0x00000004:
3038         switch (old_ecx) {
3039            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3040                                      0x0000003f, 0x00000000); break;
3041            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3042                                      0x0000003f, 0x00000000); break;
3043            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3044                                      0x000001ff, 0x00000000); break;
3045            case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3046                                      0x00001fff, 0x00000006); break;
3047            default:         SET_ABCD(0x00000000, 0x00000000,
3048                                      0x00000000, 0x00000000); break;
3049         }
3050         break;
3051      case 0x00000005:
3052         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3053         break;
3054      case 0x00000006:
3055         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3056         break;
3057      case 0x00000007:
3058         SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
3059         break;
3060      case 0x00000008:
3061         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3062         break;
3063      case 0x00000009:
3064         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3065         break;
3066      case 0x0000000a:
3067         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3068         break;
3069      case 0x0000000b:
3070         switch (old_ecx) {
3071            case 0x00000000:
3072               SET_ABCD(0x00000001, 0x00000001,
3073                        0x00000100, 0x00000000); break;
3074            case 0x00000001:
3075               SET_ABCD(0x00000004, 0x00000004,
3076                        0x00000201, 0x00000000); break;
3077            default:
3078               SET_ABCD(0x00000000, 0x00000000,
3079                        old_ecx,    0x00000000); break;
3080         }
3081         break;
3082      case 0x0000000c:
3083         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3084         break;
3085      case 0x0000000d:
3086         switch (old_ecx) {
3087            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3088                                      0x00000340, 0x00000000); break;
3089            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3090                                      0x00000000, 0x00000000); break;
3091            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3092                                      0x00000000, 0x00000000); break;
3093            default:         SET_ABCD(0x00000000, 0x00000000,
3094                                      0x00000000, 0x00000000); break;
3095         }
3096         break;
3097      case 0x0000000e:
3098         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3099         break;
3100      case 0x0000000f:
3101         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3102         break;
3103      case 0x80000000:
3104         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3105         break;
3106      case 0x80000001:
3107         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3108         break;
3109      case 0x80000002:
3110         SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3111         break;
3112      case 0x80000003:
3113         SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3114         break;
3115      case 0x80000004:
3116         SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3117         break;
3118      case 0x80000005:
3119         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3120         break;
3121      case 0x80000006:
3122         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3123         break;
3124      case 0x80000007:
3125         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3126         break;
3127      case 0x80000008:
3128         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3129         break;
3130      default:
3131         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3132         break;
3133   }
3134#  undef SET_ABCD
3135}
3136
3137
3138/* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3139
3140   With the following change: claim that XSaveOpt is not available, by
3141   cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3142   CPU.  Consequently, programs that correctly observe these CPUID
3143   values should only try to use 3 of the 8 XSave-family instructions:
3144   XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
3145   implement the compacted or optimised save/restore variants.
3146
3147   vendor_id       : GenuineIntel
3148   cpu family      : 6
3149   model           : 60
3150   model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3151   stepping        : 3
3152   microcode       : 0x1c
3153   cpu MHz         : 919.957
3154   cache size      : 8192 KB
3155   physical id     : 0
3156   siblings        : 4
3157   core id         : 3
3158   cpu cores       : 4
3159   apicid          : 6
3160   initial apicid  : 6
3161   fpu             : yes
3162   fpu_exception   : yes
3163   cpuid level     : 13
3164   wp              : yes
3165   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3166                     cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3167                     tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3168                     arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3169                     aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3170                     vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3171                     sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3172                     avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3173                     tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3174                     bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3175   bugs            :
3176   bogomips        : 5786.68
3177   clflush size    : 64
3178   cache_alignment : 64
3179   address sizes   : 39 bits physical, 48 bits virtual
3180   power management:
3181*/
3182void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
3183{
3184#  define SET_ABCD(_a,_b,_c,_d)                \
3185      do { st->guest_RAX = (ULong)(_a);        \
3186           st->guest_RBX = (ULong)(_b);        \
3187           st->guest_RCX = (ULong)(_c);        \
3188           st->guest_RDX = (ULong)(_d);        \
3189      } while (0)
3190
3191   UInt old_eax = (UInt)st->guest_RAX;
3192   UInt old_ecx = (UInt)st->guest_RCX;
3193
3194   switch (old_eax) {
3195      case 0x00000000:
3196         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3197         break;
3198      case 0x00000001:
3199         /* Don't advertise RDRAND support, bit 30 in ECX.  */
3200         SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
3201         break;
3202      case 0x00000002:
3203         SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3204         break;
3205      case 0x00000003:
3206         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3207         break;
3208      case 0x00000004:
3209         switch (old_ecx) {
3210            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3211                                      0x0000003f, 0x00000000); break;
3212            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3213                                      0x0000003f, 0x00000000); break;
3214            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3215                                      0x000001ff, 0x00000000); break;
3216            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3217                                      0x00001fff, 0x00000006); break;
3218            default:         SET_ABCD(0x00000000, 0x00000000,
3219                                      0x00000000, 0x00000000); break;
3220         }
3221         break;
3222      case 0x00000005:
3223         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3224         break;
3225      case 0x00000006:
3226         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3227         break;
3228      case 0x00000007:
3229         switch (old_ecx) {
3230            case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
3231                                      0x00000000, 0x00000000); break;
3232            default:         SET_ABCD(0x00000000, 0x00000000,
3233                                      0x00000000, 0x00000000); break;
3234         }
3235         break;
3236      case 0x00000008:
3237         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3238         break;
3239      case 0x00000009:
3240         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3241         break;
3242      case 0x0000000a:
3243         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3244         break;
3245      case 0x0000000b:
3246         switch (old_ecx) {
3247            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3248                                      0x00000100, 0x00000002); break;
3249            case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3250                                      0x00000201, 0x00000002); break;
3251            default:         SET_ABCD(0x00000000, 0x00000000,
3252                                      old_ecx,    0x00000002); break;
3253         }
3254         break;
3255      case 0x0000000c:
3256         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3257         break;
3258      case 0x0000000d:
3259         switch (old_ecx) {
3260            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3261                                      0x00000340, 0x00000000); break;
3262            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3263                                      0x00000000, 0x00000000); break;
3264            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3265                                      0x00000000, 0x00000000); break;
3266            default:         SET_ABCD(0x00000000, 0x00000000,
3267                                      0x00000000, 0x00000000); break;
3268         }
3269         break;
3270      case 0x80000000:
3271         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3272         break;
3273      case 0x80000001:
3274         SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3275         break;
3276      case 0x80000002:
3277         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3278         break;
3279      case 0x80000003:
3280         SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3281         break;
3282      case 0x80000004:
3283         SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3284         break;
3285      case 0x80000005:
3286         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3287         break;
3288      case 0x80000006:
3289         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3290         break;
3291      case 0x80000007:
3292         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3293         break;
3294      case 0x80000008:
3295         SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3296         break;
3297      default:
3298         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3299         break;
3300   }
3301#  undef SET_ABCD
3302}
3303
3304
3305/*---------------------------------------------------------------*/
3306/*--- Misc integer helpers, including rotates and crypto.     ---*/
3307/*---------------------------------------------------------------*/
3308
3309ULong amd64g_calculate_RCR ( ULong arg,
3310                             ULong rot_amt,
3311                             ULong rflags_in,
3312                             Long  szIN )
3313{
3314   Bool  wantRflags = toBool(szIN < 0);
3315   ULong sz         = wantRflags ? (-szIN) : szIN;
3316   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3317   ULong cf=0, of=0, tempcf;
3318
3319   switch (sz) {
3320      case 8:
3321         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3322         of        = ((arg >> 63) ^ cf) & 1;
3323         while (tempCOUNT > 0) {
3324            tempcf = arg & 1;
3325            arg    = (arg >> 1) | (cf << 63);
3326            cf     = tempcf;
3327            tempCOUNT--;
3328         }
3329         break;
3330      case 4:
3331         while (tempCOUNT >= 33) tempCOUNT -= 33;
3332         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3333         of        = ((arg >> 31) ^ cf) & 1;
3334         while (tempCOUNT > 0) {
3335            tempcf = arg & 1;
3336            arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3337            cf     = tempcf;
3338            tempCOUNT--;
3339         }
3340         break;
3341      case 2:
3342         while (tempCOUNT >= 17) tempCOUNT -= 17;
3343         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3344         of        = ((arg >> 15) ^ cf) & 1;
3345         while (tempCOUNT > 0) {
3346            tempcf = arg & 1;
3347            arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3348            cf     = tempcf;
3349            tempCOUNT--;
3350         }
3351         break;
3352      case 1:
3353         while (tempCOUNT >= 9) tempCOUNT -= 9;
3354         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3355         of        = ((arg >> 7) ^ cf) & 1;
3356         while (tempCOUNT > 0) {
3357            tempcf = arg & 1;
3358            arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
3359            cf     = tempcf;
3360            tempCOUNT--;
3361         }
3362         break;
3363      default:
3364         vpanic("calculate_RCR(amd64g): invalid size");
3365   }
3366
3367   cf &= 1;
3368   of &= 1;
3369   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3370   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3371
3372   /* caller can ask to have back either the resulting flags or
3373      resulting value, but not both */
3374   return wantRflags ? rflags_in : arg;
3375}
3376
3377ULong amd64g_calculate_RCL ( ULong arg,
3378                             ULong rot_amt,
3379                             ULong rflags_in,
3380                             Long  szIN )
3381{
3382   Bool  wantRflags = toBool(szIN < 0);
3383   ULong sz         = wantRflags ? (-szIN) : szIN;
3384   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3385   ULong cf=0, of=0, tempcf;
3386
3387   switch (sz) {
3388      case 8:
3389         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3390         while (tempCOUNT > 0) {
3391            tempcf = (arg >> 63) & 1;
3392            arg    = (arg << 1) | (cf & 1);
3393            cf     = tempcf;
3394            tempCOUNT--;
3395         }
3396         of = ((arg >> 63) ^ cf) & 1;
3397         break;
3398      case 4:
3399         while (tempCOUNT >= 33) tempCOUNT -= 33;
3400         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3401         while (tempCOUNT > 0) {
3402            tempcf = (arg >> 31) & 1;
3403            arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3404            cf     = tempcf;
3405            tempCOUNT--;
3406         }
3407         of = ((arg >> 31) ^ cf) & 1;
3408         break;
3409      case 2:
3410         while (tempCOUNT >= 17) tempCOUNT -= 17;
3411         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3412         while (tempCOUNT > 0) {
3413            tempcf = (arg >> 15) & 1;
3414            arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
3415            cf     = tempcf;
3416            tempCOUNT--;
3417         }
3418         of = ((arg >> 15) ^ cf) & 1;
3419         break;
3420      case 1:
3421         while (tempCOUNT >= 9) tempCOUNT -= 9;
3422         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3423         while (tempCOUNT > 0) {
3424            tempcf = (arg >> 7) & 1;
3425            arg    = 0xFFULL & ((arg << 1) | (cf & 1));
3426            cf     = tempcf;
3427            tempCOUNT--;
3428         }
3429         of = ((arg >> 7) ^ cf) & 1;
3430         break;
3431      default:
3432         vpanic("calculate_RCL(amd64g): invalid size");
3433   }
3434
3435   cf &= 1;
3436   of &= 1;
3437   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3438   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3439
3440   return wantRflags ? rflags_in : arg;
3441}
3442
3443/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3444 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3445 */
3446ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3447{
3448    ULong hi, lo, tmp, A[16];
3449
3450   A[0] = 0;            A[1] = a;
3451   A[2] = A[1] << 1;    A[3] = A[2] ^ a;
3452   A[4] = A[2] << 1;    A[5] = A[4] ^ a;
3453   A[6] = A[3] << 1;    A[7] = A[6] ^ a;
3454   A[8] = A[4] << 1;    A[9] = A[8] ^ a;
3455   A[10] = A[5] << 1;   A[11] = A[10] ^ a;
3456   A[12] = A[6] << 1;   A[13] = A[12] ^ a;
3457   A[14] = A[7] << 1;   A[15] = A[14] ^ a;
3458
3459   lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3460   hi = lo >> 56;
3461   lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3462   hi = (hi << 8) | (lo >> 56);
3463   lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3464   hi = (hi << 8) | (lo >> 56);
3465   lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3466   hi = (hi << 8) | (lo >> 56);
3467   lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3468   hi = (hi << 8) | (lo >> 56);
3469   lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3470   hi = (hi << 8) | (lo >> 56);
3471   lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3472   hi = (hi << 8) | (lo >> 56);
3473   lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3474
3475   ULong m0 = -1;
3476   m0 /= 255;
3477   tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3478   tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3479   tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3480   tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3481   tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3482   tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3483   tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3484
3485   return which ? hi : lo;
3486}
3487
3488
3489/* CALLED FROM GENERATED CODE */
3490/* DIRTY HELPER (non-referentially-transparent) */
3491/* Horrible hack.  On non-amd64 platforms, return 1. */
3492ULong amd64g_dirtyhelper_RDTSC ( void )
3493{
3494#  if defined(__x86_64__)
3495   UInt  eax, edx;
3496   __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3497   return (((ULong)edx) << 32) | ((ULong)eax);
3498#  else
3499   return 1ULL;
3500#  endif
3501}
3502
3503/* CALLED FROM GENERATED CODE */
3504/* DIRTY HELPER (non-referentially-transparent) */
3505/* Horrible hack.  On non-amd64 platforms, return 1. */
3506/* This uses a different calling convention from _RDTSC just above
3507   only because of the difficulty of returning 96 bits from a C
3508   function -- RDTSC returns 64 bits and so is simple by comparison,
3509   on amd64. */
3510void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3511{
3512#  if defined(__x86_64__)
3513   UInt eax, ecx, edx;
3514   __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3515   st->guest_RAX = (ULong)eax;
3516   st->guest_RCX = (ULong)ecx;
3517   st->guest_RDX = (ULong)edx;
3518#  else
3519   /* Do nothing. */
3520#  endif
3521}
3522
3523/* CALLED FROM GENERATED CODE */
3524/* DIRTY HELPER (non-referentially-transparent) */
3525/* Horrible hack.  On non-amd64 platforms, return 0. */
3526ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3527{
3528#  if defined(__x86_64__)
3529   ULong r = 0;
3530   portno &= 0xFFFF;
3531   switch (sz) {
3532      case 4:
3533         __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3534                              : "=a" (r) : "Nd" (portno));
3535	 break;
3536      case 2:
3537         __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3538                              : "=a" (r) : "Nd" (portno));
3539	 break;
3540      case 1:
3541         __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3542                              : "=a" (r) : "Nd" (portno));
3543	 break;
3544      default:
3545         break; /* note: no 64-bit version of insn exists */
3546   }
3547   return r;
3548#  else
3549   return 0;
3550#  endif
3551}
3552
3553
3554/* CALLED FROM GENERATED CODE */
3555/* DIRTY HELPER (non-referentially-transparent) */
3556/* Horrible hack.  On non-amd64 platforms, do nothing. */
3557void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3558{
3559#  if defined(__x86_64__)
3560   portno &= 0xFFFF;
3561   switch (sz) {
3562      case 4:
3563         __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3564                              : : "a" (data), "Nd" (portno));
3565	 break;
3566      case 2:
3567         __asm__ __volatile__("outw %w0, %w1"
3568                              : : "a" (data), "Nd" (portno));
3569	 break;
3570      case 1:
3571         __asm__ __volatile__("outb %b0, %w1"
3572                              : : "a" (data), "Nd" (portno));
3573	 break;
3574      default:
3575         break; /* note: no 64-bit version of insn exists */
3576   }
3577#  else
3578   /* do nothing */
3579#  endif
3580}
3581
3582/* CALLED FROM GENERATED CODE */
3583/* DIRTY HELPER (non-referentially-transparent) */
3584/* Horrible hack.  On non-amd64 platforms, do nothing. */
3585/* op = 0: call the native SGDT instruction.
3586   op = 1: call the native SIDT instruction.
3587*/
3588void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3589#  if defined(__x86_64__)
3590   switch (op) {
3591      case 0:
3592         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3593         break;
3594      case 1:
3595         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3596         break;
3597      default:
3598         vpanic("amd64g_dirtyhelper_SxDT");
3599   }
3600#  else
3601   /* do nothing */
3602   UChar* p = (UChar*)address;
3603   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3604   p[6] = p[7] = p[8] = p[9] = 0;
3605#  endif
3606}
3607
3608/*---------------------------------------------------------------*/
3609/*--- Helpers for MMX/SSE/SSE2.                               ---*/
3610/*---------------------------------------------------------------*/
3611
3612static inline UChar abdU8 ( UChar xx, UChar yy ) {
3613   return toUChar(xx>yy ? xx-yy : yy-xx);
3614}
3615
3616static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3617   return (((ULong)w1) << 32) | ((ULong)w0);
3618}
3619
3620static inline UShort sel16x4_3 ( ULong w64 ) {
3621   UInt hi32 = toUInt(w64 >> 32);
3622   return toUShort(hi32 >> 16);
3623}
3624static inline UShort sel16x4_2 ( ULong w64 ) {
3625   UInt hi32 = toUInt(w64 >> 32);
3626   return toUShort(hi32);
3627}
3628static inline UShort sel16x4_1 ( ULong w64 ) {
3629   UInt lo32 = toUInt(w64);
3630   return toUShort(lo32 >> 16);
3631}
3632static inline UShort sel16x4_0 ( ULong w64 ) {
3633   UInt lo32 = toUInt(w64);
3634   return toUShort(lo32);
3635}
3636
3637static inline UChar sel8x8_7 ( ULong w64 ) {
3638   UInt hi32 = toUInt(w64 >> 32);
3639   return toUChar(hi32 >> 24);
3640}
3641static inline UChar sel8x8_6 ( ULong w64 ) {
3642   UInt hi32 = toUInt(w64 >> 32);
3643   return toUChar(hi32 >> 16);
3644}
3645static inline UChar sel8x8_5 ( ULong w64 ) {
3646   UInt hi32 = toUInt(w64 >> 32);
3647   return toUChar(hi32 >> 8);
3648}
3649static inline UChar sel8x8_4 ( ULong w64 ) {
3650   UInt hi32 = toUInt(w64 >> 32);
3651   return toUChar(hi32 >> 0);
3652}
3653static inline UChar sel8x8_3 ( ULong w64 ) {
3654   UInt lo32 = toUInt(w64);
3655   return toUChar(lo32 >> 24);
3656}
3657static inline UChar sel8x8_2 ( ULong w64 ) {
3658   UInt lo32 = toUInt(w64);
3659   return toUChar(lo32 >> 16);
3660}
3661static inline UChar sel8x8_1 ( ULong w64 ) {
3662   UInt lo32 = toUInt(w64);
3663   return toUChar(lo32 >> 8);
3664}
3665static inline UChar sel8x8_0 ( ULong w64 ) {
3666   UInt lo32 = toUInt(w64);
3667   return toUChar(lo32 >> 0);
3668}
3669
3670/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3671ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3672{
3673   return
3674      mk32x2(
3675         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3676            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3677         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3678            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3679      );
3680}
3681
3682/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3683ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3684{
3685   UInt t = 0;
3686   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3687   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3688   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3689   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3690   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3691   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3692   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3693   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3694   t &= 0xFFFF;
3695   return (ULong)t;
3696}
3697
3698/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3699ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3700{
3701   UShort t, min;
3702   UInt   idx;
3703   t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
3704   t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3705   t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3706   t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3707   t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3708   t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3709   t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3710   t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3711   return ((ULong)(idx << 16)) | ((ULong)min);
3712}
3713
3714/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3715ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3716{
3717   UInt  i;
3718   ULong crc = (b & 0xFFULL) ^ crcIn;
3719   for (i = 0; i < 8; i++)
3720      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3721   return crc;
3722}
3723
3724/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3725ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3726{
3727   UInt  i;
3728   ULong crc = (w & 0xFFFFULL) ^ crcIn;
3729   for (i = 0; i < 16; i++)
3730      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3731   return crc;
3732}
3733
3734/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3735ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3736{
3737   UInt i;
3738   ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3739   for (i = 0; i < 32; i++)
3740      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3741   return crc;
3742}
3743
3744/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3745ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3746{
3747   ULong crc = amd64g_calc_crc32l(crcIn, q);
3748   return amd64g_calc_crc32l(crc, q >> 32);
3749}
3750
3751
3752/* .. helper for next fn .. */
3753static inline ULong sad_8x4 ( ULong xx, ULong yy )
3754{
3755   UInt t = 0;
3756   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3757   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3758   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3759   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3760   return (ULong)t;
3761}
3762
3763/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3764ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3765                            ULong dHi, ULong dLo,
3766                            ULong imm_and_return_control_bit )
3767{
3768   UInt imm8     = imm_and_return_control_bit & 7;
3769   Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
3770   UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3771   UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3772   /* For src we only need 32 bits, so get them into the
3773      lower half of a 64 bit word. */
3774   ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3775   /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3776      11 bytes.  If calculating the low part of the result, need bytes
3777      dstOffsL * 4 + (0 .. 6); if calculating the high part,
3778      dstOffsL * 4 + (4 .. 10). */
3779   ULong dst;
3780   /* dstOffL = 0, Lo  ->  0 .. 6
3781      dstOffL = 1, Lo  ->  4 .. 10
3782      dstOffL = 0, Hi  ->  4 .. 10
3783      dstOffL = 1, Hi  ->  8 .. 14
3784   */
3785   if (calcHi && dstOffsL) {
3786      /* 8 .. 14 */
3787      dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3788   }
3789   else if (!calcHi && !dstOffsL) {
3790      /* 0 .. 6 */
3791      dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3792   }
3793   else {
3794      /* 4 .. 10 */
3795      dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3796   }
3797   ULong r0  = sad_8x4( dst >>  0, src );
3798   ULong r1  = sad_8x4( dst >>  8, src );
3799   ULong r2  = sad_8x4( dst >> 16, src );
3800   ULong r3  = sad_8x4( dst >> 24, src );
3801   ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3802   return res;
3803}
3804
3805/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3806ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3807{
3808   ULong dst = 0;
3809   ULong src_bit;
3810   ULong dst_bit = 1;
3811   for (src_bit = 1; src_bit; src_bit <<= 1) {
3812      if (mask & src_bit) {
3813         if (src_masked & src_bit) dst |= dst_bit;
3814         dst_bit <<= 1;
3815      }
3816   }
3817   return dst;
3818}
3819
3820/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3821ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3822{
3823   ULong dst = 0;
3824   ULong dst_bit;
3825   ULong src_bit = 1;
3826   for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3827      if (mask & dst_bit) {
3828         if (src & src_bit) dst |= dst_bit;
3829         src_bit <<= 1;
3830      }
3831   }
3832   return dst;
3833}
3834
3835/*---------------------------------------------------------------*/
3836/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
3837/*---------------------------------------------------------------*/
3838
3839static UInt zmask_from_V128 ( V128* arg )
3840{
3841   UInt i, res = 0;
3842   for (i = 0; i < 16; i++) {
3843      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
3844   }
3845   return res;
3846}
3847
3848static UInt zmask_from_V128_wide ( V128* arg )
3849{
3850   UInt i, res = 0;
3851   for (i = 0; i < 8; i++) {
3852      res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
3853   }
3854   return res;
3855}
3856
3857/* Helps with PCMP{I,E}STR{I,M}.
3858
3859   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
3860   actually it could be a clean helper, but for the fact that we can't
3861   pass by value 2 x V128 to a clean helper, nor have one returned.)
3862   Reads guest state, writes to guest state for the xSTRM cases, no
3863   accesses of memory, is a pure function.
3864
3865   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3866   the callee knows which I/E and I/M variant it is dealing with and
3867   what the specific operation is.  4th byte of opcode is in the range
3868   0x60 to 0x63:
3869       istri  66 0F 3A 63
3870       istrm  66 0F 3A 62
3871       estri  66 0F 3A 61
3872       estrm  66 0F 3A 60
3873
3874   gstOffL and gstOffR are the guest state offsets for the two XMM
3875   register inputs.  We never have to deal with the memory case since
3876   that is handled by pre-loading the relevant value into the fake
3877   XMM16 register.
3878
3879   For ESTRx variants, edxIN and eaxIN hold the values of those two
3880   registers.
3881
3882   In all cases, the bottom 16 bits of the result contain the new
3883   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
3884   result hold the new %ecx value.  For xSTRM variants, the helper
3885   writes the result directly to the guest XMM0.
3886
3887   Declarable side effects: in all cases, reads guest state at
3888   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
3889   guest_XMM0.
3890
3891   Is expected to be called with opc_and_imm combinations which have
3892   actually been validated, and will assert if otherwise.  The front
3893   end should ensure we're only called with verified values.
3894*/
3895ULong amd64g_dirtyhelper_PCMPxSTRx (
3896          VexGuestAMD64State* gst,
3897          HWord opc4_and_imm,
3898          HWord gstOffL, HWord gstOffR,
3899          HWord edxIN, HWord eaxIN
3900       )
3901{
3902   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3903   HWord imm8 = opc4_and_imm & 0xFF;
3904   HWord isISTRx = opc4 & 2;
3905   HWord isxSTRM = (opc4 & 1) ^ 1;
3906   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3907   HWord wide = (imm8 & 1);
3908
3909   // where the args are
3910   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3911   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3912
3913   /* Create the arg validity masks, either from the vectors
3914      themselves or from the supplied edx/eax values. */
3915   // FIXME: this is only right for the 8-bit data cases.
3916   // At least that is asserted above.
3917   UInt zmaskL, zmaskR;
3918
3919   // temp spot for the resulting flags and vector.
3920   V128 resV;
3921   UInt resOSZACP;
3922
3923   // for checking whether case was handled
3924   Bool ok = False;
3925
3926   if (wide) {
3927      if (isISTRx) {
3928         zmaskL = zmask_from_V128_wide(argL);
3929         zmaskR = zmask_from_V128_wide(argR);
3930      } else {
3931         Int tmp;
3932         tmp = edxIN & 0xFFFFFFFF;
3933         if (tmp < -8) tmp = -8;
3934         if (tmp > 8)  tmp = 8;
3935         if (tmp < 0)  tmp = -tmp;
3936         vassert(tmp >= 0 && tmp <= 8);
3937         zmaskL = (1 << tmp) & 0xFF;
3938         tmp = eaxIN & 0xFFFFFFFF;
3939         if (tmp < -8) tmp = -8;
3940         if (tmp > 8)  tmp = 8;
3941         if (tmp < 0)  tmp = -tmp;
3942         vassert(tmp >= 0 && tmp <= 8);
3943         zmaskR = (1 << tmp) & 0xFF;
3944      }
3945      // do the meyaath
3946      ok = compute_PCMPxSTRx_wide (
3947              &resV, &resOSZACP, argL, argR,
3948              zmaskL, zmaskR, imm8, (Bool)isxSTRM
3949           );
3950   } else {
3951      if (isISTRx) {
3952         zmaskL = zmask_from_V128(argL);
3953         zmaskR = zmask_from_V128(argR);
3954      } else {
3955         Int tmp;
3956         tmp = edxIN & 0xFFFFFFFF;
3957         if (tmp < -16) tmp = -16;
3958         if (tmp > 16)  tmp = 16;
3959         if (tmp < 0)   tmp = -tmp;
3960         vassert(tmp >= 0 && tmp <= 16);
3961         zmaskL = (1 << tmp) & 0xFFFF;
3962         tmp = eaxIN & 0xFFFFFFFF;
3963         if (tmp < -16) tmp = -16;
3964         if (tmp > 16)  tmp = 16;
3965         if (tmp < 0)   tmp = -tmp;
3966         vassert(tmp >= 0 && tmp <= 16);
3967         zmaskR = (1 << tmp) & 0xFFFF;
3968      }
3969      // do the meyaath
3970      ok = compute_PCMPxSTRx (
3971              &resV, &resOSZACP, argL, argR,
3972              zmaskL, zmaskR, imm8, (Bool)isxSTRM
3973           );
3974   }
3975
3976   // front end shouldn't pass us any imm8 variants we can't
3977   // handle.  Hence:
3978   vassert(ok);
3979
3980   // So, finally we need to get the results back to the caller.
3981   // In all cases, the new OSZACP value is the lowest 16 of
3982   // the return value.
3983   if (isxSTRM) {
3984      gst->guest_YMM0[0] = resV.w32[0];
3985      gst->guest_YMM0[1] = resV.w32[1];
3986      gst->guest_YMM0[2] = resV.w32[2];
3987      gst->guest_YMM0[3] = resV.w32[3];
3988      return resOSZACP & 0x8D5;
3989   } else {
3990      UInt newECX = resV.w32[0] & 0xFFFF;
3991      return (newECX << 16) | (resOSZACP & 0x8D5);
3992   }
3993}
3994
3995/*---------------------------------------------------------------*/
3996/*--- AES primitives and helpers                              ---*/
3997/*---------------------------------------------------------------*/
3998/* a 16 x 16 matrix */
3999static const UChar sbox[256] = {                   // row nr
4000   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4001   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4002   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4003   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4004   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4005   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4006   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4007   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4008   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4009   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4010   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4011   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4012   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4013   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4014   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4015   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4016   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4017   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4018   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4019   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4020   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4021   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4022   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4023   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4024   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4025   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4026   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4027   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4028   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4029   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4030   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4031   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4032};
4033static void SubBytes (V128* v)
4034{
4035   V128 r;
4036   UInt i;
4037   for (i = 0; i < 16; i++)
4038      r.w8[i] = sbox[v->w8[i]];
4039   *v = r;
4040}
4041
4042/* a 16 x 16 matrix */
4043static const UChar invsbox[256] = {                // row nr
4044   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4045   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4046   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4047   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4048   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4049   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4050   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4051   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4052   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4053   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4054   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4055   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4056   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4057   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4058   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4059   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4060   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4061   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4062   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4063   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4064   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4065   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4066   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4067   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4068   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4069   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4070   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4071   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4072   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4073   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4074   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4075   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4076};
4077static void InvSubBytes (V128* v)
4078{
4079   V128 r;
4080   UInt i;
4081   for (i = 0; i < 16; i++)
4082      r.w8[i] = invsbox[v->w8[i]];
4083   *v = r;
4084}
4085
4086static const UChar ShiftRows_op[16] =
4087   {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
4088static void ShiftRows (V128* v)
4089{
4090   V128 r;
4091   UInt i;
4092   for (i = 0; i < 16; i++)
4093      r.w8[i] = v->w8[ShiftRows_op[15-i]];
4094   *v = r;
4095}
4096
4097static const UChar InvShiftRows_op[16] =
4098   {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
4099static void InvShiftRows (V128* v)
4100{
4101   V128 r;
4102   UInt i;
4103   for (i = 0; i < 16; i++)
4104      r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4105   *v = r;
4106}
4107
4108/* Multiplication of the finite fields elements of AES.
4109   See "A Specification for The AES Algorithm Rijndael
4110        (by Joan Daemen & Vincent Rijmen)"
4111        Dr. Brian Gladman, v3.1, 3rd March 2001. */
4112/* N values so that (hex) xy = 0x03^N.
4113   0x00 cannot be used. We put 0xff for this value.*/
4114/* a 16 x 16 matrix */
4115static const UChar Nxy[256] = {                    // row nr
4116   0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4117   0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4118   0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4119   0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4120   0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4121   0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4122   0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4123   0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4124   0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4125   0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4126   0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4127   0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4128   0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4129   0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4130   0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4131   0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4132   0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4133   0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4134   0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4135   0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4136   0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4137   0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4138   0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4139   0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4140   0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4141   0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4142   0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4143   0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4144   0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4145   0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4146   0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4147   0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4148};
4149
4150/* E values so that E = 0x03^xy. */
4151static const UChar Exy[256] = {                    // row nr
4152   0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4153   0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4154   0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4155   0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4156   0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4157   0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4158   0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4159   0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4160   0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4161   0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4162   0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4163   0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4164   0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4165   0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4166   0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4167   0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4168   0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4169   0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4170   0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4171   0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4172   0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4173   0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4174   0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4175   0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4176   0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4177   0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4178   0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4179   0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4180   0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4181   0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4182   0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4183   0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4184
4185static inline UChar ff_mul(UChar u1, UChar u2)
4186{
4187   if ((u1 > 0) && (u2 > 0)) {
4188      UInt ui = Nxy[u1] + Nxy[u2];
4189      if (ui >= 255)
4190         ui = ui - 255;
4191      return Exy[ui];
4192   } else {
4193      return 0;
4194   };
4195}
4196
4197static void MixColumns (V128* v)
4198{
4199   V128 r;
4200   Int j;
4201#define P(x,row,col) (x)->w8[((row)*4+(col))]
4202   for (j = 0; j < 4; j++) {
4203      P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4204         ^ P(v,j,2) ^ P(v,j,3);
4205      P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4206         ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4207      P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4208         ^ ff_mul(0x03, P(v,j,3) );
4209      P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4210         ^ ff_mul( 0x02, P(v,j,3) );
4211   }
4212   *v = r;
4213#undef P
4214}
4215
4216static void InvMixColumns (V128* v)
4217{
4218   V128 r;
4219   Int j;
4220#define P(x,row,col) (x)->w8[((row)*4+(col))]
4221   for (j = 0; j < 4; j++) {
4222      P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4223         ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4224      P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4225         ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4226      P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4227         ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4228      P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4229         ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4230   }
4231   *v = r;
4232#undef P
4233
4234}
4235
4236/* For description, see definition in guest_amd64_defs.h */
4237void amd64g_dirtyhelper_AES (
4238          VexGuestAMD64State* gst,
4239          HWord opc4, HWord gstOffD,
4240          HWord gstOffL, HWord gstOffR
4241       )
4242{
4243   // where the args are
4244   V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4245   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4246   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4247   V128  r;
4248
4249   switch (opc4) {
4250      case 0xDC: /* AESENC */
4251      case 0xDD: /* AESENCLAST */
4252         r = *argR;
4253         ShiftRows (&r);
4254         SubBytes  (&r);
4255         if (opc4 == 0xDC)
4256            MixColumns (&r);
4257         argD->w64[0] = r.w64[0] ^ argL->w64[0];
4258         argD->w64[1] = r.w64[1] ^ argL->w64[1];
4259         break;
4260
4261      case 0xDE: /* AESDEC */
4262      case 0xDF: /* AESDECLAST */
4263         r = *argR;
4264         InvShiftRows (&r);
4265         InvSubBytes (&r);
4266         if (opc4 == 0xDE)
4267            InvMixColumns (&r);
4268         argD->w64[0] = r.w64[0] ^ argL->w64[0];
4269         argD->w64[1] = r.w64[1] ^ argL->w64[1];
4270         break;
4271
4272      case 0xDB: /* AESIMC */
4273         *argD = *argL;
4274         InvMixColumns (argD);
4275         break;
4276      default: vassert(0);
4277   }
4278}
4279
4280static inline UInt RotWord (UInt   w32)
4281{
4282   return ((w32 >> 8) | (w32 << 24));
4283}
4284
4285static inline UInt SubWord (UInt   w32)
4286{
4287   UChar *w8;
4288   UChar *r8;
4289   UInt res;
4290   w8 = (UChar*) &w32;
4291   r8 = (UChar*) &res;
4292   r8[0] = sbox[w8[0]];
4293   r8[1] = sbox[w8[1]];
4294   r8[2] = sbox[w8[2]];
4295   r8[3] = sbox[w8[3]];
4296   return res;
4297}
4298
4299/* For description, see definition in guest_amd64_defs.h */
4300extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4301          VexGuestAMD64State* gst,
4302          HWord imm8,
4303          HWord gstOffL, HWord gstOffR
4304       )
4305{
4306   // where the args are
4307   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4308   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4309
4310   // We have to create the result in a temporary in the
4311   // case where the src and dst regs are the same.  See #341698.
4312   V128 tmp;
4313
4314   tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4315   tmp.w32[2] = SubWord (argL->w32[3]);
4316   tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4317   tmp.w32[0] = SubWord (argL->w32[1]);
4318
4319   argR->w32[3] = tmp.w32[3];
4320   argR->w32[2] = tmp.w32[2];
4321   argR->w32[1] = tmp.w32[1];
4322   argR->w32[0] = tmp.w32[0];
4323}
4324
4325
4326
4327/*---------------------------------------------------------------*/
4328/*--- Helpers for dealing with, and describing,               ---*/
4329/*--- guest state as a whole.                                 ---*/
4330/*---------------------------------------------------------------*/
4331
4332/* Initialise the entire amd64 guest state. */
4333/* VISIBLE TO LIBVEX CLIENT */
4334void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4335{
4336   vex_state->host_EvC_FAILADDR = 0;
4337   vex_state->host_EvC_COUNTER = 0;
4338   vex_state->pad0 = 0;
4339
4340   vex_state->guest_RAX = 0;
4341   vex_state->guest_RCX = 0;
4342   vex_state->guest_RDX = 0;
4343   vex_state->guest_RBX = 0;
4344   vex_state->guest_RSP = 0;
4345   vex_state->guest_RBP = 0;
4346   vex_state->guest_RSI = 0;
4347   vex_state->guest_RDI = 0;
4348   vex_state->guest_R8  = 0;
4349   vex_state->guest_R9  = 0;
4350   vex_state->guest_R10 = 0;
4351   vex_state->guest_R11 = 0;
4352   vex_state->guest_R12 = 0;
4353   vex_state->guest_R13 = 0;
4354   vex_state->guest_R14 = 0;
4355   vex_state->guest_R15 = 0;
4356
4357   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
4358   vex_state->guest_CC_DEP1 = 0;
4359   vex_state->guest_CC_DEP2 = 0;
4360   vex_state->guest_CC_NDEP = 0;
4361
4362   vex_state->guest_DFLAG   = 1; /* forwards */
4363   vex_state->guest_IDFLAG  = 0;
4364   vex_state->guest_ACFLAG  = 0;
4365
4366   /* HACK: represent the offset associated with a constant %fs.
4367      Typically, on linux, this assumes that %fs is only ever zero (main
4368      thread) or 0x63. */
4369   vex_state->guest_FS_CONST = 0;
4370
4371   vex_state->guest_RIP = 0;
4372
4373   /* Initialise the simulated FPU */
4374   amd64g_dirtyhelper_FINIT( vex_state );
4375
4376   /* Initialise the AVX state. */
4377#  define AVXZERO(_ymm) \
4378      do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4379           _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4380      } while (0)
4381   vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4382   AVXZERO(vex_state->guest_YMM0);
4383   AVXZERO(vex_state->guest_YMM1);
4384   AVXZERO(vex_state->guest_YMM2);
4385   AVXZERO(vex_state->guest_YMM3);
4386   AVXZERO(vex_state->guest_YMM4);
4387   AVXZERO(vex_state->guest_YMM5);
4388   AVXZERO(vex_state->guest_YMM6);
4389   AVXZERO(vex_state->guest_YMM7);
4390   AVXZERO(vex_state->guest_YMM8);
4391   AVXZERO(vex_state->guest_YMM9);
4392   AVXZERO(vex_state->guest_YMM10);
4393   AVXZERO(vex_state->guest_YMM11);
4394   AVXZERO(vex_state->guest_YMM12);
4395   AVXZERO(vex_state->guest_YMM13);
4396   AVXZERO(vex_state->guest_YMM14);
4397   AVXZERO(vex_state->guest_YMM15);
4398   AVXZERO(vex_state->guest_YMM16);
4399
4400#  undef AVXZERO
4401
4402   vex_state->guest_EMNOTE = EmNote_NONE;
4403
4404   /* These should not ever be either read or written, but we
4405      initialise them anyway. */
4406   vex_state->guest_CMSTART = 0;
4407   vex_state->guest_CMLEN   = 0;
4408
4409   vex_state->guest_NRADDR   = 0;
4410   vex_state->guest_SC_CLASS = 0;
4411   vex_state->guest_GS_CONST = 0;
4412
4413   vex_state->guest_IP_AT_SYSCALL = 0;
4414   vex_state->pad1 = 0;
4415}
4416
4417
4418/* Figure out if any part of the guest state contained in minoff
4419   .. maxoff requires precise memory exceptions.  If in doubt return
4420   True (but this generates significantly slower code).
4421
4422   By default we enforce precise exns for guest %RSP, %RBP and %RIP
4423   only.  These are the minimum needed to extract correct stack
4424   backtraces from amd64 code.
4425
4426   Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4427*/
4428Bool guest_amd64_state_requires_precise_mem_exns (
4429        Int minoff, Int maxoff, VexRegisterUpdates pxControl
4430     )
4431{
4432   Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4433   Int rbp_max = rbp_min + 8 - 1;
4434   Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4435   Int rsp_max = rsp_min + 8 - 1;
4436   Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4437   Int rip_max = rip_min + 8 - 1;
4438
4439   if (maxoff < rsp_min || minoff > rsp_max) {
4440      /* no overlap with rsp */
4441      if (pxControl == VexRegUpdSpAtMemAccess)
4442         return False; // We only need to check stack pointer.
4443   } else {
4444      return True;
4445   }
4446
4447   if (maxoff < rbp_min || minoff > rbp_max) {
4448      /* no overlap with rbp */
4449   } else {
4450      return True;
4451   }
4452
4453   if (maxoff < rip_min || minoff > rip_max) {
4454      /* no overlap with eip */
4455   } else {
4456      return True;
4457   }
4458
4459   return False;
4460}
4461
4462
4463#define ALWAYSDEFD(field)                             \
4464    { offsetof(VexGuestAMD64State, field),            \
4465      (sizeof ((VexGuestAMD64State*)0)->field) }
4466
4467VexGuestLayout
4468   amd64guest_layout
4469      = {
4470          /* Total size of the guest state, in bytes. */
4471          .total_sizeB = sizeof(VexGuestAMD64State),
4472
4473          /* Describe the stack pointer. */
4474          .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4475          .sizeof_SP = 8,
4476
4477          /* Describe the frame pointer. */
4478          .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4479          .sizeof_FP = 8,
4480
4481          /* Describe the instruction pointer. */
4482          .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4483          .sizeof_IP = 8,
4484
4485          /* Describe any sections to be regarded by Memcheck as
4486             'always-defined'. */
4487          .n_alwaysDefd = 16,
4488
4489          /* flags thunk: OP and NDEP are always defd, whereas DEP1
4490             and DEP2 have to be tracked.  See detailed comment in
4491             gdefs.h on meaning of thunk fields. */
4492          .alwaysDefd
4493             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
4494                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
4495		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
4496                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
4497                 /*  4 */ ALWAYSDEFD(guest_RIP),
4498                 /*  5 */ ALWAYSDEFD(guest_FS_CONST),
4499                 /*  6 */ ALWAYSDEFD(guest_FTOP),
4500                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
4501                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
4502                 /*  9 */ ALWAYSDEFD(guest_FC3210),
4503                 // /* */ ALWAYSDEFD(guest_CS),
4504                 // /* */ ALWAYSDEFD(guest_DS),
4505                 // /* */ ALWAYSDEFD(guest_ES),
4506                 // /* */ ALWAYSDEFD(guest_FS),
4507                 // /* */ ALWAYSDEFD(guest_GS),
4508                 // /* */ ALWAYSDEFD(guest_SS),
4509                 // /* */ ALWAYSDEFD(guest_LDT),
4510                 // /* */ ALWAYSDEFD(guest_GDT),
4511                 /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4512                 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4513                 /* 12 */ ALWAYSDEFD(guest_CMSTART),
4514                 /* 13 */ ALWAYSDEFD(guest_CMLEN),
4515                 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4516                 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4517               }
4518        };
4519
4520
4521/*---------------------------------------------------------------*/
4522/*--- end                               guest_amd64_helpers.c ---*/
4523/*---------------------------------------------------------------*/
4524