guest_amd64_helpers.c revision 663860b1408516d02ebfcb3a9999a134e6cfb223
1
2/*---------------------------------------------------------------*/
3/*--- begin                             guest_amd64_helpers.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_emwarn.h"
38#include "libvex_guest_amd64.h"
39#include "libvex_ir.h"
40#include "libvex.h"
41
42#include "main_util.h"
43#include "guest_generic_bb_to_IR.h"
44#include "guest_amd64_defs.h"
45#include "guest_generic_x87.h"
46
47
48/* This file contains helper functions for amd64 guest code.
49   Calls to these functions are generated by the back end.
50   These calls are of course in the host machine code and
51   this file will be compiled to host machine code, so that
52   all makes sense.
53
54   Only change the signatures of these helper functions very
55   carefully.  If you change the signature here, you'll have to change
56   the parameters passed to it in the IR calls constructed by
57   guest-amd64/toIR.c.
58
59   The convention used is that all functions called from generated
60   code are named amd64g_<something>, and any function whose name lacks
61   that prefix is not called from generated code.  Note that some
62   LibVEX_* functions can however be called by VEX's client, but that
63   is not the same as calling them from VEX-generated code.
64*/
65
66
67/* Set to 1 to get detailed profiling info about use of the flag
68   machinery. */
69#define PROFILE_RFLAGS 0
70
71
72/*---------------------------------------------------------------*/
73/*--- %rflags run-time helpers.                               ---*/
74/*---------------------------------------------------------------*/
75
76/* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
77   after imulq/mulq. */
78
79static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
80{
81   ULong u0, v0, w0;
82    Long u1, v1, w1, w2, t;
83   u0   = u & 0xFFFFFFFFULL;
84   u1   = u >> 32;
85   v0   = v & 0xFFFFFFFFULL;
86   v1   = v >> 32;
87   w0   = u0 * v0;
88   t    = u1 * v0 + (w0 >> 32);
89   w1   = t & 0xFFFFFFFFULL;
90   w2   = t >> 32;
91   w1   = u0 * v1 + w1;
92   *rHi = u1 * v1 + w2 + (w1 >> 32);
93   *rLo = u * v;
94}
95
96static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
97{
98   ULong u0, v0, w0;
99   ULong u1, v1, w1,w2,t;
100   u0   = u & 0xFFFFFFFFULL;
101   u1   = u >> 32;
102   v0   = v & 0xFFFFFFFFULL;
103   v1   = v >> 32;
104   w0   = u0 * v0;
105   t    = u1 * v0 + (w0 >> 32);
106   w1   = t & 0xFFFFFFFFULL;
107   w2   = t >> 32;
108   w1   = u0 * v1 + w1;
109   *rHi = u1 * v1 + w2 + (w1 >> 32);
110   *rLo = u * v;
111}
112
113
114static const UChar parity_table[256] = {
115    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
116    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
117    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
118    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
121    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
124    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
126    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
129    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
133    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
137    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
140    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
142    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
145    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147};
148
149/* generalised left-shifter */
150static inline Long lshift ( Long x, Int n )
151{
152   if (n >= 0)
153      return x << n;
154   else
155      return x >> (-n);
156}
157
158/* identity on ULong */
159static inline ULong idULong ( ULong x )
160{
161   return x;
162}
163
164
165#define PREAMBLE(__data_bits)					\
166   /* const */ ULong DATA_MASK 					\
167      = __data_bits==8                                          \
168           ? 0xFFULL 					        \
169           : (__data_bits==16                                   \
170                ? 0xFFFFULL 		                        \
171                : (__data_bits==32                              \
172                     ? 0xFFFFFFFFULL                            \
173                     : 0xFFFFFFFFFFFFFFFFULL));                 \
174   /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
175   /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
176   /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
177   /* const */ ULong CC_NDEP = cc_ndep_formal;			\
178   /* Four bogus assignments, which hopefully gcc can     */	\
179   /* optimise away, and which stop it complaining about  */	\
180   /* unused variables.                                   */	\
181   SIGN_MASK = SIGN_MASK;					\
182   DATA_MASK = DATA_MASK;					\
183   CC_DEP2 = CC_DEP2;						\
184   CC_NDEP = CC_NDEP;
185
186
187/*-------------------------------------------------------------*/
188
189#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
190{								\
191   PREAMBLE(DATA_BITS);						\
192   { Long cf, pf, af, zf, sf, of;				\
193     Long argL, argR, res;					\
194     argL = CC_DEP1;						\
195     argR = CC_DEP2;						\
196     res  = argL + argR;					\
197     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
198     pf = parity_table[(UChar)res];				\
199     af = (res ^ argL ^ argR) & 0x10;				\
200     zf = ((DATA_UTYPE)res == 0) << 6;				\
201     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
202     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
203                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
204     return cf | pf | af | zf | sf | of;			\
205   }								\
206}
207
208/*-------------------------------------------------------------*/
209
210#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
211{								\
212   PREAMBLE(DATA_BITS);						\
213   { Long cf, pf, af, zf, sf, of;				\
214     Long argL, argR, res;					\
215     argL = CC_DEP1;						\
216     argR = CC_DEP2;						\
217     res  = argL - argR;					\
218     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
219     pf = parity_table[(UChar)res];				\
220     af = (res ^ argL ^ argR) & 0x10;				\
221     zf = ((DATA_UTYPE)res == 0) << 6;				\
222     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
223     of = lshift((argL ^ argR) & (argL ^ res),	 		\
224                 12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
225     return cf | pf | af | zf | sf | of;			\
226   }								\
227}
228
229/*-------------------------------------------------------------*/
230
231#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
232{								\
233   PREAMBLE(DATA_BITS);						\
234   { Long cf, pf, af, zf, sf, of;				\
235     Long argL, argR, oldC, res;		 		\
236     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
237     argL = CC_DEP1;						\
238     argR = CC_DEP2 ^ oldC;	       				\
239     res  = (argL + argR) + oldC;				\
240     if (oldC)							\
241        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
242     else							\
243        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
244     pf = parity_table[(UChar)res];				\
245     af = (res ^ argL ^ argR) & 0x10;				\
246     zf = ((DATA_UTYPE)res == 0) << 6;				\
247     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
248     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
249                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
250     return cf | pf | af | zf | sf | of;			\
251   }								\
252}
253
254/*-------------------------------------------------------------*/
255
256#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
257{								\
258   PREAMBLE(DATA_BITS);						\
259   { Long cf, pf, af, zf, sf, of;				\
260     Long argL, argR, oldC, res;	       			\
261     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
262     argL = CC_DEP1;						\
263     argR = CC_DEP2 ^ oldC;	       				\
264     res  = (argL - argR) - oldC;				\
265     if (oldC)							\
266        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
267     else							\
268        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
269     pf = parity_table[(UChar)res];				\
270     af = (res ^ argL ^ argR) & 0x10;				\
271     zf = ((DATA_UTYPE)res == 0) << 6;				\
272     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
273     of = lshift((argL ^ argR) & (argL ^ res), 			\
274                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
275     return cf | pf | af | zf | sf | of;			\
276   }								\
277}
278
279/*-------------------------------------------------------------*/
280
281#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
282{								\
283   PREAMBLE(DATA_BITS);						\
284   { Long cf, pf, af, zf, sf, of;				\
285     cf = 0;							\
286     pf = parity_table[(UChar)CC_DEP1];				\
287     af = 0;							\
288     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
289     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
290     of = 0;							\
291     return cf | pf | af | zf | sf | of;			\
292   }								\
293}
294
295/*-------------------------------------------------------------*/
296
297#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
298{								\
299   PREAMBLE(DATA_BITS);						\
300   { Long cf, pf, af, zf, sf, of;				\
301     Long argL, argR, res;					\
302     res  = CC_DEP1;						\
303     argL = res - 1;						\
304     argR = 1;							\
305     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
306     pf = parity_table[(UChar)res];				\
307     af = (res ^ argL ^ argR) & 0x10;				\
308     zf = ((DATA_UTYPE)res == 0) << 6;				\
309     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
310     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
311     return cf | pf | af | zf | sf | of;			\
312   }								\
313}
314
315/*-------------------------------------------------------------*/
316
317#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
318{								\
319   PREAMBLE(DATA_BITS);						\
320   { Long cf, pf, af, zf, sf, of;				\
321     Long argL, argR, res;					\
322     res  = CC_DEP1;						\
323     argL = res + 1;						\
324     argR = 1;							\
325     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
326     pf = parity_table[(UChar)res];				\
327     af = (res ^ argL ^ argR) & 0x10;				\
328     zf = ((DATA_UTYPE)res == 0) << 6;				\
329     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
330     of = ((res & DATA_MASK) 					\
331          == ((ULong)SIGN_MASK - 1)) << 11;			\
332     return cf | pf | af | zf | sf | of;			\
333   }								\
334}
335
336/*-------------------------------------------------------------*/
337
338#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
339{								\
340   PREAMBLE(DATA_BITS);						\
341   { Long cf, pf, af, zf, sf, of;				\
342     cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
343     pf = parity_table[(UChar)CC_DEP1];				\
344     af = 0; /* undefined */					\
345     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
346     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
347     /* of is defined if shift count == 1 */			\
348     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
349          & AMD64G_CC_MASK_O;					\
350     return cf | pf | af | zf | sf | of;			\
351   }								\
352}
353
354/*-------------------------------------------------------------*/
355
356#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
357{								\
358   PREAMBLE(DATA_BITS);  					\
359   { Long cf, pf, af, zf, sf, of;				\
360     cf = CC_DEP2 & 1;						\
361     pf = parity_table[(UChar)CC_DEP1];				\
362     af = 0; /* undefined */					\
363     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
364     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
365     /* of is defined if shift count == 1 */			\
366     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
367          & AMD64G_CC_MASK_O;					\
368     return cf | pf | af | zf | sf | of;			\
369   }								\
370}
371
372/*-------------------------------------------------------------*/
373
374/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
375/* DEP1 = result, NDEP = old flags */
376#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
377{								\
378   PREAMBLE(DATA_BITS);						\
379   { Long fl 							\
380        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
381          | (AMD64G_CC_MASK_C & CC_DEP1)			\
382          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
383                                      11-(DATA_BITS-1)) 	\
384                     ^ lshift(CC_DEP1, 11)));			\
385     return fl;							\
386   }								\
387}
388
389/*-------------------------------------------------------------*/
390
391/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
392/* DEP1 = result, NDEP = old flags */
393#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
394{								\
395   PREAMBLE(DATA_BITS);						\
396   { Long fl 							\
397        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
398          | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
399          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
400                                      11-(DATA_BITS-1)) 	\
401                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
402     return fl;							\
403   }								\
404}
405
406/*-------------------------------------------------------------*/
407
408#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
409                                DATA_U2TYPE, NARROWto2U)        \
410{                                                               \
411   PREAMBLE(DATA_BITS);                                         \
412   { Long cf, pf, af, zf, sf, of;                               \
413     DATA_UTYPE  hi;                                            \
414     DATA_UTYPE  lo                                             \
415        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
416                     * ((DATA_UTYPE)CC_DEP2) );                 \
417     DATA_U2TYPE rr                                             \
418        = NARROWto2U(                                           \
419             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
420             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
421     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
422     cf = (hi != 0);                                            \
423     pf = parity_table[(UChar)lo];                              \
424     af = 0; /* undefined */                                    \
425     zf = (lo == 0) << 6;                                       \
426     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
427     of = cf << 11;                                             \
428     return cf | pf | af | zf | sf | of;                        \
429   }								\
430}
431
432/*-------------------------------------------------------------*/
433
434#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
435                                DATA_S2TYPE, NARROWto2S)        \
436{                                                               \
437   PREAMBLE(DATA_BITS);                                         \
438   { Long cf, pf, af, zf, sf, of;                               \
439     DATA_STYPE  hi;                                            \
440     DATA_STYPE  lo                                             \
441        = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
442                     * ((DATA_STYPE)CC_DEP2) );                 \
443     DATA_S2TYPE rr                                             \
444        = NARROWto2S(                                           \
445             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
446             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
447     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
448     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
449     pf = parity_table[(UChar)lo];                              \
450     af = 0; /* undefined */                                    \
451     zf = (lo == 0) << 6;                                       \
452     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
453     of = cf << 11;                                             \
454     return cf | pf | af | zf | sf | of;                        \
455   }								\
456}
457
458/*-------------------------------------------------------------*/
459
460#define ACTIONS_UMULQ                                           \
461{                                                               \
462   PREAMBLE(64);                                                \
463   { Long cf, pf, af, zf, sf, of;                               \
464     ULong lo, hi;                                              \
465     mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
466     cf = (hi != 0);                                            \
467     pf = parity_table[(UChar)lo];                              \
468     af = 0; /* undefined */                                    \
469     zf = (lo == 0) << 6;                                       \
470     sf = lshift(lo, 8 - 64) & 0x80;                            \
471     of = cf << 11;                                             \
472     return cf | pf | af | zf | sf | of;                        \
473   }								\
474}
475
476/*-------------------------------------------------------------*/
477
478#define ACTIONS_SMULQ                                           \
479{                                                               \
480   PREAMBLE(64);                                                \
481   { Long cf, pf, af, zf, sf, of;                               \
482     Long lo, hi;                                               \
483     mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
484     cf = (hi != (lo >>/*s*/ (64-1)));                          \
485     pf = parity_table[(UChar)lo];                              \
486     af = 0; /* undefined */                                    \
487     zf = (lo == 0) << 6;                                       \
488     sf = lshift(lo, 8 - 64) & 0x80;                            \
489     of = cf << 11;                                             \
490     return cf | pf | af | zf | sf | of;                        \
491   }								\
492}
493
494
495#if PROFILE_RFLAGS
496
497static Bool initted     = False;
498
499/* C flag, fast route */
500static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
501/* C flag, slow route */
502static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
503/* table for calculate_cond */
504static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
505/* total entry counts for calc_all, calc_c, calc_cond. */
506static UInt n_calc_all  = 0;
507static UInt n_calc_c    = 0;
508static UInt n_calc_cond = 0;
509
510#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
511
512
513static void showCounts ( void )
514{
515   Int op, co;
516   Char ch;
517   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
518              n_calc_all, n_calc_cond, n_calc_c);
519
520   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
521              "    S   NS    P   NP    L   NL   LE  NLE\n");
522   vex_printf("     -----------------------------------------------------"
523              "----------------------------------------\n");
524   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
525
526      ch = ' ';
527      if (op > 0 && (op-1) % 4 == 0)
528         ch = 'B';
529      if (op > 0 && (op-1) % 4 == 1)
530         ch = 'W';
531      if (op > 0 && (op-1) % 4 == 2)
532         ch = 'L';
533      if (op > 0 && (op-1) % 4 == 3)
534         ch = 'Q';
535
536      vex_printf("%2d%c: ", op, ch);
537      vex_printf("%6u ", tabc_slow[op]);
538      vex_printf("%6u ", tabc_fast[op]);
539      for (co = 0; co < 16; co++) {
540         Int n = tab_cond[op][co];
541         if (n >= 1000) {
542            vex_printf(" %3dK", n / 1000);
543         } else
544         if (n >= 0) {
545            vex_printf(" %3d ", n );
546         } else {
547            vex_printf("     ");
548         }
549      }
550      vex_printf("\n");
551   }
552   vex_printf("\n");
553}
554
555static void initCounts ( void )
556{
557   Int op, co;
558   initted = True;
559   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
560      tabc_fast[op] = tabc_slow[op] = 0;
561      for (co = 0; co < 16; co++)
562         tab_cond[op][co] = 0;
563   }
564}
565
566#endif /* PROFILE_RFLAGS */
567
568
569/* CALLED FROM GENERATED CODE: CLEAN HELPER */
570/* Calculate all the 6 flags from the supplied thunk parameters.
571   Worker function, not directly called from generated code. */
572static
573ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
574                                        ULong cc_dep1_formal,
575                                        ULong cc_dep2_formal,
576                                        ULong cc_ndep_formal )
577{
578   switch (cc_op) {
579      case AMD64G_CC_OP_COPY:
580         return cc_dep1_formal
581                & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
582                   | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
583
584      case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
585      case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
586      case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
587      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
588
589      case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
590      case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
591      case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
592      case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
593
594      case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
595      case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
596      case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
597      case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
598
599      case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
600      case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
601      case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
602      case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
603
604      case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
605      case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
606      case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
607      case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
608
609      case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
610      case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
611      case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
612      case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
613
614      case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
615      case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
616      case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
617      case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
618
619      case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
620      case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
621      case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
622      case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
623
624      case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
625      case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
626      case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
627      case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
628
629      case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
630      case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
631      case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
632      case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
633
634      case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
635      case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
636      case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
637      case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
638
639      case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
640                                                  UShort, toUShort );
641      case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
642                                                  UInt,   toUInt );
643      case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
644                                                  ULong,  idULong );
645
646      case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
647
648      case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
649                                                  Short,  toUShort );
650      case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
651                                                  Int,    toUInt   );
652      case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
653                                                  Long,   idULong );
654
655      case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
656
657      default:
658         /* shouldn't really make these calls from generated code */
659         vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
660                    "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
661                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
662         vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
663   }
664}
665
666
667/* CALLED FROM GENERATED CODE: CLEAN HELPER */
668/* Calculate all the 6 flags from the supplied thunk parameters. */
669ULong amd64g_calculate_rflags_all ( ULong cc_op,
670                                    ULong cc_dep1,
671                                    ULong cc_dep2,
672                                    ULong cc_ndep )
673{
674#  if PROFILE_RFLAGS
675   if (!initted) initCounts();
676   n_calc_all++;
677   if (SHOW_COUNTS_NOW) showCounts();
678#  endif
679   return
680      amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
681}
682
683
684/* CALLED FROM GENERATED CODE: CLEAN HELPER */
685/* Calculate just the carry flag from the supplied thunk parameters. */
686ULong amd64g_calculate_rflags_c ( ULong cc_op,
687                                  ULong cc_dep1,
688                                  ULong cc_dep2,
689                                  ULong cc_ndep )
690{
691#  if PROFILE_RFLAGS
692   if (!initted) initCounts();
693   n_calc_c++;
694   tabc_fast[cc_op]++;
695   if (SHOW_COUNTS_NOW) showCounts();
696#  endif
697
698   /* Fast-case some common ones. */
699   switch (cc_op) {
700      case AMD64G_CC_OP_COPY:
701         return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
702      case AMD64G_CC_OP_LOGICQ:
703      case AMD64G_CC_OP_LOGICL:
704      case AMD64G_CC_OP_LOGICW:
705      case AMD64G_CC_OP_LOGICB:
706         return 0;
707	 //      case AMD64G_CC_OP_SUBL:
708	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
709	 //                   ? AMD64G_CC_MASK_C : 0;
710	 //      case AMD64G_CC_OP_SUBW:
711	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
712	 //                   ? AMD64G_CC_MASK_C : 0;
713	 //      case AMD64G_CC_OP_SUBB:
714	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
715	 //                   ? AMD64G_CC_MASK_C : 0;
716	 //      case AMD64G_CC_OP_INCL:
717	 //      case AMD64G_CC_OP_DECL:
718	 //         return cc_ndep & AMD64G_CC_MASK_C;
719      default:
720         break;
721   }
722
723#  if PROFILE_RFLAGS
724   tabc_fast[cc_op]--;
725   tabc_slow[cc_op]++;
726#  endif
727
728   return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
729          & AMD64G_CC_MASK_C;
730}
731
732
733/* CALLED FROM GENERATED CODE: CLEAN HELPER */
734/* returns 1 or 0 */
735ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
736                                   ULong cc_op,
737                                   ULong cc_dep1,
738                                   ULong cc_dep2,
739                                   ULong cc_ndep )
740{
741   ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
742                                                  cc_dep2, cc_ndep);
743   ULong of,sf,zf,cf,pf;
744   ULong inv = cond & 1;
745
746#  if PROFILE_RFLAGS
747   if (!initted) initCounts();
748   tab_cond[cc_op][cond]++;
749   n_calc_cond++;
750   if (SHOW_COUNTS_NOW) showCounts();
751#  endif
752
753   switch (cond) {
754      case AMD64CondNO:
755      case AMD64CondO: /* OF == 1 */
756         of = rflags >> AMD64G_CC_SHIFT_O;
757         return 1 & (inv ^ of);
758
759      case AMD64CondNZ:
760      case AMD64CondZ: /* ZF == 1 */
761         zf = rflags >> AMD64G_CC_SHIFT_Z;
762         return 1 & (inv ^ zf);
763
764      case AMD64CondNB:
765      case AMD64CondB: /* CF == 1 */
766         cf = rflags >> AMD64G_CC_SHIFT_C;
767         return 1 & (inv ^ cf);
768         break;
769
770      case AMD64CondNBE:
771      case AMD64CondBE: /* (CF or ZF) == 1 */
772         cf = rflags >> AMD64G_CC_SHIFT_C;
773         zf = rflags >> AMD64G_CC_SHIFT_Z;
774         return 1 & (inv ^ (cf | zf));
775         break;
776
777      case AMD64CondNS:
778      case AMD64CondS: /* SF == 1 */
779         sf = rflags >> AMD64G_CC_SHIFT_S;
780         return 1 & (inv ^ sf);
781
782      case AMD64CondNP:
783      case AMD64CondP: /* PF == 1 */
784         pf = rflags >> AMD64G_CC_SHIFT_P;
785         return 1 & (inv ^ pf);
786
787      case AMD64CondNL:
788      case AMD64CondL: /* (SF xor OF) == 1 */
789         sf = rflags >> AMD64G_CC_SHIFT_S;
790         of = rflags >> AMD64G_CC_SHIFT_O;
791         return 1 & (inv ^ (sf ^ of));
792         break;
793
794      case AMD64CondNLE:
795      case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
796         sf = rflags >> AMD64G_CC_SHIFT_S;
797         of = rflags >> AMD64G_CC_SHIFT_O;
798         zf = rflags >> AMD64G_CC_SHIFT_Z;
799         return 1 & (inv ^ ((sf ^ of) | zf));
800         break;
801
802      default:
803         /* shouldn't really make these calls from generated code */
804         vex_printf("amd64g_calculate_condition"
805                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
806                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
807         vpanic("amd64g_calculate_condition");
808   }
809}
810
811
812/* VISIBLE TO LIBVEX CLIENT */
813ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state )
814{
815   ULong rflags = amd64g_calculate_rflags_all_WRK(
816                     vex_state->guest_CC_OP,
817                     vex_state->guest_CC_DEP1,
818                     vex_state->guest_CC_DEP2,
819                     vex_state->guest_CC_NDEP
820                  );
821   Long dflag = vex_state->guest_DFLAG;
822   vassert(dflag == 1 || dflag == -1);
823   if (dflag == -1)
824      rflags |= (1<<10);
825   if (vex_state->guest_IDFLAG == 1)
826      rflags |= (1<<21);
827   if (vex_state->guest_ACFLAG == 1)
828      rflags |= (1<<18);
829
830   return rflags;
831}
832
833/* VISIBLE TO LIBVEX CLIENT */
834void
835LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
836                               /*MOD*/VexGuestAMD64State* vex_state )
837{
838   ULong oszacp = amd64g_calculate_rflags_all_WRK(
839                     vex_state->guest_CC_OP,
840                     vex_state->guest_CC_DEP1,
841                     vex_state->guest_CC_DEP2,
842                     vex_state->guest_CC_NDEP
843                  );
844   if (new_carry_flag & 1) {
845      oszacp |= AMD64G_CC_MASK_C;
846   } else {
847      oszacp &= ~AMD64G_CC_MASK_C;
848   }
849   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
850   vex_state->guest_CC_DEP1 = oszacp;
851   vex_state->guest_CC_DEP2 = 0;
852   vex_state->guest_CC_NDEP = 0;
853}
854
855
856/*---------------------------------------------------------------*/
857/*--- %rflags translation-time function specialisers.         ---*/
858/*--- These help iropt specialise calls the above run-time    ---*/
859/*--- %rflags functions.                                      ---*/
860/*---------------------------------------------------------------*/
861
862/* Used by the optimiser to try specialisations.  Returns an
863   equivalent expression, or NULL if none. */
864
865static Bool isU64 ( IRExpr* e, ULong n )
866{
867   return toBool( e->tag == Iex_Const
868                  && e->Iex.Const.con->tag == Ico_U64
869                  && e->Iex.Const.con->Ico.U64 == n );
870}
871
872IRExpr* guest_amd64_spechelper ( HChar* function_name,
873                                 IRExpr** args,
874                                 IRStmt** precedingStmts,
875                                 Int      n_precedingStmts )
876{
877#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
878#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
879#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
880#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
881#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
882
883   Int i, arity = 0;
884   for (i = 0; args[i]; i++)
885      arity++;
886#  if 0
887   vex_printf("spec request:\n");
888   vex_printf("   %s  ", function_name);
889   for (i = 0; i < arity; i++) {
890      vex_printf("  ");
891      ppIRExpr(args[i]);
892   }
893   vex_printf("\n");
894#  endif
895
896   /* --------- specialising "amd64g_calculate_condition" --------- */
897
898   if (vex_streq(function_name, "amd64g_calculate_condition")) {
899      /* specialise calls to above "calculate condition" function */
900      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
901      vassert(arity == 5);
902      cond    = args[0];
903      cc_op   = args[1];
904      cc_dep1 = args[2];
905      cc_dep2 = args[3];
906
907      /*---------------- ADDQ ----------------*/
908
909      if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
910         /* long long add, then Z --> test (dst+src == 0) */
911         return unop(Iop_1Uto64,
912                     binop(Iop_CmpEQ64,
913                           binop(Iop_Add64, cc_dep1, cc_dep2),
914                           mkU64(0)));
915      }
916
917      /*---------------- SUBQ ----------------*/
918
919      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
920         /* long long sub/cmp, then Z --> test dst==src */
921         return unop(Iop_1Uto64,
922                     binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
923      }
924      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
925         /* long long sub/cmp, then NZ --> test dst!=src */
926         return unop(Iop_1Uto64,
927                     binop(Iop_CmpNE64,cc_dep1,cc_dep2));
928      }
929
930      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
931         /* long long sub/cmp, then L (signed less than)
932            --> test dst <s src */
933         return unop(Iop_1Uto64,
934                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
935      }
936
937      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
938         /* long long sub/cmp, then B (unsigned less than)
939            --> test dst <u src */
940         return unop(Iop_1Uto64,
941                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
942      }
943      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
944         /* long long sub/cmp, then NB (unsigned greater than or equal)
945            --> test src <=u dst */
946         /* Note, args are opposite way round from the usual */
947         return unop(Iop_1Uto64,
948                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
949      }
950
951      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
952         /* long long sub/cmp, then BE (unsigned less than or equal)
953            --> test dst <=u src */
954         return unop(Iop_1Uto64,
955                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
956      }
957      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
958         /* long long sub/cmp, then NBE (unsigned greater than)
959            --> test !(dst <=u src) */
960         return binop(Iop_Xor64,
961                      unop(Iop_1Uto64,
962                           binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
963                      mkU64(1));
964      }
965
966      /*---------------- SUBL ----------------*/
967
968      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
969         /* long sub/cmp, then Z --> test dst==src */
970         return unop(Iop_1Uto64,
971                     binop(Iop_CmpEQ32,
972                           unop(Iop_64to32, cc_dep1),
973                           unop(Iop_64to32, cc_dep2)));
974      }
975      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
976         /* long sub/cmp, then NZ --> test dst!=src */
977         return unop(Iop_1Uto64,
978                     binop(Iop_CmpNE32,
979                           unop(Iop_64to32, cc_dep1),
980                           unop(Iop_64to32, cc_dep2)));
981      }
982
983      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
984         /* long sub/cmp, then L (signed less than)
985            --> test dst <s src */
986         return unop(Iop_1Uto64,
987                     binop(Iop_CmpLT32S,
988                           unop(Iop_64to32, cc_dep1),
989                           unop(Iop_64to32, cc_dep2)));
990      }
991
992      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
993         /* long sub/cmp, then LE (signed less than or equal)
994            --> test dst <=s src */
995         return unop(Iop_1Uto64,
996                     binop(Iop_CmpLE32S,
997                           unop(Iop_64to32, cc_dep1),
998                           unop(Iop_64to32, cc_dep2)));
999
1000      }
1001      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1002         /* long sub/cmp, then NLE (signed greater than)
1003            --> test !(dst <=s src)
1004            --> test (dst >s src)
1005            --> test (src <s dst) */
1006         return unop(Iop_1Uto64,
1007                     binop(Iop_CmpLT32S,
1008                           unop(Iop_64to32, cc_dep2),
1009                           unop(Iop_64to32, cc_dep1)));
1010
1011      }
1012
1013      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1014         /* long sub/cmp, then BE (unsigned less than or equal)
1015            --> test dst <=u src */
1016         return unop(Iop_1Uto64,
1017                     binop(Iop_CmpLE32U,
1018                           unop(Iop_64to32, cc_dep1),
1019                           unop(Iop_64to32, cc_dep2)));
1020      }
1021      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1022         /* long sub/cmp, then NBE (unsigned greater than)
1023            --> test src <u dst */
1024         /* Note, args are opposite way round from the usual */
1025         return unop(Iop_1Uto64,
1026                     binop(Iop_CmpLT32U,
1027                           unop(Iop_64to32, cc_dep2),
1028                           unop(Iop_64to32, cc_dep1)));
1029      }
1030
1031      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1032         /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
1033         return unop(Iop_1Uto64,
1034                     binop(Iop_CmpLT32S,
1035                           binop(Iop_Sub32,
1036                                 unop(Iop_64to32, cc_dep1),
1037                                 unop(Iop_64to32, cc_dep2)),
1038                           mkU32(0)));
1039      }
1040
1041      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1042         /* long sub/cmp, then B (unsigned less than)
1043            --> test dst <u src */
1044         return unop(Iop_1Uto64,
1045                     binop(Iop_CmpLT32U,
1046                           unop(Iop_64to32, cc_dep1),
1047                           unop(Iop_64to32, cc_dep2)));
1048      }
1049
1050      /*---------------- SUBW ----------------*/
1051
1052      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1053         /* word sub/cmp, then Z --> test dst==src */
1054         return unop(Iop_1Uto64,
1055                     binop(Iop_CmpEQ16,
1056                           unop(Iop_64to16,cc_dep1),
1057                           unop(Iop_64to16,cc_dep2)));
1058      }
1059      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1060         /* word sub/cmp, then NZ --> test dst!=src */
1061         return unop(Iop_1Uto64,
1062                     binop(Iop_CmpNE16,
1063                           unop(Iop_64to16,cc_dep1),
1064                           unop(Iop_64to16,cc_dep2)));
1065      }
1066
1067      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1068         /* word sub/cmp, then LE (signed less than or equal)
1069            --> test dst <=s src */
1070         return unop(Iop_1Uto64,
1071                     binop(Iop_CmpLE64S,
1072                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1073                           binop(Iop_Shl64,cc_dep2,mkU8(48))));
1074
1075      }
1076
1077      /*---------------- SUBB ----------------*/
1078
1079      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1080         /* byte sub/cmp, then Z --> test dst==src */
1081         return unop(Iop_1Uto64,
1082                     binop(Iop_CmpEQ8,
1083                           unop(Iop_64to8,cc_dep1),
1084                           unop(Iop_64to8,cc_dep2)));
1085      }
1086      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1087         /* byte sub/cmp, then NZ --> test dst!=src */
1088         return unop(Iop_1Uto64,
1089                     binop(Iop_CmpNE8,
1090                           unop(Iop_64to8,cc_dep1),
1091                           unop(Iop_64to8,cc_dep2)));
1092      }
1093
1094      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1095         /* byte sub/cmp, then BE (unsigned less than or equal)
1096            --> test dst <=u src */
1097         return unop(Iop_1Uto64,
1098                     binop(Iop_CmpLE64U,
1099                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1100                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1101      }
1102
1103      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1104                                          && isU64(cc_dep2, 0)) {
1105         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1106                                         --> test dst <s 0
1107                                         --> (ULong)dst[7]
1108            This is yet another scheme by which gcc figures out if the
1109            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1110         /* Note: isU64(cc_dep2, 0) is correct, even though this is
1111            for an 8-bit comparison, since the args to the helper
1112            function are always U64s. */
1113         return binop(Iop_And64,
1114                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1115                      mkU64(1));
1116      }
1117      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1118                                          && isU64(cc_dep2, 0)) {
1119         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1120                                          --> test !(dst <s 0)
1121                                          --> (ULong) !dst[7]
1122         */
1123         return binop(Iop_Xor64,
1124                      binop(Iop_And64,
1125                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1126                            mkU64(1)),
1127                      mkU64(1));
1128      }
1129
1130      /*---------------- LOGICQ ----------------*/
1131
1132      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1133         /* long long and/or/xor, then Z --> test dst==0 */
1134         return unop(Iop_1Uto64,
1135                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1136      }
1137      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1138         /* long long and/or/xor, then NZ --> test dst!=0 */
1139         return unop(Iop_1Uto64,
1140                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1141      }
1142
1143      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1144         /* long long and/or/xor, then L
1145            LOGIC sets SF and ZF according to the
1146            result and makes OF be zero.  L computes SF ^ OF, but
1147            OF is zero, so this reduces to SF -- which will be 1 iff
1148            the result is < signed 0.  Hence ...
1149         */
1150         return unop(Iop_1Uto64,
1151                     binop(Iop_CmpLT64S,
1152                           cc_dep1,
1153                           mkU64(0)));
1154      }
1155
1156      /*---------------- LOGICL ----------------*/
1157
1158      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1159         /* long and/or/xor, then Z --> test dst==0 */
1160         return unop(Iop_1Uto64,
1161                     binop(Iop_CmpEQ32,
1162                           unop(Iop_64to32, cc_dep1),
1163                           mkU32(0)));
1164      }
1165      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1166         /* long and/or/xor, then NZ --> test dst!=0 */
1167         return unop(Iop_1Uto64,
1168                     binop(Iop_CmpNE32,
1169                           unop(Iop_64to32, cc_dep1),
1170                           mkU32(0)));
1171      }
1172
1173      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1174         /* long and/or/xor, then LE
1175            This is pretty subtle.  LOGIC sets SF and ZF according to the
1176            result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1177            OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1178            the result is <=signed 0.  Hence ...
1179         */
1180         return unop(Iop_1Uto64,
1181                     binop(Iop_CmpLE32S,
1182                           unop(Iop_64to32, cc_dep1),
1183                           mkU32(0)));
1184      }
1185
1186      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1187         /* long and/or/xor, then S --> (ULong)result[31] */
1188         return binop(Iop_And64,
1189                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1190                      mkU64(1));
1191      }
1192      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1193         /* long and/or/xor, then S --> (ULong) ~ result[31] */
1194         return binop(Iop_Xor64,
1195                binop(Iop_And64,
1196                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1197                      mkU64(1)),
1198                mkU64(1));
1199      }
1200
1201      /*---------------- LOGICW ----------------*/
1202
1203      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1204         /* word and/or/xor, then Z --> test dst==0 */
1205         return unop(Iop_1Uto64,
1206                     binop(Iop_CmpEQ64,
1207                           binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1208                           mkU64(0)));
1209      }
1210      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1211         /* word and/or/xor, then NZ --> test dst!=0 */
1212         return unop(Iop_1Uto64,
1213                     binop(Iop_CmpNE64,
1214                           binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1215                           mkU64(0)));
1216      }
1217
1218      /*---------------- LOGICB ----------------*/
1219
1220      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1221         /* byte and/or/xor, then Z --> test dst==0 */
1222         return unop(Iop_1Uto64,
1223                     binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1224                                        mkU64(0)));
1225      }
1226      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1227         /* byte and/or/xor, then NZ --> test dst!=0 */
1228         return unop(Iop_1Uto64,
1229                     binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1230                                        mkU64(0)));
1231      }
1232
1233      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1234         /* this is an idiom gcc sometimes uses to find out if the top
1235            bit of a byte register is set: eg testb %al,%al; js ..
1236            Since it just depends on the top bit of the byte, extract
1237            that bit and explicitly get rid of all the rest.  This
1238            helps memcheck avoid false positives in the case where any
1239            of the other bits in the byte are undefined. */
1240         /* byte and/or/xor, then S --> (UInt)result[7] */
1241         return binop(Iop_And64,
1242                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1243                      mkU64(1));
1244      }
1245      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1246         /* byte and/or/xor, then NS --> (UInt)!result[7] */
1247         return binop(Iop_Xor64,
1248                      binop(Iop_And64,
1249                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1250                            mkU64(1)),
1251                      mkU64(1));
1252      }
1253
1254      /*---------------- INCB ----------------*/
1255
1256      if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1257         /* 8-bit inc, then LE --> sign bit of the arg */
1258         return binop(Iop_And64,
1259                      binop(Iop_Shr64,
1260                            binop(Iop_Sub64, cc_dep1, mkU64(1)),
1261                            mkU8(7)),
1262                      mkU64(1));
1263      }
1264
1265      /*---------------- INCW ----------------*/
1266
1267      if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1268         /* 16-bit inc, then Z --> test dst == 0 */
1269         return unop(Iop_1Uto64,
1270                     binop(Iop_CmpEQ64,
1271                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1272                           mkU64(0)));
1273      }
1274
1275      /*---------------- DECL ----------------*/
1276
1277      if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1278         /* dec L, then Z --> test dst == 0 */
1279         return unop(Iop_1Uto64,
1280                     binop(Iop_CmpEQ32,
1281                           unop(Iop_64to32, cc_dep1),
1282                           mkU32(0)));
1283      }
1284
1285      /*---------------- DECW ----------------*/
1286
1287      if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1288         /* 16-bit dec, then NZ --> test dst != 0 */
1289         return unop(Iop_1Uto64,
1290                     binop(Iop_CmpNE64,
1291                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1292                           mkU64(0)));
1293      }
1294
1295      /*---------------- COPY ----------------*/
1296      /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1297         jbe" for example. */
1298
1299      if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1300          (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1301         /* COPY, then BE --> extract C and Z from dep1, and test (C
1302            or Z == 1). */
1303         /* COPY, then NBE --> extract C and Z from dep1, and test (C
1304            or Z == 0). */
1305         ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1306         return
1307            unop(
1308               Iop_1Uto64,
1309               binop(
1310                  Iop_CmpEQ64,
1311                  binop(
1312                     Iop_And64,
1313                     binop(
1314                        Iop_Or64,
1315                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1316                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1317                     ),
1318                     mkU64(1)
1319                  ),
1320                  mkU64(nnn)
1321               )
1322            );
1323      }
1324
1325      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1326         /* COPY, then B --> extract C dep1, and test (C == 1). */
1327         return
1328            unop(
1329               Iop_1Uto64,
1330               binop(
1331                  Iop_CmpNE64,
1332                  binop(
1333                     Iop_And64,
1334                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1335                     mkU64(1)
1336                  ),
1337                  mkU64(0)
1338               )
1339            );
1340      }
1341
1342      if (isU64(cc_op, AMD64G_CC_OP_COPY)
1343          && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1344         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1345         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1346         UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1347         return
1348            unop(
1349               Iop_1Uto64,
1350               binop(
1351                  Iop_CmpEQ64,
1352                  binop(
1353                     Iop_And64,
1354                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1355                     mkU64(1)
1356                  ),
1357                  mkU64(nnn)
1358               )
1359            );
1360      }
1361
1362      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1363         /* COPY, then P --> extract P from dep1, and test (P == 1). */
1364         return
1365            unop(
1366               Iop_1Uto64,
1367               binop(
1368                  Iop_CmpNE64,
1369                  binop(
1370                     Iop_And64,
1371                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1372                     mkU64(1)
1373                  ),
1374                  mkU64(0)
1375               )
1376            );
1377      }
1378
1379      return NULL;
1380   }
1381
1382   /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1383
1384   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1385      /* specialise calls to above "calculate_rflags_c" function */
1386      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1387      vassert(arity == 4);
1388      cc_op   = args[0];
1389      cc_dep1 = args[1];
1390      cc_dep2 = args[2];
1391      cc_ndep = args[3];
1392
1393      if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1394         /* C after sub denotes unsigned less than */
1395         return unop(Iop_1Uto64,
1396                     binop(Iop_CmpLT64U,
1397                           cc_dep1,
1398                           cc_dep2));
1399      }
1400      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1401         /* C after sub denotes unsigned less than */
1402         return unop(Iop_1Uto64,
1403                     binop(Iop_CmpLT32U,
1404                           unop(Iop_64to32, cc_dep1),
1405                           unop(Iop_64to32, cc_dep2)));
1406      }
1407      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1408         /* C after sub denotes unsigned less than */
1409         return unop(Iop_1Uto64,
1410                     binop(Iop_CmpLT64U,
1411                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1412                           binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1413      }
1414      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1415          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1416          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1417          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1418         /* cflag after logic is zero */
1419         return mkU64(0);
1420      }
1421      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1422          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1423         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1424         return cc_ndep;
1425      }
1426
1427#     if 0
1428      if (cc_op->tag == Iex_Const) {
1429         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1430      }
1431#     endif
1432
1433      return NULL;
1434   }
1435
1436#  undef unop
1437#  undef binop
1438#  undef mkU64
1439#  undef mkU32
1440#  undef mkU8
1441
1442   return NULL;
1443}
1444
1445
1446/*---------------------------------------------------------------*/
1447/*--- Supporting functions for x87 FPU activities.            ---*/
1448/*---------------------------------------------------------------*/
1449
1450static inline Bool host_is_little_endian ( void )
1451{
1452   UInt x = 0x76543210;
1453   UChar* p = (UChar*)(&x);
1454   return toBool(*p == 0x10);
1455}
1456
1457/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1458/* CALLED FROM GENERATED CODE: CLEAN HELPER */
1459ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1460{
1461   Bool   mantissaIsZero;
1462   Int    bexp;
1463   UChar  sign;
1464   UChar* f64;
1465
1466   vassert(host_is_little_endian());
1467
1468   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1469
1470   f64  = (UChar*)(&dbl);
1471   sign = toUChar( (f64[7] >> 7) & 1 );
1472
1473   /* First off, if the tag indicates the register was empty,
1474      return 1,0,sign,1 */
1475   if (tag == 0) {
1476      /* vex_printf("Empty\n"); */
1477      return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1478                                   | AMD64G_FC_MASK_C0;
1479   }
1480
1481   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1482   bexp &= 0x7FF;
1483
1484   mantissaIsZero
1485      = toBool(
1486           (f64[6] & 0x0F) == 0
1487           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1488        );
1489
1490   /* If both exponent and mantissa are zero, the value is zero.
1491      Return 1,0,sign,0. */
1492   if (bexp == 0 && mantissaIsZero) {
1493      /* vex_printf("Zero\n"); */
1494      return AMD64G_FC_MASK_C3 | 0
1495                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1496   }
1497
1498   /* If exponent is zero but mantissa isn't, it's a denormal.
1499      Return 1,1,sign,0. */
1500   if (bexp == 0 && !mantissaIsZero) {
1501      /* vex_printf("Denormal\n"); */
1502      return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1503                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1504   }
1505
1506   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1507      Return 0,1,sign,1. */
1508   if (bexp == 0x7FF && mantissaIsZero) {
1509      /* vex_printf("Inf\n"); */
1510      return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1511                                   | AMD64G_FC_MASK_C0;
1512   }
1513
1514   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1515      Return 0,0,sign,1. */
1516   if (bexp == 0x7FF && !mantissaIsZero) {
1517      /* vex_printf("NaN\n"); */
1518      return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1519   }
1520
1521   /* Uh, ok, we give up.  It must be a normal finite number.
1522      Return 0,1,sign,0.
1523   */
1524   /* vex_printf("normal\n"); */
1525   return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1526}
1527
1528
1529/* This is used to implement both 'frstor' and 'fldenv'.  The latter
1530   appears to differ from the former only in that the 8 FP registers
1531   themselves are not transferred into the guest state. */
1532static
1533VexEmWarn do_put_x87 ( Bool moveRegs,
1534                       /*IN*/UChar* x87_state,
1535                       /*OUT*/VexGuestAMD64State* vex_state )
1536{
1537   Int        stno, preg;
1538   UInt       tag;
1539   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1540   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1541   Fpu_State* x87     = (Fpu_State*)x87_state;
1542   UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
1543   UInt       tagw    = x87->env[FP_ENV_TAG];
1544   UInt       fpucw   = x87->env[FP_ENV_CTRL];
1545   UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
1546   VexEmWarn  ew;
1547   UInt       fpround;
1548   ULong      pair;
1549
1550   /* Copy registers and tags */
1551   for (stno = 0; stno < 8; stno++) {
1552      preg = (stno + ftop) & 7;
1553      tag = (tagw >> (2*preg)) & 3;
1554      if (tag == 3) {
1555         /* register is empty */
1556         /* hmm, if it's empty, does it still get written?  Probably
1557            safer to say it does.  If we don't, memcheck could get out
1558            of sync, in that it thinks all FP registers are defined by
1559            this helper, but in reality some have not been updated. */
1560         if (moveRegs)
1561            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1562         vexTags[preg] = 0;
1563      } else {
1564         /* register is non-empty */
1565         if (moveRegs)
1566            convert_f80le_to_f64le( &x87->reg[10*stno],
1567                                    (UChar*)&vexRegs[preg] );
1568         vexTags[preg] = 1;
1569      }
1570   }
1571
1572   /* stack pointer */
1573   vex_state->guest_FTOP = ftop;
1574
1575   /* status word */
1576   vex_state->guest_FC3210 = c3210;
1577
1578   /* handle the control word, setting FPROUND and detecting any
1579      emulation warnings. */
1580   pair    = amd64g_check_fldcw ( (ULong)fpucw );
1581   fpround = (UInt)pair & 0xFFFFFFFFULL;
1582   ew      = (VexEmWarn)(pair >> 32);
1583
1584   vex_state->guest_FPROUND = fpround & 3;
1585
1586   /* emulation warnings --> caller */
1587   return ew;
1588}
1589
1590
1591/* Create an x87 FPU state from the guest state, as close as
1592   we can approximate it. */
1593static
1594void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
1595                  /*OUT*/UChar* x87_state )
1596{
1597   Int        i, stno, preg;
1598   UInt       tagw;
1599   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1600   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1601   Fpu_State* x87     = (Fpu_State*)x87_state;
1602   UInt       ftop    = vex_state->guest_FTOP;
1603   UInt       c3210   = vex_state->guest_FC3210;
1604
1605   for (i = 0; i < 14; i++)
1606      x87->env[i] = 0;
1607
1608   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1609   x87->env[FP_ENV_STAT]
1610      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
1611   x87->env[FP_ENV_CTRL]
1612      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
1613
1614   /* Dump the register stack in ST order. */
1615   tagw = 0;
1616   for (stno = 0; stno < 8; stno++) {
1617      preg = (stno + ftop) & 7;
1618      if (vexTags[preg] == 0) {
1619         /* register is empty */
1620         tagw |= (3 << (2*preg));
1621         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1622                                 &x87->reg[10*stno] );
1623      } else {
1624         /* register is full. */
1625         tagw |= (0 << (2*preg));
1626         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1627                                 &x87->reg[10*stno] );
1628      }
1629   }
1630   x87->env[FP_ENV_TAG] = toUShort(tagw);
1631}
1632
1633
1634/* CALLED FROM GENERATED CODE */
1635/* DIRTY HELPER (reads guest state, writes guest mem) */
1636/* NOTE: only handles 32-bit format (no REX.W on the insn) */
1637void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
1638{
1639   /* Derived from values obtained from
1640      vendor_id       : AuthenticAMD
1641      cpu family      : 15
1642      model           : 12
1643      model name      : AMD Athlon(tm) 64 Processor 3200+
1644      stepping        : 0
1645      cpu MHz         : 2200.000
1646      cache size      : 512 KB
1647   */
1648   /* Somewhat roundabout, but at least it's simple. */
1649   Fpu_State tmp;
1650   UShort*   addrS = (UShort*)addr;
1651   UChar*    addrC = (UChar*)addr;
1652   U128*     xmm   = (U128*)(addr + 160);
1653   UInt      mxcsr;
1654   UShort    fp_tags;
1655   UInt      summary_tags;
1656   Int       r, stno;
1657   UShort    *srcS, *dstS;
1658
1659   do_get_x87( gst, (UChar*)&tmp );
1660   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
1661
1662   /* Now build the proper fxsave image from the x87 image we just
1663      made. */
1664
1665   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
1666   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
1667
1668   /* set addrS[2] in an endian-independent way */
1669   summary_tags = 0;
1670   fp_tags = tmp.env[FP_ENV_TAG];
1671   for (r = 0; r < 8; r++) {
1672      if ( ((fp_tags >> (2*r)) & 3) != 3 )
1673         summary_tags |= (1 << r);
1674   }
1675   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
1676   addrC[5]  = 0; /* pad */
1677
1678   /* FOP: faulting fpu opcode.  From experimentation, the real CPU
1679      does not write this field. (?!) */
1680   addrS[3]  = 0; /* BOGUS */
1681
1682   /* RIP (Last x87 instruction pointer).  From experimentation, the
1683      real CPU does not write this field. (?!) */
1684   addrS[4]  = 0; /* BOGUS */
1685   addrS[5]  = 0; /* BOGUS */
1686   addrS[6]  = 0; /* BOGUS */
1687   addrS[7]  = 0; /* BOGUS */
1688
1689   /* RDP (Last x87 data pointer).  From experimentation, the real CPU
1690      does not write this field. (?!) */
1691   addrS[8]  = 0; /* BOGUS */
1692   addrS[9]  = 0; /* BOGUS */
1693   addrS[10] = 0; /* BOGUS */
1694   addrS[11] = 0; /* BOGUS */
1695
1696   addrS[12] = toUShort(mxcsr);  /* MXCSR */
1697   addrS[13] = toUShort(mxcsr >> 16);
1698
1699   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
1700   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
1701
1702   /* Copy in the FP registers, in ST order. */
1703   for (stno = 0; stno < 8; stno++) {
1704      srcS = (UShort*)(&tmp.reg[10*stno]);
1705      dstS = (UShort*)(&addrS[16 + 8*stno]);
1706      dstS[0] = srcS[0];
1707      dstS[1] = srcS[1];
1708      dstS[2] = srcS[2];
1709      dstS[3] = srcS[3];
1710      dstS[4] = srcS[4];
1711      dstS[5] = 0;
1712      dstS[6] = 0;
1713      dstS[7] = 0;
1714   }
1715
1716   /* That's the first 160 bytes of the image done.  Now only %xmm0
1717      .. %xmm15 remain to be copied.  If the host is big-endian, these
1718      need to be byte-swapped. */
1719   vassert(host_is_little_endian());
1720
1721#  define COPY_U128(_dst,_src)                       \
1722      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1723           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1724      while (0)
1725
1726   COPY_U128( xmm[0],  gst->guest_YMM0 );
1727   COPY_U128( xmm[1],  gst->guest_YMM1 );
1728   COPY_U128( xmm[2],  gst->guest_YMM2 );
1729   COPY_U128( xmm[3],  gst->guest_YMM3 );
1730   COPY_U128( xmm[4],  gst->guest_YMM4 );
1731   COPY_U128( xmm[5],  gst->guest_YMM5 );
1732   COPY_U128( xmm[6],  gst->guest_YMM6 );
1733   COPY_U128( xmm[7],  gst->guest_YMM7 );
1734   COPY_U128( xmm[8],  gst->guest_YMM8 );
1735   COPY_U128( xmm[9],  gst->guest_YMM9 );
1736   COPY_U128( xmm[10], gst->guest_YMM10 );
1737   COPY_U128( xmm[11], gst->guest_YMM11 );
1738   COPY_U128( xmm[12], gst->guest_YMM12 );
1739   COPY_U128( xmm[13], gst->guest_YMM13 );
1740   COPY_U128( xmm[14], gst->guest_YMM14 );
1741   COPY_U128( xmm[15], gst->guest_YMM15 );
1742
1743#  undef COPY_U128
1744}
1745
1746
1747/* CALLED FROM GENERATED CODE */
1748/* DIRTY HELPER (writes guest state, reads guest mem) */
1749VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
1750{
1751   Fpu_State tmp;
1752   VexEmWarn warnX87 = EmWarn_NONE;
1753   VexEmWarn warnXMM = EmWarn_NONE;
1754   UShort*   addrS   = (UShort*)addr;
1755   UChar*    addrC   = (UChar*)addr;
1756   U128*     xmm     = (U128*)(addr + 160);
1757   UShort    fp_tags;
1758   Int       r, stno, i;
1759
1760   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
1761      to be byte-swapped. */
1762   vassert(host_is_little_endian());
1763
1764#  define COPY_U128(_dst,_src)                       \
1765      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1766           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1767      while (0)
1768
1769   COPY_U128( gst->guest_YMM0, xmm[0] );
1770   COPY_U128( gst->guest_YMM1, xmm[1] );
1771   COPY_U128( gst->guest_YMM2, xmm[2] );
1772   COPY_U128( gst->guest_YMM3, xmm[3] );
1773   COPY_U128( gst->guest_YMM4, xmm[4] );
1774   COPY_U128( gst->guest_YMM5, xmm[5] );
1775   COPY_U128( gst->guest_YMM6, xmm[6] );
1776   COPY_U128( gst->guest_YMM7, xmm[7] );
1777   COPY_U128( gst->guest_YMM8, xmm[8] );
1778   COPY_U128( gst->guest_YMM9, xmm[9] );
1779   COPY_U128( gst->guest_YMM10, xmm[10] );
1780   COPY_U128( gst->guest_YMM11, xmm[11] );
1781   COPY_U128( gst->guest_YMM12, xmm[12] );
1782   COPY_U128( gst->guest_YMM13, xmm[13] );
1783   COPY_U128( gst->guest_YMM14, xmm[14] );
1784   COPY_U128( gst->guest_YMM15, xmm[15] );
1785
1786#  undef COPY_U128
1787
1788   /* Copy the x87 registers out of the image, into a temporary
1789      Fpu_State struct. */
1790   for (i = 0; i < 14; i++) tmp.env[i] = 0;
1791   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
1792   /* fill in tmp.reg[0..7] */
1793   for (stno = 0; stno < 8; stno++) {
1794      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
1795      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
1796      dstS[0] = srcS[0];
1797      dstS[1] = srcS[1];
1798      dstS[2] = srcS[2];
1799      dstS[3] = srcS[3];
1800      dstS[4] = srcS[4];
1801   }
1802   /* fill in tmp.env[0..13] */
1803   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
1804   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
1805
1806   fp_tags = 0;
1807   for (r = 0; r < 8; r++) {
1808      if (addrC[4] & (1<<r))
1809         fp_tags |= (0 << (2*r)); /* EMPTY */
1810      else
1811         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
1812   }
1813   tmp.env[FP_ENV_TAG] = fp_tags;
1814
1815   /* Now write 'tmp' into the guest state. */
1816   warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
1817
1818   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
1819                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
1820     ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
1821
1822     warnXMM = (VexEmWarn)(w64 >> 32);
1823
1824     gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
1825   }
1826
1827   /* Prefer an X87 emwarn over an XMM one, if both exist. */
1828   if (warnX87 != EmWarn_NONE)
1829      return warnX87;
1830   else
1831      return warnXMM;
1832}
1833
1834
1835/* DIRTY HELPER (writes guest state) */
1836/* Initialise the x87 FPU state as per 'finit'. */
1837void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
1838{
1839   Int i;
1840   gst->guest_FTOP = 0;
1841   for (i = 0; i < 8; i++) {
1842      gst->guest_FPTAG[i] = 0; /* empty */
1843      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
1844   }
1845   gst->guest_FPROUND = (ULong)Irrm_NEAREST;
1846   gst->guest_FC3210  = 0;
1847}
1848
1849
1850/* CALLED FROM GENERATED CODE */
1851/* DIRTY HELPER (reads guest memory) */
1852ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
1853{
1854   ULong f64;
1855   convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
1856   return f64;
1857}
1858
1859/* CALLED FROM GENERATED CODE */
1860/* DIRTY HELPER (writes guest memory) */
1861void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
1862{
1863   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
1864}
1865
1866
1867/* CALLED FROM GENERATED CODE */
1868/* CLEAN HELPER */
1869/* mxcsr[15:0] contains a SSE native format MXCSR value.
1870   Extract from it the required SSEROUND value and any resulting
1871   emulation warning, and return (warn << 32) | sseround value.
1872*/
1873ULong amd64g_check_ldmxcsr ( ULong mxcsr )
1874{
1875   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
1876   /* NOTE, encoded exactly as per enum IRRoundingMode. */
1877   ULong rmode = (mxcsr >> 13) & 3;
1878
1879   /* Detect any required emulation warnings. */
1880   VexEmWarn ew = EmWarn_NONE;
1881
1882   if ((mxcsr & 0x1F80) != 0x1F80) {
1883      /* unmasked exceptions! */
1884      ew = EmWarn_X86_sseExns;
1885   }
1886   else
1887   if (mxcsr & (1<<15)) {
1888      /* FZ is set */
1889      ew = EmWarn_X86_fz;
1890   }
1891   else
1892   if (mxcsr & (1<<6)) {
1893      /* DAZ is set */
1894      ew = EmWarn_X86_daz;
1895   }
1896
1897   return (((ULong)ew) << 32) | ((ULong)rmode);
1898}
1899
1900
1901/* CALLED FROM GENERATED CODE */
1902/* CLEAN HELPER */
1903/* Given sseround as an IRRoundingMode value, create a suitable SSE
1904   native format MXCSR value. */
1905ULong amd64g_create_mxcsr ( ULong sseround )
1906{
1907   sseround &= 3;
1908   return 0x1F80 | (sseround << 13);
1909}
1910
1911
1912/* CLEAN HELPER */
1913/* fpucw[15:0] contains a x87 native format FPU control word.
1914   Extract from it the required FPROUND value and any resulting
1915   emulation warning, and return (warn << 32) | fpround value.
1916*/
1917ULong amd64g_check_fldcw ( ULong fpucw )
1918{
1919   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
1920   /* NOTE, encoded exactly as per enum IRRoundingMode. */
1921   ULong rmode = (fpucw >> 10) & 3;
1922
1923   /* Detect any required emulation warnings. */
1924   VexEmWarn ew = EmWarn_NONE;
1925
1926   if ((fpucw & 0x3F) != 0x3F) {
1927      /* unmasked exceptions! */
1928      ew = EmWarn_X86_x87exns;
1929   }
1930   else
1931   if (((fpucw >> 8) & 3) != 3) {
1932      /* unsupported precision */
1933      ew = EmWarn_X86_x87precision;
1934   }
1935
1936   return (((ULong)ew) << 32) | ((ULong)rmode);
1937}
1938
1939
1940/* CLEAN HELPER */
1941/* Given fpround as an IRRoundingMode value, create a suitable x87
1942   native format FPU control word. */
1943ULong amd64g_create_fpucw ( ULong fpround )
1944{
1945   fpround &= 3;
1946   return 0x037F | (fpround << 10);
1947}
1948
1949
1950/* This is used to implement 'fldenv'.
1951   Reads 28 bytes at x87_state[0 .. 27]. */
1952/* CALLED FROM GENERATED CODE */
1953/* DIRTY HELPER */
1954VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
1955                                      /*IN*/HWord x87_state)
1956{
1957   return do_put_x87( False, (UChar*)x87_state, vex_state );
1958}
1959
1960
1961/* CALLED FROM GENERATED CODE */
1962/* DIRTY HELPER */
1963/* Create an x87 FPU env from the guest state, as close as we can
1964   approximate it.  Writes 28 bytes at x87_state[0..27]. */
1965void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
1966                                 /*OUT*/HWord x87_state )
1967{
1968   Int        i, stno, preg;
1969   UInt       tagw;
1970   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1971   Fpu_State* x87     = (Fpu_State*)x87_state;
1972   UInt       ftop    = vex_state->guest_FTOP;
1973   ULong      c3210   = vex_state->guest_FC3210;
1974
1975   for (i = 0; i < 14; i++)
1976      x87->env[i] = 0;
1977
1978   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1979   x87->env[FP_ENV_STAT]
1980      = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
1981   x87->env[FP_ENV_CTRL]
1982      = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
1983
1984   /* Compute the x87 tag word. */
1985   tagw = 0;
1986   for (stno = 0; stno < 8; stno++) {
1987      preg = (stno + ftop) & 7;
1988      if (vexTags[preg] == 0) {
1989         /* register is empty */
1990         tagw |= (3 << (2*preg));
1991      } else {
1992         /* register is full. */
1993         tagw |= (0 << (2*preg));
1994      }
1995   }
1996   x87->env[FP_ENV_TAG] = toUShort(tagw);
1997
1998   /* We don't dump the x87 registers, tho. */
1999}
2000
2001
2002/* This is used to implement 'fnsave'.
2003   Writes 108 bytes at x87_state[0 .. 107]. */
2004/* CALLED FROM GENERATED CODE */
2005/* DIRTY HELPER */
2006void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2007                                 /*OUT*/HWord x87_state)
2008{
2009   do_get_x87( vex_state, (UChar*)x87_state );
2010}
2011
2012
2013/* This is used to implement 'fnsaves'.
2014   Writes 94 bytes at x87_state[0 .. 93]. */
2015/* CALLED FROM GENERATED CODE */
2016/* DIRTY HELPER */
2017void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2018                                  /*OUT*/HWord x87_state)
2019{
2020   Int           i, stno, preg;
2021   UInt          tagw;
2022   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2023   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2024   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2025   UInt          ftop    = vex_state->guest_FTOP;
2026   UInt          c3210   = vex_state->guest_FC3210;
2027
2028   for (i = 0; i < 7; i++)
2029      x87->env[i] = 0;
2030
2031   x87->env[FPS_ENV_STAT]
2032      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2033   x87->env[FPS_ENV_CTRL]
2034      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2035
2036   /* Dump the register stack in ST order. */
2037   tagw = 0;
2038   for (stno = 0; stno < 8; stno++) {
2039      preg = (stno + ftop) & 7;
2040      if (vexTags[preg] == 0) {
2041         /* register is empty */
2042         tagw |= (3 << (2*preg));
2043         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2044                                 &x87->reg[10*stno] );
2045      } else {
2046         /* register is full. */
2047         tagw |= (0 << (2*preg));
2048         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2049                                 &x87->reg[10*stno] );
2050      }
2051   }
2052   x87->env[FPS_ENV_TAG] = toUShort(tagw);
2053}
2054
2055
2056/* This is used to implement 'frstor'.
2057   Reads 108 bytes at x87_state[0 .. 107]. */
2058/* CALLED FROM GENERATED CODE */
2059/* DIRTY HELPER */
2060VexEmWarn amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2061                                      /*IN*/HWord x87_state)
2062{
2063   return do_put_x87( True, (UChar*)x87_state, vex_state );
2064}
2065
2066
2067/* This is used to implement 'frstors'.
2068   Reads 94 bytes at x87_state[0 .. 93]. */
2069/* CALLED FROM GENERATED CODE */
2070/* DIRTY HELPER */
2071VexEmWarn amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2072                                       /*IN*/HWord x87_state)
2073{
2074   Int           stno, preg;
2075   UInt          tag;
2076   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2077   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2078   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2079   UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2080   UInt          tagw    = x87->env[FPS_ENV_TAG];
2081   UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2082   UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2083   VexEmWarn     ew;
2084   UInt          fpround;
2085   ULong         pair;
2086
2087   /* Copy registers and tags */
2088   for (stno = 0; stno < 8; stno++) {
2089      preg = (stno + ftop) & 7;
2090      tag = (tagw >> (2*preg)) & 3;
2091      if (tag == 3) {
2092         /* register is empty */
2093         /* hmm, if it's empty, does it still get written?  Probably
2094            safer to say it does.  If we don't, memcheck could get out
2095            of sync, in that it thinks all FP registers are defined by
2096            this helper, but in reality some have not been updated. */
2097         vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2098         vexTags[preg] = 0;
2099      } else {
2100         /* register is non-empty */
2101         convert_f80le_to_f64le( &x87->reg[10*stno],
2102                                 (UChar*)&vexRegs[preg] );
2103         vexTags[preg] = 1;
2104      }
2105   }
2106
2107   /* stack pointer */
2108   vex_state->guest_FTOP = ftop;
2109
2110   /* status word */
2111   vex_state->guest_FC3210 = c3210;
2112
2113   /* handle the control word, setting FPROUND and detecting any
2114      emulation warnings. */
2115   pair    = amd64g_check_fldcw ( (ULong)fpucw );
2116   fpround = (UInt)pair & 0xFFFFFFFFULL;
2117   ew      = (VexEmWarn)(pair >> 32);
2118
2119   vex_state->guest_FPROUND = fpround & 3;
2120
2121   /* emulation warnings --> caller */
2122   return ew;
2123}
2124
2125
2126/*---------------------------------------------------------------*/
2127/*--- Misc integer helpers, including rotates and CPUID.      ---*/
2128/*---------------------------------------------------------------*/
2129
2130/* Claim to be the following CPU, which is probably representative of
2131   the lowliest (earliest) amd64 offerings.  It can do neither sse3
2132   nor cx16.
2133
2134   vendor_id       : AuthenticAMD
2135   cpu family      : 15
2136   model           : 5
2137   model name      : AMD Opteron (tm) Processor 848
2138   stepping        : 10
2139   cpu MHz         : 1797.682
2140   cache size      : 1024 KB
2141   fpu             : yes
2142   fpu_exception   : yes
2143   cpuid level     : 1
2144   wp              : yes
2145   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2146                     mtrr pge mca cmov pat pse36 clflush mmx fxsr
2147                     sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2148   bogomips        : 3600.62
2149   TLB size        : 1088 4K pages
2150   clflush size    : 64
2151   cache_alignment : 64
2152   address sizes   : 40 bits physical, 48 bits virtual
2153   power management: ts fid vid ttp
2154
2155   2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2156   we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2157   and 3dnowext is 80000001.EDX.30.
2158*/
2159void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2160{
2161#  define SET_ABCD(_a,_b,_c,_d)                \
2162      do { st->guest_RAX = (ULong)(_a);        \
2163           st->guest_RBX = (ULong)(_b);        \
2164           st->guest_RCX = (ULong)(_c);        \
2165           st->guest_RDX = (ULong)(_d);        \
2166      } while (0)
2167
2168   switch (0xFFFFFFFF & st->guest_RAX) {
2169      case 0x00000000:
2170         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2171         break;
2172      case 0x00000001:
2173         SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2174         break;
2175      case 0x80000000:
2176         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2177         break;
2178      case 0x80000001:
2179         /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
2180            the original it-is-supported value that the h/w provides.
2181            See #291568. */
2182         SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2183                                                      0x21d3fbff);
2184         break;
2185      case 0x80000002:
2186         SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2187         break;
2188      case 0x80000003:
2189         SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2190         break;
2191      case 0x80000004:
2192         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2193         break;
2194      case 0x80000005:
2195         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2196         break;
2197      case 0x80000006:
2198         SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2199         break;
2200      case 0x80000007:
2201         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2202         break;
2203      case 0x80000008:
2204         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2205         break;
2206      default:
2207         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2208         break;
2209   }
2210#  undef SET_ABCD
2211}
2212
2213
2214/* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2215   capable.
2216
2217   vendor_id       : GenuineIntel
2218   cpu family      : 6
2219   model           : 15
2220   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2221   stepping        : 6
2222   cpu MHz         : 2394.000
2223   cache size      : 4096 KB
2224   physical id     : 0
2225   siblings        : 2
2226   core id         : 0
2227   cpu cores       : 2
2228   fpu             : yes
2229   fpu_exception   : yes
2230   cpuid level     : 10
2231   wp              : yes
2232   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2233                     mtrr pge mca cmov pat pse36 clflush dts acpi
2234                     mmx fxsr sse sse2 ss ht tm syscall nx lm
2235                     constant_tsc pni monitor ds_cpl vmx est tm2
2236                     cx16 xtpr lahf_lm
2237   bogomips        : 4798.78
2238   clflush size    : 64
2239   cache_alignment : 64
2240   address sizes   : 36 bits physical, 48 bits virtual
2241   power management:
2242*/
2243void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2244{
2245#  define SET_ABCD(_a,_b,_c,_d)                \
2246      do { st->guest_RAX = (ULong)(_a);        \
2247           st->guest_RBX = (ULong)(_b);        \
2248           st->guest_RCX = (ULong)(_c);        \
2249           st->guest_RDX = (ULong)(_d);        \
2250      } while (0)
2251
2252   switch (0xFFFFFFFF & st->guest_RAX) {
2253      case 0x00000000:
2254         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2255         break;
2256      case 0x00000001:
2257         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2258         break;
2259      case 0x00000002:
2260         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2261         break;
2262      case 0x00000003:
2263         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2264         break;
2265      case 0x00000004: {
2266         switch (0xFFFFFFFF & st->guest_RCX) {
2267            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2268                                      0x0000003f, 0x00000001); break;
2269            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2270                                      0x0000003f, 0x00000001); break;
2271            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2272                                      0x00000fff, 0x00000001); break;
2273            default:         SET_ABCD(0x00000000, 0x00000000,
2274                                      0x00000000, 0x00000000); break;
2275         }
2276         break;
2277      }
2278      case 0x00000005:
2279         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2280         break;
2281      case 0x00000006:
2282         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2283         break;
2284      case 0x00000007:
2285         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2286         break;
2287      case 0x00000008:
2288         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2289         break;
2290      case 0x00000009:
2291         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2292         break;
2293      case 0x0000000a:
2294      unhandled_eax_value:
2295         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2296         break;
2297      case 0x80000000:
2298         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2299         break;
2300      case 0x80000001:
2301         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2302         break;
2303      case 0x80000002:
2304         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2305         break;
2306      case 0x80000003:
2307         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2308         break;
2309      case 0x80000004:
2310         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2311         break;
2312      case 0x80000005:
2313         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2314         break;
2315      case 0x80000006:
2316         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2317         break;
2318      case 0x80000007:
2319         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2320         break;
2321      case 0x80000008:
2322         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2323         break;
2324      default:
2325         goto unhandled_eax_value;
2326   }
2327#  undef SET_ABCD
2328}
2329
2330
2331/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2332   capable.
2333
2334   vendor_id       : GenuineIntel
2335   cpu family      : 6
2336   model           : 37
2337   model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2338   stepping        : 2
2339   cpu MHz         : 3334.000
2340   cache size      : 4096 KB
2341   physical id     : 0
2342   siblings        : 4
2343   core id         : 0
2344   cpu cores       : 2
2345   apicid          : 0
2346   initial apicid  : 0
2347   fpu             : yes
2348   fpu_exception   : yes
2349   cpuid level     : 11
2350   wp              : yes
2351   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2352                     mtrr pge mca cmov pat pse36 clflush dts acpi
2353                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2354                     lm constant_tsc arch_perfmon pebs bts rep_good
2355                     xtopology nonstop_tsc aperfmperf pni pclmulqdq
2356                     dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2357                     xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2358                     arat tpr_shadow vnmi flexpriority ept vpid
2359   bogomips        : 6957.57
2360   clflush size    : 64
2361   cache_alignment : 64
2362   address sizes   : 36 bits physical, 48 bits virtual
2363   power management:
2364*/
2365void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2366{
2367#  define SET_ABCD(_a,_b,_c,_d)                \
2368      do { st->guest_RAX = (ULong)(_a);        \
2369           st->guest_RBX = (ULong)(_b);        \
2370           st->guest_RCX = (ULong)(_c);        \
2371           st->guest_RDX = (ULong)(_d);        \
2372      } while (0)
2373
2374   UInt old_eax = (UInt)st->guest_RAX;
2375   UInt old_ecx = (UInt)st->guest_RCX;
2376
2377   switch (old_eax) {
2378      case 0x00000000:
2379         SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2380         break;
2381      case 0x00000001:
2382         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2383         break;
2384      case 0x00000002:
2385         SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2386         break;
2387      case 0x00000003:
2388         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2389         break;
2390      case 0x00000004:
2391         switch (old_ecx) {
2392            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2393                                      0x0000003f, 0x00000000); break;
2394            case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2395                                      0x0000007f, 0x00000000); break;
2396            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2397                                      0x000001ff, 0x00000000); break;
2398            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2399                                      0x00000fff, 0x00000002); break;
2400            default:         SET_ABCD(0x00000000, 0x00000000,
2401                                      0x00000000, 0x00000000); break;
2402         }
2403         break;
2404      case 0x00000005:
2405         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2406         break;
2407      case 0x00000006:
2408         SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2409         break;
2410      case 0x00000007:
2411         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2412         break;
2413      case 0x00000008:
2414         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2415         break;
2416      case 0x00000009:
2417         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2418         break;
2419      case 0x0000000a:
2420         SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2421         break;
2422      case 0x0000000b:
2423         switch (old_ecx) {
2424            case 0x00000000:
2425               SET_ABCD(0x00000001, 0x00000002,
2426                        0x00000100, 0x00000000); break;
2427            case 0x00000001:
2428               SET_ABCD(0x00000004, 0x00000004,
2429                        0x00000201, 0x00000000); break;
2430            default:
2431               SET_ABCD(0x00000000, 0x00000000,
2432                        old_ecx,    0x00000000); break;
2433         }
2434         break;
2435      case 0x0000000c:
2436         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2437         break;
2438      case 0x0000000d:
2439         switch (old_ecx) {
2440            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2441                                      0x00000100, 0x00000000); break;
2442            case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2443                                      0x00000201, 0x00000000); break;
2444            default:         SET_ABCD(0x00000000, 0x00000000,
2445                                      old_ecx,    0x00000000); break;
2446         }
2447         break;
2448      case 0x80000000:
2449         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2450         break;
2451      case 0x80000001:
2452         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2453         break;
2454      case 0x80000002:
2455         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2456         break;
2457      case 0x80000003:
2458         SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2459         break;
2460      case 0x80000004:
2461         SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2462         break;
2463      case 0x80000005:
2464         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2465         break;
2466      case 0x80000006:
2467         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2468         break;
2469      case 0x80000007:
2470         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2471         break;
2472      case 0x80000008:
2473         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2474         break;
2475      default:
2476         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2477         break;
2478   }
2479#  undef SET_ABCD
2480}
2481
2482
2483/* Claim to be the following CPU (4 x ...), which is AVX and cx16
2484   capable.
2485
2486   vendor_id       : GenuineIntel
2487   cpu family      : 6
2488   model           : 42
2489   model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
2490   stepping        : 7
2491   cpu MHz         : 1600.000
2492   cache size      : 6144 KB
2493   physical id     : 0
2494   siblings        : 4
2495   core id         : 3
2496   cpu cores       : 4
2497   apicid          : 6
2498   initial apicid  : 6
2499   fpu             : yes
2500   fpu_exception   : yes
2501   cpuid level     : 13
2502   wp              : yes
2503   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2504                     mtrr pge mca cmov pat pse36 clflush dts acpi
2505                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2506                     lm constant_tsc arch_perfmon pebs bts rep_good
2507                     nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
2508                     dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
2509                     xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
2510                     lahf_lm ida arat epb xsaveopt pln pts dts
2511                     tpr_shadow vnmi flexpriority ept vpid
2512
2513   bogomips        : 5768.94
2514   clflush size    : 64
2515   cache_alignment : 64
2516   address sizes   : 36 bits physical, 48 bits virtual
2517   power management:
2518*/
2519void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
2520{
2521#  define SET_ABCD(_a,_b,_c,_d)                \
2522      do { st->guest_RAX = (ULong)(_a);        \
2523           st->guest_RBX = (ULong)(_b);        \
2524           st->guest_RCX = (ULong)(_c);        \
2525           st->guest_RDX = (ULong)(_d);        \
2526      } while (0)
2527
2528   UInt old_eax = (UInt)st->guest_RAX;
2529   UInt old_ecx = (UInt)st->guest_RCX;
2530
2531   switch (old_eax) {
2532      case 0x00000000:
2533         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
2534         break;
2535      case 0x00000001:
2536         SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
2537         break;
2538      case 0x00000002:
2539         SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
2540         break;
2541      case 0x00000003:
2542         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2543         break;
2544      case 0x00000004:
2545         switch (old_ecx) {
2546            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2547                                      0x0000003f, 0x00000000); break;
2548            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
2549                                      0x0000003f, 0x00000000); break;
2550            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2551                                      0x000001ff, 0x00000000); break;
2552            case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
2553                                      0x00001fff, 0x00000006); break;
2554            default:         SET_ABCD(0x00000000, 0x00000000,
2555                                      0x00000000, 0x00000000); break;
2556         }
2557         break;
2558      case 0x00000005:
2559         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2560         break;
2561      case 0x00000006:
2562         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
2563         break;
2564      case 0x00000007:
2565         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2566         break;
2567      case 0x00000008:
2568         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2569         break;
2570      case 0x00000009:
2571         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2572         break;
2573      case 0x0000000a:
2574         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
2575         break;
2576      case 0x0000000b:
2577         switch (old_ecx) {
2578            case 0x00000000:
2579               SET_ABCD(0x00000001, 0x00000001,
2580                        0x00000100, 0x00000000); break;
2581            case 0x00000001:
2582               SET_ABCD(0x00000004, 0x00000004,
2583                        0x00000201, 0x00000000); break;
2584            default:
2585               SET_ABCD(0x00000000, 0x00000000,
2586                        old_ecx,    0x00000000); break;
2587         }
2588         break;
2589      case 0x0000000c:
2590         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2591         break;
2592      case 0x0000000d:
2593         switch (old_ecx) {
2594            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
2595                                      0x00000340, 0x00000000); break;
2596            case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
2597                                      0x00000000, 0x00000000); break;
2598            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
2599                                      0x00000000, 0x00000000); break;
2600            default:         SET_ABCD(0x00000000, 0x00000000,
2601                                      0x00000000, 0x00000000); break;
2602         }
2603         break;
2604      case 0x0000000e:
2605         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2606         break;
2607      case 0x0000000f:
2608         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2609         break;
2610      case 0x80000000:
2611         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2612         break;
2613      case 0x80000001:
2614         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2615         break;
2616      case 0x80000002:
2617         SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
2618         break;
2619      case 0x80000003:
2620         SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
2621         break;
2622      case 0x80000004:
2623         SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
2624         break;
2625      case 0x80000005:
2626         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2627         break;
2628      case 0x80000006:
2629         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2630         break;
2631      case 0x80000007:
2632         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2633         break;
2634      case 0x80000008:
2635         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2636         break;
2637      default:
2638         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2639         break;
2640   }
2641#  undef SET_ABCD
2642}
2643
2644
2645ULong amd64g_calculate_RCR ( ULong arg,
2646                             ULong rot_amt,
2647                             ULong rflags_in,
2648                             Long  szIN )
2649{
2650   Bool  wantRflags = toBool(szIN < 0);
2651   ULong sz         = wantRflags ? (-szIN) : szIN;
2652   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2653   ULong cf=0, of=0, tempcf;
2654
2655   switch (sz) {
2656      case 8:
2657         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2658         of        = ((arg >> 63) ^ cf) & 1;
2659         while (tempCOUNT > 0) {
2660            tempcf = arg & 1;
2661            arg    = (arg >> 1) | (cf << 63);
2662            cf     = tempcf;
2663            tempCOUNT--;
2664         }
2665         break;
2666      case 4:
2667         while (tempCOUNT >= 33) tempCOUNT -= 33;
2668         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2669         of        = ((arg >> 31) ^ cf) & 1;
2670         while (tempCOUNT > 0) {
2671            tempcf = arg & 1;
2672            arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
2673            cf     = tempcf;
2674            tempCOUNT--;
2675         }
2676         break;
2677      case 2:
2678         while (tempCOUNT >= 17) tempCOUNT -= 17;
2679         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2680         of        = ((arg >> 15) ^ cf) & 1;
2681         while (tempCOUNT > 0) {
2682            tempcf = arg & 1;
2683            arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
2684            cf     = tempcf;
2685            tempCOUNT--;
2686         }
2687         break;
2688      case 1:
2689         while (tempCOUNT >= 9) tempCOUNT -= 9;
2690         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2691         of        = ((arg >> 7) ^ cf) & 1;
2692         while (tempCOUNT > 0) {
2693            tempcf = arg & 1;
2694            arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
2695            cf     = tempcf;
2696            tempCOUNT--;
2697         }
2698         break;
2699      default:
2700         vpanic("calculate_RCR(amd64g): invalid size");
2701   }
2702
2703   cf &= 1;
2704   of &= 1;
2705   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2706   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2707
2708   /* caller can ask to have back either the resulting flags or
2709      resulting value, but not both */
2710   return wantRflags ? rflags_in : arg;
2711}
2712
2713ULong amd64g_calculate_RCL ( ULong arg,
2714                             ULong rot_amt,
2715                             ULong rflags_in,
2716                             Long  szIN )
2717{
2718   Bool  wantRflags = toBool(szIN < 0);
2719   ULong sz         = wantRflags ? (-szIN) : szIN;
2720   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2721   ULong cf=0, of=0, tempcf;
2722
2723   switch (sz) {
2724      case 8:
2725         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2726         while (tempCOUNT > 0) {
2727            tempcf = (arg >> 63) & 1;
2728            arg    = (arg << 1) | (cf & 1);
2729            cf     = tempcf;
2730            tempCOUNT--;
2731         }
2732         of = ((arg >> 63) ^ cf) & 1;
2733         break;
2734      case 4:
2735         while (tempCOUNT >= 33) tempCOUNT -= 33;
2736         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2737         while (tempCOUNT > 0) {
2738            tempcf = (arg >> 31) & 1;
2739            arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
2740            cf     = tempcf;
2741            tempCOUNT--;
2742         }
2743         of = ((arg >> 31) ^ cf) & 1;
2744         break;
2745      case 2:
2746         while (tempCOUNT >= 17) tempCOUNT -= 17;
2747         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2748         while (tempCOUNT > 0) {
2749            tempcf = (arg >> 15) & 1;
2750            arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
2751            cf     = tempcf;
2752            tempCOUNT--;
2753         }
2754         of = ((arg >> 15) ^ cf) & 1;
2755         break;
2756      case 1:
2757         while (tempCOUNT >= 9) tempCOUNT -= 9;
2758         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2759         while (tempCOUNT > 0) {
2760            tempcf = (arg >> 7) & 1;
2761            arg    = 0xFFULL & ((arg << 1) | (cf & 1));
2762            cf     = tempcf;
2763            tempCOUNT--;
2764         }
2765         of = ((arg >> 7) ^ cf) & 1;
2766         break;
2767      default:
2768         vpanic("calculate_RCL(amd64g): invalid size");
2769   }
2770
2771   cf &= 1;
2772   of &= 1;
2773   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2774   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2775
2776   return wantRflags ? rflags_in : arg;
2777}
2778
2779/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
2780 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
2781 */
2782ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
2783{
2784    ULong hi, lo, tmp, A[16];
2785
2786   A[0] = 0;            A[1] = a;
2787   A[2] = A[1] << 1;    A[3] = A[2] ^ a;
2788   A[4] = A[2] << 1;    A[5] = A[4] ^ a;
2789   A[6] = A[3] << 1;    A[7] = A[6] ^ a;
2790   A[8] = A[4] << 1;    A[9] = A[8] ^ a;
2791   A[10] = A[5] << 1;   A[11] = A[10] ^ a;
2792   A[12] = A[6] << 1;   A[13] = A[12] ^ a;
2793   A[14] = A[7] << 1;   A[15] = A[14] ^ a;
2794
2795   lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
2796   hi = lo >> 56;
2797   lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
2798   hi = (hi << 8) | (lo >> 56);
2799   lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
2800   hi = (hi << 8) | (lo >> 56);
2801   lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
2802   hi = (hi << 8) | (lo >> 56);
2803   lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
2804   hi = (hi << 8) | (lo >> 56);
2805   lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
2806   hi = (hi << 8) | (lo >> 56);
2807   lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
2808   hi = (hi << 8) | (lo >> 56);
2809   lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
2810
2811   ULong m0 = -1;
2812   m0 /= 255;
2813   tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
2814   tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
2815   tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
2816   tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
2817   tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
2818   tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
2819   tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
2820
2821   return which ? hi : lo;
2822}
2823
2824
2825/* CALLED FROM GENERATED CODE */
2826/* DIRTY HELPER (non-referentially-transparent) */
2827/* Horrible hack.  On non-amd64 platforms, return 1. */
2828ULong amd64g_dirtyhelper_RDTSC ( void )
2829{
2830#  if defined(__x86_64__)
2831   UInt  eax, edx;
2832   __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
2833   return (((ULong)edx) << 32) | ((ULong)eax);
2834#  else
2835   return 1ULL;
2836#  endif
2837}
2838
2839
2840/* CALLED FROM GENERATED CODE */
2841/* DIRTY HELPER (non-referentially-transparent) */
2842/* Horrible hack.  On non-amd64 platforms, return 0. */
2843ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
2844{
2845#  if defined(__x86_64__)
2846   ULong r = 0;
2847   portno &= 0xFFFF;
2848   switch (sz) {
2849      case 4:
2850         __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
2851                              : "=a" (r) : "Nd" (portno));
2852	 break;
2853      case 2:
2854         __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
2855                              : "=a" (r) : "Nd" (portno));
2856	 break;
2857      case 1:
2858         __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
2859                              : "=a" (r) : "Nd" (portno));
2860	 break;
2861      default:
2862         break; /* note: no 64-bit version of insn exists */
2863   }
2864   return r;
2865#  else
2866   return 0;
2867#  endif
2868}
2869
2870
2871/* CALLED FROM GENERATED CODE */
2872/* DIRTY HELPER (non-referentially-transparent) */
2873/* Horrible hack.  On non-amd64 platforms, do nothing. */
2874void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
2875{
2876#  if defined(__x86_64__)
2877   portno &= 0xFFFF;
2878   switch (sz) {
2879      case 4:
2880         __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
2881                              : : "a" (data), "Nd" (portno));
2882	 break;
2883      case 2:
2884         __asm__ __volatile__("outw %w0, %w1"
2885                              : : "a" (data), "Nd" (portno));
2886	 break;
2887      case 1:
2888         __asm__ __volatile__("outb %b0, %w1"
2889                              : : "a" (data), "Nd" (portno));
2890	 break;
2891      default:
2892         break; /* note: no 64-bit version of insn exists */
2893   }
2894#  else
2895   /* do nothing */
2896#  endif
2897}
2898
2899/* CALLED FROM GENERATED CODE */
2900/* DIRTY HELPER (non-referentially-transparent) */
2901/* Horrible hack.  On non-amd64 platforms, do nothing. */
2902/* op = 0: call the native SGDT instruction.
2903   op = 1: call the native SIDT instruction.
2904*/
2905void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
2906#  if defined(__x86_64__)
2907   switch (op) {
2908      case 0:
2909         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
2910         break;
2911      case 1:
2912         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
2913         break;
2914      default:
2915         vpanic("amd64g_dirtyhelper_SxDT");
2916   }
2917#  else
2918   /* do nothing */
2919   UChar* p = (UChar*)address;
2920   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
2921   p[6] = p[7] = p[8] = p[9] = 0;
2922#  endif
2923}
2924
2925/*---------------------------------------------------------------*/
2926/*--- Helpers for MMX/SSE/SSE2.                               ---*/
2927/*---------------------------------------------------------------*/
2928
2929static inline UChar abdU8 ( UChar xx, UChar yy ) {
2930   return toUChar(xx>yy ? xx-yy : yy-xx);
2931}
2932
2933static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
2934   return (((ULong)w1) << 32) | ((ULong)w0);
2935}
2936
2937static inline UShort sel16x4_3 ( ULong w64 ) {
2938   UInt hi32 = toUInt(w64 >> 32);
2939   return toUShort(hi32 >> 16);
2940}
2941static inline UShort sel16x4_2 ( ULong w64 ) {
2942   UInt hi32 = toUInt(w64 >> 32);
2943   return toUShort(hi32);
2944}
2945static inline UShort sel16x4_1 ( ULong w64 ) {
2946   UInt lo32 = toUInt(w64);
2947   return toUShort(lo32 >> 16);
2948}
2949static inline UShort sel16x4_0 ( ULong w64 ) {
2950   UInt lo32 = toUInt(w64);
2951   return toUShort(lo32);
2952}
2953
2954static inline UChar sel8x8_7 ( ULong w64 ) {
2955   UInt hi32 = toUInt(w64 >> 32);
2956   return toUChar(hi32 >> 24);
2957}
2958static inline UChar sel8x8_6 ( ULong w64 ) {
2959   UInt hi32 = toUInt(w64 >> 32);
2960   return toUChar(hi32 >> 16);
2961}
2962static inline UChar sel8x8_5 ( ULong w64 ) {
2963   UInt hi32 = toUInt(w64 >> 32);
2964   return toUChar(hi32 >> 8);
2965}
2966static inline UChar sel8x8_4 ( ULong w64 ) {
2967   UInt hi32 = toUInt(w64 >> 32);
2968   return toUChar(hi32 >> 0);
2969}
2970static inline UChar sel8x8_3 ( ULong w64 ) {
2971   UInt lo32 = toUInt(w64);
2972   return toUChar(lo32 >> 24);
2973}
2974static inline UChar sel8x8_2 ( ULong w64 ) {
2975   UInt lo32 = toUInt(w64);
2976   return toUChar(lo32 >> 16);
2977}
2978static inline UChar sel8x8_1 ( ULong w64 ) {
2979   UInt lo32 = toUInt(w64);
2980   return toUChar(lo32 >> 8);
2981}
2982static inline UChar sel8x8_0 ( ULong w64 ) {
2983   UInt lo32 = toUInt(w64);
2984   return toUChar(lo32 >> 0);
2985}
2986
2987/* CALLED FROM GENERATED CODE: CLEAN HELPER */
2988ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
2989{
2990   return
2991      mk32x2(
2992         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
2993            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
2994         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
2995            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
2996      );
2997}
2998
2999/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3000ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
3001{
3002   ULong r = 0;
3003   if (xx & (1ULL << (64-1))) r |= (1<<7);
3004   if (xx & (1ULL << (56-1))) r |= (1<<6);
3005   if (xx & (1ULL << (48-1))) r |= (1<<5);
3006   if (xx & (1ULL << (40-1))) r |= (1<<4);
3007   if (xx & (1ULL << (32-1))) r |= (1<<3);
3008   if (xx & (1ULL << (24-1))) r |= (1<<2);
3009   if (xx & (1ULL << (16-1))) r |= (1<<1);
3010   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
3011   return r;
3012}
3013
3014/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3015ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3016{
3017   UInt t = 0;
3018   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3019   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3020   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3021   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3022   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3023   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3024   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3025   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3026   t &= 0xFFFF;
3027   return (ULong)t;
3028}
3029
3030/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3031ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
3032{
3033   ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
3034   ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
3035   return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
3036}
3037
3038/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3039ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3040{
3041   UShort t, min;
3042   UInt   idx;
3043   t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
3044   t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3045   t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3046   t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3047   t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3048   t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3049   t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3050   t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3051   return ((ULong)(idx << 16)) | ((ULong)min);
3052}
3053
3054/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3055ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3056{
3057   UInt  i;
3058   ULong crc = (b & 0xFFULL) ^ crcIn;
3059   for (i = 0; i < 8; i++)
3060      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3061   return crc;
3062}
3063
3064/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3065ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3066{
3067   UInt  i;
3068   ULong crc = (w & 0xFFFFULL) ^ crcIn;
3069   for (i = 0; i < 16; i++)
3070      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3071   return crc;
3072}
3073
3074/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3075ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3076{
3077   UInt i;
3078   ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3079   for (i = 0; i < 32; i++)
3080      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3081   return crc;
3082}
3083
3084/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3085ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3086{
3087   ULong crc = amd64g_calc_crc32l(crcIn, q);
3088   return amd64g_calc_crc32l(crc, q >> 32);
3089}
3090
3091
3092/* .. helper for next fn .. */
3093static inline ULong sad_8x4 ( ULong xx, ULong yy )
3094{
3095   UInt t = 0;
3096   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3097   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3098   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3099   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3100   return (ULong)t;
3101}
3102
3103/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3104ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3105                            ULong dHi, ULong dLo,
3106                            ULong imm_and_return_control_bit )
3107{
3108   UInt imm8     = imm_and_return_control_bit & 7;
3109   Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
3110   UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3111   UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3112   /* For src we only need 32 bits, so get them into the
3113      lower half of a 64 bit word. */
3114   ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3115   /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3116      11 bytes.  If calculating the low part of the result, need bytes
3117      dstOffsL * 4 + (0 .. 6); if calculating the high part,
3118      dstOffsL * 4 + (4 .. 10). */
3119   ULong dst;
3120   /* dstOffL = 0, Lo  ->  0 .. 6
3121      dstOffL = 1, Lo  ->  4 .. 10
3122      dstOffL = 0, Hi  ->  4 .. 10
3123      dstOffL = 1, Hi  ->  8 .. 14
3124   */
3125   if (calcHi && dstOffsL) {
3126      /* 8 .. 14 */
3127      dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3128   }
3129   else if (!calcHi && !dstOffsL) {
3130      /* 0 .. 6 */
3131      dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3132   }
3133   else {
3134      /* 4 .. 10 */
3135      dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3136   }
3137   ULong r0  = sad_8x4( dst >>  0, src );
3138   ULong r1  = sad_8x4( dst >>  8, src );
3139   ULong r2  = sad_8x4( dst >> 16, src );
3140   ULong r3  = sad_8x4( dst >> 24, src );
3141   ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3142   return res;
3143}
3144
3145/*---------------------------------------------------------------*/
3146/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
3147/*---------------------------------------------------------------*/
3148
3149static UInt zmask_from_V128 ( V128* arg )
3150{
3151   UInt i, res = 0;
3152   for (i = 0; i < 16; i++) {
3153      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
3154   }
3155   return res;
3156}
3157
3158static UInt zmask_from_V128_wide ( V128* arg )
3159{
3160   UInt i, res = 0;
3161   for (i = 0; i < 8; i++) {
3162      res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
3163   }
3164   return res;
3165}
3166
3167/* Helps with PCMP{I,E}STR{I,M}.
3168
3169   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
3170   actually it could be a clean helper, but for the fact that we can't
3171   pass by value 2 x V128 to a clean helper, nor have one returned.)
3172   Reads guest state, writes to guest state for the xSTRM cases, no
3173   accesses of memory, is a pure function.
3174
3175   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3176   the callee knows which I/E and I/M variant it is dealing with and
3177   what the specific operation is.  4th byte of opcode is in the range
3178   0x60 to 0x63:
3179       istri  66 0F 3A 63
3180       istrm  66 0F 3A 62
3181       estri  66 0F 3A 61
3182       estrm  66 0F 3A 60
3183
3184   gstOffL and gstOffR are the guest state offsets for the two XMM
3185   register inputs.  We never have to deal with the memory case since
3186   that is handled by pre-loading the relevant value into the fake
3187   XMM16 register.
3188
3189   For ESTRx variants, edxIN and eaxIN hold the values of those two
3190   registers.
3191
3192   In all cases, the bottom 16 bits of the result contain the new
3193   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
3194   result hold the new %ecx value.  For xSTRM variants, the helper
3195   writes the result directly to the guest XMM0.
3196
3197   Declarable side effects: in all cases, reads guest state at
3198   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
3199   guest_XMM0.
3200
3201   Is expected to be called with opc_and_imm combinations which have
3202   actually been validated, and will assert if otherwise.  The front
3203   end should ensure we're only called with verified values.
3204*/
3205ULong amd64g_dirtyhelper_PCMPxSTRx (
3206          VexGuestAMD64State* gst,
3207          HWord opc4_and_imm,
3208          HWord gstOffL, HWord gstOffR,
3209          HWord edxIN, HWord eaxIN
3210       )
3211{
3212   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3213   HWord imm8 = opc4_and_imm & 0xFF;
3214   HWord isISTRx = opc4 & 2;
3215   HWord isxSTRM = (opc4 & 1) ^ 1;
3216   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3217   HWord wide = (imm8 & 1);
3218
3219   // where the args are
3220   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3221   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3222
3223   /* Create the arg validity masks, either from the vectors
3224      themselves or from the supplied edx/eax values. */
3225   // FIXME: this is only right for the 8-bit data cases.
3226   // At least that is asserted above.
3227   UInt zmaskL, zmaskR;
3228
3229   // temp spot for the resulting flags and vector.
3230   V128 resV;
3231   UInt resOSZACP;
3232
3233   // for checking whether case was handled
3234   Bool ok = False;
3235
3236   if (wide) {
3237      if (isISTRx) {
3238         zmaskL = zmask_from_V128_wide(argL);
3239         zmaskR = zmask_from_V128_wide(argR);
3240      } else {
3241         Int tmp;
3242         tmp = edxIN & 0xFFFFFFFF;
3243         if (tmp < -8) tmp = -8;
3244         if (tmp > 8)  tmp = 8;
3245         if (tmp < 0)  tmp = -tmp;
3246         vassert(tmp >= 0 && tmp <= 8);
3247         zmaskL = (1 << tmp) & 0xFF;
3248         tmp = eaxIN & 0xFFFFFFFF;
3249         if (tmp < -8) tmp = -8;
3250         if (tmp > 8)  tmp = 8;
3251         if (tmp < 0)  tmp = -tmp;
3252         vassert(tmp >= 0 && tmp <= 8);
3253         zmaskR = (1 << tmp) & 0xFF;
3254      }
3255      // do the meyaath
3256      ok = compute_PCMPxSTRx_wide (
3257              &resV, &resOSZACP, argL, argR,
3258              zmaskL, zmaskR, imm8, (Bool)isxSTRM
3259           );
3260   } else {
3261      if (isISTRx) {
3262         zmaskL = zmask_from_V128(argL);
3263         zmaskR = zmask_from_V128(argR);
3264      } else {
3265         Int tmp;
3266         tmp = edxIN & 0xFFFFFFFF;
3267         if (tmp < -16) tmp = -16;
3268         if (tmp > 16)  tmp = 16;
3269         if (tmp < 0)   tmp = -tmp;
3270         vassert(tmp >= 0 && tmp <= 16);
3271         zmaskL = (1 << tmp) & 0xFFFF;
3272         tmp = eaxIN & 0xFFFFFFFF;
3273         if (tmp < -16) tmp = -16;
3274         if (tmp > 16)  tmp = 16;
3275         if (tmp < 0)   tmp = -tmp;
3276         vassert(tmp >= 0 && tmp <= 16);
3277         zmaskR = (1 << tmp) & 0xFFFF;
3278      }
3279      // do the meyaath
3280      ok = compute_PCMPxSTRx (
3281              &resV, &resOSZACP, argL, argR,
3282              zmaskL, zmaskR, imm8, (Bool)isxSTRM
3283           );
3284   }
3285
3286   // front end shouldn't pass us any imm8 variants we can't
3287   // handle.  Hence:
3288   vassert(ok);
3289
3290   // So, finally we need to get the results back to the caller.
3291   // In all cases, the new OSZACP value is the lowest 16 of
3292   // the return value.
3293   if (isxSTRM) {
3294      gst->guest_YMM0[0] = resV.w32[0];
3295      gst->guest_YMM0[1] = resV.w32[1];
3296      gst->guest_YMM0[2] = resV.w32[2];
3297      gst->guest_YMM0[3] = resV.w32[3];
3298      return resOSZACP & 0x8D5;
3299   } else {
3300      UInt newECX = resV.w32[0] & 0xFFFF;
3301      return (newECX << 16) | (resOSZACP & 0x8D5);
3302   }
3303}
3304
3305/*---------------------------------------------------------------*/
3306/*--- AES primitives and helpers                              ---*/
3307/*---------------------------------------------------------------*/
3308/* a 16 x 16 matrix */
3309static const UChar sbox[256] = {                   // row nr
3310   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
3311   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
3312   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
3313   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
3314   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
3315   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3316   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
3317   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
3318   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
3319   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
3320   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
3321   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3322   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
3323   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
3324   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
3325   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
3326   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
3327   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3328   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
3329   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
3330   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
3331   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
3332   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
3333   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3334   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
3335   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
3336   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
3337   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
3338   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
3339   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3340   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
3341   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
3342};
3343static void SubBytes (V128* v)
3344{
3345   V128 r;
3346   UInt i;
3347   for (i = 0; i < 16; i++)
3348      r.w8[i] = sbox[v->w8[i]];
3349   *v = r;
3350}
3351
3352/* a 16 x 16 matrix */
3353static const UChar invsbox[256] = {                // row nr
3354   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
3355   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
3356   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
3357   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
3358   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
3359   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
3360   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
3361   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
3362   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
3363   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
3364   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
3365   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
3366   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
3367   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
3368   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
3369   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
3370   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
3371   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
3372   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
3373   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
3374   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
3375   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
3376   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
3377   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
3378   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
3379   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
3380   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
3381   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
3382   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
3383   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
3384   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
3385   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
3386};
3387static void InvSubBytes (V128* v)
3388{
3389   V128 r;
3390   UInt i;
3391   for (i = 0; i < 16; i++)
3392      r.w8[i] = invsbox[v->w8[i]];
3393   *v = r;
3394}
3395
3396static const UChar ShiftRows_op[16] =
3397   {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
3398static void ShiftRows (V128* v)
3399{
3400   V128 r;
3401   UInt i;
3402   for (i = 0; i < 16; i++)
3403      r.w8[i] = v->w8[ShiftRows_op[15-i]];
3404   *v = r;
3405}
3406
3407static const UChar InvShiftRows_op[16] =
3408   {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
3409static void InvShiftRows (V128* v)
3410{
3411   V128 r;
3412   UInt i;
3413   for (i = 0; i < 16; i++)
3414      r.w8[i] = v->w8[InvShiftRows_op[15-i]];
3415   *v = r;
3416}
3417
3418/* Multiplication of the finite fields elements of AES.
3419   See "A Specification for The AES Algorithm Rijndael
3420        (by Joan Daemen & Vincent Rijmen)"
3421        Dr. Brian Gladman, v3.1, 3rd March 2001. */
3422/* N values so that (hex) xy = 0x03^N.
3423   0x00 cannot be used. We put 0xff for this value.*/
3424/* a 16 x 16 matrix */
3425static const UChar Nxy[256] = {                    // row nr
3426   0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
3427   0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
3428   0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
3429   0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
3430   0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
3431   0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
3432   0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
3433   0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
3434   0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
3435   0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
3436   0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
3437   0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
3438   0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
3439   0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
3440   0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
3441   0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
3442   0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
3443   0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
3444   0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
3445   0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
3446   0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
3447   0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
3448   0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
3449   0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
3450   0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
3451   0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
3452   0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
3453   0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
3454   0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
3455   0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
3456   0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
3457   0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
3458};
3459
3460/* E values so that E = 0x03^xy. */
3461static const UChar Exy[256] = {                    // row nr
3462   0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
3463   0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
3464   0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
3465   0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
3466   0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
3467   0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
3468   0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
3469   0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
3470   0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
3471   0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
3472   0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
3473   0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
3474   0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
3475   0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
3476   0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
3477   0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
3478   0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
3479   0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
3480   0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
3481   0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
3482   0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
3483   0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
3484   0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
3485   0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
3486   0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
3487   0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
3488   0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
3489   0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
3490   0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
3491   0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
3492   0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
3493   0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
3494
3495static inline UChar ff_mul(UChar u1, UChar u2)
3496{
3497   if ((u1 > 0) && (u2 > 0)) {
3498      UInt ui = Nxy[u1] + Nxy[u2];
3499      if (ui >= 255)
3500         ui = ui - 255;
3501      return Exy[ui];
3502   } else {
3503      return 0;
3504   };
3505}
3506
3507static void MixColumns (V128* v)
3508{
3509   V128 r;
3510   Int j;
3511#define P(x,row,col) (x)->w8[((row)*4+(col))]
3512   for (j = 0; j < 4; j++) {
3513      P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
3514         ^ P(v,j,2) ^ P(v,j,3);
3515      P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
3516         ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
3517      P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
3518         ^ ff_mul(0x03, P(v,j,3) );
3519      P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
3520         ^ ff_mul( 0x02, P(v,j,3) );
3521   }
3522   *v = r;
3523#undef P
3524}
3525
3526static void InvMixColumns (V128* v)
3527{
3528   V128 r;
3529   Int j;
3530#define P(x,row,col) (x)->w8[((row)*4+(col))]
3531   for (j = 0; j < 4; j++) {
3532      P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
3533         ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
3534      P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
3535         ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
3536      P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
3537         ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
3538      P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
3539         ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
3540   }
3541   *v = r;
3542#undef P
3543
3544}
3545
3546/* For description, see definition in guest_amd64_defs.h */
3547void amd64g_dirtyhelper_AES (
3548          VexGuestAMD64State* gst,
3549          HWord opc4, HWord gstOffD,
3550          HWord gstOffL, HWord gstOffR
3551       )
3552{
3553   // where the args are
3554   V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
3555   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3556   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3557   V128  r;
3558
3559   switch (opc4) {
3560      case 0xDC: /* AESENC */
3561      case 0xDD: /* AESENCLAST */
3562         r = *argR;
3563         ShiftRows (&r);
3564         SubBytes  (&r);
3565         if (opc4 == 0xDC)
3566            MixColumns (&r);
3567         argD->w64[0] = r.w64[0] ^ argL->w64[0];
3568         argD->w64[1] = r.w64[1] ^ argL->w64[1];
3569         break;
3570
3571      case 0xDE: /* AESDEC */
3572      case 0xDF: /* AESDECLAST */
3573         r = *argR;
3574         InvShiftRows (&r);
3575         InvSubBytes (&r);
3576         if (opc4 == 0xDE)
3577            InvMixColumns (&r);
3578         argD->w64[0] = r.w64[0] ^ argL->w64[0];
3579         argD->w64[1] = r.w64[1] ^ argL->w64[1];
3580         break;
3581
3582      case 0xDB: /* AESIMC */
3583         *argD = *argL;
3584         InvMixColumns (argD);
3585         break;
3586      default: vassert(0);
3587   }
3588}
3589
3590static inline UInt RotWord (UInt   w32)
3591{
3592   return ((w32 >> 8) | (w32 << 24));
3593}
3594
3595static inline UInt SubWord (UInt   w32)
3596{
3597   UChar *w8;
3598   UChar *r8;
3599   UInt res;
3600   w8 = (UChar*) &w32;
3601   r8 = (UChar*) &res;
3602   r8[0] = sbox[w8[0]];
3603   r8[1] = sbox[w8[1]];
3604   r8[2] = sbox[w8[2]];
3605   r8[3] = sbox[w8[3]];
3606   return res;
3607}
3608
3609/* For description, see definition in guest_amd64_defs.h */
3610extern void amd64g_dirtyhelper_AESKEYGENASSIST (
3611          VexGuestAMD64State* gst,
3612          HWord imm8,
3613          HWord gstOffL, HWord gstOffR
3614       )
3615{
3616   // where the args are
3617   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3618   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3619
3620   argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
3621   argR->w32[2] = SubWord (argL->w32[3]);
3622   argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
3623   argR->w32[0] = SubWord (argL->w32[1]);
3624}
3625
3626
3627
3628/*---------------------------------------------------------------*/
3629/*--- Helpers for dealing with, and describing,               ---*/
3630/*--- guest state as a whole.                                 ---*/
3631/*---------------------------------------------------------------*/
3632
3633/* Initialise the entire amd64 guest state. */
3634/* VISIBLE TO LIBVEX CLIENT */
3635void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
3636{
3637   vex_state->host_EvC_FAILADDR = 0;
3638   vex_state->host_EvC_COUNTER = 0;
3639   vex_state->pad0 = 0;
3640
3641   vex_state->guest_RAX = 0;
3642   vex_state->guest_RCX = 0;
3643   vex_state->guest_RDX = 0;
3644   vex_state->guest_RBX = 0;
3645   vex_state->guest_RSP = 0;
3646   vex_state->guest_RBP = 0;
3647   vex_state->guest_RSI = 0;
3648   vex_state->guest_RDI = 0;
3649   vex_state->guest_R8  = 0;
3650   vex_state->guest_R9  = 0;
3651   vex_state->guest_R10 = 0;
3652   vex_state->guest_R11 = 0;
3653   vex_state->guest_R12 = 0;
3654   vex_state->guest_R13 = 0;
3655   vex_state->guest_R14 = 0;
3656   vex_state->guest_R15 = 0;
3657
3658   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
3659   vex_state->guest_CC_DEP1 = 0;
3660   vex_state->guest_CC_DEP2 = 0;
3661   vex_state->guest_CC_NDEP = 0;
3662
3663   vex_state->guest_DFLAG   = 1; /* forwards */
3664   vex_state->guest_IDFLAG  = 0;
3665
3666   /* HACK: represent the offset associated with %fs==0. This
3667      assumes that %fs is only ever zero. */
3668   vex_state->guest_FS_ZERO = 0;
3669
3670   vex_state->guest_RIP = 0;
3671
3672   /* Initialise the simulated FPU */
3673   amd64g_dirtyhelper_FINIT( vex_state );
3674
3675   /* Initialise the AVX state. */
3676#  define AVXZERO(_ymm) \
3677      do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
3678           _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
3679      } while (0)
3680   vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
3681   AVXZERO(vex_state->guest_YMM0);
3682   AVXZERO(vex_state->guest_YMM1);
3683   AVXZERO(vex_state->guest_YMM2);
3684   AVXZERO(vex_state->guest_YMM3);
3685   AVXZERO(vex_state->guest_YMM4);
3686   AVXZERO(vex_state->guest_YMM5);
3687   AVXZERO(vex_state->guest_YMM6);
3688   AVXZERO(vex_state->guest_YMM7);
3689   AVXZERO(vex_state->guest_YMM8);
3690   AVXZERO(vex_state->guest_YMM9);
3691   AVXZERO(vex_state->guest_YMM10);
3692   AVXZERO(vex_state->guest_YMM11);
3693   AVXZERO(vex_state->guest_YMM12);
3694   AVXZERO(vex_state->guest_YMM13);
3695   AVXZERO(vex_state->guest_YMM14);
3696   AVXZERO(vex_state->guest_YMM15);
3697   AVXZERO(vex_state->guest_YMM16);
3698
3699#  undef AVXZERO
3700
3701   vex_state->guest_EMWARN = EmWarn_NONE;
3702
3703   /* These should not ever be either read or written, but we
3704      initialise them anyway. */
3705   vex_state->guest_TISTART = 0;
3706   vex_state->guest_TILEN   = 0;
3707
3708   vex_state->guest_NRADDR   = 0;
3709   vex_state->guest_SC_CLASS = 0;
3710   vex_state->guest_GS_0x60  = 0;
3711
3712   vex_state->guest_IP_AT_SYSCALL = 0;
3713   vex_state->pad1 = 0;
3714}
3715
3716
3717/* Figure out if any part of the guest state contained in minoff
3718   .. maxoff requires precise memory exceptions.  If in doubt return
3719   True (but this is generates significantly slower code).
3720
3721   By default we enforce precise exns for guest %RSP, %RBP and %RIP
3722   only.  These are the minimum needed to extract correct stack
3723   backtraces from amd64 code.
3724*/
3725Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
3726                                                   Int maxoff)
3727{
3728   Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
3729   Int rbp_max = rbp_min + 8 - 1;
3730   Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
3731   Int rsp_max = rsp_min + 8 - 1;
3732   Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
3733   Int rip_max = rip_min + 8 - 1;
3734
3735   if (maxoff < rbp_min || minoff > rbp_max) {
3736      /* no overlap with rbp */
3737   } else {
3738      return True;
3739   }
3740
3741   if (maxoff < rsp_min || minoff > rsp_max) {
3742      /* no overlap with rsp */
3743   } else {
3744      return True;
3745   }
3746
3747   if (maxoff < rip_min || minoff > rip_max) {
3748      /* no overlap with eip */
3749   } else {
3750      return True;
3751   }
3752
3753   return False;
3754}
3755
3756
3757#define ALWAYSDEFD(field)                             \
3758    { offsetof(VexGuestAMD64State, field),            \
3759      (sizeof ((VexGuestAMD64State*)0)->field) }
3760
3761VexGuestLayout
3762   amd64guest_layout
3763      = {
3764          /* Total size of the guest state, in bytes. */
3765          .total_sizeB = sizeof(VexGuestAMD64State),
3766
3767          /* Describe the stack pointer. */
3768          .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
3769          .sizeof_SP = 8,
3770
3771          /* Describe the frame pointer. */
3772          .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
3773          .sizeof_FP = 8,
3774
3775          /* Describe the instruction pointer. */
3776          .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
3777          .sizeof_IP = 8,
3778
3779          /* Describe any sections to be regarded by Memcheck as
3780             'always-defined'. */
3781          .n_alwaysDefd = 16,
3782
3783          /* flags thunk: OP and NDEP are always defd, whereas DEP1
3784             and DEP2 have to be tracked.  See detailed comment in
3785             gdefs.h on meaning of thunk fields. */
3786          .alwaysDefd
3787             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
3788                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
3789		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
3790                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
3791                 /*  4 */ ALWAYSDEFD(guest_RIP),
3792                 /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
3793                 /*  6 */ ALWAYSDEFD(guest_FTOP),
3794                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
3795                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
3796                 /*  9 */ ALWAYSDEFD(guest_FC3210),
3797                 // /* */ ALWAYSDEFD(guest_CS),
3798                 // /* */ ALWAYSDEFD(guest_DS),
3799                 // /* */ ALWAYSDEFD(guest_ES),
3800                 // /* */ ALWAYSDEFD(guest_FS),
3801                 // /* */ ALWAYSDEFD(guest_GS),
3802                 // /* */ ALWAYSDEFD(guest_SS),
3803                 // /* */ ALWAYSDEFD(guest_LDT),
3804                 // /* */ ALWAYSDEFD(guest_GDT),
3805                 /* 10 */ ALWAYSDEFD(guest_EMWARN),
3806                 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
3807                 /* 12 */ ALWAYSDEFD(guest_TISTART),
3808                 /* 13 */ ALWAYSDEFD(guest_TILEN),
3809                 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
3810                 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
3811               }
3812        };
3813
3814
3815/*---------------------------------------------------------------*/
3816/*--- end                               guest_amd64_helpers.c ---*/
3817/*---------------------------------------------------------------*/
3818