guest_amd64_helpers.c revision eb0bae136f4eeaaf29761dddb148b118fb824632
1
2/*---------------------------------------------------------------*/
3/*--- begin                             guest_amd64_helpers.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36#include "libvex_basictypes.h"
37#include "libvex_emnote.h"
38#include "libvex_guest_amd64.h"
39#include "libvex_ir.h"
40#include "libvex.h"
41
42#include "main_util.h"
43#include "main_globals.h"
44#include "guest_generic_bb_to_IR.h"
45#include "guest_amd64_defs.h"
46#include "guest_generic_x87.h"
47
48
49/* This file contains helper functions for amd64 guest code.
50   Calls to these functions are generated by the back end.
51   These calls are of course in the host machine code and
52   this file will be compiled to host machine code, so that
53   all makes sense.
54
55   Only change the signatures of these helper functions very
56   carefully.  If you change the signature here, you'll have to change
57   the parameters passed to it in the IR calls constructed by
58   guest-amd64/toIR.c.
59
60   The convention used is that all functions called from generated
61   code are named amd64g_<something>, and any function whose name lacks
62   that prefix is not called from generated code.  Note that some
63   LibVEX_* functions can however be called by VEX's client, but that
64   is not the same as calling them from VEX-generated code.
65*/
66
67
68/* Set to 1 to get detailed profiling info about use of the flag
69   machinery. */
70#define PROFILE_RFLAGS 0
71
72
73/*---------------------------------------------------------------*/
74/*--- %rflags run-time helpers.                               ---*/
75/*---------------------------------------------------------------*/
76
77/* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
78   after imulq/mulq. */
79
80static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
81{
82   ULong u0, v0, w0;
83    Long u1, v1, w1, w2, t;
84   u0   = u & 0xFFFFFFFFULL;
85   u1   = u >> 32;
86   v0   = v & 0xFFFFFFFFULL;
87   v1   = v >> 32;
88   w0   = u0 * v0;
89   t    = u1 * v0 + (w0 >> 32);
90   w1   = t & 0xFFFFFFFFULL;
91   w2   = t >> 32;
92   w1   = u0 * v1 + w1;
93   *rHi = u1 * v1 + w2 + (w1 >> 32);
94   *rLo = u * v;
95}
96
97static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
98{
99   ULong u0, v0, w0;
100   ULong u1, v1, w1,w2,t;
101   u0   = u & 0xFFFFFFFFULL;
102   u1   = u >> 32;
103   v0   = v & 0xFFFFFFFFULL;
104   v1   = v >> 32;
105   w0   = u0 * v0;
106   t    = u1 * v0 + (w0 >> 32);
107   w1   = t & 0xFFFFFFFFULL;
108   w2   = t >> 32;
109   w1   = u0 * v1 + w1;
110   *rHi = u1 * v1 + w2 + (w1 >> 32);
111   *rLo = u * v;
112}
113
114
115static const UChar parity_table[256] = {
116    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
117    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
118    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
119    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
120    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
121    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
123    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
124    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
125    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
126    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
127    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
128    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
129    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
131    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
132    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
133    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
135    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
136    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
137    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
139    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
140    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
141    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
142    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
143    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
144    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
145    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
147    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
148};
149
150/* generalised left-shifter */
151static inline Long lshift ( Long x, Int n )
152{
153   if (n >= 0)
154      return x << n;
155   else
156      return x >> (-n);
157}
158
159/* identity on ULong */
160static inline ULong idULong ( ULong x )
161{
162   return x;
163}
164
165
166#define PREAMBLE(__data_bits)					\
167   /* const */ ULong DATA_MASK 					\
168      = __data_bits==8                                          \
169           ? 0xFFULL 					        \
170           : (__data_bits==16                                   \
171                ? 0xFFFFULL 		                        \
172                : (__data_bits==32                              \
173                     ? 0xFFFFFFFFULL                            \
174                     : 0xFFFFFFFFFFFFFFFFULL));                 \
175   /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
176   /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
177   /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
178   /* const */ ULong CC_NDEP = cc_ndep_formal;			\
179   /* Four bogus assignments, which hopefully gcc can     */	\
180   /* optimise away, and which stop it complaining about  */	\
181   /* unused variables.                                   */	\
182   SIGN_MASK = SIGN_MASK;					\
183   DATA_MASK = DATA_MASK;					\
184   CC_DEP2 = CC_DEP2;						\
185   CC_NDEP = CC_NDEP;
186
187
188/*-------------------------------------------------------------*/
189
190#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
191{								\
192   PREAMBLE(DATA_BITS);						\
193   { Long cf, pf, af, zf, sf, of;				\
194     Long argL, argR, res;					\
195     argL = CC_DEP1;						\
196     argR = CC_DEP2;						\
197     res  = argL + argR;					\
198     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
199     pf = parity_table[(UChar)res];				\
200     af = (res ^ argL ^ argR) & 0x10;				\
201     zf = ((DATA_UTYPE)res == 0) << 6;				\
202     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
203     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
204                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
205     return cf | pf | af | zf | sf | of;			\
206   }								\
207}
208
209/*-------------------------------------------------------------*/
210
211#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
212{								\
213   PREAMBLE(DATA_BITS);						\
214   { Long cf, pf, af, zf, sf, of;				\
215     Long argL, argR, res;					\
216     argL = CC_DEP1;						\
217     argR = CC_DEP2;						\
218     res  = argL - argR;					\
219     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
220     pf = parity_table[(UChar)res];				\
221     af = (res ^ argL ^ argR) & 0x10;				\
222     zf = ((DATA_UTYPE)res == 0) << 6;				\
223     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
224     of = lshift((argL ^ argR) & (argL ^ res),	 		\
225                 12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
226     return cf | pf | af | zf | sf | of;			\
227   }								\
228}
229
230/*-------------------------------------------------------------*/
231
232#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
233{								\
234   PREAMBLE(DATA_BITS);						\
235   { Long cf, pf, af, zf, sf, of;				\
236     Long argL, argR, oldC, res;		 		\
237     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
238     argL = CC_DEP1;						\
239     argR = CC_DEP2 ^ oldC;	       				\
240     res  = (argL + argR) + oldC;				\
241     if (oldC)							\
242        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
243     else							\
244        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
245     pf = parity_table[(UChar)res];				\
246     af = (res ^ argL ^ argR) & 0x10;				\
247     zf = ((DATA_UTYPE)res == 0) << 6;				\
248     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
249     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
250                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
251     return cf | pf | af | zf | sf | of;			\
252   }								\
253}
254
255/*-------------------------------------------------------------*/
256
257#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
258{								\
259   PREAMBLE(DATA_BITS);						\
260   { Long cf, pf, af, zf, sf, of;				\
261     Long argL, argR, oldC, res;	       			\
262     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
263     argL = CC_DEP1;						\
264     argR = CC_DEP2 ^ oldC;	       				\
265     res  = (argL - argR) - oldC;				\
266     if (oldC)							\
267        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
268     else							\
269        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
270     pf = parity_table[(UChar)res];				\
271     af = (res ^ argL ^ argR) & 0x10;				\
272     zf = ((DATA_UTYPE)res == 0) << 6;				\
273     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
274     of = lshift((argL ^ argR) & (argL ^ res), 			\
275                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
276     return cf | pf | af | zf | sf | of;			\
277   }								\
278}
279
280/*-------------------------------------------------------------*/
281
282#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
283{								\
284   PREAMBLE(DATA_BITS);						\
285   { Long cf, pf, af, zf, sf, of;				\
286     cf = 0;							\
287     pf = parity_table[(UChar)CC_DEP1];				\
288     af = 0;							\
289     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
290     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
291     of = 0;							\
292     return cf | pf | af | zf | sf | of;			\
293   }								\
294}
295
296/*-------------------------------------------------------------*/
297
298#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
299{								\
300   PREAMBLE(DATA_BITS);						\
301   { Long cf, pf, af, zf, sf, of;				\
302     Long argL, argR, res;					\
303     res  = CC_DEP1;						\
304     argL = res - 1;						\
305     argR = 1;							\
306     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
307     pf = parity_table[(UChar)res];				\
308     af = (res ^ argL ^ argR) & 0x10;				\
309     zf = ((DATA_UTYPE)res == 0) << 6;				\
310     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
311     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
312     return cf | pf | af | zf | sf | of;			\
313   }								\
314}
315
316/*-------------------------------------------------------------*/
317
318#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
319{								\
320   PREAMBLE(DATA_BITS);						\
321   { Long cf, pf, af, zf, sf, of;				\
322     Long argL, argR, res;					\
323     res  = CC_DEP1;						\
324     argL = res + 1;						\
325     argR = 1;							\
326     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
327     pf = parity_table[(UChar)res];				\
328     af = (res ^ argL ^ argR) & 0x10;				\
329     zf = ((DATA_UTYPE)res == 0) << 6;				\
330     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
331     of = ((res & DATA_MASK) 					\
332          == ((ULong)SIGN_MASK - 1)) << 11;			\
333     return cf | pf | af | zf | sf | of;			\
334   }								\
335}
336
337/*-------------------------------------------------------------*/
338
339#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
340{								\
341   PREAMBLE(DATA_BITS);						\
342   { Long cf, pf, af, zf, sf, of;				\
343     cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
344     pf = parity_table[(UChar)CC_DEP1];				\
345     af = 0; /* undefined */					\
346     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
347     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
348     /* of is defined if shift count == 1 */			\
349     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
350          & AMD64G_CC_MASK_O;					\
351     return cf | pf | af | zf | sf | of;			\
352   }								\
353}
354
355/*-------------------------------------------------------------*/
356
357#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
358{								\
359   PREAMBLE(DATA_BITS);  					\
360   { Long cf, pf, af, zf, sf, of;				\
361     cf = CC_DEP2 & 1;						\
362     pf = parity_table[(UChar)CC_DEP1];				\
363     af = 0; /* undefined */					\
364     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
365     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
366     /* of is defined if shift count == 1 */			\
367     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
368          & AMD64G_CC_MASK_O;					\
369     return cf | pf | af | zf | sf | of;			\
370   }								\
371}
372
373/*-------------------------------------------------------------*/
374
375/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
376/* DEP1 = result, NDEP = old flags */
377#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
378{								\
379   PREAMBLE(DATA_BITS);						\
380   { Long fl 							\
381        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
382          | (AMD64G_CC_MASK_C & CC_DEP1)			\
383          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
384                                      11-(DATA_BITS-1)) 	\
385                     ^ lshift(CC_DEP1, 11)));			\
386     return fl;							\
387   }								\
388}
389
390/*-------------------------------------------------------------*/
391
392/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
393/* DEP1 = result, NDEP = old flags */
394#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
395{								\
396   PREAMBLE(DATA_BITS);						\
397   { Long fl 							\
398        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
399          | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
400          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
401                                      11-(DATA_BITS-1)) 	\
402                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
403     return fl;							\
404   }								\
405}
406
407/*-------------------------------------------------------------*/
408
409#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
410                                DATA_U2TYPE, NARROWto2U)        \
411{                                                               \
412   PREAMBLE(DATA_BITS);                                         \
413   { Long cf, pf, af, zf, sf, of;                               \
414     DATA_UTYPE  hi;                                            \
415     DATA_UTYPE  lo                                             \
416        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
417                     * ((DATA_UTYPE)CC_DEP2) );                 \
418     DATA_U2TYPE rr                                             \
419        = NARROWto2U(                                           \
420             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
421             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
422     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
423     cf = (hi != 0);                                            \
424     pf = parity_table[(UChar)lo];                              \
425     af = 0; /* undefined */                                    \
426     zf = (lo == 0) << 6;                                       \
427     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
428     of = cf << 11;                                             \
429     return cf | pf | af | zf | sf | of;                        \
430   }								\
431}
432
433/*-------------------------------------------------------------*/
434
435#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
436                                DATA_S2TYPE, NARROWto2S)        \
437{                                                               \
438   PREAMBLE(DATA_BITS);                                         \
439   { Long cf, pf, af, zf, sf, of;                               \
440     DATA_STYPE  hi;                                            \
441     DATA_STYPE  lo                                             \
442        = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
443                     * ((DATA_STYPE)CC_DEP2) );                 \
444     DATA_S2TYPE rr                                             \
445        = NARROWto2S(                                           \
446             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
447             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
448     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
449     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
450     pf = parity_table[(UChar)lo];                              \
451     af = 0; /* undefined */                                    \
452     zf = (lo == 0) << 6;                                       \
453     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
454     of = cf << 11;                                             \
455     return cf | pf | af | zf | sf | of;                        \
456   }								\
457}
458
459/*-------------------------------------------------------------*/
460
461#define ACTIONS_UMULQ                                           \
462{                                                               \
463   PREAMBLE(64);                                                \
464   { Long cf, pf, af, zf, sf, of;                               \
465     ULong lo, hi;                                              \
466     mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
467     cf = (hi != 0);                                            \
468     pf = parity_table[(UChar)lo];                              \
469     af = 0; /* undefined */                                    \
470     zf = (lo == 0) << 6;                                       \
471     sf = lshift(lo, 8 - 64) & 0x80;                            \
472     of = cf << 11;                                             \
473     return cf | pf | af | zf | sf | of;                        \
474   }								\
475}
476
477/*-------------------------------------------------------------*/
478
479#define ACTIONS_SMULQ                                           \
480{                                                               \
481   PREAMBLE(64);                                                \
482   { Long cf, pf, af, zf, sf, of;                               \
483     Long lo, hi;                                               \
484     mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
485     cf = (hi != (lo >>/*s*/ (64-1)));                          \
486     pf = parity_table[(UChar)lo];                              \
487     af = 0; /* undefined */                                    \
488     zf = (lo == 0) << 6;                                       \
489     sf = lshift(lo, 8 - 64) & 0x80;                            \
490     of = cf << 11;                                             \
491     return cf | pf | af | zf | sf | of;                        \
492   }								\
493}
494
495/*-------------------------------------------------------------*/
496
497#define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
498{								\
499   PREAMBLE(DATA_BITS);						\
500   { Long cf, pf, af, zf, sf, of;				\
501     cf = 0;							\
502     pf = 0;							\
503     af = 0;							\
504     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
505     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
506     of = 0;							\
507     return cf | pf | af | zf | sf | of;			\
508   }								\
509}
510
511/*-------------------------------------------------------------*/
512
513#define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
514{								\
515   PREAMBLE(DATA_BITS);						\
516   { Long cf, pf, af, zf, sf, of;				\
517     cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
518     pf = 0;							\
519     af = 0;							\
520     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
521     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
522     of = 0;							\
523     return cf | pf | af | zf | sf | of;			\
524   }								\
525}
526
527/*-------------------------------------------------------------*/
528
529#define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
530{								\
531   PREAMBLE(DATA_BITS);						\
532   { Long cf, pf, af, zf, sf, of;				\
533     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
534     pf = 0;							\
535     af = 0;							\
536     zf = 0;							\
537     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
538     of = 0;							\
539     return cf | pf | af | zf | sf | of;			\
540   }								\
541}
542
543/*-------------------------------------------------------------*/
544
545#define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
546{								\
547   PREAMBLE(DATA_BITS);						\
548   { Long cf, pf, af, zf, sf, of;				\
549     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
550     pf = 0;							\
551     af = 0;							\
552     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
553     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
554     of = 0;							\
555     return cf | pf | af | zf | sf | of;			\
556   }								\
557}
558
559/*-------------------------------------------------------------*/
560
561
562#if PROFILE_RFLAGS
563
564static Bool initted     = False;
565
566/* C flag, fast route */
567static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
568/* C flag, slow route */
569static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
570/* table for calculate_cond */
571static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
572/* total entry counts for calc_all, calc_c, calc_cond. */
573static UInt n_calc_all  = 0;
574static UInt n_calc_c    = 0;
575static UInt n_calc_cond = 0;
576
577#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
578
579
580static void showCounts ( void )
581{
582   Int op, co;
583   HChar ch;
584   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
585              n_calc_all, n_calc_cond, n_calc_c);
586
587   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
588              "    S   NS    P   NP    L   NL   LE  NLE\n");
589   vex_printf("     -----------------------------------------------------"
590              "----------------------------------------\n");
591   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
592
593      ch = ' ';
594      if (op > 0 && (op-1) % 4 == 0)
595         ch = 'B';
596      if (op > 0 && (op-1) % 4 == 1)
597         ch = 'W';
598      if (op > 0 && (op-1) % 4 == 2)
599         ch = 'L';
600      if (op > 0 && (op-1) % 4 == 3)
601         ch = 'Q';
602
603      vex_printf("%2d%c: ", op, ch);
604      vex_printf("%6u ", tabc_slow[op]);
605      vex_printf("%6u ", tabc_fast[op]);
606      for (co = 0; co < 16; co++) {
607         Int n = tab_cond[op][co];
608         if (n >= 1000) {
609            vex_printf(" %3dK", n / 1000);
610         } else
611         if (n >= 0) {
612            vex_printf(" %3d ", n );
613         } else {
614            vex_printf("     ");
615         }
616      }
617      vex_printf("\n");
618   }
619   vex_printf("\n");
620}
621
622static void initCounts ( void )
623{
624   Int op, co;
625   initted = True;
626   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
627      tabc_fast[op] = tabc_slow[op] = 0;
628      for (co = 0; co < 16; co++)
629         tab_cond[op][co] = 0;
630   }
631}
632
633#endif /* PROFILE_RFLAGS */
634
635
636/* CALLED FROM GENERATED CODE: CLEAN HELPER */
637/* Calculate all the 6 flags from the supplied thunk parameters.
638   Worker function, not directly called from generated code. */
639static
640ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
641                                        ULong cc_dep1_formal,
642                                        ULong cc_dep2_formal,
643                                        ULong cc_ndep_formal )
644{
645   switch (cc_op) {
646      case AMD64G_CC_OP_COPY:
647         return cc_dep1_formal
648                & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
649                   | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
650
651      case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
652      case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
653      case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
654      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
655
656      case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
657      case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
658      case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
659      case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
660
661      case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
662      case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
663      case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
664      case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
665
666      case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
667      case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
668      case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
669      case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
670
671      case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
672      case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
673      case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
674      case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
675
676      case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
677      case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
678      case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
679      case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
680
681      case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
682      case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
683      case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
684      case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
685
686      case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
687      case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
688      case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
689      case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
690
691      case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
692      case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
693      case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
694      case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
695
696      case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
697      case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
698      case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
699      case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
700
701      case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
702      case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
703      case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
704      case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
705
706      case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
707                                                  UShort, toUShort );
708      case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
709                                                  UInt,   toUInt );
710      case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
711                                                  ULong,  idULong );
712
713      case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
714
715      case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
716                                                  Short,  toUShort );
717      case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
718                                                  Int,    toUInt   );
719      case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
720                                                  Long,   idULong );
721
722      case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
723
724      case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
725      case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
726
727      case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
728      case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
729
730      case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
731      case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
732
733      case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
734      case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
735
736      default:
737         /* shouldn't really make these calls from generated code */
738         vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
739                    "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
740                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
741         vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
742   }
743}
744
745
746/* CALLED FROM GENERATED CODE: CLEAN HELPER */
747/* Calculate all the 6 flags from the supplied thunk parameters. */
748ULong amd64g_calculate_rflags_all ( ULong cc_op,
749                                    ULong cc_dep1,
750                                    ULong cc_dep2,
751                                    ULong cc_ndep )
752{
753#  if PROFILE_RFLAGS
754   if (!initted) initCounts();
755   n_calc_all++;
756   if (SHOW_COUNTS_NOW) showCounts();
757#  endif
758   return
759      amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
760}
761
762
763/* CALLED FROM GENERATED CODE: CLEAN HELPER */
764/* Calculate just the carry flag from the supplied thunk parameters. */
765ULong amd64g_calculate_rflags_c ( ULong cc_op,
766                                  ULong cc_dep1,
767                                  ULong cc_dep2,
768                                  ULong cc_ndep )
769{
770#  if PROFILE_RFLAGS
771   if (!initted) initCounts();
772   n_calc_c++;
773   tabc_fast[cc_op]++;
774   if (SHOW_COUNTS_NOW) showCounts();
775#  endif
776
777   /* Fast-case some common ones. */
778   switch (cc_op) {
779      case AMD64G_CC_OP_COPY:
780         return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
781      case AMD64G_CC_OP_LOGICQ:
782      case AMD64G_CC_OP_LOGICL:
783      case AMD64G_CC_OP_LOGICW:
784      case AMD64G_CC_OP_LOGICB:
785         return 0;
786	 //      case AMD64G_CC_OP_SUBL:
787	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
788	 //                   ? AMD64G_CC_MASK_C : 0;
789	 //      case AMD64G_CC_OP_SUBW:
790	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
791	 //                   ? AMD64G_CC_MASK_C : 0;
792	 //      case AMD64G_CC_OP_SUBB:
793	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
794	 //                   ? AMD64G_CC_MASK_C : 0;
795	 //      case AMD64G_CC_OP_INCL:
796	 //      case AMD64G_CC_OP_DECL:
797	 //         return cc_ndep & AMD64G_CC_MASK_C;
798      default:
799         break;
800   }
801
802#  if PROFILE_RFLAGS
803   tabc_fast[cc_op]--;
804   tabc_slow[cc_op]++;
805#  endif
806
807   return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
808          & AMD64G_CC_MASK_C;
809}
810
811
812/* CALLED FROM GENERATED CODE: CLEAN HELPER */
813/* returns 1 or 0 */
814ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
815                                   ULong cc_op,
816                                   ULong cc_dep1,
817                                   ULong cc_dep2,
818                                   ULong cc_ndep )
819{
820   ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
821                                                  cc_dep2, cc_ndep);
822   ULong of,sf,zf,cf,pf;
823   ULong inv = cond & 1;
824
825#  if PROFILE_RFLAGS
826   if (!initted) initCounts();
827   tab_cond[cc_op][cond]++;
828   n_calc_cond++;
829   if (SHOW_COUNTS_NOW) showCounts();
830#  endif
831
832   switch (cond) {
833      case AMD64CondNO:
834      case AMD64CondO: /* OF == 1 */
835         of = rflags >> AMD64G_CC_SHIFT_O;
836         return 1 & (inv ^ of);
837
838      case AMD64CondNZ:
839      case AMD64CondZ: /* ZF == 1 */
840         zf = rflags >> AMD64G_CC_SHIFT_Z;
841         return 1 & (inv ^ zf);
842
843      case AMD64CondNB:
844      case AMD64CondB: /* CF == 1 */
845         cf = rflags >> AMD64G_CC_SHIFT_C;
846         return 1 & (inv ^ cf);
847         break;
848
849      case AMD64CondNBE:
850      case AMD64CondBE: /* (CF or ZF) == 1 */
851         cf = rflags >> AMD64G_CC_SHIFT_C;
852         zf = rflags >> AMD64G_CC_SHIFT_Z;
853         return 1 & (inv ^ (cf | zf));
854         break;
855
856      case AMD64CondNS:
857      case AMD64CondS: /* SF == 1 */
858         sf = rflags >> AMD64G_CC_SHIFT_S;
859         return 1 & (inv ^ sf);
860
861      case AMD64CondNP:
862      case AMD64CondP: /* PF == 1 */
863         pf = rflags >> AMD64G_CC_SHIFT_P;
864         return 1 & (inv ^ pf);
865
866      case AMD64CondNL:
867      case AMD64CondL: /* (SF xor OF) == 1 */
868         sf = rflags >> AMD64G_CC_SHIFT_S;
869         of = rflags >> AMD64G_CC_SHIFT_O;
870         return 1 & (inv ^ (sf ^ of));
871         break;
872
873      case AMD64CondNLE:
874      case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
875         sf = rflags >> AMD64G_CC_SHIFT_S;
876         of = rflags >> AMD64G_CC_SHIFT_O;
877         zf = rflags >> AMD64G_CC_SHIFT_Z;
878         return 1 & (inv ^ ((sf ^ of) | zf));
879         break;
880
881      default:
882         /* shouldn't really make these calls from generated code */
883         vex_printf("amd64g_calculate_condition"
884                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
885                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
886         vpanic("amd64g_calculate_condition");
887   }
888}
889
890
891/* VISIBLE TO LIBVEX CLIENT */
892ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
893{
894   ULong rflags = amd64g_calculate_rflags_all_WRK(
895                     vex_state->guest_CC_OP,
896                     vex_state->guest_CC_DEP1,
897                     vex_state->guest_CC_DEP2,
898                     vex_state->guest_CC_NDEP
899                  );
900   Long dflag = vex_state->guest_DFLAG;
901   vassert(dflag == 1 || dflag == -1);
902   if (dflag == -1)
903      rflags |= (1<<10);
904   if (vex_state->guest_IDFLAG == 1)
905      rflags |= (1<<21);
906   if (vex_state->guest_ACFLAG == 1)
907      rflags |= (1<<18);
908
909   return rflags;
910}
911
912/* VISIBLE TO LIBVEX CLIENT */
913void
914LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
915                               /*MOD*/VexGuestAMD64State* vex_state )
916{
917   ULong oszacp = amd64g_calculate_rflags_all_WRK(
918                     vex_state->guest_CC_OP,
919                     vex_state->guest_CC_DEP1,
920                     vex_state->guest_CC_DEP2,
921                     vex_state->guest_CC_NDEP
922                  );
923   if (new_carry_flag & 1) {
924      oszacp |= AMD64G_CC_MASK_C;
925   } else {
926      oszacp &= ~AMD64G_CC_MASK_C;
927   }
928   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
929   vex_state->guest_CC_DEP1 = oszacp;
930   vex_state->guest_CC_DEP2 = 0;
931   vex_state->guest_CC_NDEP = 0;
932}
933
934
935/*---------------------------------------------------------------*/
936/*--- %rflags translation-time function specialisers.         ---*/
937/*--- These help iropt specialise calls the above run-time    ---*/
938/*--- %rflags functions.                                      ---*/
939/*---------------------------------------------------------------*/
940
941/* Used by the optimiser to try specialisations.  Returns an
942   equivalent expression, or NULL if none. */
943
944static Bool isU64 ( IRExpr* e, ULong n )
945{
946   return toBool( e->tag == Iex_Const
947                  && e->Iex.Const.con->tag == Ico_U64
948                  && e->Iex.Const.con->Ico.U64 == n );
949}
950
951IRExpr* guest_amd64_spechelper ( const HChar* function_name,
952                                 IRExpr** args,
953                                 IRStmt** precedingStmts,
954                                 Int      n_precedingStmts )
955{
956#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
957#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
958#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
959#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
960#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
961
962   Int i, arity = 0;
963   for (i = 0; args[i]; i++)
964      arity++;
965#  if 0
966   vex_printf("spec request:\n");
967   vex_printf("   %s  ", function_name);
968   for (i = 0; i < arity; i++) {
969      vex_printf("  ");
970      ppIRExpr(args[i]);
971   }
972   vex_printf("\n");
973#  endif
974
975   /* --------- specialising "amd64g_calculate_condition" --------- */
976
977   if (vex_streq(function_name, "amd64g_calculate_condition")) {
978      /* specialise calls to above "calculate condition" function */
979      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
980      vassert(arity == 5);
981      cond    = args[0];
982      cc_op   = args[1];
983      cc_dep1 = args[2];
984      cc_dep2 = args[3];
985
986      /*---------------- ADDQ ----------------*/
987
988      if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
989         /* long long add, then Z --> test (dst+src == 0) */
990         return unop(Iop_1Uto64,
991                     binop(Iop_CmpEQ64,
992                           binop(Iop_Add64, cc_dep1, cc_dep2),
993                           mkU64(0)));
994      }
995
996      /*---------------- SUBQ ----------------*/
997
998      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
999         /* long long sub/cmp, then Z --> test dst==src */
1000         return unop(Iop_1Uto64,
1001                     binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1002      }
1003      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1004         /* long long sub/cmp, then NZ --> test dst!=src */
1005         return unop(Iop_1Uto64,
1006                     binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1007      }
1008
1009      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1010         /* long long sub/cmp, then L (signed less than)
1011            --> test dst <s src */
1012         return unop(Iop_1Uto64,
1013                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1014      }
1015
1016      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1017         /* long long sub/cmp, then B (unsigned less than)
1018            --> test dst <u src */
1019         return unop(Iop_1Uto64,
1020                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1021      }
1022      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1023         /* long long sub/cmp, then NB (unsigned greater than or equal)
1024            --> test src <=u dst */
1025         /* Note, args are opposite way round from the usual */
1026         return unop(Iop_1Uto64,
1027                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1028      }
1029
1030      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1031         /* long sub/cmp, then NLE (signed greater than)
1032            --> test !(dst <=s src)
1033            --> test (dst >s src)
1034            --> test (src <s dst) */
1035         return unop(Iop_1Uto64,
1036                     binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1037
1038      }
1039
1040      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1041         /* long long sub/cmp, then BE (unsigned less than or equal)
1042            --> test dst <=u src */
1043         return unop(Iop_1Uto64,
1044                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1045      }
1046      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1047         /* long long sub/cmp, then NBE (unsigned greater than)
1048            --> test !(dst <=u src) */
1049         return binop(Iop_Xor64,
1050                      unop(Iop_1Uto64,
1051                           binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1052                      mkU64(1));
1053      }
1054
1055      /*---------------- SUBL ----------------*/
1056
1057      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1058         /* long sub/cmp, then Z --> test dst==src */
1059         return unop(Iop_1Uto64,
1060                     binop(Iop_CmpEQ32,
1061                           unop(Iop_64to32, cc_dep1),
1062                           unop(Iop_64to32, cc_dep2)));
1063      }
1064      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1065         /* long sub/cmp, then NZ --> test dst!=src */
1066         return unop(Iop_1Uto64,
1067                     binop(Iop_CmpNE32,
1068                           unop(Iop_64to32, cc_dep1),
1069                           unop(Iop_64to32, cc_dep2)));
1070      }
1071
1072      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1073         /* long sub/cmp, then L (signed less than)
1074            --> test dst <s src */
1075         return unop(Iop_1Uto64,
1076                     binop(Iop_CmpLT32S,
1077                           unop(Iop_64to32, cc_dep1),
1078                           unop(Iop_64to32, cc_dep2)));
1079      }
1080
1081      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1082         /* long sub/cmp, then LE (signed less than or equal)
1083            --> test dst <=s src */
1084         return unop(Iop_1Uto64,
1085                     binop(Iop_CmpLE32S,
1086                           unop(Iop_64to32, cc_dep1),
1087                           unop(Iop_64to32, cc_dep2)));
1088
1089      }
1090      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1091         /* long sub/cmp, then NLE (signed greater than)
1092            --> test !(dst <=s src)
1093            --> test (dst >s src)
1094            --> test (src <s dst) */
1095         return unop(Iop_1Uto64,
1096                     binop(Iop_CmpLT32S,
1097                           unop(Iop_64to32, cc_dep2),
1098                           unop(Iop_64to32, cc_dep1)));
1099
1100      }
1101
1102      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1103         /* long sub/cmp, then BE (unsigned less than or equal)
1104            --> test dst <=u src */
1105         return unop(Iop_1Uto64,
1106                     binop(Iop_CmpLE32U,
1107                           unop(Iop_64to32, cc_dep1),
1108                           unop(Iop_64to32, cc_dep2)));
1109      }
1110      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1111         /* long sub/cmp, then NBE (unsigned greater than)
1112            --> test src <u dst */
1113         /* Note, args are opposite way round from the usual */
1114         return unop(Iop_1Uto64,
1115                     binop(Iop_CmpLT32U,
1116                           unop(Iop_64to32, cc_dep2),
1117                           unop(Iop_64to32, cc_dep1)));
1118      }
1119
1120      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1121         /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
1122         return unop(Iop_1Uto64,
1123                     binop(Iop_CmpLT32S,
1124                           binop(Iop_Sub32,
1125                                 unop(Iop_64to32, cc_dep1),
1126                                 unop(Iop_64to32, cc_dep2)),
1127                           mkU32(0)));
1128      }
1129
1130      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1131         /* long sub/cmp, then B (unsigned less than)
1132            --> test dst <u src */
1133         return unop(Iop_1Uto64,
1134                     binop(Iop_CmpLT32U,
1135                           unop(Iop_64to32, cc_dep1),
1136                           unop(Iop_64to32, cc_dep2)));
1137      }
1138
1139      /*---------------- SUBW ----------------*/
1140
1141      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1142         /* word sub/cmp, then Z --> test dst==src */
1143         return unop(Iop_1Uto64,
1144                     binop(Iop_CmpEQ16,
1145                           unop(Iop_64to16,cc_dep1),
1146                           unop(Iop_64to16,cc_dep2)));
1147      }
1148      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1149         /* word sub/cmp, then NZ --> test dst!=src */
1150         return unop(Iop_1Uto64,
1151                     binop(Iop_CmpNE16,
1152                           unop(Iop_64to16,cc_dep1),
1153                           unop(Iop_64to16,cc_dep2)));
1154      }
1155
1156      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1157         /* word sub/cmp, then LE (signed less than or equal)
1158            --> test dst <=s src */
1159         return unop(Iop_1Uto64,
1160                     binop(Iop_CmpLE64S,
1161                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1162                           binop(Iop_Shl64,cc_dep2,mkU8(48))));
1163
1164      }
1165
1166      /*---------------- SUBB ----------------*/
1167
1168      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1169         /* byte sub/cmp, then Z --> test dst==src */
1170         return unop(Iop_1Uto64,
1171                     binop(Iop_CmpEQ8,
1172                           unop(Iop_64to8,cc_dep1),
1173                           unop(Iop_64to8,cc_dep2)));
1174      }
1175      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1176         /* byte sub/cmp, then NZ --> test dst!=src */
1177         return unop(Iop_1Uto64,
1178                     binop(Iop_CmpNE8,
1179                           unop(Iop_64to8,cc_dep1),
1180                           unop(Iop_64to8,cc_dep2)));
1181      }
1182
1183      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1184         /* byte sub/cmp, then BE (unsigned less than or equal)
1185            --> test dst <=u src */
1186         return unop(Iop_1Uto64,
1187                     binop(Iop_CmpLE64U,
1188                           binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1189                           binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1190      }
1191
1192      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1193                                          && isU64(cc_dep2, 0)) {
1194         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1195                                         --> test dst <s 0
1196                                         --> (ULong)dst[7]
1197            This is yet another scheme by which gcc figures out if the
1198            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1199         /* Note: isU64(cc_dep2, 0) is correct, even though this is
1200            for an 8-bit comparison, since the args to the helper
1201            function are always U64s. */
1202         return binop(Iop_And64,
1203                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1204                      mkU64(1));
1205      }
1206      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1207                                          && isU64(cc_dep2, 0)) {
1208         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1209                                          --> test !(dst <s 0)
1210                                          --> (ULong) !dst[7]
1211         */
1212         return binop(Iop_Xor64,
1213                      binop(Iop_And64,
1214                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1215                            mkU64(1)),
1216                      mkU64(1));
1217      }
1218
1219      /*---------------- LOGICQ ----------------*/
1220
1221      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1222         /* long long and/or/xor, then Z --> test dst==0 */
1223         return unop(Iop_1Uto64,
1224                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1225      }
1226      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1227         /* long long and/or/xor, then NZ --> test dst!=0 */
1228         return unop(Iop_1Uto64,
1229                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1230      }
1231
1232      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1233         /* long long and/or/xor, then L
1234            LOGIC sets SF and ZF according to the
1235            result and makes OF be zero.  L computes SF ^ OF, but
1236            OF is zero, so this reduces to SF -- which will be 1 iff
1237            the result is < signed 0.  Hence ...
1238         */
1239         return unop(Iop_1Uto64,
1240                     binop(Iop_CmpLT64S,
1241                           cc_dep1,
1242                           mkU64(0)));
1243      }
1244
1245      /*---------------- LOGICL ----------------*/
1246
1247      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1248         /* long and/or/xor, then Z --> test dst==0 */
1249         return unop(Iop_1Uto64,
1250                     binop(Iop_CmpEQ32,
1251                           unop(Iop_64to32, cc_dep1),
1252                           mkU32(0)));
1253      }
1254      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1255         /* long and/or/xor, then NZ --> test dst!=0 */
1256         return unop(Iop_1Uto64,
1257                     binop(Iop_CmpNE32,
1258                           unop(Iop_64to32, cc_dep1),
1259                           mkU32(0)));
1260      }
1261
1262      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1263         /* long and/or/xor, then LE
1264            This is pretty subtle.  LOGIC sets SF and ZF according to the
1265            result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1266            OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1267            the result is <=signed 0.  Hence ...
1268         */
1269         return unop(Iop_1Uto64,
1270                     binop(Iop_CmpLE32S,
1271                           unop(Iop_64to32, cc_dep1),
1272                           mkU32(0)));
1273      }
1274
1275      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1276         /* long and/or/xor, then S --> (ULong)result[31] */
1277         return binop(Iop_And64,
1278                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1279                      mkU64(1));
1280      }
1281      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1282         /* long and/or/xor, then S --> (ULong) ~ result[31] */
1283         return binop(Iop_Xor64,
1284                binop(Iop_And64,
1285                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1286                      mkU64(1)),
1287                mkU64(1));
1288      }
1289
1290      /*---------------- LOGICW ----------------*/
1291
1292      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1293         /* word and/or/xor, then Z --> test dst==0 */
1294         return unop(Iop_1Uto64,
1295                     binop(Iop_CmpEQ64,
1296                           binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1297                           mkU64(0)));
1298      }
1299      if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1300         /* word and/or/xor, then NZ --> test dst!=0 */
1301         return unop(Iop_1Uto64,
1302                     binop(Iop_CmpNE64,
1303                           binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1304                           mkU64(0)));
1305      }
1306
1307      /*---------------- LOGICB ----------------*/
1308
1309      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1310         /* byte and/or/xor, then Z --> test dst==0 */
1311         return unop(Iop_1Uto64,
1312                     binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1313                                        mkU64(0)));
1314      }
1315      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1316         /* byte and/or/xor, then NZ --> test dst!=0 */
1317         return unop(Iop_1Uto64,
1318                     binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1319                                        mkU64(0)));
1320      }
1321
1322      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1323         /* this is an idiom gcc sometimes uses to find out if the top
1324            bit of a byte register is set: eg testb %al,%al; js ..
1325            Since it just depends on the top bit of the byte, extract
1326            that bit and explicitly get rid of all the rest.  This
1327            helps memcheck avoid false positives in the case where any
1328            of the other bits in the byte are undefined. */
1329         /* byte and/or/xor, then S --> (UInt)result[7] */
1330         return binop(Iop_And64,
1331                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
1332                      mkU64(1));
1333      }
1334      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1335         /* byte and/or/xor, then NS --> (UInt)!result[7] */
1336         return binop(Iop_Xor64,
1337                      binop(Iop_And64,
1338                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
1339                            mkU64(1)),
1340                      mkU64(1));
1341      }
1342
1343      /*---------------- INCB ----------------*/
1344
1345      if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1346         /* 8-bit inc, then LE --> sign bit of the arg */
1347         return binop(Iop_And64,
1348                      binop(Iop_Shr64,
1349                            binop(Iop_Sub64, cc_dep1, mkU64(1)),
1350                            mkU8(7)),
1351                      mkU64(1));
1352      }
1353
1354      /*---------------- INCW ----------------*/
1355
1356      if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1357         /* 16-bit inc, then Z --> test dst == 0 */
1358         return unop(Iop_1Uto64,
1359                     binop(Iop_CmpEQ64,
1360                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1361                           mkU64(0)));
1362      }
1363
1364      /*---------------- DECL ----------------*/
1365
1366      if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1367         /* dec L, then Z --> test dst == 0 */
1368         return unop(Iop_1Uto64,
1369                     binop(Iop_CmpEQ32,
1370                           unop(Iop_64to32, cc_dep1),
1371                           mkU32(0)));
1372      }
1373
1374      /*---------------- DECW ----------------*/
1375
1376      if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1377         /* 16-bit dec, then NZ --> test dst != 0 */
1378         return unop(Iop_1Uto64,
1379                     binop(Iop_CmpNE64,
1380                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
1381                           mkU64(0)));
1382      }
1383
1384      /*---------------- COPY ----------------*/
1385      /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1386         jbe" for example. */
1387
1388      if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1389          (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1390         /* COPY, then BE --> extract C and Z from dep1, and test (C
1391            or Z == 1). */
1392         /* COPY, then NBE --> extract C and Z from dep1, and test (C
1393            or Z == 0). */
1394         ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1395         return
1396            unop(
1397               Iop_1Uto64,
1398               binop(
1399                  Iop_CmpEQ64,
1400                  binop(
1401                     Iop_And64,
1402                     binop(
1403                        Iop_Or64,
1404                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1405                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1406                     ),
1407                     mkU64(1)
1408                  ),
1409                  mkU64(nnn)
1410               )
1411            );
1412      }
1413
1414      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1415         /* COPY, then B --> extract C dep1, and test (C == 1). */
1416         return
1417            unop(
1418               Iop_1Uto64,
1419               binop(
1420                  Iop_CmpNE64,
1421                  binop(
1422                     Iop_And64,
1423                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1424                     mkU64(1)
1425                  ),
1426                  mkU64(0)
1427               )
1428            );
1429      }
1430
1431      if (isU64(cc_op, AMD64G_CC_OP_COPY)
1432          && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1433         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1434         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1435         UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1436         return
1437            unop(
1438               Iop_1Uto64,
1439               binop(
1440                  Iop_CmpEQ64,
1441                  binop(
1442                     Iop_And64,
1443                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1444                     mkU64(1)
1445                  ),
1446                  mkU64(nnn)
1447               )
1448            );
1449      }
1450
1451      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1452         /* COPY, then P --> extract P from dep1, and test (P == 1). */
1453         return
1454            unop(
1455               Iop_1Uto64,
1456               binop(
1457                  Iop_CmpNE64,
1458                  binop(
1459                     Iop_And64,
1460                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1461                     mkU64(1)
1462                  ),
1463                  mkU64(0)
1464               )
1465            );
1466      }
1467
1468      return NULL;
1469   }
1470
1471   /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1472
1473   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1474      /* specialise calls to above "calculate_rflags_c" function */
1475      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1476      vassert(arity == 4);
1477      cc_op   = args[0];
1478      cc_dep1 = args[1];
1479      cc_dep2 = args[2];
1480      cc_ndep = args[3];
1481
1482      if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1483         /* C after sub denotes unsigned less than */
1484         return unop(Iop_1Uto64,
1485                     binop(Iop_CmpLT64U,
1486                           cc_dep1,
1487                           cc_dep2));
1488      }
1489      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1490         /* C after sub denotes unsigned less than */
1491         return unop(Iop_1Uto64,
1492                     binop(Iop_CmpLT32U,
1493                           unop(Iop_64to32, cc_dep1),
1494                           unop(Iop_64to32, cc_dep2)));
1495      }
1496      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1497         /* C after sub denotes unsigned less than */
1498         return unop(Iop_1Uto64,
1499                     binop(Iop_CmpLT64U,
1500                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1501                           binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1502      }
1503      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1504          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1505          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1506          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1507         /* cflag after logic is zero */
1508         return mkU64(0);
1509      }
1510      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1511          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1512         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1513         return cc_ndep;
1514      }
1515
1516#     if 0
1517      if (cc_op->tag == Iex_Const) {
1518         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1519      }
1520#     endif
1521
1522      return NULL;
1523   }
1524
1525#  undef unop
1526#  undef binop
1527#  undef mkU64
1528#  undef mkU32
1529#  undef mkU8
1530
1531   return NULL;
1532}
1533
1534
1535/*---------------------------------------------------------------*/
1536/*--- Supporting functions for x87 FPU activities.            ---*/
1537/*---------------------------------------------------------------*/
1538
1539static inline Bool host_is_little_endian ( void )
1540{
1541   UInt x = 0x76543210;
1542   UChar* p = (UChar*)(&x);
1543   return toBool(*p == 0x10);
1544}
1545
1546/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1547/* CALLED FROM GENERATED CODE: CLEAN HELPER */
1548ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1549{
1550   Bool   mantissaIsZero;
1551   Int    bexp;
1552   UChar  sign;
1553   UChar* f64;
1554
1555   vassert(host_is_little_endian());
1556
1557   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1558
1559   f64  = (UChar*)(&dbl);
1560   sign = toUChar( (f64[7] >> 7) & 1 );
1561
1562   /* First off, if the tag indicates the register was empty,
1563      return 1,0,sign,1 */
1564   if (tag == 0) {
1565      /* vex_printf("Empty\n"); */
1566      return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1567                                   | AMD64G_FC_MASK_C0;
1568   }
1569
1570   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1571   bexp &= 0x7FF;
1572
1573   mantissaIsZero
1574      = toBool(
1575           (f64[6] & 0x0F) == 0
1576           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1577        );
1578
1579   /* If both exponent and mantissa are zero, the value is zero.
1580      Return 1,0,sign,0. */
1581   if (bexp == 0 && mantissaIsZero) {
1582      /* vex_printf("Zero\n"); */
1583      return AMD64G_FC_MASK_C3 | 0
1584                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1585   }
1586
1587   /* If exponent is zero but mantissa isn't, it's a denormal.
1588      Return 1,1,sign,0. */
1589   if (bexp == 0 && !mantissaIsZero) {
1590      /* vex_printf("Denormal\n"); */
1591      return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1592                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
1593   }
1594
1595   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1596      Return 0,1,sign,1. */
1597   if (bexp == 0x7FF && mantissaIsZero) {
1598      /* vex_printf("Inf\n"); */
1599      return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1600                                   | AMD64G_FC_MASK_C0;
1601   }
1602
1603   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1604      Return 0,0,sign,1. */
1605   if (bexp == 0x7FF && !mantissaIsZero) {
1606      /* vex_printf("NaN\n"); */
1607      return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1608   }
1609
1610   /* Uh, ok, we give up.  It must be a normal finite number.
1611      Return 0,1,sign,0.
1612   */
1613   /* vex_printf("normal\n"); */
1614   return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1615}
1616
1617
1618/* This is used to implement both 'frstor' and 'fldenv'.  The latter
1619   appears to differ from the former only in that the 8 FP registers
1620   themselves are not transferred into the guest state. */
1621static
1622VexEmNote do_put_x87 ( Bool moveRegs,
1623                       /*IN*/UChar* x87_state,
1624                       /*OUT*/VexGuestAMD64State* vex_state )
1625{
1626   Int        stno, preg;
1627   UInt       tag;
1628   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1629   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1630   Fpu_State* x87     = (Fpu_State*)x87_state;
1631   UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
1632   UInt       tagw    = x87->env[FP_ENV_TAG];
1633   UInt       fpucw   = x87->env[FP_ENV_CTRL];
1634   UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
1635   VexEmNote  ew;
1636   UInt       fpround;
1637   ULong      pair;
1638
1639   /* Copy registers and tags */
1640   for (stno = 0; stno < 8; stno++) {
1641      preg = (stno + ftop) & 7;
1642      tag = (tagw >> (2*preg)) & 3;
1643      if (tag == 3) {
1644         /* register is empty */
1645         /* hmm, if it's empty, does it still get written?  Probably
1646            safer to say it does.  If we don't, memcheck could get out
1647            of sync, in that it thinks all FP registers are defined by
1648            this helper, but in reality some have not been updated. */
1649         if (moveRegs)
1650            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1651         vexTags[preg] = 0;
1652      } else {
1653         /* register is non-empty */
1654         if (moveRegs)
1655            convert_f80le_to_f64le( &x87->reg[10*stno],
1656                                    (UChar*)&vexRegs[preg] );
1657         vexTags[preg] = 1;
1658      }
1659   }
1660
1661   /* stack pointer */
1662   vex_state->guest_FTOP = ftop;
1663
1664   /* status word */
1665   vex_state->guest_FC3210 = c3210;
1666
1667   /* handle the control word, setting FPROUND and detecting any
1668      emulation warnings. */
1669   pair    = amd64g_check_fldcw ( (ULong)fpucw );
1670   fpround = (UInt)pair & 0xFFFFFFFFULL;
1671   ew      = (VexEmNote)(pair >> 32);
1672
1673   vex_state->guest_FPROUND = fpround & 3;
1674
1675   /* emulation warnings --> caller */
1676   return ew;
1677}
1678
1679
1680/* Create an x87 FPU state from the guest state, as close as
1681   we can approximate it. */
1682static
1683void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
1684                  /*OUT*/UChar* x87_state )
1685{
1686   Int        i, stno, preg;
1687   UInt       tagw;
1688   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1689   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1690   Fpu_State* x87     = (Fpu_State*)x87_state;
1691   UInt       ftop    = vex_state->guest_FTOP;
1692   UInt       c3210   = vex_state->guest_FC3210;
1693
1694   for (i = 0; i < 14; i++)
1695      x87->env[i] = 0;
1696
1697   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1698   x87->env[FP_ENV_STAT]
1699      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
1700   x87->env[FP_ENV_CTRL]
1701      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
1702
1703   /* Dump the register stack in ST order. */
1704   tagw = 0;
1705   for (stno = 0; stno < 8; stno++) {
1706      preg = (stno + ftop) & 7;
1707      if (vexTags[preg] == 0) {
1708         /* register is empty */
1709         tagw |= (3 << (2*preg));
1710         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1711                                 &x87->reg[10*stno] );
1712      } else {
1713         /* register is full. */
1714         tagw |= (0 << (2*preg));
1715         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1716                                 &x87->reg[10*stno] );
1717      }
1718   }
1719   x87->env[FP_ENV_TAG] = toUShort(tagw);
1720}
1721
1722
1723/* CALLED FROM GENERATED CODE */
1724/* DIRTY HELPER (reads guest state, writes guest mem) */
1725/* NOTE: only handles 32-bit format (no REX.W on the insn) */
1726void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
1727{
1728   /* Derived from values obtained from
1729      vendor_id       : AuthenticAMD
1730      cpu family      : 15
1731      model           : 12
1732      model name      : AMD Athlon(tm) 64 Processor 3200+
1733      stepping        : 0
1734      cpu MHz         : 2200.000
1735      cache size      : 512 KB
1736   */
1737   /* Somewhat roundabout, but at least it's simple. */
1738   Fpu_State tmp;
1739   UShort*   addrS = (UShort*)addr;
1740   UChar*    addrC = (UChar*)addr;
1741   U128*     xmm   = (U128*)(addr + 160);
1742   UInt      mxcsr;
1743   UShort    fp_tags;
1744   UInt      summary_tags;
1745   Int       r, stno;
1746   UShort    *srcS, *dstS;
1747
1748   do_get_x87( gst, (UChar*)&tmp );
1749   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
1750
1751   /* Now build the proper fxsave image from the x87 image we just
1752      made. */
1753
1754   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
1755   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
1756
1757   /* set addrS[2] in an endian-independent way */
1758   summary_tags = 0;
1759   fp_tags = tmp.env[FP_ENV_TAG];
1760   for (r = 0; r < 8; r++) {
1761      if ( ((fp_tags >> (2*r)) & 3) != 3 )
1762         summary_tags |= (1 << r);
1763   }
1764   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
1765   addrC[5]  = 0; /* pad */
1766
1767   /* FOP: faulting fpu opcode.  From experimentation, the real CPU
1768      does not write this field. (?!) */
1769   addrS[3]  = 0; /* BOGUS */
1770
1771   /* RIP (Last x87 instruction pointer).  From experimentation, the
1772      real CPU does not write this field. (?!) */
1773   addrS[4]  = 0; /* BOGUS */
1774   addrS[5]  = 0; /* BOGUS */
1775   addrS[6]  = 0; /* BOGUS */
1776   addrS[7]  = 0; /* BOGUS */
1777
1778   /* RDP (Last x87 data pointer).  From experimentation, the real CPU
1779      does not write this field. (?!) */
1780   addrS[8]  = 0; /* BOGUS */
1781   addrS[9]  = 0; /* BOGUS */
1782   addrS[10] = 0; /* BOGUS */
1783   addrS[11] = 0; /* BOGUS */
1784
1785   addrS[12] = toUShort(mxcsr);  /* MXCSR */
1786   addrS[13] = toUShort(mxcsr >> 16);
1787
1788   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
1789   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
1790
1791   /* Copy in the FP registers, in ST order. */
1792   for (stno = 0; stno < 8; stno++) {
1793      srcS = (UShort*)(&tmp.reg[10*stno]);
1794      dstS = (UShort*)(&addrS[16 + 8*stno]);
1795      dstS[0] = srcS[0];
1796      dstS[1] = srcS[1];
1797      dstS[2] = srcS[2];
1798      dstS[3] = srcS[3];
1799      dstS[4] = srcS[4];
1800      dstS[5] = 0;
1801      dstS[6] = 0;
1802      dstS[7] = 0;
1803   }
1804
1805   /* That's the first 160 bytes of the image done.  Now only %xmm0
1806      .. %xmm15 remain to be copied.  If the host is big-endian, these
1807      need to be byte-swapped. */
1808   vassert(host_is_little_endian());
1809
1810#  define COPY_U128(_dst,_src)                       \
1811      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1812           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1813      while (0)
1814
1815   COPY_U128( xmm[0],  gst->guest_YMM0 );
1816   COPY_U128( xmm[1],  gst->guest_YMM1 );
1817   COPY_U128( xmm[2],  gst->guest_YMM2 );
1818   COPY_U128( xmm[3],  gst->guest_YMM3 );
1819   COPY_U128( xmm[4],  gst->guest_YMM4 );
1820   COPY_U128( xmm[5],  gst->guest_YMM5 );
1821   COPY_U128( xmm[6],  gst->guest_YMM6 );
1822   COPY_U128( xmm[7],  gst->guest_YMM7 );
1823   COPY_U128( xmm[8],  gst->guest_YMM8 );
1824   COPY_U128( xmm[9],  gst->guest_YMM9 );
1825   COPY_U128( xmm[10], gst->guest_YMM10 );
1826   COPY_U128( xmm[11], gst->guest_YMM11 );
1827   COPY_U128( xmm[12], gst->guest_YMM12 );
1828   COPY_U128( xmm[13], gst->guest_YMM13 );
1829   COPY_U128( xmm[14], gst->guest_YMM14 );
1830   COPY_U128( xmm[15], gst->guest_YMM15 );
1831
1832#  undef COPY_U128
1833}
1834
1835
1836/* CALLED FROM GENERATED CODE */
1837/* DIRTY HELPER (writes guest state, reads guest mem) */
1838VexEmNote amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
1839{
1840   Fpu_State tmp;
1841   VexEmNote warnX87 = EmNote_NONE;
1842   VexEmNote warnXMM = EmNote_NONE;
1843   UShort*   addrS   = (UShort*)addr;
1844   UChar*    addrC   = (UChar*)addr;
1845   U128*     xmm     = (U128*)(addr + 160);
1846   UShort    fp_tags;
1847   Int       r, stno, i;
1848
1849   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
1850      to be byte-swapped. */
1851   vassert(host_is_little_endian());
1852
1853#  define COPY_U128(_dst,_src)                       \
1854      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1855           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1856      while (0)
1857
1858   COPY_U128( gst->guest_YMM0, xmm[0] );
1859   COPY_U128( gst->guest_YMM1, xmm[1] );
1860   COPY_U128( gst->guest_YMM2, xmm[2] );
1861   COPY_U128( gst->guest_YMM3, xmm[3] );
1862   COPY_U128( gst->guest_YMM4, xmm[4] );
1863   COPY_U128( gst->guest_YMM5, xmm[5] );
1864   COPY_U128( gst->guest_YMM6, xmm[6] );
1865   COPY_U128( gst->guest_YMM7, xmm[7] );
1866   COPY_U128( gst->guest_YMM8, xmm[8] );
1867   COPY_U128( gst->guest_YMM9, xmm[9] );
1868   COPY_U128( gst->guest_YMM10, xmm[10] );
1869   COPY_U128( gst->guest_YMM11, xmm[11] );
1870   COPY_U128( gst->guest_YMM12, xmm[12] );
1871   COPY_U128( gst->guest_YMM13, xmm[13] );
1872   COPY_U128( gst->guest_YMM14, xmm[14] );
1873   COPY_U128( gst->guest_YMM15, xmm[15] );
1874
1875#  undef COPY_U128
1876
1877   /* Copy the x87 registers out of the image, into a temporary
1878      Fpu_State struct. */
1879   for (i = 0; i < 14; i++) tmp.env[i] = 0;
1880   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
1881   /* fill in tmp.reg[0..7] */
1882   for (stno = 0; stno < 8; stno++) {
1883      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
1884      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
1885      dstS[0] = srcS[0];
1886      dstS[1] = srcS[1];
1887      dstS[2] = srcS[2];
1888      dstS[3] = srcS[3];
1889      dstS[4] = srcS[4];
1890   }
1891   /* fill in tmp.env[0..13] */
1892   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
1893   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
1894
1895   fp_tags = 0;
1896   for (r = 0; r < 8; r++) {
1897      if (addrC[4] & (1<<r))
1898         fp_tags |= (0 << (2*r)); /* EMPTY */
1899      else
1900         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
1901   }
1902   tmp.env[FP_ENV_TAG] = fp_tags;
1903
1904   /* Now write 'tmp' into the guest state. */
1905   warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
1906
1907   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
1908                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
1909     ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
1910
1911     warnXMM = (VexEmNote)(w64 >> 32);
1912
1913     gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
1914   }
1915
1916   /* Prefer an X87 emwarn over an XMM one, if both exist. */
1917   if (warnX87 != EmNote_NONE)
1918      return warnX87;
1919   else
1920      return warnXMM;
1921}
1922
1923
1924/* DIRTY HELPER (writes guest state) */
1925/* Initialise the x87 FPU state as per 'finit'. */
1926void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
1927{
1928   Int i;
1929   gst->guest_FTOP = 0;
1930   for (i = 0; i < 8; i++) {
1931      gst->guest_FPTAG[i] = 0; /* empty */
1932      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
1933   }
1934   gst->guest_FPROUND = (ULong)Irrm_NEAREST;
1935   gst->guest_FC3210  = 0;
1936}
1937
1938
1939/* CALLED FROM GENERATED CODE */
1940/* DIRTY HELPER (reads guest memory) */
1941ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
1942{
1943   ULong f64;
1944   convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
1945   return f64;
1946}
1947
1948/* CALLED FROM GENERATED CODE */
1949/* DIRTY HELPER (writes guest memory) */
1950void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
1951{
1952   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
1953}
1954
1955
1956/* CALLED FROM GENERATED CODE */
1957/* CLEAN HELPER */
1958/* mxcsr[15:0] contains a SSE native format MXCSR value.
1959   Extract from it the required SSEROUND value and any resulting
1960   emulation warning, and return (warn << 32) | sseround value.
1961*/
1962ULong amd64g_check_ldmxcsr ( ULong mxcsr )
1963{
1964   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
1965   /* NOTE, encoded exactly as per enum IRRoundingMode. */
1966   ULong rmode = (mxcsr >> 13) & 3;
1967
1968   /* Detect any required emulation warnings. */
1969   VexEmNote ew = EmNote_NONE;
1970
1971   if ((mxcsr & 0x1F80) != 0x1F80) {
1972      /* unmasked exceptions! */
1973      ew = EmWarn_X86_sseExns;
1974   }
1975   else
1976   if (mxcsr & (1<<15)) {
1977      /* FZ is set */
1978      ew = EmWarn_X86_fz;
1979   }
1980   else
1981   if (mxcsr & (1<<6)) {
1982      /* DAZ is set */
1983      ew = EmWarn_X86_daz;
1984   }
1985
1986   return (((ULong)ew) << 32) | ((ULong)rmode);
1987}
1988
1989
1990/* CALLED FROM GENERATED CODE */
1991/* CLEAN HELPER */
1992/* Given sseround as an IRRoundingMode value, create a suitable SSE
1993   native format MXCSR value. */
1994ULong amd64g_create_mxcsr ( ULong sseround )
1995{
1996   sseround &= 3;
1997   return 0x1F80 | (sseround << 13);
1998}
1999
2000
2001/* CLEAN HELPER */
2002/* fpucw[15:0] contains a x87 native format FPU control word.
2003   Extract from it the required FPROUND value and any resulting
2004   emulation warning, and return (warn << 32) | fpround value.
2005*/
2006ULong amd64g_check_fldcw ( ULong fpucw )
2007{
2008   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
2009   /* NOTE, encoded exactly as per enum IRRoundingMode. */
2010   ULong rmode = (fpucw >> 10) & 3;
2011
2012   /* Detect any required emulation warnings. */
2013   VexEmNote ew = EmNote_NONE;
2014
2015   if ((fpucw & 0x3F) != 0x3F) {
2016      /* unmasked exceptions! */
2017      ew = EmWarn_X86_x87exns;
2018   }
2019   else
2020   if (((fpucw >> 8) & 3) != 3) {
2021      /* unsupported precision */
2022      ew = EmWarn_X86_x87precision;
2023   }
2024
2025   return (((ULong)ew) << 32) | ((ULong)rmode);
2026}
2027
2028
2029/* CLEAN HELPER */
2030/* Given fpround as an IRRoundingMode value, create a suitable x87
2031   native format FPU control word. */
2032ULong amd64g_create_fpucw ( ULong fpround )
2033{
2034   fpround &= 3;
2035   return 0x037F | (fpround << 10);
2036}
2037
2038
2039/* This is used to implement 'fldenv'.
2040   Reads 28 bytes at x87_state[0 .. 27]. */
2041/* CALLED FROM GENERATED CODE */
2042/* DIRTY HELPER */
2043VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2044                                      /*IN*/HWord x87_state)
2045{
2046   return do_put_x87( False, (UChar*)x87_state, vex_state );
2047}
2048
2049
2050/* CALLED FROM GENERATED CODE */
2051/* DIRTY HELPER */
2052/* Create an x87 FPU env from the guest state, as close as we can
2053   approximate it.  Writes 28 bytes at x87_state[0..27]. */
2054void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2055                                 /*OUT*/HWord x87_state )
2056{
2057   Int        i, stno, preg;
2058   UInt       tagw;
2059   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2060   Fpu_State* x87     = (Fpu_State*)x87_state;
2061   UInt       ftop    = vex_state->guest_FTOP;
2062   ULong      c3210   = vex_state->guest_FC3210;
2063
2064   for (i = 0; i < 14; i++)
2065      x87->env[i] = 0;
2066
2067   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2068   x87->env[FP_ENV_STAT]
2069      = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2070   x87->env[FP_ENV_CTRL]
2071      = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2072
2073   /* Compute the x87 tag word. */
2074   tagw = 0;
2075   for (stno = 0; stno < 8; stno++) {
2076      preg = (stno + ftop) & 7;
2077      if (vexTags[preg] == 0) {
2078         /* register is empty */
2079         tagw |= (3 << (2*preg));
2080      } else {
2081         /* register is full. */
2082         tagw |= (0 << (2*preg));
2083      }
2084   }
2085   x87->env[FP_ENV_TAG] = toUShort(tagw);
2086
2087   /* We don't dump the x87 registers, tho. */
2088}
2089
2090
2091/* This is used to implement 'fnsave'.
2092   Writes 108 bytes at x87_state[0 .. 107]. */
2093/* CALLED FROM GENERATED CODE */
2094/* DIRTY HELPER */
2095void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2096                                 /*OUT*/HWord x87_state)
2097{
2098   do_get_x87( vex_state, (UChar*)x87_state );
2099}
2100
2101
2102/* This is used to implement 'fnsaves'.
2103   Writes 94 bytes at x87_state[0 .. 93]. */
2104/* CALLED FROM GENERATED CODE */
2105/* DIRTY HELPER */
2106void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2107                                  /*OUT*/HWord x87_state)
2108{
2109   Int           i, stno, preg;
2110   UInt          tagw;
2111   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2112   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2113   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2114   UInt          ftop    = vex_state->guest_FTOP;
2115   UInt          c3210   = vex_state->guest_FC3210;
2116
2117   for (i = 0; i < 7; i++)
2118      x87->env[i] = 0;
2119
2120   x87->env[FPS_ENV_STAT]
2121      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2122   x87->env[FPS_ENV_CTRL]
2123      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2124
2125   /* Dump the register stack in ST order. */
2126   tagw = 0;
2127   for (stno = 0; stno < 8; stno++) {
2128      preg = (stno + ftop) & 7;
2129      if (vexTags[preg] == 0) {
2130         /* register is empty */
2131         tagw |= (3 << (2*preg));
2132         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2133                                 &x87->reg[10*stno] );
2134      } else {
2135         /* register is full. */
2136         tagw |= (0 << (2*preg));
2137         convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2138                                 &x87->reg[10*stno] );
2139      }
2140   }
2141   x87->env[FPS_ENV_TAG] = toUShort(tagw);
2142}
2143
2144
2145/* This is used to implement 'frstor'.
2146   Reads 108 bytes at x87_state[0 .. 107]. */
2147/* CALLED FROM GENERATED CODE */
2148/* DIRTY HELPER */
2149VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2150                                      /*IN*/HWord x87_state)
2151{
2152   return do_put_x87( True, (UChar*)x87_state, vex_state );
2153}
2154
2155
2156/* This is used to implement 'frstors'.
2157   Reads 94 bytes at x87_state[0 .. 93]. */
2158/* CALLED FROM GENERATED CODE */
2159/* DIRTY HELPER */
2160VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2161                                       /*IN*/HWord x87_state)
2162{
2163   Int           stno, preg;
2164   UInt          tag;
2165   ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2166   UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2167   Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2168   UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2169   UInt          tagw    = x87->env[FPS_ENV_TAG];
2170   UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2171   UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2172   VexEmNote     ew;
2173   UInt          fpround;
2174   ULong         pair;
2175
2176   /* Copy registers and tags */
2177   for (stno = 0; stno < 8; stno++) {
2178      preg = (stno + ftop) & 7;
2179      tag = (tagw >> (2*preg)) & 3;
2180      if (tag == 3) {
2181         /* register is empty */
2182         /* hmm, if it's empty, does it still get written?  Probably
2183            safer to say it does.  If we don't, memcheck could get out
2184            of sync, in that it thinks all FP registers are defined by
2185            this helper, but in reality some have not been updated. */
2186         vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2187         vexTags[preg] = 0;
2188      } else {
2189         /* register is non-empty */
2190         convert_f80le_to_f64le( &x87->reg[10*stno],
2191                                 (UChar*)&vexRegs[preg] );
2192         vexTags[preg] = 1;
2193      }
2194   }
2195
2196   /* stack pointer */
2197   vex_state->guest_FTOP = ftop;
2198
2199   /* status word */
2200   vex_state->guest_FC3210 = c3210;
2201
2202   /* handle the control word, setting FPROUND and detecting any
2203      emulation warnings. */
2204   pair    = amd64g_check_fldcw ( (ULong)fpucw );
2205   fpround = (UInt)pair & 0xFFFFFFFFULL;
2206   ew      = (VexEmNote)(pair >> 32);
2207
2208   vex_state->guest_FPROUND = fpround & 3;
2209
2210   /* emulation warnings --> caller */
2211   return ew;
2212}
2213
2214
2215/*---------------------------------------------------------------*/
2216/*--- Misc integer helpers, including rotates and CPUID.      ---*/
2217/*---------------------------------------------------------------*/
2218
2219/* Claim to be the following CPU, which is probably representative of
2220   the lowliest (earliest) amd64 offerings.  It can do neither sse3
2221   nor cx16.
2222
2223   vendor_id       : AuthenticAMD
2224   cpu family      : 15
2225   model           : 5
2226   model name      : AMD Opteron (tm) Processor 848
2227   stepping        : 10
2228   cpu MHz         : 1797.682
2229   cache size      : 1024 KB
2230   fpu             : yes
2231   fpu_exception   : yes
2232   cpuid level     : 1
2233   wp              : yes
2234   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2235                     mtrr pge mca cmov pat pse36 clflush mmx fxsr
2236                     sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2237   bogomips        : 3600.62
2238   TLB size        : 1088 4K pages
2239   clflush size    : 64
2240   cache_alignment : 64
2241   address sizes   : 40 bits physical, 48 bits virtual
2242   power management: ts fid vid ttp
2243
2244   2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2245   we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2246   and 3dnowext is 80000001.EDX.30.
2247*/
2248void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2249{
2250#  define SET_ABCD(_a,_b,_c,_d)                \
2251      do { st->guest_RAX = (ULong)(_a);        \
2252           st->guest_RBX = (ULong)(_b);        \
2253           st->guest_RCX = (ULong)(_c);        \
2254           st->guest_RDX = (ULong)(_d);        \
2255      } while (0)
2256
2257   switch (0xFFFFFFFF & st->guest_RAX) {
2258      case 0x00000000:
2259         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2260         break;
2261      case 0x00000001:
2262         SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2263         break;
2264      case 0x80000000:
2265         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2266         break;
2267      case 0x80000001:
2268         /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
2269            the original it-is-supported value that the h/w provides.
2270            See #291568. */
2271         SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2272                                                      0x21d3fbff);
2273         break;
2274      case 0x80000002:
2275         SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2276         break;
2277      case 0x80000003:
2278         SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2279         break;
2280      case 0x80000004:
2281         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2282         break;
2283      case 0x80000005:
2284         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2285         break;
2286      case 0x80000006:
2287         SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2288         break;
2289      case 0x80000007:
2290         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2291         break;
2292      case 0x80000008:
2293         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2294         break;
2295      default:
2296         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2297         break;
2298   }
2299#  undef SET_ABCD
2300}
2301
2302
2303/* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2304   capable.
2305
2306   vendor_id       : GenuineIntel
2307   cpu family      : 6
2308   model           : 15
2309   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2310   stepping        : 6
2311   cpu MHz         : 2394.000
2312   cache size      : 4096 KB
2313   physical id     : 0
2314   siblings        : 2
2315   core id         : 0
2316   cpu cores       : 2
2317   fpu             : yes
2318   fpu_exception   : yes
2319   cpuid level     : 10
2320   wp              : yes
2321   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2322                     mtrr pge mca cmov pat pse36 clflush dts acpi
2323                     mmx fxsr sse sse2 ss ht tm syscall nx lm
2324                     constant_tsc pni monitor ds_cpl vmx est tm2
2325                     cx16 xtpr lahf_lm
2326   bogomips        : 4798.78
2327   clflush size    : 64
2328   cache_alignment : 64
2329   address sizes   : 36 bits physical, 48 bits virtual
2330   power management:
2331*/
2332void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2333{
2334#  define SET_ABCD(_a,_b,_c,_d)                \
2335      do { st->guest_RAX = (ULong)(_a);        \
2336           st->guest_RBX = (ULong)(_b);        \
2337           st->guest_RCX = (ULong)(_c);        \
2338           st->guest_RDX = (ULong)(_d);        \
2339      } while (0)
2340
2341   switch (0xFFFFFFFF & st->guest_RAX) {
2342      case 0x00000000:
2343         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2344         break;
2345      case 0x00000001:
2346         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2347         break;
2348      case 0x00000002:
2349         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2350         break;
2351      case 0x00000003:
2352         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2353         break;
2354      case 0x00000004: {
2355         switch (0xFFFFFFFF & st->guest_RCX) {
2356            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2357                                      0x0000003f, 0x00000001); break;
2358            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2359                                      0x0000003f, 0x00000001); break;
2360            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2361                                      0x00000fff, 0x00000001); break;
2362            default:         SET_ABCD(0x00000000, 0x00000000,
2363                                      0x00000000, 0x00000000); break;
2364         }
2365         break;
2366      }
2367      case 0x00000005:
2368         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2369         break;
2370      case 0x00000006:
2371         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2372         break;
2373      case 0x00000007:
2374         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2375         break;
2376      case 0x00000008:
2377         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2378         break;
2379      case 0x00000009:
2380         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2381         break;
2382      case 0x0000000a:
2383      unhandled_eax_value:
2384         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2385         break;
2386      case 0x80000000:
2387         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2388         break;
2389      case 0x80000001:
2390         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2391         break;
2392      case 0x80000002:
2393         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2394         break;
2395      case 0x80000003:
2396         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2397         break;
2398      case 0x80000004:
2399         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2400         break;
2401      case 0x80000005:
2402         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2403         break;
2404      case 0x80000006:
2405         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2406         break;
2407      case 0x80000007:
2408         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2409         break;
2410      case 0x80000008:
2411         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2412         break;
2413      default:
2414         goto unhandled_eax_value;
2415   }
2416#  undef SET_ABCD
2417}
2418
2419
2420/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2421   capable.
2422
2423   vendor_id       : GenuineIntel
2424   cpu family      : 6
2425   model           : 37
2426   model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2427   stepping        : 2
2428   cpu MHz         : 3334.000
2429   cache size      : 4096 KB
2430   physical id     : 0
2431   siblings        : 4
2432   core id         : 0
2433   cpu cores       : 2
2434   apicid          : 0
2435   initial apicid  : 0
2436   fpu             : yes
2437   fpu_exception   : yes
2438   cpuid level     : 11
2439   wp              : yes
2440   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2441                     mtrr pge mca cmov pat pse36 clflush dts acpi
2442                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2443                     lm constant_tsc arch_perfmon pebs bts rep_good
2444                     xtopology nonstop_tsc aperfmperf pni pclmulqdq
2445                     dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2446                     xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2447                     arat tpr_shadow vnmi flexpriority ept vpid
2448   bogomips        : 6957.57
2449   clflush size    : 64
2450   cache_alignment : 64
2451   address sizes   : 36 bits physical, 48 bits virtual
2452   power management:
2453*/
2454void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2455{
2456#  define SET_ABCD(_a,_b,_c,_d)                \
2457      do { st->guest_RAX = (ULong)(_a);        \
2458           st->guest_RBX = (ULong)(_b);        \
2459           st->guest_RCX = (ULong)(_c);        \
2460           st->guest_RDX = (ULong)(_d);        \
2461      } while (0)
2462
2463   UInt old_eax = (UInt)st->guest_RAX;
2464   UInt old_ecx = (UInt)st->guest_RCX;
2465
2466   switch (old_eax) {
2467      case 0x00000000:
2468         SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2469         break;
2470      case 0x00000001:
2471         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2472         break;
2473      case 0x00000002:
2474         SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2475         break;
2476      case 0x00000003:
2477         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2478         break;
2479      case 0x00000004:
2480         switch (old_ecx) {
2481            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2482                                      0x0000003f, 0x00000000); break;
2483            case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2484                                      0x0000007f, 0x00000000); break;
2485            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2486                                      0x000001ff, 0x00000000); break;
2487            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2488                                      0x00000fff, 0x00000002); break;
2489            default:         SET_ABCD(0x00000000, 0x00000000,
2490                                      0x00000000, 0x00000000); break;
2491         }
2492         break;
2493      case 0x00000005:
2494         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2495         break;
2496      case 0x00000006:
2497         SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2498         break;
2499      case 0x00000007:
2500         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2501         break;
2502      case 0x00000008:
2503         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2504         break;
2505      case 0x00000009:
2506         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2507         break;
2508      case 0x0000000a:
2509         SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2510         break;
2511      case 0x0000000b:
2512         switch (old_ecx) {
2513            case 0x00000000:
2514               SET_ABCD(0x00000001, 0x00000002,
2515                        0x00000100, 0x00000000); break;
2516            case 0x00000001:
2517               SET_ABCD(0x00000004, 0x00000004,
2518                        0x00000201, 0x00000000); break;
2519            default:
2520               SET_ABCD(0x00000000, 0x00000000,
2521                        old_ecx,    0x00000000); break;
2522         }
2523         break;
2524      case 0x0000000c:
2525         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2526         break;
2527      case 0x0000000d:
2528         switch (old_ecx) {
2529            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2530                                      0x00000100, 0x00000000); break;
2531            case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2532                                      0x00000201, 0x00000000); break;
2533            default:         SET_ABCD(0x00000000, 0x00000000,
2534                                      old_ecx,    0x00000000); break;
2535         }
2536         break;
2537      case 0x80000000:
2538         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2539         break;
2540      case 0x80000001:
2541         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2542         break;
2543      case 0x80000002:
2544         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2545         break;
2546      case 0x80000003:
2547         SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2548         break;
2549      case 0x80000004:
2550         SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2551         break;
2552      case 0x80000005:
2553         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2554         break;
2555      case 0x80000006:
2556         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2557         break;
2558      case 0x80000007:
2559         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2560         break;
2561      case 0x80000008:
2562         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2563         break;
2564      default:
2565         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2566         break;
2567   }
2568#  undef SET_ABCD
2569}
2570
2571
2572/* Claim to be the following CPU (4 x ...), which is AVX and cx16
2573   capable.  Plus (kludge!) it "supports" HTM.
2574
2575   vendor_id       : GenuineIntel
2576   cpu family      : 6
2577   model           : 42
2578   model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
2579   stepping        : 7
2580   cpu MHz         : 1600.000
2581   cache size      : 6144 KB
2582   physical id     : 0
2583   siblings        : 4
2584   core id         : 3
2585   cpu cores       : 4
2586   apicid          : 6
2587   initial apicid  : 6
2588   fpu             : yes
2589   fpu_exception   : yes
2590   cpuid level     : 13
2591   wp              : yes
2592   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2593                     mtrr pge mca cmov pat pse36 clflush dts acpi
2594                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2595                     lm constant_tsc arch_perfmon pebs bts rep_good
2596                     nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
2597                     dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
2598                     xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
2599                     lahf_lm ida arat epb xsaveopt pln pts dts
2600                     tpr_shadow vnmi flexpriority ept vpid
2601
2602   bogomips        : 5768.94
2603   clflush size    : 64
2604   cache_alignment : 64
2605   address sizes   : 36 bits physical, 48 bits virtual
2606   power management:
2607*/
2608void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
2609{
2610#  define SET_ABCD(_a,_b,_c,_d)                \
2611      do { st->guest_RAX = (ULong)(_a);        \
2612           st->guest_RBX = (ULong)(_b);        \
2613           st->guest_RCX = (ULong)(_c);        \
2614           st->guest_RDX = (ULong)(_d);        \
2615      } while (0)
2616
2617   UInt old_eax = (UInt)st->guest_RAX;
2618   UInt old_ecx = (UInt)st->guest_RCX;
2619
2620   switch (old_eax) {
2621      case 0x00000000:
2622         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
2623         break;
2624      case 0x00000001:
2625         SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
2626         break;
2627      case 0x00000002:
2628         SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
2629         break;
2630      case 0x00000003:
2631         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2632         break;
2633      case 0x00000004:
2634         switch (old_ecx) {
2635            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2636                                      0x0000003f, 0x00000000); break;
2637            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
2638                                      0x0000003f, 0x00000000); break;
2639            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2640                                      0x000001ff, 0x00000000); break;
2641            case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
2642                                      0x00001fff, 0x00000006); break;
2643            default:         SET_ABCD(0x00000000, 0x00000000,
2644                                      0x00000000, 0x00000000); break;
2645         }
2646         break;
2647      case 0x00000005:
2648         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2649         break;
2650      case 0x00000006:
2651         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
2652         break;
2653      case 0x00000007:
2654         SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
2655         break;
2656      case 0x00000008:
2657         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2658         break;
2659      case 0x00000009:
2660         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2661         break;
2662      case 0x0000000a:
2663         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
2664         break;
2665      case 0x0000000b:
2666         switch (old_ecx) {
2667            case 0x00000000:
2668               SET_ABCD(0x00000001, 0x00000001,
2669                        0x00000100, 0x00000000); break;
2670            case 0x00000001:
2671               SET_ABCD(0x00000004, 0x00000004,
2672                        0x00000201, 0x00000000); break;
2673            default:
2674               SET_ABCD(0x00000000, 0x00000000,
2675                        old_ecx,    0x00000000); break;
2676         }
2677         break;
2678      case 0x0000000c:
2679         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2680         break;
2681      case 0x0000000d:
2682         switch (old_ecx) {
2683            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
2684                                      0x00000340, 0x00000000); break;
2685            case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
2686                                      0x00000000, 0x00000000); break;
2687            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
2688                                      0x00000000, 0x00000000); break;
2689            default:         SET_ABCD(0x00000000, 0x00000000,
2690                                      0x00000000, 0x00000000); break;
2691         }
2692         break;
2693      case 0x0000000e:
2694         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2695         break;
2696      case 0x0000000f:
2697         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2698         break;
2699      case 0x80000000:
2700         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2701         break;
2702      case 0x80000001:
2703         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2704         break;
2705      case 0x80000002:
2706         SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
2707         break;
2708      case 0x80000003:
2709         SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
2710         break;
2711      case 0x80000004:
2712         SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
2713         break;
2714      case 0x80000005:
2715         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2716         break;
2717      case 0x80000006:
2718         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2719         break;
2720      case 0x80000007:
2721         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2722         break;
2723      case 0x80000008:
2724         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2725         break;
2726      default:
2727         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2728         break;
2729   }
2730#  undef SET_ABCD
2731}
2732
2733
2734ULong amd64g_calculate_RCR ( ULong arg,
2735                             ULong rot_amt,
2736                             ULong rflags_in,
2737                             Long  szIN )
2738{
2739   Bool  wantRflags = toBool(szIN < 0);
2740   ULong sz         = wantRflags ? (-szIN) : szIN;
2741   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2742   ULong cf=0, of=0, tempcf;
2743
2744   switch (sz) {
2745      case 8:
2746         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2747         of        = ((arg >> 63) ^ cf) & 1;
2748         while (tempCOUNT > 0) {
2749            tempcf = arg & 1;
2750            arg    = (arg >> 1) | (cf << 63);
2751            cf     = tempcf;
2752            tempCOUNT--;
2753         }
2754         break;
2755      case 4:
2756         while (tempCOUNT >= 33) tempCOUNT -= 33;
2757         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2758         of        = ((arg >> 31) ^ cf) & 1;
2759         while (tempCOUNT > 0) {
2760            tempcf = arg & 1;
2761            arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
2762            cf     = tempcf;
2763            tempCOUNT--;
2764         }
2765         break;
2766      case 2:
2767         while (tempCOUNT >= 17) tempCOUNT -= 17;
2768         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2769         of        = ((arg >> 15) ^ cf) & 1;
2770         while (tempCOUNT > 0) {
2771            tempcf = arg & 1;
2772            arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
2773            cf     = tempcf;
2774            tempCOUNT--;
2775         }
2776         break;
2777      case 1:
2778         while (tempCOUNT >= 9) tempCOUNT -= 9;
2779         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2780         of        = ((arg >> 7) ^ cf) & 1;
2781         while (tempCOUNT > 0) {
2782            tempcf = arg & 1;
2783            arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
2784            cf     = tempcf;
2785            tempCOUNT--;
2786         }
2787         break;
2788      default:
2789         vpanic("calculate_RCR(amd64g): invalid size");
2790   }
2791
2792   cf &= 1;
2793   of &= 1;
2794   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2795   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2796
2797   /* caller can ask to have back either the resulting flags or
2798      resulting value, but not both */
2799   return wantRflags ? rflags_in : arg;
2800}
2801
2802ULong amd64g_calculate_RCL ( ULong arg,
2803                             ULong rot_amt,
2804                             ULong rflags_in,
2805                             Long  szIN )
2806{
2807   Bool  wantRflags = toBool(szIN < 0);
2808   ULong sz         = wantRflags ? (-szIN) : szIN;
2809   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2810   ULong cf=0, of=0, tempcf;
2811
2812   switch (sz) {
2813      case 8:
2814         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2815         while (tempCOUNT > 0) {
2816            tempcf = (arg >> 63) & 1;
2817            arg    = (arg << 1) | (cf & 1);
2818            cf     = tempcf;
2819            tempCOUNT--;
2820         }
2821         of = ((arg >> 63) ^ cf) & 1;
2822         break;
2823      case 4:
2824         while (tempCOUNT >= 33) tempCOUNT -= 33;
2825         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2826         while (tempCOUNT > 0) {
2827            tempcf = (arg >> 31) & 1;
2828            arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
2829            cf     = tempcf;
2830            tempCOUNT--;
2831         }
2832         of = ((arg >> 31) ^ cf) & 1;
2833         break;
2834      case 2:
2835         while (tempCOUNT >= 17) tempCOUNT -= 17;
2836         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2837         while (tempCOUNT > 0) {
2838            tempcf = (arg >> 15) & 1;
2839            arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
2840            cf     = tempcf;
2841            tempCOUNT--;
2842         }
2843         of = ((arg >> 15) ^ cf) & 1;
2844         break;
2845      case 1:
2846         while (tempCOUNT >= 9) tempCOUNT -= 9;
2847         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2848         while (tempCOUNT > 0) {
2849            tempcf = (arg >> 7) & 1;
2850            arg    = 0xFFULL & ((arg << 1) | (cf & 1));
2851            cf     = tempcf;
2852            tempCOUNT--;
2853         }
2854         of = ((arg >> 7) ^ cf) & 1;
2855         break;
2856      default:
2857         vpanic("calculate_RCL(amd64g): invalid size");
2858   }
2859
2860   cf &= 1;
2861   of &= 1;
2862   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2863   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2864
2865   return wantRflags ? rflags_in : arg;
2866}
2867
2868/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
2869 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
2870 */
2871ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
2872{
2873    ULong hi, lo, tmp, A[16];
2874
2875   A[0] = 0;            A[1] = a;
2876   A[2] = A[1] << 1;    A[3] = A[2] ^ a;
2877   A[4] = A[2] << 1;    A[5] = A[4] ^ a;
2878   A[6] = A[3] << 1;    A[7] = A[6] ^ a;
2879   A[8] = A[4] << 1;    A[9] = A[8] ^ a;
2880   A[10] = A[5] << 1;   A[11] = A[10] ^ a;
2881   A[12] = A[6] << 1;   A[13] = A[12] ^ a;
2882   A[14] = A[7] << 1;   A[15] = A[14] ^ a;
2883
2884   lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
2885   hi = lo >> 56;
2886   lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
2887   hi = (hi << 8) | (lo >> 56);
2888   lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
2889   hi = (hi << 8) | (lo >> 56);
2890   lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
2891   hi = (hi << 8) | (lo >> 56);
2892   lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
2893   hi = (hi << 8) | (lo >> 56);
2894   lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
2895   hi = (hi << 8) | (lo >> 56);
2896   lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
2897   hi = (hi << 8) | (lo >> 56);
2898   lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
2899
2900   ULong m0 = -1;
2901   m0 /= 255;
2902   tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
2903   tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
2904   tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
2905   tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
2906   tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
2907   tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
2908   tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
2909
2910   return which ? hi : lo;
2911}
2912
2913
2914/* CALLED FROM GENERATED CODE */
2915/* DIRTY HELPER (non-referentially-transparent) */
2916/* Horrible hack.  On non-amd64 platforms, return 1. */
2917ULong amd64g_dirtyhelper_RDTSC ( void )
2918{
2919#  if defined(__x86_64__)
2920   UInt  eax, edx;
2921   __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
2922   return (((ULong)edx) << 32) | ((ULong)eax);
2923#  else
2924   return 1ULL;
2925#  endif
2926}
2927
2928/* CALLED FROM GENERATED CODE */
2929/* DIRTY HELPER (non-referentially-transparent) */
2930/* Horrible hack.  On non-amd64 platforms, return 1. */
2931/* This uses a different calling convention from _RDTSC just above
2932   only because of the difficulty of returning 96 bits from a C
2933   function -- RDTSC returns 64 bits and so is simple by comparison,
2934   on amd64. */
2935void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
2936{
2937#  if defined(__x86_64__)
2938   UInt eax, ecx, edx;
2939   __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
2940   st->guest_RAX = (ULong)eax;
2941   st->guest_RCX = (ULong)ecx;
2942   st->guest_RDX = (ULong)edx;
2943#  else
2944   /* Do nothing. */
2945#  endif
2946}
2947
2948/* CALLED FROM GENERATED CODE */
2949/* DIRTY HELPER (non-referentially-transparent) */
2950/* Horrible hack.  On non-amd64 platforms, return 0. */
2951ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
2952{
2953#  if defined(__x86_64__)
2954   ULong r = 0;
2955   portno &= 0xFFFF;
2956   switch (sz) {
2957      case 4:
2958         __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
2959                              : "=a" (r) : "Nd" (portno));
2960	 break;
2961      case 2:
2962         __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
2963                              : "=a" (r) : "Nd" (portno));
2964	 break;
2965      case 1:
2966         __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
2967                              : "=a" (r) : "Nd" (portno));
2968	 break;
2969      default:
2970         break; /* note: no 64-bit version of insn exists */
2971   }
2972   return r;
2973#  else
2974   return 0;
2975#  endif
2976}
2977
2978
2979/* CALLED FROM GENERATED CODE */
2980/* DIRTY HELPER (non-referentially-transparent) */
2981/* Horrible hack.  On non-amd64 platforms, do nothing. */
2982void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
2983{
2984#  if defined(__x86_64__)
2985   portno &= 0xFFFF;
2986   switch (sz) {
2987      case 4:
2988         __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
2989                              : : "a" (data), "Nd" (portno));
2990	 break;
2991      case 2:
2992         __asm__ __volatile__("outw %w0, %w1"
2993                              : : "a" (data), "Nd" (portno));
2994	 break;
2995      case 1:
2996         __asm__ __volatile__("outb %b0, %w1"
2997                              : : "a" (data), "Nd" (portno));
2998	 break;
2999      default:
3000         break; /* note: no 64-bit version of insn exists */
3001   }
3002#  else
3003   /* do nothing */
3004#  endif
3005}
3006
3007/* CALLED FROM GENERATED CODE */
3008/* DIRTY HELPER (non-referentially-transparent) */
3009/* Horrible hack.  On non-amd64 platforms, do nothing. */
3010/* op = 0: call the native SGDT instruction.
3011   op = 1: call the native SIDT instruction.
3012*/
3013void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3014#  if defined(__x86_64__)
3015   switch (op) {
3016      case 0:
3017         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3018         break;
3019      case 1:
3020         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3021         break;
3022      default:
3023         vpanic("amd64g_dirtyhelper_SxDT");
3024   }
3025#  else
3026   /* do nothing */
3027   UChar* p = (UChar*)address;
3028   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3029   p[6] = p[7] = p[8] = p[9] = 0;
3030#  endif
3031}
3032
3033/*---------------------------------------------------------------*/
3034/*--- Helpers for MMX/SSE/SSE2.                               ---*/
3035/*---------------------------------------------------------------*/
3036
3037static inline UChar abdU8 ( UChar xx, UChar yy ) {
3038   return toUChar(xx>yy ? xx-yy : yy-xx);
3039}
3040
3041static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3042   return (((ULong)w1) << 32) | ((ULong)w0);
3043}
3044
3045static inline UShort sel16x4_3 ( ULong w64 ) {
3046   UInt hi32 = toUInt(w64 >> 32);
3047   return toUShort(hi32 >> 16);
3048}
3049static inline UShort sel16x4_2 ( ULong w64 ) {
3050   UInt hi32 = toUInt(w64 >> 32);
3051   return toUShort(hi32);
3052}
3053static inline UShort sel16x4_1 ( ULong w64 ) {
3054   UInt lo32 = toUInt(w64);
3055   return toUShort(lo32 >> 16);
3056}
3057static inline UShort sel16x4_0 ( ULong w64 ) {
3058   UInt lo32 = toUInt(w64);
3059   return toUShort(lo32);
3060}
3061
3062static inline UChar sel8x8_7 ( ULong w64 ) {
3063   UInt hi32 = toUInt(w64 >> 32);
3064   return toUChar(hi32 >> 24);
3065}
3066static inline UChar sel8x8_6 ( ULong w64 ) {
3067   UInt hi32 = toUInt(w64 >> 32);
3068   return toUChar(hi32 >> 16);
3069}
3070static inline UChar sel8x8_5 ( ULong w64 ) {
3071   UInt hi32 = toUInt(w64 >> 32);
3072   return toUChar(hi32 >> 8);
3073}
3074static inline UChar sel8x8_4 ( ULong w64 ) {
3075   UInt hi32 = toUInt(w64 >> 32);
3076   return toUChar(hi32 >> 0);
3077}
3078static inline UChar sel8x8_3 ( ULong w64 ) {
3079   UInt lo32 = toUInt(w64);
3080   return toUChar(lo32 >> 24);
3081}
3082static inline UChar sel8x8_2 ( ULong w64 ) {
3083   UInt lo32 = toUInt(w64);
3084   return toUChar(lo32 >> 16);
3085}
3086static inline UChar sel8x8_1 ( ULong w64 ) {
3087   UInt lo32 = toUInt(w64);
3088   return toUChar(lo32 >> 8);
3089}
3090static inline UChar sel8x8_0 ( ULong w64 ) {
3091   UInt lo32 = toUInt(w64);
3092   return toUChar(lo32 >> 0);
3093}
3094
3095/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3096ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3097{
3098   return
3099      mk32x2(
3100         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3101            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3102         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3103            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3104      );
3105}
3106
3107/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3108ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3109{
3110   UInt t = 0;
3111   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3112   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3113   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3114   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3115   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3116   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3117   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3118   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3119   t &= 0xFFFF;
3120   return (ULong)t;
3121}
3122
3123/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3124ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3125{
3126   UShort t, min;
3127   UInt   idx;
3128   t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
3129   t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3130   t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3131   t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3132   t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3133   t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3134   t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3135   t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3136   return ((ULong)(idx << 16)) | ((ULong)min);
3137}
3138
3139/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3140ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3141{
3142   UInt  i;
3143   ULong crc = (b & 0xFFULL) ^ crcIn;
3144   for (i = 0; i < 8; i++)
3145      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3146   return crc;
3147}
3148
3149/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3150ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3151{
3152   UInt  i;
3153   ULong crc = (w & 0xFFFFULL) ^ crcIn;
3154   for (i = 0; i < 16; i++)
3155      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3156   return crc;
3157}
3158
3159/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3160ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3161{
3162   UInt i;
3163   ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3164   for (i = 0; i < 32; i++)
3165      crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3166   return crc;
3167}
3168
3169/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3170ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3171{
3172   ULong crc = amd64g_calc_crc32l(crcIn, q);
3173   return amd64g_calc_crc32l(crc, q >> 32);
3174}
3175
3176
3177/* .. helper for next fn .. */
3178static inline ULong sad_8x4 ( ULong xx, ULong yy )
3179{
3180   UInt t = 0;
3181   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3182   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3183   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3184   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3185   return (ULong)t;
3186}
3187
3188/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3189ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3190                            ULong dHi, ULong dLo,
3191                            ULong imm_and_return_control_bit )
3192{
3193   UInt imm8     = imm_and_return_control_bit & 7;
3194   Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
3195   UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3196   UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3197   /* For src we only need 32 bits, so get them into the
3198      lower half of a 64 bit word. */
3199   ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3200   /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3201      11 bytes.  If calculating the low part of the result, need bytes
3202      dstOffsL * 4 + (0 .. 6); if calculating the high part,
3203      dstOffsL * 4 + (4 .. 10). */
3204   ULong dst;
3205   /* dstOffL = 0, Lo  ->  0 .. 6
3206      dstOffL = 1, Lo  ->  4 .. 10
3207      dstOffL = 0, Hi  ->  4 .. 10
3208      dstOffL = 1, Hi  ->  8 .. 14
3209   */
3210   if (calcHi && dstOffsL) {
3211      /* 8 .. 14 */
3212      dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3213   }
3214   else if (!calcHi && !dstOffsL) {
3215      /* 0 .. 6 */
3216      dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3217   }
3218   else {
3219      /* 4 .. 10 */
3220      dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3221   }
3222   ULong r0  = sad_8x4( dst >>  0, src );
3223   ULong r1  = sad_8x4( dst >>  8, src );
3224   ULong r2  = sad_8x4( dst >> 16, src );
3225   ULong r3  = sad_8x4( dst >> 24, src );
3226   ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3227   return res;
3228}
3229
3230/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3231ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3232{
3233   ULong dst = 0;
3234   ULong src_bit;
3235   ULong dst_bit = 1;
3236   for (src_bit = 1; src_bit; src_bit <<= 1) {
3237      if (mask & src_bit) {
3238         if (src_masked & src_bit) dst |= dst_bit;
3239         dst_bit <<= 1;
3240      }
3241   }
3242   return dst;
3243}
3244
3245/* CALLED FROM GENERATED CODE: CLEAN HELPER */
3246ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3247{
3248   ULong dst = 0;
3249   ULong dst_bit;
3250   ULong src_bit = 1;
3251   for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3252      if (mask & dst_bit) {
3253         if (src & src_bit) dst |= dst_bit;
3254         src_bit <<= 1;
3255      }
3256   }
3257   return dst;
3258}
3259
3260/*---------------------------------------------------------------*/
3261/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
3262/*---------------------------------------------------------------*/
3263
3264static UInt zmask_from_V128 ( V128* arg )
3265{
3266   UInt i, res = 0;
3267   for (i = 0; i < 16; i++) {
3268      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
3269   }
3270   return res;
3271}
3272
3273static UInt zmask_from_V128_wide ( V128* arg )
3274{
3275   UInt i, res = 0;
3276   for (i = 0; i < 8; i++) {
3277      res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
3278   }
3279   return res;
3280}
3281
3282/* Helps with PCMP{I,E}STR{I,M}.
3283
3284   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
3285   actually it could be a clean helper, but for the fact that we can't
3286   pass by value 2 x V128 to a clean helper, nor have one returned.)
3287   Reads guest state, writes to guest state for the xSTRM cases, no
3288   accesses of memory, is a pure function.
3289
3290   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3291   the callee knows which I/E and I/M variant it is dealing with and
3292   what the specific operation is.  4th byte of opcode is in the range
3293   0x60 to 0x63:
3294       istri  66 0F 3A 63
3295       istrm  66 0F 3A 62
3296       estri  66 0F 3A 61
3297       estrm  66 0F 3A 60
3298
3299   gstOffL and gstOffR are the guest state offsets for the two XMM
3300   register inputs.  We never have to deal with the memory case since
3301   that is handled by pre-loading the relevant value into the fake
3302   XMM16 register.
3303
3304   For ESTRx variants, edxIN and eaxIN hold the values of those two
3305   registers.
3306
3307   In all cases, the bottom 16 bits of the result contain the new
3308   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
3309   result hold the new %ecx value.  For xSTRM variants, the helper
3310   writes the result directly to the guest XMM0.
3311
3312   Declarable side effects: in all cases, reads guest state at
3313   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
3314   guest_XMM0.
3315
3316   Is expected to be called with opc_and_imm combinations which have
3317   actually been validated, and will assert if otherwise.  The front
3318   end should ensure we're only called with verified values.
3319*/
3320ULong amd64g_dirtyhelper_PCMPxSTRx (
3321          VexGuestAMD64State* gst,
3322          HWord opc4_and_imm,
3323          HWord gstOffL, HWord gstOffR,
3324          HWord edxIN, HWord eaxIN
3325       )
3326{
3327   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3328   HWord imm8 = opc4_and_imm & 0xFF;
3329   HWord isISTRx = opc4 & 2;
3330   HWord isxSTRM = (opc4 & 1) ^ 1;
3331   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3332   HWord wide = (imm8 & 1);
3333
3334   // where the args are
3335   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3336   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3337
3338   /* Create the arg validity masks, either from the vectors
3339      themselves or from the supplied edx/eax values. */
3340   // FIXME: this is only right for the 8-bit data cases.
3341   // At least that is asserted above.
3342   UInt zmaskL, zmaskR;
3343
3344   // temp spot for the resulting flags and vector.
3345   V128 resV;
3346   UInt resOSZACP;
3347
3348   // for checking whether case was handled
3349   Bool ok = False;
3350
3351   if (wide) {
3352      if (isISTRx) {
3353         zmaskL = zmask_from_V128_wide(argL);
3354         zmaskR = zmask_from_V128_wide(argR);
3355      } else {
3356         Int tmp;
3357         tmp = edxIN & 0xFFFFFFFF;
3358         if (tmp < -8) tmp = -8;
3359         if (tmp > 8)  tmp = 8;
3360         if (tmp < 0)  tmp = -tmp;
3361         vassert(tmp >= 0 && tmp <= 8);
3362         zmaskL = (1 << tmp) & 0xFF;
3363         tmp = eaxIN & 0xFFFFFFFF;
3364         if (tmp < -8) tmp = -8;
3365         if (tmp > 8)  tmp = 8;
3366         if (tmp < 0)  tmp = -tmp;
3367         vassert(tmp >= 0 && tmp <= 8);
3368         zmaskR = (1 << tmp) & 0xFF;
3369      }
3370      // do the meyaath
3371      ok = compute_PCMPxSTRx_wide (
3372              &resV, &resOSZACP, argL, argR,
3373              zmaskL, zmaskR, imm8, (Bool)isxSTRM
3374           );
3375   } else {
3376      if (isISTRx) {
3377         zmaskL = zmask_from_V128(argL);
3378         zmaskR = zmask_from_V128(argR);
3379      } else {
3380         Int tmp;
3381         tmp = edxIN & 0xFFFFFFFF;
3382         if (tmp < -16) tmp = -16;
3383         if (tmp > 16)  tmp = 16;
3384         if (tmp < 0)   tmp = -tmp;
3385         vassert(tmp >= 0 && tmp <= 16);
3386         zmaskL = (1 << tmp) & 0xFFFF;
3387         tmp = eaxIN & 0xFFFFFFFF;
3388         if (tmp < -16) tmp = -16;
3389         if (tmp > 16)  tmp = 16;
3390         if (tmp < 0)   tmp = -tmp;
3391         vassert(tmp >= 0 && tmp <= 16);
3392         zmaskR = (1 << tmp) & 0xFFFF;
3393      }
3394      // do the meyaath
3395      ok = compute_PCMPxSTRx (
3396              &resV, &resOSZACP, argL, argR,
3397              zmaskL, zmaskR, imm8, (Bool)isxSTRM
3398           );
3399   }
3400
3401   // front end shouldn't pass us any imm8 variants we can't
3402   // handle.  Hence:
3403   vassert(ok);
3404
3405   // So, finally we need to get the results back to the caller.
3406   // In all cases, the new OSZACP value is the lowest 16 of
3407   // the return value.
3408   if (isxSTRM) {
3409      gst->guest_YMM0[0] = resV.w32[0];
3410      gst->guest_YMM0[1] = resV.w32[1];
3411      gst->guest_YMM0[2] = resV.w32[2];
3412      gst->guest_YMM0[3] = resV.w32[3];
3413      return resOSZACP & 0x8D5;
3414   } else {
3415      UInt newECX = resV.w32[0] & 0xFFFF;
3416      return (newECX << 16) | (resOSZACP & 0x8D5);
3417   }
3418}
3419
3420/*---------------------------------------------------------------*/
3421/*--- AES primitives and helpers                              ---*/
3422/*---------------------------------------------------------------*/
3423/* a 16 x 16 matrix */
3424static const UChar sbox[256] = {                   // row nr
3425   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
3426   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
3427   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
3428   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
3429   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
3430   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3431   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
3432   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
3433   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
3434   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
3435   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
3436   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3437   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
3438   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
3439   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
3440   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
3441   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
3442   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3443   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
3444   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
3445   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
3446   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
3447   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
3448   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3449   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
3450   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
3451   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
3452   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
3453   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
3454   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3455   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
3456   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
3457};
3458static void SubBytes (V128* v)
3459{
3460   V128 r;
3461   UInt i;
3462   for (i = 0; i < 16; i++)
3463      r.w8[i] = sbox[v->w8[i]];
3464   *v = r;
3465}
3466
3467/* a 16 x 16 matrix */
3468static const UChar invsbox[256] = {                // row nr
3469   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
3470   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
3471   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
3472   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
3473   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
3474   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
3475   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
3476   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
3477   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
3478   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
3479   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
3480   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
3481   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
3482   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
3483   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
3484   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
3485   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
3486   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
3487   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
3488   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
3489   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
3490   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
3491   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
3492   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
3493   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
3494   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
3495   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
3496   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
3497   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
3498   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
3499   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
3500   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
3501};
3502static void InvSubBytes (V128* v)
3503{
3504   V128 r;
3505   UInt i;
3506   for (i = 0; i < 16; i++)
3507      r.w8[i] = invsbox[v->w8[i]];
3508   *v = r;
3509}
3510
3511static const UChar ShiftRows_op[16] =
3512   {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
3513static void ShiftRows (V128* v)
3514{
3515   V128 r;
3516   UInt i;
3517   for (i = 0; i < 16; i++)
3518      r.w8[i] = v->w8[ShiftRows_op[15-i]];
3519   *v = r;
3520}
3521
3522static const UChar InvShiftRows_op[16] =
3523   {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
3524static void InvShiftRows (V128* v)
3525{
3526   V128 r;
3527   UInt i;
3528   for (i = 0; i < 16; i++)
3529      r.w8[i] = v->w8[InvShiftRows_op[15-i]];
3530   *v = r;
3531}
3532
3533/* Multiplication of the finite fields elements of AES.
3534   See "A Specification for The AES Algorithm Rijndael
3535        (by Joan Daemen & Vincent Rijmen)"
3536        Dr. Brian Gladman, v3.1, 3rd March 2001. */
3537/* N values so that (hex) xy = 0x03^N.
3538   0x00 cannot be used. We put 0xff for this value.*/
3539/* a 16 x 16 matrix */
3540static const UChar Nxy[256] = {                    // row nr
3541   0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
3542   0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
3543   0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
3544   0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
3545   0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
3546   0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
3547   0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
3548   0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
3549   0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
3550   0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
3551   0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
3552   0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
3553   0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
3554   0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
3555   0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
3556   0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
3557   0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
3558   0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
3559   0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
3560   0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
3561   0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
3562   0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
3563   0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
3564   0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
3565   0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
3566   0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
3567   0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
3568   0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
3569   0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
3570   0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
3571   0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
3572   0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
3573};
3574
3575/* E values so that E = 0x03^xy. */
3576static const UChar Exy[256] = {                    // row nr
3577   0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
3578   0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
3579   0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
3580   0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
3581   0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
3582   0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
3583   0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
3584   0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
3585   0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
3586   0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
3587   0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
3588   0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
3589   0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
3590   0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
3591   0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
3592   0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
3593   0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
3594   0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
3595   0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
3596   0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
3597   0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
3598   0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
3599   0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
3600   0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
3601   0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
3602   0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
3603   0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
3604   0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
3605   0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
3606   0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
3607   0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
3608   0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
3609
3610static inline UChar ff_mul(UChar u1, UChar u2)
3611{
3612   if ((u1 > 0) && (u2 > 0)) {
3613      UInt ui = Nxy[u1] + Nxy[u2];
3614      if (ui >= 255)
3615         ui = ui - 255;
3616      return Exy[ui];
3617   } else {
3618      return 0;
3619   };
3620}
3621
3622static void MixColumns (V128* v)
3623{
3624   V128 r;
3625   Int j;
3626#define P(x,row,col) (x)->w8[((row)*4+(col))]
3627   for (j = 0; j < 4; j++) {
3628      P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
3629         ^ P(v,j,2) ^ P(v,j,3);
3630      P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
3631         ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
3632      P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
3633         ^ ff_mul(0x03, P(v,j,3) );
3634      P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
3635         ^ ff_mul( 0x02, P(v,j,3) );
3636   }
3637   *v = r;
3638#undef P
3639}
3640
3641static void InvMixColumns (V128* v)
3642{
3643   V128 r;
3644   Int j;
3645#define P(x,row,col) (x)->w8[((row)*4+(col))]
3646   for (j = 0; j < 4; j++) {
3647      P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
3648         ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
3649      P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
3650         ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
3651      P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
3652         ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
3653      P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
3654         ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
3655   }
3656   *v = r;
3657#undef P
3658
3659}
3660
3661/* For description, see definition in guest_amd64_defs.h */
3662void amd64g_dirtyhelper_AES (
3663          VexGuestAMD64State* gst,
3664          HWord opc4, HWord gstOffD,
3665          HWord gstOffL, HWord gstOffR
3666       )
3667{
3668   // where the args are
3669   V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
3670   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3671   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3672   V128  r;
3673
3674   switch (opc4) {
3675      case 0xDC: /* AESENC */
3676      case 0xDD: /* AESENCLAST */
3677         r = *argR;
3678         ShiftRows (&r);
3679         SubBytes  (&r);
3680         if (opc4 == 0xDC)
3681            MixColumns (&r);
3682         argD->w64[0] = r.w64[0] ^ argL->w64[0];
3683         argD->w64[1] = r.w64[1] ^ argL->w64[1];
3684         break;
3685
3686      case 0xDE: /* AESDEC */
3687      case 0xDF: /* AESDECLAST */
3688         r = *argR;
3689         InvShiftRows (&r);
3690         InvSubBytes (&r);
3691         if (opc4 == 0xDE)
3692            InvMixColumns (&r);
3693         argD->w64[0] = r.w64[0] ^ argL->w64[0];
3694         argD->w64[1] = r.w64[1] ^ argL->w64[1];
3695         break;
3696
3697      case 0xDB: /* AESIMC */
3698         *argD = *argL;
3699         InvMixColumns (argD);
3700         break;
3701      default: vassert(0);
3702   }
3703}
3704
3705static inline UInt RotWord (UInt   w32)
3706{
3707   return ((w32 >> 8) | (w32 << 24));
3708}
3709
3710static inline UInt SubWord (UInt   w32)
3711{
3712   UChar *w8;
3713   UChar *r8;
3714   UInt res;
3715   w8 = (UChar*) &w32;
3716   r8 = (UChar*) &res;
3717   r8[0] = sbox[w8[0]];
3718   r8[1] = sbox[w8[1]];
3719   r8[2] = sbox[w8[2]];
3720   r8[3] = sbox[w8[3]];
3721   return res;
3722}
3723
3724/* For description, see definition in guest_amd64_defs.h */
3725extern void amd64g_dirtyhelper_AESKEYGENASSIST (
3726          VexGuestAMD64State* gst,
3727          HWord imm8,
3728          HWord gstOffL, HWord gstOffR
3729       )
3730{
3731   // where the args are
3732   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3733   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3734
3735   argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
3736   argR->w32[2] = SubWord (argL->w32[3]);
3737   argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
3738   argR->w32[0] = SubWord (argL->w32[1]);
3739}
3740
3741
3742
3743/*---------------------------------------------------------------*/
3744/*--- Helpers for dealing with, and describing,               ---*/
3745/*--- guest state as a whole.                                 ---*/
3746/*---------------------------------------------------------------*/
3747
3748/* Initialise the entire amd64 guest state. */
3749/* VISIBLE TO LIBVEX CLIENT */
3750void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
3751{
3752   vex_state->host_EvC_FAILADDR = 0;
3753   vex_state->host_EvC_COUNTER = 0;
3754   vex_state->pad0 = 0;
3755
3756   vex_state->guest_RAX = 0;
3757   vex_state->guest_RCX = 0;
3758   vex_state->guest_RDX = 0;
3759   vex_state->guest_RBX = 0;
3760   vex_state->guest_RSP = 0;
3761   vex_state->guest_RBP = 0;
3762   vex_state->guest_RSI = 0;
3763   vex_state->guest_RDI = 0;
3764   vex_state->guest_R8  = 0;
3765   vex_state->guest_R9  = 0;
3766   vex_state->guest_R10 = 0;
3767   vex_state->guest_R11 = 0;
3768   vex_state->guest_R12 = 0;
3769   vex_state->guest_R13 = 0;
3770   vex_state->guest_R14 = 0;
3771   vex_state->guest_R15 = 0;
3772
3773   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
3774   vex_state->guest_CC_DEP1 = 0;
3775   vex_state->guest_CC_DEP2 = 0;
3776   vex_state->guest_CC_NDEP = 0;
3777
3778   vex_state->guest_DFLAG   = 1; /* forwards */
3779   vex_state->guest_IDFLAG  = 0;
3780   vex_state->guest_ACFLAG  = 0;
3781
3782   /* HACK: represent the offset associated with %fs==0. This
3783      assumes that %fs is only ever zero. */
3784   vex_state->guest_FS_ZERO = 0;
3785
3786   vex_state->guest_RIP = 0;
3787
3788   /* Initialise the simulated FPU */
3789   amd64g_dirtyhelper_FINIT( vex_state );
3790
3791   /* Initialise the AVX state. */
3792#  define AVXZERO(_ymm) \
3793      do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
3794           _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
3795      } while (0)
3796   vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
3797   AVXZERO(vex_state->guest_YMM0);
3798   AVXZERO(vex_state->guest_YMM1);
3799   AVXZERO(vex_state->guest_YMM2);
3800   AVXZERO(vex_state->guest_YMM3);
3801   AVXZERO(vex_state->guest_YMM4);
3802   AVXZERO(vex_state->guest_YMM5);
3803   AVXZERO(vex_state->guest_YMM6);
3804   AVXZERO(vex_state->guest_YMM7);
3805   AVXZERO(vex_state->guest_YMM8);
3806   AVXZERO(vex_state->guest_YMM9);
3807   AVXZERO(vex_state->guest_YMM10);
3808   AVXZERO(vex_state->guest_YMM11);
3809   AVXZERO(vex_state->guest_YMM12);
3810   AVXZERO(vex_state->guest_YMM13);
3811   AVXZERO(vex_state->guest_YMM14);
3812   AVXZERO(vex_state->guest_YMM15);
3813   AVXZERO(vex_state->guest_YMM16);
3814
3815#  undef AVXZERO
3816
3817   vex_state->guest_EMNOTE = EmNote_NONE;
3818
3819   /* These should not ever be either read or written, but we
3820      initialise them anyway. */
3821   vex_state->guest_CMSTART = 0;
3822   vex_state->guest_CMLEN   = 0;
3823
3824   vex_state->guest_NRADDR   = 0;
3825   vex_state->guest_SC_CLASS = 0;
3826   vex_state->guest_GS_0x60  = 0;
3827
3828   vex_state->guest_IP_AT_SYSCALL = 0;
3829   vex_state->pad1 = 0;
3830}
3831
3832
3833/* Figure out if any part of the guest state contained in minoff
3834   .. maxoff requires precise memory exceptions.  If in doubt return
3835   True (but this generates significantly slower code).
3836
3837   By default we enforce precise exns for guest %RSP, %RBP and %RIP
3838   only.  These are the minimum needed to extract correct stack
3839   backtraces from amd64 code.
3840
3841   Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
3842*/
3843Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
3844                                                   Int maxoff)
3845{
3846   Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
3847   Int rbp_max = rbp_min + 8 - 1;
3848   Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
3849   Int rsp_max = rsp_min + 8 - 1;
3850   Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
3851   Int rip_max = rip_min + 8 - 1;
3852
3853   if (maxoff < rsp_min || minoff > rsp_max) {
3854      /* no overlap with rsp */
3855      if (vex_control.iropt_register_updates == VexRegUpdSpAtMemAccess)
3856         return False; // We only need to check stack pointer.
3857   } else {
3858      return True;
3859   }
3860
3861   if (maxoff < rbp_min || minoff > rbp_max) {
3862      /* no overlap with rbp */
3863   } else {
3864      return True;
3865   }
3866
3867   if (maxoff < rip_min || minoff > rip_max) {
3868      /* no overlap with eip */
3869   } else {
3870      return True;
3871   }
3872
3873   return False;
3874}
3875
3876
3877#define ALWAYSDEFD(field)                             \
3878    { offsetof(VexGuestAMD64State, field),            \
3879      (sizeof ((VexGuestAMD64State*)0)->field) }
3880
3881VexGuestLayout
3882   amd64guest_layout
3883      = {
3884          /* Total size of the guest state, in bytes. */
3885          .total_sizeB = sizeof(VexGuestAMD64State),
3886
3887          /* Describe the stack pointer. */
3888          .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
3889          .sizeof_SP = 8,
3890
3891          /* Describe the frame pointer. */
3892          .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
3893          .sizeof_FP = 8,
3894
3895          /* Describe the instruction pointer. */
3896          .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
3897          .sizeof_IP = 8,
3898
3899          /* Describe any sections to be regarded by Memcheck as
3900             'always-defined'. */
3901          .n_alwaysDefd = 16,
3902
3903          /* flags thunk: OP and NDEP are always defd, whereas DEP1
3904             and DEP2 have to be tracked.  See detailed comment in
3905             gdefs.h on meaning of thunk fields. */
3906          .alwaysDefd
3907             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
3908                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
3909		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
3910                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
3911                 /*  4 */ ALWAYSDEFD(guest_RIP),
3912                 /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
3913                 /*  6 */ ALWAYSDEFD(guest_FTOP),
3914                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
3915                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
3916                 /*  9 */ ALWAYSDEFD(guest_FC3210),
3917                 // /* */ ALWAYSDEFD(guest_CS),
3918                 // /* */ ALWAYSDEFD(guest_DS),
3919                 // /* */ ALWAYSDEFD(guest_ES),
3920                 // /* */ ALWAYSDEFD(guest_FS),
3921                 // /* */ ALWAYSDEFD(guest_GS),
3922                 // /* */ ALWAYSDEFD(guest_SS),
3923                 // /* */ ALWAYSDEFD(guest_LDT),
3924                 // /* */ ALWAYSDEFD(guest_GDT),
3925                 /* 10 */ ALWAYSDEFD(guest_EMNOTE),
3926                 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
3927                 /* 12 */ ALWAYSDEFD(guest_CMSTART),
3928                 /* 13 */ ALWAYSDEFD(guest_CMLEN),
3929                 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
3930                 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
3931               }
3932        };
3933
3934
3935/*---------------------------------------------------------------*/
3936/*--- end                               guest_amd64_helpers.c ---*/
3937/*---------------------------------------------------------------*/
3938