1
2/*--------------------------------------------------------------------*/
3/*--- Instrument IR to perform memory checking operations.         ---*/
4/*---                                               mc_translate.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of MemCheck, a heavyweight Valgrind tool for
9   detecting memory errors.
10
11   Copyright (C) 2000-2013 Julian Seward
12      jseward@acm.org
13
14   This program is free software; you can redistribute it and/or
15   modify it under the terms of the GNU General Public License as
16   published by the Free Software Foundation; either version 2 of the
17   License, or (at your option) any later version.
18
19   This program is distributed in the hope that it will be useful, but
20   WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22   General Public License for more details.
23
24   You should have received a copy of the GNU General Public License
25   along with this program; if not, write to the Free Software
26   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27   02111-1307, USA.
28
29   The GNU General Public License is contained in the file COPYING.
30*/
31
32#include "pub_tool_basics.h"
33#include "pub_tool_poolalloc.h"     // For mc_include.h
34#include "pub_tool_hashtable.h"     // For mc_include.h
35#include "pub_tool_libcassert.h"
36#include "pub_tool_libcprint.h"
37#include "pub_tool_tooliface.h"
38#include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
39#include "pub_tool_xarray.h"
40#include "pub_tool_mallocfree.h"
41#include "pub_tool_libcbase.h"
42
43#include "mc_include.h"
44
45
46/* FIXMEs JRS 2011-June-16.
47
48   Check the interpretation for vector narrowing and widening ops,
49   particularly the saturating ones.  I suspect they are either overly
50   pessimistic and/or wrong.
51*/
52
53/* This file implements the Memcheck instrumentation, and in
54   particular contains the core of its undefined value detection
55   machinery.  For a comprehensive background of the terminology,
56   algorithms and rationale used herein, read:
57
58     Using Valgrind to detect undefined value errors with
59     bit-precision
60
61     Julian Seward and Nicholas Nethercote
62
63     2005 USENIX Annual Technical Conference (General Track),
64     Anaheim, CA, USA, April 10-15, 2005.
65
66   ----
67
68   Here is as good a place as any to record exactly when V bits are and
69   should be checked, why, and what function is responsible.
70
71
72   Memcheck complains when an undefined value is used:
73
74   1. In the condition of a conditional branch.  Because it could cause
75      incorrect control flow, and thus cause incorrect externally-visible
76      behaviour.  [mc_translate.c:complainIfUndefined]
77
78   2. As an argument to a system call, or as the value that specifies
79      the system call number.  Because it could cause an incorrect
80      externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
81
82   3. As the address in a load or store.  Because it could cause an
83      incorrect value to be used later, which could cause externally-visible
84      behaviour (eg. via incorrect control flow or an incorrect system call
85      argument)  [complainIfUndefined]
86
87   4. As the target address of a branch.  Because it could cause incorrect
88      control flow.  [complainIfUndefined]
89
90   5. As an argument to setenv, unsetenv, or putenv.  Because it could put
91      an incorrect value into the external environment.
92      [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
93
94   6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
95      [complainIfUndefined]
96
97   7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
98      VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
99      requested it.  [in memcheck.h]
100
101
102   Memcheck also complains, but should not, when an undefined value is used:
103
104   8. As the shift value in certain SIMD shift operations (but not in the
105      standard integer shift operations).  This inconsistency is due to
106      historical reasons.)  [complainIfUndefined]
107
108
109   Memcheck does not complain, but should, when an undefined value is used:
110
111   9. As an input to a client request.  Because the client request may
112      affect the visible behaviour -- see bug #144362 for an example
113      involving the malloc replacements in vg_replace_malloc.c and
114      VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
115      isn't identified.  That bug report also has some info on how to solve
116      the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
117
118
119   In practice, 1 and 2 account for the vast majority of cases.
120*/
121
122/* Generation of addr-definedness, addr-validity and
123   guard-definedness checks pertaining to loads and stores (Iex_Load,
124   Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
125   loads/stores) was re-checked 11 May 2013. */
126
127/*------------------------------------------------------------*/
128/*--- Forward decls                                        ---*/
129/*------------------------------------------------------------*/
130
131struct _MCEnv;
132
133static IRType  shadowTypeV ( IRType ty );
134static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
135static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
136
137static IRExpr *i128_const_zero(void);
138
139/*------------------------------------------------------------*/
140/*--- Memcheck running state, and tmp management.          ---*/
141/*------------------------------------------------------------*/
142
143/* Carries info about a particular tmp.  The tmp's number is not
144   recorded, as this is implied by (equal to) its index in the tmpMap
145   in MCEnv.  The tmp's type is also not recorded, as this is present
146   in MCEnv.sb->tyenv.
147
148   When .kind is Orig, .shadowV and .shadowB may give the identities
149   of the temps currently holding the associated definedness (shadowV)
150   and origin (shadowB) values, or these may be IRTemp_INVALID if code
151   to compute such values has not yet been emitted.
152
153   When .kind is VSh or BSh then the tmp is holds a V- or B- value,
154   and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
155   illogical for a shadow tmp itself to be shadowed.
156*/
157typedef
158   enum { Orig=1, VSh=2, BSh=3 }
159   TempKind;
160
161typedef
162   struct {
163      TempKind kind;
164      IRTemp   shadowV;
165      IRTemp   shadowB;
166   }
167   TempMapEnt;
168
169
170/* Carries around state during memcheck instrumentation. */
171typedef
172   struct _MCEnv {
173      /* MODIFIED: the superblock being constructed.  IRStmts are
174         added. */
175      IRSB* sb;
176      Bool  trace;
177
178      /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
179         current kind and possibly shadow temps for each temp in the
180         IRSB being constructed.  Note that it does not contain the
181         type of each tmp.  If you want to know the type, look at the
182         relevant entry in sb->tyenv.  It follows that at all times
183         during the instrumentation process, the valid indices for
184         tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
185         total number of Orig, V- and B- temps allocated so far.
186
187         The reason for this strange split (types in one place, all
188         other info in another) is that we need the types to be
189         attached to sb so as to make it possible to do
190         "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
191         instrumentation process. */
192      XArray* /* of TempMapEnt */ tmpMap;
193
194      /* MODIFIED: indicates whether "bogus" literals have so far been
195         found.  Starts off False, and may change to True. */
196      Bool bogusLiterals;
197
198      /* READONLY: indicates whether we should use expensive
199         interpretations of integer adds, since unfortunately LLVM
200         uses them to do ORs in some circumstances.  Defaulted to True
201         on MacOS and False everywhere else. */
202      Bool useLLVMworkarounds;
203
204      /* READONLY: the guest layout.  This indicates which parts of
205         the guest state should be regarded as 'always defined'. */
206      VexGuestLayout* layout;
207
208      /* READONLY: the host word type.  Needed for constructing
209         arguments of type 'HWord' to be passed to helper functions.
210         Ity_I32 or Ity_I64 only. */
211      IRType hWordTy;
212   }
213   MCEnv;
214
215/* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
216   demand), as they are encountered.  This is for two reasons.
217
218   (1) (less important reason): Many original tmps are unused due to
219   initial IR optimisation, and we do not want to spaces in tables
220   tracking them.
221
222   Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
223   table indexed [0 .. n_types-1], which gives the current shadow for
224   each original tmp, or INVALID_IRTEMP if none is so far assigned.
225   It is necessary to support making multiple assignments to a shadow
226   -- specifically, after testing a shadow for definedness, it needs
227   to be made defined.  But IR's SSA property disallows this.
228
229   (2) (more important reason): Therefore, when a shadow needs to get
230   a new value, a new temporary is created, the value is assigned to
231   that, and the tmpMap is updated to reflect the new binding.
232
233   A corollary is that if the tmpMap maps a given tmp to
234   IRTemp_INVALID and we are hoping to read that shadow tmp, it means
235   there's a read-before-write error in the original tmps.  The IR
236   sanity checker should catch all such anomalies, however.
237*/
238
239/* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
240   both the table in mce->sb and to our auxiliary mapping.  Note that
241   newTemp may cause mce->tmpMap to resize, hence previous results
242   from VG_(indexXA)(mce->tmpMap) are invalidated. */
243static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
244{
245   Word       newIx;
246   TempMapEnt ent;
247   IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
248   ent.kind    = kind;
249   ent.shadowV = IRTemp_INVALID;
250   ent.shadowB = IRTemp_INVALID;
251   newIx = VG_(addToXA)( mce->tmpMap, &ent );
252   tl_assert(newIx == (Word)tmp);
253   return tmp;
254}
255
256
257/* Find the tmp currently shadowing the given original tmp.  If none
258   so far exists, allocate one.  */
259static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
260{
261   TempMapEnt* ent;
262   /* VG_(indexXA) range-checks 'orig', hence no need to check
263      here. */
264   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
265   tl_assert(ent->kind == Orig);
266   if (ent->shadowV == IRTemp_INVALID) {
267      IRTemp tmpV
268        = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
269      /* newTemp may cause mce->tmpMap to resize, hence previous results
270         from VG_(indexXA) are invalid. */
271      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
272      tl_assert(ent->kind == Orig);
273      tl_assert(ent->shadowV == IRTemp_INVALID);
274      ent->shadowV = tmpV;
275   }
276   return ent->shadowV;
277}
278
279/* Allocate a new shadow for the given original tmp.  This means any
280   previous shadow is abandoned.  This is needed because it is
281   necessary to give a new value to a shadow once it has been tested
282   for undefinedness, but unfortunately IR's SSA property disallows
283   this.  Instead we must abandon the old shadow, allocate a new one
284   and use that instead.
285
286   This is the same as findShadowTmpV, except we don't bother to see
287   if a shadow temp already existed -- we simply allocate a new one
288   regardless. */
289static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
290{
291   TempMapEnt* ent;
292   /* VG_(indexXA) range-checks 'orig', hence no need to check
293      here. */
294   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
295   tl_assert(ent->kind == Orig);
296   if (1) {
297      IRTemp tmpV
298        = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
299      /* newTemp may cause mce->tmpMap to resize, hence previous results
300         from VG_(indexXA) are invalid. */
301      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
302      tl_assert(ent->kind == Orig);
303      ent->shadowV = tmpV;
304   }
305}
306
307
308/*------------------------------------------------------------*/
309/*--- IRAtoms -- a subset of IRExprs                       ---*/
310/*------------------------------------------------------------*/
311
312/* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
313   isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
314   input, most of this code deals in atoms.  Usefully, a value atom
315   always has a V-value which is also an atom: constants are shadowed
316   by constants, and temps are shadowed by the corresponding shadow
317   temporary. */
318
319typedef  IRExpr  IRAtom;
320
321/* (used for sanity checks only): is this an atom which looks
322   like it's from original code? */
323static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
324{
325   if (a1->tag == Iex_Const)
326      return True;
327   if (a1->tag == Iex_RdTmp) {
328      TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
329      return ent->kind == Orig;
330   }
331   return False;
332}
333
334/* (used for sanity checks only): is this an atom which looks
335   like it's from shadow code? */
336static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
337{
338   if (a1->tag == Iex_Const)
339      return True;
340   if (a1->tag == Iex_RdTmp) {
341      TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
342      return ent->kind == VSh || ent->kind == BSh;
343   }
344   return False;
345}
346
347/* (used for sanity checks only): check that both args are atoms and
348   are identically-kinded. */
349static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
350{
351   if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
352      return True;
353   if (a1->tag == Iex_Const && a2->tag == Iex_Const)
354      return True;
355   return False;
356}
357
358
359/*------------------------------------------------------------*/
360/*--- Type management                                      ---*/
361/*------------------------------------------------------------*/
362
363/* Shadow state is always accessed using integer types.  This returns
364   an integer type with the same size (as per sizeofIRType) as the
365   given type.  The only valid shadow types are Bit, I8, I16, I32,
366   I64, I128, V128, V256. */
367
368static IRType shadowTypeV ( IRType ty )
369{
370   switch (ty) {
371      case Ity_I1:
372      case Ity_I8:
373      case Ity_I16:
374      case Ity_I32:
375      case Ity_I64:
376      case Ity_I128: return ty;
377      case Ity_F32:  return Ity_I32;
378      case Ity_D32:  return Ity_I32;
379      case Ity_F64:  return Ity_I64;
380      case Ity_D64:  return Ity_I64;
381      case Ity_F128: return Ity_I128;
382      case Ity_D128: return Ity_I128;
383      case Ity_V128: return Ity_V128;
384      case Ity_V256: return Ity_V256;
385      default: ppIRType(ty);
386               VG_(tool_panic)("memcheck:shadowTypeV");
387   }
388}
389
390/* Produce a 'defined' value of the given shadow type.  Should only be
391   supplied shadow types (Bit/I8/I16/I32/UI64). */
392static IRExpr* definedOfType ( IRType ty ) {
393   switch (ty) {
394      case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
395      case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
396      case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
397      case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
398      case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
399      case Ity_I128: return i128_const_zero();
400      case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
401      case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
402      default:       VG_(tool_panic)("memcheck:definedOfType");
403   }
404}
405
406
407/*------------------------------------------------------------*/
408/*--- Constructing IR fragments                            ---*/
409/*------------------------------------------------------------*/
410
411/* add stmt to a bb */
412static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
413   if (mce->trace) {
414      VG_(printf)("  %c: ", cat);
415      ppIRStmt(st);
416      VG_(printf)("\n");
417   }
418   addStmtToIRSB(mce->sb, st);
419}
420
421/* assign value to tmp */
422static inline
423void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
424   stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
425}
426
427/* build various kinds of expressions */
428#define triop(_op, _arg1, _arg2, _arg3) \
429                                 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
430#define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
431#define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
432#define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
433#define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
434#define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
435#define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
436#define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
437#define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
438#define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
439
440/* Bind the given expression to a new temporary, and return the
441   temporary.  This effectively converts an arbitrary expression into
442   an atom.
443
444   'ty' is the type of 'e' and hence the type that the new temporary
445   needs to be.  But passing it in is redundant, since we can deduce
446   the type merely by inspecting 'e'.  So at least use that fact to
447   assert that the two types agree. */
448static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
449{
450   TempKind k;
451   IRTemp   t;
452   IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
453
454   tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
455   switch (cat) {
456      case 'V': k = VSh;  break;
457      case 'B': k = BSh;  break;
458      case 'C': k = Orig; break;
459                /* happens when we are making up new "orig"
460                   expressions, for IRCAS handling */
461      default: tl_assert(0);
462   }
463   t = newTemp(mce, ty, k);
464   assign(cat, mce, t, e);
465   return mkexpr(t);
466}
467
468
469/*------------------------------------------------------------*/
470/*--- Helper functions for 128-bit ops                     ---*/
471/*------------------------------------------------------------*/
472
473static IRExpr *i128_const_zero(void)
474{
475   IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
476   return binop(Iop_64HLto128, z64, z64);
477}
478
479/* There are no I128-bit loads and/or stores [as generated by any
480   current front ends].  So we do not need to worry about that in
481   expr2vbits_Load */
482
483
484/*------------------------------------------------------------*/
485/*--- Constructing definedness primitive ops               ---*/
486/*------------------------------------------------------------*/
487
488/* --------- Defined-if-either-defined --------- */
489
490static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
491   tl_assert(isShadowAtom(mce,a1));
492   tl_assert(isShadowAtom(mce,a2));
493   return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
494}
495
496static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
497   tl_assert(isShadowAtom(mce,a1));
498   tl_assert(isShadowAtom(mce,a2));
499   return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
500}
501
502static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
503   tl_assert(isShadowAtom(mce,a1));
504   tl_assert(isShadowAtom(mce,a2));
505   return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
506}
507
508static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
509   tl_assert(isShadowAtom(mce,a1));
510   tl_assert(isShadowAtom(mce,a2));
511   return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
512}
513
514static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
515   tl_assert(isShadowAtom(mce,a1));
516   tl_assert(isShadowAtom(mce,a2));
517   return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
518}
519
520static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
521   tl_assert(isShadowAtom(mce,a1));
522   tl_assert(isShadowAtom(mce,a2));
523   return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
524}
525
526/* --------- Undefined-if-either-undefined --------- */
527
528static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
529   tl_assert(isShadowAtom(mce,a1));
530   tl_assert(isShadowAtom(mce,a2));
531   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
532}
533
534static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
535   tl_assert(isShadowAtom(mce,a1));
536   tl_assert(isShadowAtom(mce,a2));
537   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
538}
539
540static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
541   tl_assert(isShadowAtom(mce,a1));
542   tl_assert(isShadowAtom(mce,a2));
543   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
544}
545
546static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
547   tl_assert(isShadowAtom(mce,a1));
548   tl_assert(isShadowAtom(mce,a2));
549   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
550}
551
552static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
553   IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
554   tl_assert(isShadowAtom(mce,a1));
555   tl_assert(isShadowAtom(mce,a2));
556   tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
557   tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
558   tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
559   tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
560   tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
561   tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
562
563   return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
564}
565
566static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
567   tl_assert(isShadowAtom(mce,a1));
568   tl_assert(isShadowAtom(mce,a2));
569   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
570}
571
572static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
573   tl_assert(isShadowAtom(mce,a1));
574   tl_assert(isShadowAtom(mce,a2));
575   return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
576}
577
578static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
579   switch (vty) {
580      case Ity_I8:   return mkUifU8(mce, a1, a2);
581      case Ity_I16:  return mkUifU16(mce, a1, a2);
582      case Ity_I32:  return mkUifU32(mce, a1, a2);
583      case Ity_I64:  return mkUifU64(mce, a1, a2);
584      case Ity_I128: return mkUifU128(mce, a1, a2);
585      case Ity_V128: return mkUifUV128(mce, a1, a2);
586      case Ity_V256: return mkUifUV256(mce, a1, a2);
587      default:
588         VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
589         VG_(tool_panic)("memcheck:mkUifU");
590   }
591}
592
593/* --------- The Left-family of operations. --------- */
594
595static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
596   tl_assert(isShadowAtom(mce,a1));
597   return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
598}
599
600static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
601   tl_assert(isShadowAtom(mce,a1));
602   return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
603}
604
605static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
606   tl_assert(isShadowAtom(mce,a1));
607   return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
608}
609
610static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
611   tl_assert(isShadowAtom(mce,a1));
612   return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
613}
614
615/* --------- 'Improvement' functions for AND/OR. --------- */
616
617/* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
618   defined (0); all other -> undefined (1).
619*/
620static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
621{
622   tl_assert(isOriginalAtom(mce, data));
623   tl_assert(isShadowAtom(mce, vbits));
624   tl_assert(sameKindedAtoms(data, vbits));
625   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
626}
627
628static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
629{
630   tl_assert(isOriginalAtom(mce, data));
631   tl_assert(isShadowAtom(mce, vbits));
632   tl_assert(sameKindedAtoms(data, vbits));
633   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
634}
635
636static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
637{
638   tl_assert(isOriginalAtom(mce, data));
639   tl_assert(isShadowAtom(mce, vbits));
640   tl_assert(sameKindedAtoms(data, vbits));
641   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
642}
643
644static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
645{
646   tl_assert(isOriginalAtom(mce, data));
647   tl_assert(isShadowAtom(mce, vbits));
648   tl_assert(sameKindedAtoms(data, vbits));
649   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
650}
651
652static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
653{
654   tl_assert(isOriginalAtom(mce, data));
655   tl_assert(isShadowAtom(mce, vbits));
656   tl_assert(sameKindedAtoms(data, vbits));
657   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
658}
659
660static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
661{
662   tl_assert(isOriginalAtom(mce, data));
663   tl_assert(isShadowAtom(mce, vbits));
664   tl_assert(sameKindedAtoms(data, vbits));
665   return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
666}
667
668/* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
669   defined (0); all other -> undefined (1).
670*/
671static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
672{
673   tl_assert(isOriginalAtom(mce, data));
674   tl_assert(isShadowAtom(mce, vbits));
675   tl_assert(sameKindedAtoms(data, vbits));
676   return assignNew(
677             'V', mce, Ity_I8,
678             binop(Iop_Or8,
679                   assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
680                   vbits) );
681}
682
683static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
684{
685   tl_assert(isOriginalAtom(mce, data));
686   tl_assert(isShadowAtom(mce, vbits));
687   tl_assert(sameKindedAtoms(data, vbits));
688   return assignNew(
689             'V', mce, Ity_I16,
690             binop(Iop_Or16,
691                   assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
692                   vbits) );
693}
694
695static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
696{
697   tl_assert(isOriginalAtom(mce, data));
698   tl_assert(isShadowAtom(mce, vbits));
699   tl_assert(sameKindedAtoms(data, vbits));
700   return assignNew(
701             'V', mce, Ity_I32,
702             binop(Iop_Or32,
703                   assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
704                   vbits) );
705}
706
707static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
708{
709   tl_assert(isOriginalAtom(mce, data));
710   tl_assert(isShadowAtom(mce, vbits));
711   tl_assert(sameKindedAtoms(data, vbits));
712   return assignNew(
713             'V', mce, Ity_I64,
714             binop(Iop_Or64,
715                   assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
716                   vbits) );
717}
718
719static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
720{
721   tl_assert(isOriginalAtom(mce, data));
722   tl_assert(isShadowAtom(mce, vbits));
723   tl_assert(sameKindedAtoms(data, vbits));
724   return assignNew(
725             'V', mce, Ity_V128,
726             binop(Iop_OrV128,
727                   assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
728                   vbits) );
729}
730
731static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
732{
733   tl_assert(isOriginalAtom(mce, data));
734   tl_assert(isShadowAtom(mce, vbits));
735   tl_assert(sameKindedAtoms(data, vbits));
736   return assignNew(
737             'V', mce, Ity_V256,
738             binop(Iop_OrV256,
739                   assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
740                   vbits) );
741}
742
743/* --------- Pessimising casts. --------- */
744
745/* The function returns an expression of type DST_TY. If any of the VBITS
746   is undefined (value == 1) the resulting expression has all bits set to
747   1. Otherwise, all bits are 0. */
748
749static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
750{
751   IRType  src_ty;
752   IRAtom* tmp1;
753
754   /* Note, dst_ty is a shadow type, not an original type. */
755   tl_assert(isShadowAtom(mce,vbits));
756   src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
757
758   /* Fast-track some common cases */
759   if (src_ty == Ity_I32 && dst_ty == Ity_I32)
760      return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
761
762   if (src_ty == Ity_I64 && dst_ty == Ity_I64)
763      return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
764
765   if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
766      /* PCast the arg, then clone it. */
767      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
768      return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
769   }
770
771   if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
772      /* PCast the arg, then clone it 4 times. */
773      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
774      tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
775      return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
776   }
777
778   if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
779      /* PCast the arg, then clone it 8 times. */
780      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
781      tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
782      tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
783      return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
784   }
785
786   if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
787      /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
788         the top half. */
789      IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
790      return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
791   }
792
793   /* Else do it the slow way .. */
794   /* First of all, collapse vbits down to a single bit. */
795   tmp1   = NULL;
796   switch (src_ty) {
797      case Ity_I1:
798         tmp1 = vbits;
799         break;
800      case Ity_I8:
801         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
802         break;
803      case Ity_I16:
804         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
805         break;
806      case Ity_I32:
807         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
808         break;
809      case Ity_I64:
810         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
811         break;
812      case Ity_I128: {
813         /* Gah.  Chop it in half, OR the halves together, and compare
814            that with zero. */
815         IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
816         IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
817         IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
818         tmp1         = assignNew('V', mce, Ity_I1,
819                                       unop(Iop_CmpNEZ64, tmp4));
820         break;
821      }
822      default:
823         ppIRType(src_ty);
824         VG_(tool_panic)("mkPCastTo(1)");
825   }
826   tl_assert(tmp1);
827   /* Now widen up to the dst type. */
828   switch (dst_ty) {
829      case Ity_I1:
830         return tmp1;
831      case Ity_I8:
832         return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
833      case Ity_I16:
834         return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
835      case Ity_I32:
836         return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
837      case Ity_I64:
838         return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
839      case Ity_V128:
840         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
841         tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
842         return tmp1;
843      case Ity_I128:
844         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
845         tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
846         return tmp1;
847      case Ity_V256:
848         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
849         tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
850                                                    tmp1, tmp1));
851         tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
852                                                    tmp1, tmp1));
853         return tmp1;
854      default:
855         ppIRType(dst_ty);
856         VG_(tool_panic)("mkPCastTo(2)");
857   }
858}
859
860/* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
861/*
862   Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
863   PCasting to Ity_U1.  However, sometimes it is necessary to be more
864   accurate.  The insight is that the result is defined if two
865   corresponding bits can be found, one from each argument, so that
866   both bits are defined but are different -- that makes EQ say "No"
867   and NE say "Yes".  Hence, we compute an improvement term and DifD
868   it onto the "normal" (UifU) result.
869
870   The result is:
871
872   PCastTo<1> (
873      -- naive version
874      PCastTo<sz>( UifU<sz>(vxx, vyy) )
875
876      `DifD<sz>`
877
878      -- improvement term
879      PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
880   )
881
882   where
883     vec contains 0 (defined) bits where the corresponding arg bits
884     are defined but different, and 1 bits otherwise.
885
886     vec = Or<sz>( vxx,   // 0 iff bit defined
887                   vyy,   // 0 iff bit defined
888                   Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
889                 )
890
891     If any bit of vec is 0, the result is defined and so the
892     improvement term should produce 0...0, else it should produce
893     1...1.
894
895     Hence require for the improvement term:
896
897        if vec == 1...1 then 1...1 else 0...0
898     ->
899        PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
900
901   This was extensively re-analysed and checked on 6 July 05.
902*/
903static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
904                                    IRType  ty,
905                                    IRAtom* vxx, IRAtom* vyy,
906                                    IRAtom* xx,  IRAtom* yy )
907{
908   IRAtom *naive, *vec, *improvement_term;
909   IRAtom *improved, *final_cast, *top;
910   IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
911
912   tl_assert(isShadowAtom(mce,vxx));
913   tl_assert(isShadowAtom(mce,vyy));
914   tl_assert(isOriginalAtom(mce,xx));
915   tl_assert(isOriginalAtom(mce,yy));
916   tl_assert(sameKindedAtoms(vxx,xx));
917   tl_assert(sameKindedAtoms(vyy,yy));
918
919   switch (ty) {
920      case Ity_I16:
921         opOR   = Iop_Or16;
922         opDIFD = Iop_And16;
923         opUIFU = Iop_Or16;
924         opNOT  = Iop_Not16;
925         opXOR  = Iop_Xor16;
926         opCMP  = Iop_CmpEQ16;
927         top    = mkU16(0xFFFF);
928         break;
929      case Ity_I32:
930         opOR   = Iop_Or32;
931         opDIFD = Iop_And32;
932         opUIFU = Iop_Or32;
933         opNOT  = Iop_Not32;
934         opXOR  = Iop_Xor32;
935         opCMP  = Iop_CmpEQ32;
936         top    = mkU32(0xFFFFFFFF);
937         break;
938      case Ity_I64:
939         opOR   = Iop_Or64;
940         opDIFD = Iop_And64;
941         opUIFU = Iop_Or64;
942         opNOT  = Iop_Not64;
943         opXOR  = Iop_Xor64;
944         opCMP  = Iop_CmpEQ64;
945         top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
946         break;
947      default:
948         VG_(tool_panic)("expensiveCmpEQorNE");
949   }
950
951   naive
952      = mkPCastTo(mce,ty,
953                  assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
954
955   vec
956      = assignNew(
957           'V', mce,ty,
958           binop( opOR,
959                  assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
960                  assignNew(
961                     'V', mce,ty,
962                     unop( opNOT,
963                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
964
965   improvement_term
966      = mkPCastTo( mce,ty,
967                   assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
968
969   improved
970      = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
971
972   final_cast
973      = mkPCastTo( mce, Ity_I1, improved );
974
975   return final_cast;
976}
977
978
979/* --------- Semi-accurate interpretation of CmpORD. --------- */
980
981/* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
982
983      CmpORD32S(x,y) = 1<<3   if  x <s y
984                     = 1<<2   if  x >s y
985                     = 1<<1   if  x == y
986
987   and similarly the unsigned variant.  The default interpretation is:
988
989      CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
990                                  & (7<<1)
991
992   The "& (7<<1)" reflects the fact that all result bits except 3,2,1
993   are zero and therefore defined (viz, zero).
994
995   Also deal with a special case better:
996
997      CmpORD32S(x,0)
998
999   Here, bit 3 (LT) of the result is a copy of the top bit of x and
1000   will be defined even if the rest of x isn't.  In which case we do:
1001
1002      CmpORD32S#(x,x#,0,{impliedly 0}#)
1003         = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1004           | (x# >>u 31) << 3      -- LT# = x#[31]
1005
1006   Analogous handling for CmpORD64{S,U}.
1007*/
1008static Bool isZeroU32 ( IRAtom* e )
1009{
1010   return
1011      toBool( e->tag == Iex_Const
1012              && e->Iex.Const.con->tag == Ico_U32
1013              && e->Iex.Const.con->Ico.U32 == 0 );
1014}
1015
1016static Bool isZeroU64 ( IRAtom* e )
1017{
1018   return
1019      toBool( e->tag == Iex_Const
1020              && e->Iex.Const.con->tag == Ico_U64
1021              && e->Iex.Const.con->Ico.U64 == 0 );
1022}
1023
1024static IRAtom* doCmpORD ( MCEnv*  mce,
1025                          IROp    cmp_op,
1026                          IRAtom* xxhash, IRAtom* yyhash,
1027                          IRAtom* xx,     IRAtom* yy )
1028{
1029   Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1030   Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1031   IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
1032   IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
1033   IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
1034   IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
1035   IRType ty     = m64 ? Ity_I64   : Ity_I32;
1036   Int    width  = m64 ? 64        : 32;
1037
1038   Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1039
1040   IRAtom* threeLeft1 = NULL;
1041   IRAtom* sevenLeft1 = NULL;
1042
1043   tl_assert(isShadowAtom(mce,xxhash));
1044   tl_assert(isShadowAtom(mce,yyhash));
1045   tl_assert(isOriginalAtom(mce,xx));
1046   tl_assert(isOriginalAtom(mce,yy));
1047   tl_assert(sameKindedAtoms(xxhash,xx));
1048   tl_assert(sameKindedAtoms(yyhash,yy));
1049   tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1050             || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1051
1052   if (0) {
1053      ppIROp(cmp_op); VG_(printf)(" ");
1054      ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1055   }
1056
1057   if (syned && isZero(yy)) {
1058      /* fancy interpretation */
1059      /* if yy is zero, then it must be fully defined (zero#). */
1060      tl_assert(isZero(yyhash));
1061      threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
1062      return
1063         binop(
1064            opOR,
1065            assignNew(
1066               'V', mce,ty,
1067               binop(
1068                  opAND,
1069                  mkPCastTo(mce,ty, xxhash),
1070                  threeLeft1
1071               )),
1072            assignNew(
1073               'V', mce,ty,
1074               binop(
1075                  opSHL,
1076                  assignNew(
1077                     'V', mce,ty,
1078                     binop(opSHR, xxhash, mkU8(width-1))),
1079                  mkU8(3)
1080               ))
1081	 );
1082   } else {
1083      /* standard interpretation */
1084      sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1085      return
1086         binop(
1087            opAND,
1088            mkPCastTo( mce,ty,
1089                       mkUifU(mce,ty, xxhash,yyhash)),
1090            sevenLeft1
1091         );
1092   }
1093}
1094
1095
1096/*------------------------------------------------------------*/
1097/*--- Emit a test and complaint if something is undefined. ---*/
1098/*------------------------------------------------------------*/
1099
1100static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1101
1102
1103/* Set the annotations on a dirty helper to indicate that the stack
1104   pointer and instruction pointers might be read.  This is the
1105   behaviour of all 'emit-a-complaint' style functions we might
1106   call. */
1107
1108static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1109   di->nFxState = 2;
1110   di->fxState[0].fx        = Ifx_Read;
1111   di->fxState[0].offset    = mce->layout->offset_SP;
1112   di->fxState[0].size      = mce->layout->sizeof_SP;
1113   di->fxState[0].nRepeats  = 0;
1114   di->fxState[0].repeatLen = 0;
1115   di->fxState[1].fx        = Ifx_Read;
1116   di->fxState[1].offset    = mce->layout->offset_IP;
1117   di->fxState[1].size      = mce->layout->sizeof_IP;
1118   di->fxState[1].nRepeats  = 0;
1119   di->fxState[1].repeatLen = 0;
1120}
1121
1122
1123/* Check the supplied *original* |atom| for undefinedness, and emit a
1124   complaint if so.  Once that happens, mark it as defined.  This is
1125   possible because the atom is either a tmp or literal.  If it's a
1126   tmp, it will be shadowed by a tmp, and so we can set the shadow to
1127   be defined.  In fact as mentioned above, we will have to allocate a
1128   new tmp to carry the new 'defined' shadow value, and update the
1129   original->tmp mapping accordingly; we cannot simply assign a new
1130   value to an existing shadow tmp as this breaks SSAness.
1131
1132   The checks are performed, any resulting complaint emitted, and
1133   |atom|'s shadow temp set to 'defined', ONLY in the case that
1134   |guard| evaluates to True at run-time.  If it evaluates to False
1135   then no action is performed.  If |guard| is NULL (the usual case)
1136   then it is assumed to be always-true, and hence these actions are
1137   performed unconditionally.
1138
1139   This routine does not generate code to check the definedness of
1140   |guard|.  The caller is assumed to have taken care of that already.
1141*/
1142static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1143{
1144   IRAtom*  vatom;
1145   IRType   ty;
1146   Int      sz;
1147   IRDirty* di;
1148   IRAtom*  cond;
1149   IRAtom*  origin;
1150   void*    fn;
1151   const HChar* nm;
1152   IRExpr** args;
1153   Int      nargs;
1154
1155   // Don't do V bit tests if we're not reporting undefined value errors.
1156   if (MC_(clo_mc_level) == 1)
1157      return;
1158
1159   if (guard)
1160      tl_assert(isOriginalAtom(mce, guard));
1161
1162   /* Since the original expression is atomic, there's no duplicated
1163      work generated by making multiple V-expressions for it.  So we
1164      don't really care about the possibility that someone else may
1165      also create a V-interpretion for it. */
1166   tl_assert(isOriginalAtom(mce, atom));
1167   vatom = expr2vbits( mce, atom );
1168   tl_assert(isShadowAtom(mce, vatom));
1169   tl_assert(sameKindedAtoms(atom, vatom));
1170
1171   ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1172
1173   /* sz is only used for constructing the error message */
1174   sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1175
1176   cond = mkPCastTo( mce, Ity_I1, vatom );
1177   /* cond will be 0 if all defined, and 1 if any not defined. */
1178
1179   /* Get the origin info for the value we are about to check.  At
1180      least, if we are doing origin tracking.  If not, use a dummy
1181      zero origin. */
1182   if (MC_(clo_mc_level) == 3) {
1183      origin = schemeE( mce, atom );
1184      if (mce->hWordTy == Ity_I64) {
1185         origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1186      }
1187   } else {
1188      origin = NULL;
1189   }
1190
1191   fn    = NULL;
1192   nm    = NULL;
1193   args  = NULL;
1194   nargs = -1;
1195
1196   switch (sz) {
1197      case 0:
1198         if (origin) {
1199            fn    = &MC_(helperc_value_check0_fail_w_o);
1200            nm    = "MC_(helperc_value_check0_fail_w_o)";
1201            args  = mkIRExprVec_1(origin);
1202            nargs = 1;
1203         } else {
1204            fn    = &MC_(helperc_value_check0_fail_no_o);
1205            nm    = "MC_(helperc_value_check0_fail_no_o)";
1206            args  = mkIRExprVec_0();
1207            nargs = 0;
1208         }
1209         break;
1210      case 1:
1211         if (origin) {
1212            fn    = &MC_(helperc_value_check1_fail_w_o);
1213            nm    = "MC_(helperc_value_check1_fail_w_o)";
1214            args  = mkIRExprVec_1(origin);
1215            nargs = 1;
1216         } else {
1217            fn    = &MC_(helperc_value_check1_fail_no_o);
1218            nm    = "MC_(helperc_value_check1_fail_no_o)";
1219            args  = mkIRExprVec_0();
1220            nargs = 0;
1221         }
1222         break;
1223      case 4:
1224         if (origin) {
1225            fn    = &MC_(helperc_value_check4_fail_w_o);
1226            nm    = "MC_(helperc_value_check4_fail_w_o)";
1227            args  = mkIRExprVec_1(origin);
1228            nargs = 1;
1229         } else {
1230            fn    = &MC_(helperc_value_check4_fail_no_o);
1231            nm    = "MC_(helperc_value_check4_fail_no_o)";
1232            args  = mkIRExprVec_0();
1233            nargs = 0;
1234         }
1235         break;
1236      case 8:
1237         if (origin) {
1238            fn    = &MC_(helperc_value_check8_fail_w_o);
1239            nm    = "MC_(helperc_value_check8_fail_w_o)";
1240            args  = mkIRExprVec_1(origin);
1241            nargs = 1;
1242         } else {
1243            fn    = &MC_(helperc_value_check8_fail_no_o);
1244            nm    = "MC_(helperc_value_check8_fail_no_o)";
1245            args  = mkIRExprVec_0();
1246            nargs = 0;
1247         }
1248         break;
1249      case 2:
1250      case 16:
1251         if (origin) {
1252            fn    = &MC_(helperc_value_checkN_fail_w_o);
1253            nm    = "MC_(helperc_value_checkN_fail_w_o)";
1254            args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1255            nargs = 2;
1256         } else {
1257            fn    = &MC_(helperc_value_checkN_fail_no_o);
1258            nm    = "MC_(helperc_value_checkN_fail_no_o)";
1259            args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1260            nargs = 1;
1261         }
1262         break;
1263      default:
1264         VG_(tool_panic)("unexpected szB");
1265   }
1266
1267   tl_assert(fn);
1268   tl_assert(nm);
1269   tl_assert(args);
1270   tl_assert(nargs >= 0 && nargs <= 2);
1271   tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1272              || (MC_(clo_mc_level) == 2 && origin == NULL) );
1273
1274   di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1275                           VG_(fnptr_to_fnentry)( fn ), args );
1276   di->guard = cond; // and cond is PCast-to-1(atom#)
1277
1278   /* If the complaint is to be issued under a guard condition, AND
1279      that into the guard condition for the helper call. */
1280   if (guard) {
1281      IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1282      IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1283      IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1284      di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1285   }
1286
1287   setHelperAnns( mce, di );
1288   stmt( 'V', mce, IRStmt_Dirty(di));
1289
1290   /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1291      defined -- but only in the case where the guard evaluates to
1292      True at run-time.  Do the update by setting the orig->shadow
1293      mapping for tmp to reflect the fact that this shadow is getting
1294      a new value. */
1295   tl_assert(isIRAtom(vatom));
1296   /* sameKindedAtoms ... */
1297   if (vatom->tag == Iex_RdTmp) {
1298      tl_assert(atom->tag == Iex_RdTmp);
1299      if (guard == NULL) {
1300         // guard is 'always True', hence update unconditionally
1301         newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1302         assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1303                          definedOfType(ty));
1304      } else {
1305         // update the temp only conditionally.  Do this by copying
1306         // its old value when the guard is False.
1307         // The old value ..
1308         IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1309         newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1310         IRAtom* new_tmpV
1311            = assignNew('V', mce, shadowTypeV(ty),
1312                        IRExpr_ITE(guard, definedOfType(ty),
1313                                          mkexpr(old_tmpV)));
1314         assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1315      }
1316   }
1317}
1318
1319
1320/*------------------------------------------------------------*/
1321/*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1322/*------------------------------------------------------------*/
1323
1324/* Examine the always-defined sections declared in layout to see if
1325   the (offset,size) section is within one.  Note, is is an error to
1326   partially fall into such a region: (offset,size) should either be
1327   completely in such a region or completely not-in such a region.
1328*/
1329static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1330{
1331   Int minoffD, maxoffD, i;
1332   Int minoff = offset;
1333   Int maxoff = minoff + size - 1;
1334   tl_assert((minoff & ~0xFFFF) == 0);
1335   tl_assert((maxoff & ~0xFFFF) == 0);
1336
1337   for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1338      minoffD = mce->layout->alwaysDefd[i].offset;
1339      maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1340      tl_assert((minoffD & ~0xFFFF) == 0);
1341      tl_assert((maxoffD & ~0xFFFF) == 0);
1342
1343      if (maxoff < minoffD || maxoffD < minoff)
1344         continue; /* no overlap */
1345      if (minoff >= minoffD && maxoff <= maxoffD)
1346         return True; /* completely contained in an always-defd section */
1347
1348      VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1349   }
1350   return False; /* could not find any containing section */
1351}
1352
1353
1354/* Generate into bb suitable actions to shadow this Put.  If the state
1355   slice is marked 'always defined', do nothing.  Otherwise, write the
1356   supplied V bits to the shadow state.  We can pass in either an
1357   original atom or a V-atom, but not both.  In the former case the
1358   relevant V-bits are then generated from the original.
1359   We assume here, that the definedness of GUARD has already been checked.
1360*/
1361static
1362void do_shadow_PUT ( MCEnv* mce,  Int offset,
1363                     IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1364{
1365   IRType ty;
1366
1367   // Don't do shadow PUTs if we're not doing undefined value checking.
1368   // Their absence lets Vex's optimiser remove all the shadow computation
1369   // that they depend on, which includes GETs of the shadow registers.
1370   if (MC_(clo_mc_level) == 1)
1371      return;
1372
1373   if (atom) {
1374      tl_assert(!vatom);
1375      tl_assert(isOriginalAtom(mce, atom));
1376      vatom = expr2vbits( mce, atom );
1377   } else {
1378      tl_assert(vatom);
1379      tl_assert(isShadowAtom(mce, vatom));
1380   }
1381
1382   ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1383   tl_assert(ty != Ity_I1);
1384   tl_assert(ty != Ity_I128);
1385   if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1386      /* later: no ... */
1387      /* emit code to emit a complaint if any of the vbits are 1. */
1388      /* complainIfUndefined(mce, atom); */
1389   } else {
1390      /* Do a plain shadow Put. */
1391      if (guard) {
1392         /* If the guard expression evaluates to false we simply Put the value
1393            that is already stored in the guest state slot */
1394         IRAtom *cond, *iffalse;
1395
1396         cond    = assignNew('V', mce, Ity_I1, guard);
1397         iffalse = assignNew('V', mce, ty,
1398                             IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1399         vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1400      }
1401      stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1402   }
1403}
1404
1405
1406/* Return an expression which contains the V bits corresponding to the
1407   given GETI (passed in in pieces).
1408*/
1409static
1410void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1411{
1412   IRAtom* vatom;
1413   IRType  ty, tyS;
1414   Int     arrSize;;
1415   IRRegArray* descr = puti->descr;
1416   IRAtom*     ix    = puti->ix;
1417   Int         bias  = puti->bias;
1418   IRAtom*     atom  = puti->data;
1419
1420   // Don't do shadow PUTIs if we're not doing undefined value checking.
1421   // Their absence lets Vex's optimiser remove all the shadow computation
1422   // that they depend on, which includes GETIs of the shadow registers.
1423   if (MC_(clo_mc_level) == 1)
1424      return;
1425
1426   tl_assert(isOriginalAtom(mce,atom));
1427   vatom = expr2vbits( mce, atom );
1428   tl_assert(sameKindedAtoms(atom, vatom));
1429   ty   = descr->elemTy;
1430   tyS  = shadowTypeV(ty);
1431   arrSize = descr->nElems * sizeofIRType(ty);
1432   tl_assert(ty != Ity_I1);
1433   tl_assert(isOriginalAtom(mce,ix));
1434   complainIfUndefined(mce, ix, NULL);
1435   if (isAlwaysDefd(mce, descr->base, arrSize)) {
1436      /* later: no ... */
1437      /* emit code to emit a complaint if any of the vbits are 1. */
1438      /* complainIfUndefined(mce, atom); */
1439   } else {
1440      /* Do a cloned version of the Put that refers to the shadow
1441         area. */
1442      IRRegArray* new_descr
1443         = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1444                         tyS, descr->nElems);
1445      stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1446   }
1447}
1448
1449
1450/* Return an expression which contains the V bits corresponding to the
1451   given GET (passed in in pieces).
1452*/
1453static
1454IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1455{
1456   IRType tyS = shadowTypeV(ty);
1457   tl_assert(ty != Ity_I1);
1458   tl_assert(ty != Ity_I128);
1459   if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1460      /* Always defined, return all zeroes of the relevant type */
1461      return definedOfType(tyS);
1462   } else {
1463      /* return a cloned version of the Get that refers to the shadow
1464         area. */
1465      /* FIXME: this isn't an atom! */
1466      return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1467   }
1468}
1469
1470
1471/* Return an expression which contains the V bits corresponding to the
1472   given GETI (passed in in pieces).
1473*/
1474static
1475IRExpr* shadow_GETI ( MCEnv* mce,
1476                      IRRegArray* descr, IRAtom* ix, Int bias )
1477{
1478   IRType ty   = descr->elemTy;
1479   IRType tyS  = shadowTypeV(ty);
1480   Int arrSize = descr->nElems * sizeofIRType(ty);
1481   tl_assert(ty != Ity_I1);
1482   tl_assert(isOriginalAtom(mce,ix));
1483   complainIfUndefined(mce, ix, NULL);
1484   if (isAlwaysDefd(mce, descr->base, arrSize)) {
1485      /* Always defined, return all zeroes of the relevant type */
1486      return definedOfType(tyS);
1487   } else {
1488      /* return a cloned version of the Get that refers to the shadow
1489         area. */
1490      IRRegArray* new_descr
1491         = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1492                         tyS, descr->nElems);
1493      return IRExpr_GetI( new_descr, ix, bias );
1494   }
1495}
1496
1497
1498/*------------------------------------------------------------*/
1499/*--- Generating approximations for unknown operations,    ---*/
1500/*--- using lazy-propagate semantics                       ---*/
1501/*------------------------------------------------------------*/
1502
1503/* Lazy propagation of undefinedness from two values, resulting in the
1504   specified shadow type.
1505*/
1506static
1507IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1508{
1509   IRAtom* at;
1510   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1511   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1512   tl_assert(isShadowAtom(mce,va1));
1513   tl_assert(isShadowAtom(mce,va2));
1514
1515   /* The general case is inefficient because PCast is an expensive
1516      operation.  Here are some special cases which use PCast only
1517      once rather than twice. */
1518
1519   /* I64 x I64 -> I64 */
1520   if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1521      if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1522      at = mkUifU(mce, Ity_I64, va1, va2);
1523      at = mkPCastTo(mce, Ity_I64, at);
1524      return at;
1525   }
1526
1527   /* I64 x I64 -> I32 */
1528   if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1529      if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1530      at = mkUifU(mce, Ity_I64, va1, va2);
1531      at = mkPCastTo(mce, Ity_I32, at);
1532      return at;
1533   }
1534
1535   if (0) {
1536      VG_(printf)("mkLazy2 ");
1537      ppIRType(t1);
1538      VG_(printf)("_");
1539      ppIRType(t2);
1540      VG_(printf)("_");
1541      ppIRType(finalVty);
1542      VG_(printf)("\n");
1543   }
1544
1545   /* General case: force everything via 32-bit intermediaries. */
1546   at = mkPCastTo(mce, Ity_I32, va1);
1547   at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1548   at = mkPCastTo(mce, finalVty, at);
1549   return at;
1550}
1551
1552
1553/* 3-arg version of the above. */
1554static
1555IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1556                  IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1557{
1558   IRAtom* at;
1559   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1560   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1561   IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1562   tl_assert(isShadowAtom(mce,va1));
1563   tl_assert(isShadowAtom(mce,va2));
1564   tl_assert(isShadowAtom(mce,va3));
1565
1566   /* The general case is inefficient because PCast is an expensive
1567      operation.  Here are some special cases which use PCast only
1568      twice rather than three times. */
1569
1570   /* I32 x I64 x I64 -> I64 */
1571   /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1572   if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1573       && finalVty == Ity_I64) {
1574      if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1575      /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1576         mode indication which is fully defined, this should get
1577         folded out later. */
1578      at = mkPCastTo(mce, Ity_I64, va1);
1579      /* Now fold in 2nd and 3rd args. */
1580      at = mkUifU(mce, Ity_I64, at, va2);
1581      at = mkUifU(mce, Ity_I64, at, va3);
1582      /* and PCast once again. */
1583      at = mkPCastTo(mce, Ity_I64, at);
1584      return at;
1585   }
1586
1587   /* I32 x I8 x I64 -> I64 */
1588   if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1589       && finalVty == Ity_I64) {
1590      if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1591      /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1592       * rounding mode indication which is fully defined, this should
1593       * get folded out later.
1594      */
1595      IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1596      IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1597      at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1598      at = mkUifU(mce, Ity_I64, at, va3);
1599      /* and PCast once again. */
1600      at = mkPCastTo(mce, Ity_I64, at);
1601      return at;
1602   }
1603
1604   /* I32 x I64 x I64 -> I32 */
1605   if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1606       && finalVty == Ity_I32) {
1607      if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1608      at = mkPCastTo(mce, Ity_I64, va1);
1609      at = mkUifU(mce, Ity_I64, at, va2);
1610      at = mkUifU(mce, Ity_I64, at, va3);
1611      at = mkPCastTo(mce, Ity_I32, at);
1612      return at;
1613   }
1614
1615   /* I32 x I32 x I32 -> I32 */
1616   /* 32-bit FP idiom, as (eg) happens on ARM */
1617   if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1618       && finalVty == Ity_I32) {
1619      if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1620      at = va1;
1621      at = mkUifU(mce, Ity_I32, at, va2);
1622      at = mkUifU(mce, Ity_I32, at, va3);
1623      at = mkPCastTo(mce, Ity_I32, at);
1624      return at;
1625   }
1626
1627   /* I32 x I128 x I128 -> I128 */
1628   /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1629   if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1630       && finalVty == Ity_I128) {
1631      if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1632      /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1633         mode indication which is fully defined, this should get
1634         folded out later. */
1635      at = mkPCastTo(mce, Ity_I128, va1);
1636      /* Now fold in 2nd and 3rd args. */
1637      at = mkUifU(mce, Ity_I128, at, va2);
1638      at = mkUifU(mce, Ity_I128, at, va3);
1639      /* and PCast once again. */
1640      at = mkPCastTo(mce, Ity_I128, at);
1641      return at;
1642   }
1643
1644   /* I32 x I8 x I128 -> I128 */
1645   /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1646   if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
1647       && finalVty == Ity_I128) {
1648      if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
1649      /* Use I64 as an intermediate type, which means PCasting all 3
1650         args to I64 to start with. 1st arg is typically a rounding
1651         mode indication which is fully defined, so we hope that it
1652         will get folded out later. */
1653      IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1654      IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1655      IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
1656      /* Now UifU all three together. */
1657      at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1658      at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
1659      /* and PCast once again. */
1660      at = mkPCastTo(mce, Ity_I128, at);
1661      return at;
1662   }
1663   if (1) {
1664      VG_(printf)("mkLazy3: ");
1665      ppIRType(t1);
1666      VG_(printf)(" x ");
1667      ppIRType(t2);
1668      VG_(printf)(" x ");
1669      ppIRType(t3);
1670      VG_(printf)(" -> ");
1671      ppIRType(finalVty);
1672      VG_(printf)("\n");
1673   }
1674
1675   tl_assert(0);
1676   /* General case: force everything via 32-bit intermediaries. */
1677   /*
1678   at = mkPCastTo(mce, Ity_I32, va1);
1679   at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1680   at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
1681   at = mkPCastTo(mce, finalVty, at);
1682   return at;
1683   */
1684}
1685
1686
1687/* 4-arg version of the above. */
1688static
1689IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
1690                  IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
1691{
1692   IRAtom* at;
1693   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1694   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1695   IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1696   IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
1697   tl_assert(isShadowAtom(mce,va1));
1698   tl_assert(isShadowAtom(mce,va2));
1699   tl_assert(isShadowAtom(mce,va3));
1700   tl_assert(isShadowAtom(mce,va4));
1701
1702   /* The general case is inefficient because PCast is an expensive
1703      operation.  Here are some special cases which use PCast only
1704      twice rather than three times. */
1705
1706   /* I32 x I64 x I64 x I64 -> I64 */
1707   /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1708   if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
1709       && finalVty == Ity_I64) {
1710      if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
1711      /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1712         mode indication which is fully defined, this should get
1713         folded out later. */
1714      at = mkPCastTo(mce, Ity_I64, va1);
1715      /* Now fold in 2nd, 3rd, 4th args. */
1716      at = mkUifU(mce, Ity_I64, at, va2);
1717      at = mkUifU(mce, Ity_I64, at, va3);
1718      at = mkUifU(mce, Ity_I64, at, va4);
1719      /* and PCast once again. */
1720      at = mkPCastTo(mce, Ity_I64, at);
1721      return at;
1722   }
1723   /* I32 x I32 x I32 x I32 -> I32 */
1724   /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1725   if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
1726       && finalVty == Ity_I32) {
1727      if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
1728      at = va1;
1729      /* Now fold in 2nd, 3rd, 4th args. */
1730      at = mkUifU(mce, Ity_I32, at, va2);
1731      at = mkUifU(mce, Ity_I32, at, va3);
1732      at = mkUifU(mce, Ity_I32, at, va4);
1733      at = mkPCastTo(mce, Ity_I32, at);
1734      return at;
1735   }
1736
1737   if (1) {
1738      VG_(printf)("mkLazy4: ");
1739      ppIRType(t1);
1740      VG_(printf)(" x ");
1741      ppIRType(t2);
1742      VG_(printf)(" x ");
1743      ppIRType(t3);
1744      VG_(printf)(" x ");
1745      ppIRType(t4);
1746      VG_(printf)(" -> ");
1747      ppIRType(finalVty);
1748      VG_(printf)("\n");
1749   }
1750
1751   tl_assert(0);
1752}
1753
1754
1755/* Do the lazy propagation game from a null-terminated vector of
1756   atoms.  This is presumably the arguments to a helper call, so the
1757   IRCallee info is also supplied in order that we can know which
1758   arguments should be ignored (via the .mcx_mask field).
1759*/
1760static
1761IRAtom* mkLazyN ( MCEnv* mce,
1762                  IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
1763{
1764   Int     i;
1765   IRAtom* here;
1766   IRAtom* curr;
1767   IRType  mergeTy;
1768   Bool    mergeTy64 = True;
1769
1770   /* Decide on the type of the merge intermediary.  If all relevant
1771      args are I64, then it's I64.  In all other circumstances, use
1772      I32. */
1773   for (i = 0; exprvec[i]; i++) {
1774      tl_assert(i < 32);
1775      tl_assert(isOriginalAtom(mce, exprvec[i]));
1776      if (cee->mcx_mask & (1<<i))
1777         continue;
1778      if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
1779         mergeTy64 = False;
1780   }
1781
1782   mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
1783   curr    = definedOfType(mergeTy);
1784
1785   for (i = 0; exprvec[i]; i++) {
1786      tl_assert(i < 32);
1787      tl_assert(isOriginalAtom(mce, exprvec[i]));
1788      /* Only take notice of this arg if the callee's mc-exclusion
1789         mask does not say it is to be excluded. */
1790      if (cee->mcx_mask & (1<<i)) {
1791         /* the arg is to be excluded from definedness checking.  Do
1792            nothing. */
1793         if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
1794      } else {
1795         /* calculate the arg's definedness, and pessimistically merge
1796            it in. */
1797         here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
1798         curr = mergeTy64
1799                   ? mkUifU64(mce, here, curr)
1800                   : mkUifU32(mce, here, curr);
1801      }
1802   }
1803   return mkPCastTo(mce, finalVtype, curr );
1804}
1805
1806
1807/*------------------------------------------------------------*/
1808/*--- Generating expensive sequences for exact carry-chain ---*/
1809/*--- propagation in add/sub and related operations.       ---*/
1810/*------------------------------------------------------------*/
1811
1812static
1813IRAtom* expensiveAddSub ( MCEnv*  mce,
1814                          Bool    add,
1815                          IRType  ty,
1816                          IRAtom* qaa, IRAtom* qbb,
1817                          IRAtom* aa,  IRAtom* bb )
1818{
1819   IRAtom *a_min, *b_min, *a_max, *b_max;
1820   IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
1821
1822   tl_assert(isShadowAtom(mce,qaa));
1823   tl_assert(isShadowAtom(mce,qbb));
1824   tl_assert(isOriginalAtom(mce,aa));
1825   tl_assert(isOriginalAtom(mce,bb));
1826   tl_assert(sameKindedAtoms(qaa,aa));
1827   tl_assert(sameKindedAtoms(qbb,bb));
1828
1829   switch (ty) {
1830      case Ity_I32:
1831         opAND = Iop_And32;
1832         opOR  = Iop_Or32;
1833         opXOR = Iop_Xor32;
1834         opNOT = Iop_Not32;
1835         opADD = Iop_Add32;
1836         opSUB = Iop_Sub32;
1837         break;
1838      case Ity_I64:
1839         opAND = Iop_And64;
1840         opOR  = Iop_Or64;
1841         opXOR = Iop_Xor64;
1842         opNOT = Iop_Not64;
1843         opADD = Iop_Add64;
1844         opSUB = Iop_Sub64;
1845         break;
1846      default:
1847         VG_(tool_panic)("expensiveAddSub");
1848   }
1849
1850   // a_min = aa & ~qaa
1851   a_min = assignNew('V', mce,ty,
1852                     binop(opAND, aa,
1853                                  assignNew('V', mce,ty, unop(opNOT, qaa))));
1854
1855   // b_min = bb & ~qbb
1856   b_min = assignNew('V', mce,ty,
1857                     binop(opAND, bb,
1858                                  assignNew('V', mce,ty, unop(opNOT, qbb))));
1859
1860   // a_max = aa | qaa
1861   a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
1862
1863   // b_max = bb | qbb
1864   b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
1865
1866   if (add) {
1867      // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
1868      return
1869      assignNew('V', mce,ty,
1870         binop( opOR,
1871                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
1872                assignNew('V', mce,ty,
1873                   binop( opXOR,
1874                          assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
1875                          assignNew('V', mce,ty, binop(opADD, a_max, b_max))
1876                   )
1877                )
1878         )
1879      );
1880   } else {
1881      // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
1882      return
1883      assignNew('V', mce,ty,
1884         binop( opOR,
1885                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
1886                assignNew('V', mce,ty,
1887                   binop( opXOR,
1888                          assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
1889                          assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
1890                   )
1891                )
1892         )
1893      );
1894   }
1895
1896}
1897
1898
1899static
1900IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
1901                                       IRAtom* atom, IRAtom* vatom )
1902{
1903   IRType ty;
1904   IROp xorOp, subOp, andOp;
1905   IRExpr *one;
1906   IRAtom *improver, *improved;
1907   tl_assert(isShadowAtom(mce,vatom));
1908   tl_assert(isOriginalAtom(mce,atom));
1909   tl_assert(sameKindedAtoms(atom,vatom));
1910
1911   switch (czop) {
1912      case Iop_Ctz32:
1913         ty = Ity_I32;
1914         xorOp = Iop_Xor32;
1915         subOp = Iop_Sub32;
1916         andOp = Iop_And32;
1917         one = mkU32(1);
1918         break;
1919      case Iop_Ctz64:
1920         ty = Ity_I64;
1921         xorOp = Iop_Xor64;
1922         subOp = Iop_Sub64;
1923         andOp = Iop_And64;
1924         one = mkU64(1);
1925         break;
1926      default:
1927         ppIROp(czop);
1928         VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
1929   }
1930
1931   // improver = atom ^ (atom - 1)
1932   //
1933   // That is, improver has its low ctz(atom) bits equal to one;
1934   // higher bits (if any) equal to zero.
1935   improver = assignNew('V', mce,ty,
1936                        binop(xorOp,
1937                              atom,
1938                              assignNew('V', mce, ty,
1939                                        binop(subOp, atom, one))));
1940
1941   // improved = vatom & improver
1942   //
1943   // That is, treat any V bits above the first ctz(atom) bits as
1944   // "defined".
1945   improved = assignNew('V', mce, ty,
1946                        binop(andOp, vatom, improver));
1947
1948   // Return pessimizing cast of improved.
1949   return mkPCastTo(mce, ty, improved);
1950}
1951
1952
1953/*------------------------------------------------------------*/
1954/*--- Scalar shifts.                                       ---*/
1955/*------------------------------------------------------------*/
1956
1957/* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
1958   idea is to shift the definedness bits by the original shift amount.
1959   This introduces 0s ("defined") in new positions for left shifts and
1960   unsigned right shifts, and copies the top definedness bit for
1961   signed right shifts.  So, conveniently, applying the original shift
1962   operator to the definedness bits for the left arg is exactly the
1963   right thing to do:
1964
1965      (qaa << bb)
1966
1967   However if the shift amount is undefined then the whole result
1968   is undefined.  Hence need:
1969
1970      (qaa << bb) `UifU` PCast(qbb)
1971
1972   If the shift amount bb is a literal than qbb will say 'all defined'
1973   and the UifU and PCast will get folded out by post-instrumentation
1974   optimisation.
1975*/
1976static IRAtom* scalarShift ( MCEnv*  mce,
1977                             IRType  ty,
1978                             IROp    original_op,
1979                             IRAtom* qaa, IRAtom* qbb,
1980                             IRAtom* aa,  IRAtom* bb )
1981{
1982   tl_assert(isShadowAtom(mce,qaa));
1983   tl_assert(isShadowAtom(mce,qbb));
1984   tl_assert(isOriginalAtom(mce,aa));
1985   tl_assert(isOriginalAtom(mce,bb));
1986   tl_assert(sameKindedAtoms(qaa,aa));
1987   tl_assert(sameKindedAtoms(qbb,bb));
1988   return
1989      assignNew(
1990         'V', mce, ty,
1991         mkUifU( mce, ty,
1992                 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
1993                 mkPCastTo(mce, ty, qbb)
1994         )
1995   );
1996}
1997
1998
1999/*------------------------------------------------------------*/
2000/*--- Helpers for dealing with vector primops.             ---*/
2001/*------------------------------------------------------------*/
2002
2003/* Vector pessimisation -- pessimise within each lane individually. */
2004
2005static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2006{
2007   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2008}
2009
2010static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2011{
2012   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2013}
2014
2015static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2016{
2017   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2018}
2019
2020static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2021{
2022   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2023}
2024
2025static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2026{
2027   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2028}
2029
2030static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2031{
2032   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2033}
2034
2035static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2036{
2037   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2038}
2039
2040static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2041{
2042   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2043}
2044
2045static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2046{
2047   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2048}
2049
2050static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2051{
2052   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2053}
2054
2055static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2056{
2057   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2058}
2059
2060static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2061{
2062   return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2063}
2064
2065static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2066{
2067   return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2068}
2069
2070
2071/* Here's a simple scheme capable of handling ops derived from SSE1
2072   code and while only generating ops that can be efficiently
2073   implemented in SSE1. */
2074
2075/* All-lanes versions are straightforward:
2076
2077   binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2078
2079   unary32Fx4(x,y)    ==> PCast32x4(x#)
2080
2081   Lowest-lane-only versions are more complex:
2082
2083   binary32F0x4(x,y)  ==> SetV128lo32(
2084                             x#,
2085                             PCast32(V128to32(UifUV128(x#,y#)))
2086                          )
2087
2088   This is perhaps not so obvious.  In particular, it's faster to
2089   do a V128-bit UifU and then take the bottom 32 bits than the more
2090   obvious scheme of taking the bottom 32 bits of each operand
2091   and doing a 32-bit UifU.  Basically since UifU is fast and
2092   chopping lanes off vector values is slow.
2093
2094   Finally:
2095
2096   unary32F0x4(x)     ==> SetV128lo32(
2097                             x#,
2098                             PCast32(V128to32(x#))
2099                          )
2100
2101   Where:
2102
2103   PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2104   PCast32x4(v#) = CmpNEZ32x4(v#)
2105*/
2106
2107static
2108IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2109{
2110   IRAtom* at;
2111   tl_assert(isShadowAtom(mce, vatomX));
2112   tl_assert(isShadowAtom(mce, vatomY));
2113   at = mkUifUV128(mce, vatomX, vatomY);
2114   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2115   return at;
2116}
2117
2118static
2119IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2120{
2121   IRAtom* at;
2122   tl_assert(isShadowAtom(mce, vatomX));
2123   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2124   return at;
2125}
2126
2127static
2128IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2129{
2130   IRAtom* at;
2131   tl_assert(isShadowAtom(mce, vatomX));
2132   tl_assert(isShadowAtom(mce, vatomY));
2133   at = mkUifUV128(mce, vatomX, vatomY);
2134   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2135   at = mkPCastTo(mce, Ity_I32, at);
2136   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2137   return at;
2138}
2139
2140static
2141IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2142{
2143   IRAtom* at;
2144   tl_assert(isShadowAtom(mce, vatomX));
2145   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2146   at = mkPCastTo(mce, Ity_I32, at);
2147   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2148   return at;
2149}
2150
2151/* --- ... and ... 64Fx2 versions of the same ... --- */
2152
2153static
2154IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2155{
2156   IRAtom* at;
2157   tl_assert(isShadowAtom(mce, vatomX));
2158   tl_assert(isShadowAtom(mce, vatomY));
2159   at = mkUifUV128(mce, vatomX, vatomY);
2160   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2161   return at;
2162}
2163
2164static
2165IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2166{
2167   IRAtom* at;
2168   tl_assert(isShadowAtom(mce, vatomX));
2169   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2170   return at;
2171}
2172
2173static
2174IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2175{
2176   IRAtom* at;
2177   tl_assert(isShadowAtom(mce, vatomX));
2178   tl_assert(isShadowAtom(mce, vatomY));
2179   at = mkUifUV128(mce, vatomX, vatomY);
2180   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2181   at = mkPCastTo(mce, Ity_I64, at);
2182   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2183   return at;
2184}
2185
2186static
2187IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2188{
2189   IRAtom* at;
2190   tl_assert(isShadowAtom(mce, vatomX));
2191   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2192   at = mkPCastTo(mce, Ity_I64, at);
2193   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2194   return at;
2195}
2196
2197/* --- --- ... and ... 32Fx2 versions of the same --- --- */
2198
2199static
2200IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2201{
2202   IRAtom* at;
2203   tl_assert(isShadowAtom(mce, vatomX));
2204   tl_assert(isShadowAtom(mce, vatomY));
2205   at = mkUifU64(mce, vatomX, vatomY);
2206   at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2207   return at;
2208}
2209
2210static
2211IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2212{
2213   IRAtom* at;
2214   tl_assert(isShadowAtom(mce, vatomX));
2215   at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2216   return at;
2217}
2218
2219/* --- ... and ... 64Fx4 versions of the same ... --- */
2220
2221static
2222IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2223{
2224   IRAtom* at;
2225   tl_assert(isShadowAtom(mce, vatomX));
2226   tl_assert(isShadowAtom(mce, vatomY));
2227   at = mkUifUV256(mce, vatomX, vatomY);
2228   at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2229   return at;
2230}
2231
2232static
2233IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2234{
2235   IRAtom* at;
2236   tl_assert(isShadowAtom(mce, vatomX));
2237   at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2238   return at;
2239}
2240
2241/* --- ... and ... 32Fx8 versions of the same ... --- */
2242
2243static
2244IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2245{
2246   IRAtom* at;
2247   tl_assert(isShadowAtom(mce, vatomX));
2248   tl_assert(isShadowAtom(mce, vatomY));
2249   at = mkUifUV256(mce, vatomX, vatomY);
2250   at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2251   return at;
2252}
2253
2254static
2255IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2256{
2257   IRAtom* at;
2258   tl_assert(isShadowAtom(mce, vatomX));
2259   at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2260   return at;
2261}
2262
2263/* --- 64Fx2 binary FP ops, with rounding mode --- */
2264
2265static
2266IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2267                                       IRAtom* vatomX, IRAtom* vatomY )
2268{
2269   /* This is the same as binary64Fx2, except that we subsequently
2270      pessimise vRM (definedness of the rounding mode), widen to 128
2271      bits and UifU it into the result.  As with the scalar cases, if
2272      the RM is a constant then it is defined and so this extra bit
2273      will get constant-folded out later. */
2274   // "do" the vector args
2275   IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2276   // PCast the RM, and widen it to 128 bits
2277   IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2278   // Roll it into the result
2279   t1 = mkUifUV128(mce, t1, t2);
2280   return t1;
2281}
2282
2283/* --- ... and ... 32Fx4 versions of the same --- */
2284
2285static
2286IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2287                                       IRAtom* vatomX, IRAtom* vatomY )
2288{
2289   IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2290   // PCast the RM, and widen it to 128 bits
2291   IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2292   // Roll it into the result
2293   t1 = mkUifUV128(mce, t1, t2);
2294   return t1;
2295}
2296
2297/* --- ... and ... 64Fx4 versions of the same --- */
2298
2299static
2300IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2301                                       IRAtom* vatomX, IRAtom* vatomY )
2302{
2303   IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2304   // PCast the RM, and widen it to 256 bits
2305   IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2306   // Roll it into the result
2307   t1 = mkUifUV256(mce, t1, t2);
2308   return t1;
2309}
2310
2311/* --- ... and ... 32Fx8 versions of the same --- */
2312
2313static
2314IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2315                                       IRAtom* vatomX, IRAtom* vatomY )
2316{
2317   IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2318   // PCast the RM, and widen it to 256 bits
2319   IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2320   // Roll it into the result
2321   t1 = mkUifUV256(mce, t1, t2);
2322   return t1;
2323}
2324
2325
2326/* --- --- Vector saturated narrowing --- --- */
2327
2328/* We used to do something very clever here, but on closer inspection
2329   (2011-Jun-15), and in particular bug #279698, it turns out to be
2330   wrong.  Part of the problem came from the fact that for a long
2331   time, the IR primops to do with saturated narrowing were
2332   underspecified and managed to confuse multiple cases which needed
2333   to be separate: the op names had a signedness qualifier, but in
2334   fact the source and destination signednesses needed to be specified
2335   independently, so the op names really need two independent
2336   signedness specifiers.
2337
2338   As of 2011-Jun-15 (ish) the underspecification was sorted out
2339   properly.  The incorrect instrumentation remained, though.  That
2340   has now (2011-Oct-22) been fixed.
2341
2342   What we now do is simple:
2343
2344   Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2345   number of lanes, X is the source lane width and signedness, and Y
2346   is the destination lane width and signedness.  In all cases the
2347   destination lane width is half the source lane width, so the names
2348   have a bit of redundancy, but are at least easy to read.
2349
2350   For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2351   to unsigned 16s.
2352
2353   Let Vanilla(OP) be a function that takes OP, one of these
2354   saturating narrowing ops, and produces the same "shaped" narrowing
2355   op which is not saturating, but merely dumps the most significant
2356   bits.  "same shape" means that the lane numbers and widths are the
2357   same as with OP.
2358
2359   For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2360                  = Iop_NarrowBin32to16x8,
2361   that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2362   dumping the top half of each lane.
2363
2364   So, with that in place, the scheme is simple, and it is simple to
2365   pessimise each lane individually and then apply Vanilla(OP) so as
2366   to get the result in the right "shape".  If the original OP is
2367   QNarrowBinXtoYxZ then we produce
2368
2369   Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2370
2371   or for the case when OP is unary (Iop_QNarrowUn*)
2372
2373   Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2374*/
2375static
2376IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2377{
2378   switch (qnarrowOp) {
2379      /* Binary: (128, 128) -> 128 */
2380      case Iop_QNarrowBin16Sto8Ux16:
2381      case Iop_QNarrowBin16Sto8Sx16:
2382      case Iop_QNarrowBin16Uto8Ux16:
2383      case Iop_QNarrowBin64Sto32Sx4:
2384      case Iop_QNarrowBin64Uto32Ux4:
2385         return Iop_NarrowBin16to8x16;
2386      case Iop_QNarrowBin32Sto16Ux8:
2387      case Iop_QNarrowBin32Sto16Sx8:
2388      case Iop_QNarrowBin32Uto16Ux8:
2389         return Iop_NarrowBin32to16x8;
2390      /* Binary: (64, 64) -> 64 */
2391      case Iop_QNarrowBin32Sto16Sx4:
2392         return Iop_NarrowBin32to16x4;
2393      case Iop_QNarrowBin16Sto8Ux8:
2394      case Iop_QNarrowBin16Sto8Sx8:
2395         return Iop_NarrowBin16to8x8;
2396      /* Unary: 128 -> 64 */
2397      case Iop_QNarrowUn64Uto32Ux2:
2398      case Iop_QNarrowUn64Sto32Sx2:
2399      case Iop_QNarrowUn64Sto32Ux2:
2400         return Iop_NarrowUn64to32x2;
2401      case Iop_QNarrowUn32Uto16Ux4:
2402      case Iop_QNarrowUn32Sto16Sx4:
2403      case Iop_QNarrowUn32Sto16Ux4:
2404         return Iop_NarrowUn32to16x4;
2405      case Iop_QNarrowUn16Uto8Ux8:
2406      case Iop_QNarrowUn16Sto8Sx8:
2407      case Iop_QNarrowUn16Sto8Ux8:
2408         return Iop_NarrowUn16to8x8;
2409      default:
2410         ppIROp(qnarrowOp);
2411         VG_(tool_panic)("vanillaNarrowOpOfShape");
2412   }
2413}
2414
2415static
2416IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
2417                              IRAtom* vatom1, IRAtom* vatom2)
2418{
2419   IRAtom *at1, *at2, *at3;
2420   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2421   switch (narrow_op) {
2422      case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
2423      case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
2424      case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
2425      case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
2426      case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
2427      case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
2428      case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
2429      case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
2430      default: VG_(tool_panic)("vectorNarrowBinV128");
2431   }
2432   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2433   tl_assert(isShadowAtom(mce,vatom1));
2434   tl_assert(isShadowAtom(mce,vatom2));
2435   at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2436   at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
2437   at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
2438   return at3;
2439}
2440
2441static
2442IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
2443                            IRAtom* vatom1, IRAtom* vatom2)
2444{
2445   IRAtom *at1, *at2, *at3;
2446   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2447   switch (narrow_op) {
2448      case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
2449      case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
2450      case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
2451      default: VG_(tool_panic)("vectorNarrowBin64");
2452   }
2453   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2454   tl_assert(isShadowAtom(mce,vatom1));
2455   tl_assert(isShadowAtom(mce,vatom2));
2456   at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
2457   at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
2458   at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
2459   return at3;
2460}
2461
2462static
2463IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
2464                             IRAtom* vatom1)
2465{
2466   IRAtom *at1, *at2;
2467   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2468   tl_assert(isShadowAtom(mce,vatom1));
2469   /* For vanilla narrowing (non-saturating), we can just apply
2470      the op directly to the V bits. */
2471   switch (narrow_op) {
2472      case Iop_NarrowUn16to8x8:
2473      case Iop_NarrowUn32to16x4:
2474      case Iop_NarrowUn64to32x2:
2475         at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
2476         return at1;
2477      default:
2478         break; /* Do Plan B */
2479   }
2480   /* Plan B: for ops that involve a saturation operation on the args,
2481      we must PCast before the vanilla narrow. */
2482   switch (narrow_op) {
2483      case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
2484      case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
2485      case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
2486      case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
2487      case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
2488      case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
2489      case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
2490      case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
2491      case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
2492      default: VG_(tool_panic)("vectorNarrowUnV128");
2493   }
2494   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2495   at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2496   at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
2497   return at2;
2498}
2499
2500static
2501IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
2502                         IRAtom* vatom1)
2503{
2504   IRAtom *at1, *at2;
2505   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2506   switch (longen_op) {
2507      case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
2508      case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
2509      case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
2510      case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
2511      case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
2512      case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
2513      default: VG_(tool_panic)("vectorWidenI64");
2514   }
2515   tl_assert(isShadowAtom(mce,vatom1));
2516   at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
2517   at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
2518   return at2;
2519}
2520
2521
2522/* --- --- Vector integer arithmetic --- --- */
2523
2524/* Simple ... UifU the args and per-lane pessimise the results. */
2525
2526/* --- V256-bit versions --- */
2527
2528static
2529IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2530{
2531   IRAtom* at;
2532   at = mkUifUV256(mce, vatom1, vatom2);
2533   at = mkPCast8x32(mce, at);
2534   return at;
2535}
2536
2537static
2538IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2539{
2540   IRAtom* at;
2541   at = mkUifUV256(mce, vatom1, vatom2);
2542   at = mkPCast16x16(mce, at);
2543   return at;
2544}
2545
2546static
2547IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2548{
2549   IRAtom* at;
2550   at = mkUifUV256(mce, vatom1, vatom2);
2551   at = mkPCast32x8(mce, at);
2552   return at;
2553}
2554
2555static
2556IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2557{
2558   IRAtom* at;
2559   at = mkUifUV256(mce, vatom1, vatom2);
2560   at = mkPCast64x4(mce, at);
2561   return at;
2562}
2563
2564/* --- V128-bit versions --- */
2565
2566static
2567IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2568{
2569   IRAtom* at;
2570   at = mkUifUV128(mce, vatom1, vatom2);
2571   at = mkPCast8x16(mce, at);
2572   return at;
2573}
2574
2575static
2576IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2577{
2578   IRAtom* at;
2579   at = mkUifUV128(mce, vatom1, vatom2);
2580   at = mkPCast16x8(mce, at);
2581   return at;
2582}
2583
2584static
2585IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2586{
2587   IRAtom* at;
2588   at = mkUifUV128(mce, vatom1, vatom2);
2589   at = mkPCast32x4(mce, at);
2590   return at;
2591}
2592
2593static
2594IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2595{
2596   IRAtom* at;
2597   at = mkUifUV128(mce, vatom1, vatom2);
2598   at = mkPCast64x2(mce, at);
2599   return at;
2600}
2601
2602/* --- 64-bit versions --- */
2603
2604static
2605IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2606{
2607   IRAtom* at;
2608   at = mkUifU64(mce, vatom1, vatom2);
2609   at = mkPCast8x8(mce, at);
2610   return at;
2611}
2612
2613static
2614IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2615{
2616   IRAtom* at;
2617   at = mkUifU64(mce, vatom1, vatom2);
2618   at = mkPCast16x4(mce, at);
2619   return at;
2620}
2621
2622static
2623IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2624{
2625   IRAtom* at;
2626   at = mkUifU64(mce, vatom1, vatom2);
2627   at = mkPCast32x2(mce, at);
2628   return at;
2629}
2630
2631static
2632IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2633{
2634   IRAtom* at;
2635   at = mkUifU64(mce, vatom1, vatom2);
2636   at = mkPCastTo(mce, Ity_I64, at);
2637   return at;
2638}
2639
2640/* --- 32-bit versions --- */
2641
2642static
2643IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2644{
2645   IRAtom* at;
2646   at = mkUifU32(mce, vatom1, vatom2);
2647   at = mkPCast8x4(mce, at);
2648   return at;
2649}
2650
2651static
2652IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2653{
2654   IRAtom* at;
2655   at = mkUifU32(mce, vatom1, vatom2);
2656   at = mkPCast16x2(mce, at);
2657   return at;
2658}
2659
2660
2661/*------------------------------------------------------------*/
2662/*--- Generate shadow values from all kinds of IRExprs.    ---*/
2663/*------------------------------------------------------------*/
2664
2665static
2666IRAtom* expr2vbits_Qop ( MCEnv* mce,
2667                         IROp op,
2668                         IRAtom* atom1, IRAtom* atom2,
2669                         IRAtom* atom3, IRAtom* atom4 )
2670{
2671   IRAtom* vatom1 = expr2vbits( mce, atom1 );
2672   IRAtom* vatom2 = expr2vbits( mce, atom2 );
2673   IRAtom* vatom3 = expr2vbits( mce, atom3 );
2674   IRAtom* vatom4 = expr2vbits( mce, atom4 );
2675
2676   tl_assert(isOriginalAtom(mce,atom1));
2677   tl_assert(isOriginalAtom(mce,atom2));
2678   tl_assert(isOriginalAtom(mce,atom3));
2679   tl_assert(isOriginalAtom(mce,atom4));
2680   tl_assert(isShadowAtom(mce,vatom1));
2681   tl_assert(isShadowAtom(mce,vatom2));
2682   tl_assert(isShadowAtom(mce,vatom3));
2683   tl_assert(isShadowAtom(mce,vatom4));
2684   tl_assert(sameKindedAtoms(atom1,vatom1));
2685   tl_assert(sameKindedAtoms(atom2,vatom2));
2686   tl_assert(sameKindedAtoms(atom3,vatom3));
2687   tl_assert(sameKindedAtoms(atom4,vatom4));
2688   switch (op) {
2689      case Iop_MAddF64:
2690      case Iop_MAddF64r32:
2691      case Iop_MSubF64:
2692      case Iop_MSubF64r32:
2693         /* I32(rm) x F64 x F64 x F64 -> F64 */
2694         return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
2695
2696      case Iop_MAddF32:
2697      case Iop_MSubF32:
2698         /* I32(rm) x F32 x F32 x F32 -> F32 */
2699         return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
2700
2701      /* V256-bit data-steering */
2702      case Iop_64x4toV256:
2703         return assignNew('V', mce, Ity_V256,
2704                          IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
2705
2706      default:
2707         ppIROp(op);
2708         VG_(tool_panic)("memcheck:expr2vbits_Qop");
2709   }
2710}
2711
2712
2713static
2714IRAtom* expr2vbits_Triop ( MCEnv* mce,
2715                           IROp op,
2716                           IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
2717{
2718   IRAtom* vatom1 = expr2vbits( mce, atom1 );
2719   IRAtom* vatom2 = expr2vbits( mce, atom2 );
2720   IRAtom* vatom3 = expr2vbits( mce, atom3 );
2721
2722   tl_assert(isOriginalAtom(mce,atom1));
2723   tl_assert(isOriginalAtom(mce,atom2));
2724   tl_assert(isOriginalAtom(mce,atom3));
2725   tl_assert(isShadowAtom(mce,vatom1));
2726   tl_assert(isShadowAtom(mce,vatom2));
2727   tl_assert(isShadowAtom(mce,vatom3));
2728   tl_assert(sameKindedAtoms(atom1,vatom1));
2729   tl_assert(sameKindedAtoms(atom2,vatom2));
2730   tl_assert(sameKindedAtoms(atom3,vatom3));
2731   switch (op) {
2732      case Iop_AddF128:
2733      case Iop_AddD128:
2734      case Iop_SubF128:
2735      case Iop_SubD128:
2736      case Iop_MulF128:
2737      case Iop_MulD128:
2738      case Iop_DivF128:
2739      case Iop_DivD128:
2740      case Iop_QuantizeD128:
2741         /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
2742         return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
2743      case Iop_AddF64:
2744      case Iop_AddD64:
2745      case Iop_AddF64r32:
2746      case Iop_SubF64:
2747      case Iop_SubD64:
2748      case Iop_SubF64r32:
2749      case Iop_MulF64:
2750      case Iop_MulD64:
2751      case Iop_MulF64r32:
2752      case Iop_DivF64:
2753      case Iop_DivD64:
2754      case Iop_DivF64r32:
2755      case Iop_ScaleF64:
2756      case Iop_Yl2xF64:
2757      case Iop_Yl2xp1F64:
2758      case Iop_AtanF64:
2759      case Iop_PRemF64:
2760      case Iop_PRem1F64:
2761      case Iop_QuantizeD64:
2762         /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
2763         return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
2764      case Iop_PRemC3210F64:
2765      case Iop_PRem1C3210F64:
2766         /* I32(rm) x F64 x F64 -> I32 */
2767         return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
2768      case Iop_AddF32:
2769      case Iop_SubF32:
2770      case Iop_MulF32:
2771      case Iop_DivF32:
2772         /* I32(rm) x F32 x F32 -> I32 */
2773         return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
2774      case Iop_SignificanceRoundD64:
2775         /* IRRoundingMode(I32) x I8 x D64 -> D64 */
2776         return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
2777      case Iop_SignificanceRoundD128:
2778         /* IRRoundingMode(I32) x I8 x D128 -> D128 */
2779         return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
2780      case Iop_ExtractV128:
2781         complainIfUndefined(mce, atom3, NULL);
2782         return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
2783      case Iop_Extract64:
2784         complainIfUndefined(mce, atom3, NULL);
2785         return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
2786      case Iop_SetElem8x8:
2787      case Iop_SetElem16x4:
2788      case Iop_SetElem32x2:
2789         complainIfUndefined(mce, atom2, NULL);
2790         return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
2791      /* BCDIops */
2792      case Iop_BCDAdd:
2793      case Iop_BCDSub:
2794         complainIfUndefined(mce, atom3, NULL);
2795         return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
2796
2797      /* Vector FP with rounding mode as the first arg */
2798      case Iop_Add64Fx2:
2799      case Iop_Sub64Fx2:
2800      case Iop_Mul64Fx2:
2801      case Iop_Div64Fx2:
2802         return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
2803
2804      case Iop_Add32Fx4:
2805      case Iop_Sub32Fx4:
2806      case Iop_Mul32Fx4:
2807      case Iop_Div32Fx4:
2808        return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
2809
2810      case Iop_Add64Fx4:
2811      case Iop_Sub64Fx4:
2812      case Iop_Mul64Fx4:
2813      case Iop_Div64Fx4:
2814         return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
2815
2816      case Iop_Add32Fx8:
2817      case Iop_Sub32Fx8:
2818      case Iop_Mul32Fx8:
2819      case Iop_Div32Fx8:
2820         return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
2821
2822      default:
2823         ppIROp(op);
2824         VG_(tool_panic)("memcheck:expr2vbits_Triop");
2825   }
2826}
2827
2828
2829static
2830IRAtom* expr2vbits_Binop ( MCEnv* mce,
2831                           IROp op,
2832                           IRAtom* atom1, IRAtom* atom2 )
2833{
2834   IRType  and_or_ty;
2835   IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
2836   IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
2837   IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
2838
2839   IRAtom* vatom1 = expr2vbits( mce, atom1 );
2840   IRAtom* vatom2 = expr2vbits( mce, atom2 );
2841
2842   tl_assert(isOriginalAtom(mce,atom1));
2843   tl_assert(isOriginalAtom(mce,atom2));
2844   tl_assert(isShadowAtom(mce,vatom1));
2845   tl_assert(isShadowAtom(mce,vatom2));
2846   tl_assert(sameKindedAtoms(atom1,vatom1));
2847   tl_assert(sameKindedAtoms(atom2,vatom2));
2848   switch (op) {
2849
2850      /* 32-bit SIMD */
2851
2852      case Iop_Add16x2:
2853      case Iop_HAdd16Ux2:
2854      case Iop_HAdd16Sx2:
2855      case Iop_Sub16x2:
2856      case Iop_HSub16Ux2:
2857      case Iop_HSub16Sx2:
2858      case Iop_QAdd16Sx2:
2859      case Iop_QSub16Sx2:
2860      case Iop_QSub16Ux2:
2861      case Iop_QAdd16Ux2:
2862         return binary16Ix2(mce, vatom1, vatom2);
2863
2864      case Iop_Add8x4:
2865      case Iop_HAdd8Ux4:
2866      case Iop_HAdd8Sx4:
2867      case Iop_Sub8x4:
2868      case Iop_HSub8Ux4:
2869      case Iop_HSub8Sx4:
2870      case Iop_QSub8Ux4:
2871      case Iop_QAdd8Ux4:
2872      case Iop_QSub8Sx4:
2873      case Iop_QAdd8Sx4:
2874         return binary8Ix4(mce, vatom1, vatom2);
2875
2876      /* 64-bit SIMD */
2877
2878      case Iop_ShrN8x8:
2879      case Iop_ShrN16x4:
2880      case Iop_ShrN32x2:
2881      case Iop_SarN8x8:
2882      case Iop_SarN16x4:
2883      case Iop_SarN32x2:
2884      case Iop_ShlN16x4:
2885      case Iop_ShlN32x2:
2886      case Iop_ShlN8x8:
2887         /* Same scheme as with all other shifts. */
2888         complainIfUndefined(mce, atom2, NULL);
2889         return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
2890
2891      case Iop_QNarrowBin32Sto16Sx4:
2892      case Iop_QNarrowBin16Sto8Sx8:
2893      case Iop_QNarrowBin16Sto8Ux8:
2894         return vectorNarrowBin64(mce, op, vatom1, vatom2);
2895
2896      case Iop_Min8Ux8:
2897      case Iop_Min8Sx8:
2898      case Iop_Max8Ux8:
2899      case Iop_Max8Sx8:
2900      case Iop_Avg8Ux8:
2901      case Iop_QSub8Sx8:
2902      case Iop_QSub8Ux8:
2903      case Iop_Sub8x8:
2904      case Iop_CmpGT8Sx8:
2905      case Iop_CmpGT8Ux8:
2906      case Iop_CmpEQ8x8:
2907      case Iop_QAdd8Sx8:
2908      case Iop_QAdd8Ux8:
2909      case Iop_QSal8x8:
2910      case Iop_QShl8x8:
2911      case Iop_Add8x8:
2912      case Iop_Mul8x8:
2913      case Iop_PolynomialMul8x8:
2914         return binary8Ix8(mce, vatom1, vatom2);
2915
2916      case Iop_Min16Sx4:
2917      case Iop_Min16Ux4:
2918      case Iop_Max16Sx4:
2919      case Iop_Max16Ux4:
2920      case Iop_Avg16Ux4:
2921      case Iop_QSub16Ux4:
2922      case Iop_QSub16Sx4:
2923      case Iop_Sub16x4:
2924      case Iop_Mul16x4:
2925      case Iop_MulHi16Sx4:
2926      case Iop_MulHi16Ux4:
2927      case Iop_CmpGT16Sx4:
2928      case Iop_CmpGT16Ux4:
2929      case Iop_CmpEQ16x4:
2930      case Iop_QAdd16Sx4:
2931      case Iop_QAdd16Ux4:
2932      case Iop_QSal16x4:
2933      case Iop_QShl16x4:
2934      case Iop_Add16x4:
2935      case Iop_QDMulHi16Sx4:
2936      case Iop_QRDMulHi16Sx4:
2937         return binary16Ix4(mce, vatom1, vatom2);
2938
2939      case Iop_Sub32x2:
2940      case Iop_Mul32x2:
2941      case Iop_Max32Sx2:
2942      case Iop_Max32Ux2:
2943      case Iop_Min32Sx2:
2944      case Iop_Min32Ux2:
2945      case Iop_CmpGT32Sx2:
2946      case Iop_CmpGT32Ux2:
2947      case Iop_CmpEQ32x2:
2948      case Iop_Add32x2:
2949      case Iop_QAdd32Ux2:
2950      case Iop_QAdd32Sx2:
2951      case Iop_QSub32Ux2:
2952      case Iop_QSub32Sx2:
2953      case Iop_QSal32x2:
2954      case Iop_QShl32x2:
2955      case Iop_QDMulHi32Sx2:
2956      case Iop_QRDMulHi32Sx2:
2957         return binary32Ix2(mce, vatom1, vatom2);
2958
2959      case Iop_QSub64Ux1:
2960      case Iop_QSub64Sx1:
2961      case Iop_QAdd64Ux1:
2962      case Iop_QAdd64Sx1:
2963      case Iop_QSal64x1:
2964      case Iop_QShl64x1:
2965      case Iop_Sal64x1:
2966         return binary64Ix1(mce, vatom1, vatom2);
2967
2968      case Iop_QShlN8Sx8:
2969      case Iop_QShlN8x8:
2970      case Iop_QSalN8x8:
2971         complainIfUndefined(mce, atom2, NULL);
2972         return mkPCast8x8(mce, vatom1);
2973
2974      case Iop_QShlN16Sx4:
2975      case Iop_QShlN16x4:
2976      case Iop_QSalN16x4:
2977         complainIfUndefined(mce, atom2, NULL);
2978         return mkPCast16x4(mce, vatom1);
2979
2980      case Iop_QShlN32Sx2:
2981      case Iop_QShlN32x2:
2982      case Iop_QSalN32x2:
2983         complainIfUndefined(mce, atom2, NULL);
2984         return mkPCast32x2(mce, vatom1);
2985
2986      case Iop_QShlN64Sx1:
2987      case Iop_QShlN64x1:
2988      case Iop_QSalN64x1:
2989         complainIfUndefined(mce, atom2, NULL);
2990         return mkPCast32x2(mce, vatom1);
2991
2992      case Iop_PwMax32Sx2:
2993      case Iop_PwMax32Ux2:
2994      case Iop_PwMin32Sx2:
2995      case Iop_PwMin32Ux2:
2996      case Iop_PwMax32Fx2:
2997      case Iop_PwMin32Fx2:
2998         return assignNew('V', mce, Ity_I64,
2999                          binop(Iop_PwMax32Ux2,
3000                                mkPCast32x2(mce, vatom1),
3001                                mkPCast32x2(mce, vatom2)));
3002
3003      case Iop_PwMax16Sx4:
3004      case Iop_PwMax16Ux4:
3005      case Iop_PwMin16Sx4:
3006      case Iop_PwMin16Ux4:
3007         return assignNew('V', mce, Ity_I64,
3008                          binop(Iop_PwMax16Ux4,
3009                                mkPCast16x4(mce, vatom1),
3010                                mkPCast16x4(mce, vatom2)));
3011
3012      case Iop_PwMax8Sx8:
3013      case Iop_PwMax8Ux8:
3014      case Iop_PwMin8Sx8:
3015      case Iop_PwMin8Ux8:
3016         return assignNew('V', mce, Ity_I64,
3017                          binop(Iop_PwMax8Ux8,
3018                                mkPCast8x8(mce, vatom1),
3019                                mkPCast8x8(mce, vatom2)));
3020
3021      case Iop_PwAdd32x2:
3022      case Iop_PwAdd32Fx2:
3023         return mkPCast32x2(mce,
3024               assignNew('V', mce, Ity_I64,
3025                         binop(Iop_PwAdd32x2,
3026                               mkPCast32x2(mce, vatom1),
3027                               mkPCast32x2(mce, vatom2))));
3028
3029      case Iop_PwAdd16x4:
3030         return mkPCast16x4(mce,
3031               assignNew('V', mce, Ity_I64,
3032                         binop(op, mkPCast16x4(mce, vatom1),
3033                                   mkPCast16x4(mce, vatom2))));
3034
3035      case Iop_PwAdd8x8:
3036         return mkPCast8x8(mce,
3037               assignNew('V', mce, Ity_I64,
3038                         binop(op, mkPCast8x8(mce, vatom1),
3039                                   mkPCast8x8(mce, vatom2))));
3040
3041      case Iop_Shl8x8:
3042      case Iop_Shr8x8:
3043      case Iop_Sar8x8:
3044      case Iop_Sal8x8:
3045         return mkUifU64(mce,
3046                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3047                   mkPCast8x8(mce,vatom2)
3048                );
3049
3050      case Iop_Shl16x4:
3051      case Iop_Shr16x4:
3052      case Iop_Sar16x4:
3053      case Iop_Sal16x4:
3054         return mkUifU64(mce,
3055                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3056                   mkPCast16x4(mce,vatom2)
3057                );
3058
3059      case Iop_Shl32x2:
3060      case Iop_Shr32x2:
3061      case Iop_Sar32x2:
3062      case Iop_Sal32x2:
3063         return mkUifU64(mce,
3064                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3065                   mkPCast32x2(mce,vatom2)
3066                );
3067
3068      /* 64-bit data-steering */
3069      case Iop_InterleaveLO32x2:
3070      case Iop_InterleaveLO16x4:
3071      case Iop_InterleaveLO8x8:
3072      case Iop_InterleaveHI32x2:
3073      case Iop_InterleaveHI16x4:
3074      case Iop_InterleaveHI8x8:
3075      case Iop_CatOddLanes8x8:
3076      case Iop_CatEvenLanes8x8:
3077      case Iop_CatOddLanes16x4:
3078      case Iop_CatEvenLanes16x4:
3079      case Iop_InterleaveOddLanes8x8:
3080      case Iop_InterleaveEvenLanes8x8:
3081      case Iop_InterleaveOddLanes16x4:
3082      case Iop_InterleaveEvenLanes16x4:
3083         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3084
3085      case Iop_GetElem8x8:
3086         complainIfUndefined(mce, atom2, NULL);
3087         return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3088      case Iop_GetElem16x4:
3089         complainIfUndefined(mce, atom2, NULL);
3090         return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3091      case Iop_GetElem32x2:
3092         complainIfUndefined(mce, atom2, NULL);
3093         return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3094
3095      /* Perm8x8: rearrange values in left arg using steering values
3096        from right arg.  So rearrange the vbits in the same way but
3097        pessimise wrt steering values. */
3098      case Iop_Perm8x8:
3099         return mkUifU64(
3100                   mce,
3101                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3102                   mkPCast8x8(mce, vatom2)
3103                );
3104
3105      /* V128-bit SIMD */
3106
3107      case Iop_ShrN8x16:
3108      case Iop_ShrN16x8:
3109      case Iop_ShrN32x4:
3110      case Iop_ShrN64x2:
3111      case Iop_SarN8x16:
3112      case Iop_SarN16x8:
3113      case Iop_SarN32x4:
3114      case Iop_SarN64x2:
3115      case Iop_ShlN8x16:
3116      case Iop_ShlN16x8:
3117      case Iop_ShlN32x4:
3118      case Iop_ShlN64x2:
3119         /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3120            this is wrong now, scalar shifts are done properly lazily.
3121            Vector shifts should be fixed too. */
3122         complainIfUndefined(mce, atom2, NULL);
3123         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3124
3125      /* V x V shifts/rotates are done using the standard lazy scheme. */
3126      case Iop_Shl8x16:
3127      case Iop_Shr8x16:
3128      case Iop_Sar8x16:
3129      case Iop_Sal8x16:
3130      case Iop_Rol8x16:
3131         return mkUifUV128(mce,
3132                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3133                   mkPCast8x16(mce,vatom2)
3134                );
3135
3136      case Iop_Shl16x8:
3137      case Iop_Shr16x8:
3138      case Iop_Sar16x8:
3139      case Iop_Sal16x8:
3140      case Iop_Rol16x8:
3141         return mkUifUV128(mce,
3142                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3143                   mkPCast16x8(mce,vatom2)
3144                );
3145
3146      case Iop_Shl32x4:
3147      case Iop_Shr32x4:
3148      case Iop_Sar32x4:
3149      case Iop_Sal32x4:
3150      case Iop_Rol32x4:
3151      case Iop_Rol64x2:
3152         return mkUifUV128(mce,
3153                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3154                   mkPCast32x4(mce,vatom2)
3155                );
3156
3157      case Iop_Shl64x2:
3158      case Iop_Shr64x2:
3159      case Iop_Sar64x2:
3160      case Iop_Sal64x2:
3161         return mkUifUV128(mce,
3162                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3163                   mkPCast64x2(mce,vatom2)
3164                );
3165
3166      case Iop_F32ToFixed32Ux4_RZ:
3167      case Iop_F32ToFixed32Sx4_RZ:
3168      case Iop_Fixed32UToF32x4_RN:
3169      case Iop_Fixed32SToF32x4_RN:
3170         complainIfUndefined(mce, atom2, NULL);
3171         return mkPCast32x4(mce, vatom1);
3172
3173      case Iop_F32ToFixed32Ux2_RZ:
3174      case Iop_F32ToFixed32Sx2_RZ:
3175      case Iop_Fixed32UToF32x2_RN:
3176      case Iop_Fixed32SToF32x2_RN:
3177         complainIfUndefined(mce, atom2, NULL);
3178         return mkPCast32x2(mce, vatom1);
3179
3180      case Iop_QSub8Ux16:
3181      case Iop_QSub8Sx16:
3182      case Iop_Sub8x16:
3183      case Iop_Min8Ux16:
3184      case Iop_Min8Sx16:
3185      case Iop_Max8Ux16:
3186      case Iop_Max8Sx16:
3187      case Iop_CmpGT8Sx16:
3188      case Iop_CmpGT8Ux16:
3189      case Iop_CmpEQ8x16:
3190      case Iop_Avg8Ux16:
3191      case Iop_Avg8Sx16:
3192      case Iop_QAdd8Ux16:
3193      case Iop_QAdd8Sx16:
3194      case Iop_QSal8x16:
3195      case Iop_QShl8x16:
3196      case Iop_Add8x16:
3197      case Iop_Mul8x16:
3198      case Iop_PolynomialMul8x16:
3199      case Iop_PolynomialMulAdd8x16:
3200         return binary8Ix16(mce, vatom1, vatom2);
3201
3202      case Iop_QSub16Ux8:
3203      case Iop_QSub16Sx8:
3204      case Iop_Sub16x8:
3205      case Iop_Mul16x8:
3206      case Iop_MulHi16Sx8:
3207      case Iop_MulHi16Ux8:
3208      case Iop_Min16Sx8:
3209      case Iop_Min16Ux8:
3210      case Iop_Max16Sx8:
3211      case Iop_Max16Ux8:
3212      case Iop_CmpGT16Sx8:
3213      case Iop_CmpGT16Ux8:
3214      case Iop_CmpEQ16x8:
3215      case Iop_Avg16Ux8:
3216      case Iop_Avg16Sx8:
3217      case Iop_QAdd16Ux8:
3218      case Iop_QAdd16Sx8:
3219      case Iop_QSal16x8:
3220      case Iop_QShl16x8:
3221      case Iop_Add16x8:
3222      case Iop_QDMulHi16Sx8:
3223      case Iop_QRDMulHi16Sx8:
3224      case Iop_PolynomialMulAdd16x8:
3225         return binary16Ix8(mce, vatom1, vatom2);
3226
3227      case Iop_Sub32x4:
3228      case Iop_CmpGT32Sx4:
3229      case Iop_CmpGT32Ux4:
3230      case Iop_CmpEQ32x4:
3231      case Iop_QAdd32Sx4:
3232      case Iop_QAdd32Ux4:
3233      case Iop_QSub32Sx4:
3234      case Iop_QSub32Ux4:
3235      case Iop_QSal32x4:
3236      case Iop_QShl32x4:
3237      case Iop_Avg32Ux4:
3238      case Iop_Avg32Sx4:
3239      case Iop_Add32x4:
3240      case Iop_Max32Ux4:
3241      case Iop_Max32Sx4:
3242      case Iop_Min32Ux4:
3243      case Iop_Min32Sx4:
3244      case Iop_Mul32x4:
3245      case Iop_QDMulHi32Sx4:
3246      case Iop_QRDMulHi32Sx4:
3247      case Iop_PolynomialMulAdd32x4:
3248         return binary32Ix4(mce, vatom1, vatom2);
3249
3250      case Iop_Sub64x2:
3251      case Iop_Add64x2:
3252      case Iop_Max64Sx2:
3253      case Iop_Max64Ux2:
3254      case Iop_Min64Sx2:
3255      case Iop_Min64Ux2:
3256      case Iop_CmpEQ64x2:
3257      case Iop_CmpGT64Sx2:
3258      case Iop_CmpGT64Ux2:
3259      case Iop_QSal64x2:
3260      case Iop_QShl64x2:
3261      case Iop_QAdd64Ux2:
3262      case Iop_QAdd64Sx2:
3263      case Iop_QSub64Ux2:
3264      case Iop_QSub64Sx2:
3265      case Iop_PolynomialMulAdd64x2:
3266      case Iop_CipherV128:
3267      case Iop_CipherLV128:
3268      case Iop_NCipherV128:
3269      case Iop_NCipherLV128:
3270        return binary64Ix2(mce, vatom1, vatom2);
3271
3272      case Iop_QNarrowBin64Sto32Sx4:
3273      case Iop_QNarrowBin64Uto32Ux4:
3274      case Iop_QNarrowBin32Sto16Sx8:
3275      case Iop_QNarrowBin32Uto16Ux8:
3276      case Iop_QNarrowBin32Sto16Ux8:
3277      case Iop_QNarrowBin16Sto8Sx16:
3278      case Iop_QNarrowBin16Uto8Ux16:
3279      case Iop_QNarrowBin16Sto8Ux16:
3280         return vectorNarrowBinV128(mce, op, vatom1, vatom2);
3281
3282      case Iop_Min64Fx2:
3283      case Iop_Max64Fx2:
3284      case Iop_CmpLT64Fx2:
3285      case Iop_CmpLE64Fx2:
3286      case Iop_CmpEQ64Fx2:
3287      case Iop_CmpUN64Fx2:
3288         return binary64Fx2(mce, vatom1, vatom2);
3289
3290      case Iop_Sub64F0x2:
3291      case Iop_Mul64F0x2:
3292      case Iop_Min64F0x2:
3293      case Iop_Max64F0x2:
3294      case Iop_Div64F0x2:
3295      case Iop_CmpLT64F0x2:
3296      case Iop_CmpLE64F0x2:
3297      case Iop_CmpEQ64F0x2:
3298      case Iop_CmpUN64F0x2:
3299      case Iop_Add64F0x2:
3300         return binary64F0x2(mce, vatom1, vatom2);
3301
3302      case Iop_Min32Fx4:
3303      case Iop_Max32Fx4:
3304      case Iop_CmpLT32Fx4:
3305      case Iop_CmpLE32Fx4:
3306      case Iop_CmpEQ32Fx4:
3307      case Iop_CmpUN32Fx4:
3308      case Iop_CmpGT32Fx4:
3309      case Iop_CmpGE32Fx4:
3310      case Iop_Recps32Fx4:
3311      case Iop_Rsqrts32Fx4:
3312         return binary32Fx4(mce, vatom1, vatom2);
3313
3314      case Iop_Sub32Fx2:
3315      case Iop_Mul32Fx2:
3316      case Iop_Min32Fx2:
3317      case Iop_Max32Fx2:
3318      case Iop_CmpEQ32Fx2:
3319      case Iop_CmpGT32Fx2:
3320      case Iop_CmpGE32Fx2:
3321      case Iop_Add32Fx2:
3322      case Iop_Recps32Fx2:
3323      case Iop_Rsqrts32Fx2:
3324         return binary32Fx2(mce, vatom1, vatom2);
3325
3326      case Iop_Sub32F0x4:
3327      case Iop_Mul32F0x4:
3328      case Iop_Min32F0x4:
3329      case Iop_Max32F0x4:
3330      case Iop_Div32F0x4:
3331      case Iop_CmpLT32F0x4:
3332      case Iop_CmpLE32F0x4:
3333      case Iop_CmpEQ32F0x4:
3334      case Iop_CmpUN32F0x4:
3335      case Iop_Add32F0x4:
3336         return binary32F0x4(mce, vatom1, vatom2);
3337
3338      case Iop_QShlN8Sx16:
3339      case Iop_QShlN8x16:
3340      case Iop_QSalN8x16:
3341         complainIfUndefined(mce, atom2, NULL);
3342         return mkPCast8x16(mce, vatom1);
3343
3344      case Iop_QShlN16Sx8:
3345      case Iop_QShlN16x8:
3346      case Iop_QSalN16x8:
3347         complainIfUndefined(mce, atom2, NULL);
3348         return mkPCast16x8(mce, vatom1);
3349
3350      case Iop_QShlN32Sx4:
3351      case Iop_QShlN32x4:
3352      case Iop_QSalN32x4:
3353         complainIfUndefined(mce, atom2, NULL);
3354         return mkPCast32x4(mce, vatom1);
3355
3356      case Iop_QShlN64Sx2:
3357      case Iop_QShlN64x2:
3358      case Iop_QSalN64x2:
3359         complainIfUndefined(mce, atom2, NULL);
3360         return mkPCast32x4(mce, vatom1);
3361
3362      case Iop_Mull32Sx2:
3363      case Iop_Mull32Ux2:
3364      case Iop_QDMulLong32Sx2:
3365         return vectorWidenI64(mce, Iop_Widen32Sto64x2,
3366                                    mkUifU64(mce, vatom1, vatom2));
3367
3368      case Iop_Mull16Sx4:
3369      case Iop_Mull16Ux4:
3370      case Iop_QDMulLong16Sx4:
3371         return vectorWidenI64(mce, Iop_Widen16Sto32x4,
3372                                    mkUifU64(mce, vatom1, vatom2));
3373
3374      case Iop_Mull8Sx8:
3375      case Iop_Mull8Ux8:
3376      case Iop_PolynomialMull8x8:
3377         return vectorWidenI64(mce, Iop_Widen8Sto16x8,
3378                                    mkUifU64(mce, vatom1, vatom2));
3379
3380      case Iop_PwAdd32x4:
3381         return mkPCast32x4(mce,
3382               assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
3383                     mkPCast32x4(mce, vatom2))));
3384
3385      case Iop_PwAdd16x8:
3386         return mkPCast16x8(mce,
3387               assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
3388                     mkPCast16x8(mce, vatom2))));
3389
3390      case Iop_PwAdd8x16:
3391         return mkPCast8x16(mce,
3392               assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
3393                     mkPCast8x16(mce, vatom2))));
3394
3395      /* V128-bit data-steering */
3396      case Iop_SetV128lo32:
3397      case Iop_SetV128lo64:
3398      case Iop_64HLtoV128:
3399      case Iop_InterleaveLO64x2:
3400      case Iop_InterleaveLO32x4:
3401      case Iop_InterleaveLO16x8:
3402      case Iop_InterleaveLO8x16:
3403      case Iop_InterleaveHI64x2:
3404      case Iop_InterleaveHI32x4:
3405      case Iop_InterleaveHI16x8:
3406      case Iop_InterleaveHI8x16:
3407      case Iop_CatOddLanes8x16:
3408      case Iop_CatOddLanes16x8:
3409      case Iop_CatOddLanes32x4:
3410      case Iop_CatEvenLanes8x16:
3411      case Iop_CatEvenLanes16x8:
3412      case Iop_CatEvenLanes32x4:
3413      case Iop_InterleaveOddLanes8x16:
3414      case Iop_InterleaveOddLanes16x8:
3415      case Iop_InterleaveOddLanes32x4:
3416      case Iop_InterleaveEvenLanes8x16:
3417      case Iop_InterleaveEvenLanes16x8:
3418      case Iop_InterleaveEvenLanes32x4:
3419         return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
3420
3421      case Iop_GetElem8x16:
3422         complainIfUndefined(mce, atom2, NULL);
3423         return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3424      case Iop_GetElem16x8:
3425         complainIfUndefined(mce, atom2, NULL);
3426         return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3427      case Iop_GetElem32x4:
3428         complainIfUndefined(mce, atom2, NULL);
3429         return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3430      case Iop_GetElem64x2:
3431         complainIfUndefined(mce, atom2, NULL);
3432         return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3433
3434     /* Perm8x16: rearrange values in left arg using steering values
3435        from right arg.  So rearrange the vbits in the same way but
3436        pessimise wrt steering values.  Perm32x4 ditto. */
3437      case Iop_Perm8x16:
3438         return mkUifUV128(
3439                   mce,
3440                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3441                   mkPCast8x16(mce, vatom2)
3442                );
3443      case Iop_Perm32x4:
3444         return mkUifUV128(
3445                   mce,
3446                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3447                   mkPCast32x4(mce, vatom2)
3448                );
3449
3450     /* These two take the lower half of each 16-bit lane, sign/zero
3451        extend it to 32, and multiply together, producing a 32x4
3452        result (and implicitly ignoring half the operand bits).  So
3453        treat it as a bunch of independent 16x8 operations, but then
3454        do 32-bit shifts left-right to copy the lower half results
3455        (which are all 0s or all 1s due to PCasting in binary16Ix8)
3456        into the upper half of each result lane. */
3457      case Iop_MullEven16Ux8:
3458      case Iop_MullEven16Sx8: {
3459         IRAtom* at;
3460         at = binary16Ix8(mce,vatom1,vatom2);
3461         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
3462         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
3463	 return at;
3464      }
3465
3466      /* Same deal as Iop_MullEven16{S,U}x8 */
3467      case Iop_MullEven8Ux16:
3468      case Iop_MullEven8Sx16: {
3469         IRAtom* at;
3470         at = binary8Ix16(mce,vatom1,vatom2);
3471         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
3472         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
3473	 return at;
3474      }
3475
3476      /* Same deal as Iop_MullEven16{S,U}x8 */
3477      case Iop_MullEven32Ux4:
3478      case Iop_MullEven32Sx4: {
3479         IRAtom* at;
3480         at = binary32Ix4(mce,vatom1,vatom2);
3481         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
3482         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
3483         return at;
3484      }
3485
3486      /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
3487         32x4 -> 16x8 laneage, discarding the upper half of each lane.
3488         Simply apply same op to the V bits, since this really no more
3489         than a data steering operation. */
3490      case Iop_NarrowBin32to16x8:
3491      case Iop_NarrowBin16to8x16:
3492      case Iop_NarrowBin64to32x4:
3493         return assignNew('V', mce, Ity_V128,
3494                                    binop(op, vatom1, vatom2));
3495
3496      case Iop_ShrV128:
3497      case Iop_ShlV128:
3498         /* Same scheme as with all other shifts.  Note: 10 Nov 05:
3499            this is wrong now, scalar shifts are done properly lazily.
3500            Vector shifts should be fixed too. */
3501         complainIfUndefined(mce, atom2, NULL);
3502         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3503
3504      /* SHA Iops */
3505      case Iop_SHA256:
3506      case Iop_SHA512:
3507         complainIfUndefined(mce, atom2, NULL);
3508         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3509
3510      /* I128-bit data-steering */
3511      case Iop_64HLto128:
3512         return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
3513
3514      /* V256-bit SIMD */
3515
3516      case Iop_Max64Fx4:
3517      case Iop_Min64Fx4:
3518         return binary64Fx4(mce, vatom1, vatom2);
3519
3520      case Iop_Max32Fx8:
3521      case Iop_Min32Fx8:
3522         return binary32Fx8(mce, vatom1, vatom2);
3523
3524      /* V256-bit data-steering */
3525      case Iop_V128HLtoV256:
3526         return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
3527
3528      /* Scalar floating point */
3529
3530      case Iop_F32toI64S:
3531      case Iop_F32toI64U:
3532         /* I32(rm) x F32 -> I64 */
3533         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3534
3535      case Iop_I64StoF32:
3536         /* I32(rm) x I64 -> F32 */
3537         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3538
3539      case Iop_RoundF64toInt:
3540      case Iop_RoundF64toF32:
3541      case Iop_F64toI64S:
3542      case Iop_F64toI64U:
3543      case Iop_I64StoF64:
3544      case Iop_I64UtoF64:
3545      case Iop_SinF64:
3546      case Iop_CosF64:
3547      case Iop_TanF64:
3548      case Iop_2xm1F64:
3549      case Iop_SqrtF64:
3550         /* I32(rm) x I64/F64 -> I64/F64 */
3551         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3552
3553      case Iop_ShlD64:
3554      case Iop_ShrD64:
3555      case Iop_RoundD64toInt:
3556         /* I32(rm) x D64 -> D64 */
3557         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3558
3559      case Iop_ShlD128:
3560      case Iop_ShrD128:
3561      case Iop_RoundD128toInt:
3562         /* I32(rm) x D128 -> D128 */
3563         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3564
3565      case Iop_D64toI64S:
3566      case Iop_D64toI64U:
3567      case Iop_I64StoD64:
3568      case Iop_I64UtoD64:
3569         /* I32(rm) x I64/D64 -> D64/I64 */
3570         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3571
3572      case Iop_F32toD32:
3573      case Iop_F64toD32:
3574      case Iop_F128toD32:
3575      case Iop_D32toF32:
3576      case Iop_D64toF32:
3577      case Iop_D128toF32:
3578         /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
3579         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3580
3581      case Iop_F32toD64:
3582      case Iop_F64toD64:
3583      case Iop_F128toD64:
3584      case Iop_D32toF64:
3585      case Iop_D64toF64:
3586      case Iop_D128toF64:
3587         /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
3588         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3589
3590      case Iop_F32toD128:
3591      case Iop_F64toD128:
3592      case Iop_F128toD128:
3593      case Iop_D32toF128:
3594      case Iop_D64toF128:
3595      case Iop_D128toF128:
3596         /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
3597         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3598
3599      case Iop_RoundF32toInt:
3600      case Iop_SqrtF32:
3601         /* I32(rm) x I32/F32 -> I32/F32 */
3602         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3603
3604      case Iop_SqrtF128:
3605         /* I32(rm) x F128 -> F128 */
3606         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3607
3608      case Iop_I32StoF32:
3609      case Iop_I32UtoF32:
3610      case Iop_F32toI32S:
3611      case Iop_F32toI32U:
3612         /* First arg is I32 (rounding mode), second is F32/I32 (data). */
3613         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3614
3615      case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
3616      case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
3617      case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
3618      case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
3619      case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
3620         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3621
3622      case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
3623      case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
3624      case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
3625      case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
3626      case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
3627      case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
3628         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3629
3630      case Iop_F64HLtoF128:
3631      case Iop_D64HLtoD128:
3632         return assignNew('V', mce, Ity_I128,
3633                          binop(Iop_64HLto128, vatom1, vatom2));
3634
3635      case Iop_F64toI32U:
3636      case Iop_F64toI32S:
3637      case Iop_F64toF32:
3638      case Iop_I64UtoF32:
3639      case Iop_D64toI32U:
3640      case Iop_D64toI32S:
3641         /* First arg is I32 (rounding mode), second is F64/D64 (data). */
3642         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3643
3644      case Iop_D64toD32:
3645         /* First arg is I32 (rounding mode), second is D64 (data). */
3646         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3647
3648      case Iop_F64toI16S:
3649         /* First arg is I32 (rounding mode), second is F64 (data). */
3650         return mkLazy2(mce, Ity_I16, vatom1, vatom2);
3651
3652      case Iop_InsertExpD64:
3653         /*  I64 x I64 -> D64 */
3654         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3655
3656      case Iop_InsertExpD128:
3657         /*  I64 x I128 -> D128 */
3658         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3659
3660      case Iop_CmpF32:
3661      case Iop_CmpF64:
3662      case Iop_CmpF128:
3663      case Iop_CmpD64:
3664      case Iop_CmpD128:
3665      case Iop_CmpExpD64:
3666      case Iop_CmpExpD128:
3667         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3668
3669      /* non-FP after here */
3670
3671      case Iop_DivModU64to32:
3672      case Iop_DivModS64to32:
3673         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3674
3675      case Iop_DivModU128to64:
3676      case Iop_DivModS128to64:
3677         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3678
3679      case Iop_8HLto16:
3680         return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
3681      case Iop_16HLto32:
3682         return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
3683      case Iop_32HLto64:
3684         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3685
3686      case Iop_DivModS64to64:
3687      case Iop_MullS64:
3688      case Iop_MullU64: {
3689         IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
3690         IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
3691         return assignNew('V', mce, Ity_I128,
3692                          binop(Iop_64HLto128, vHi64, vLo64));
3693      }
3694
3695      case Iop_MullS32:
3696      case Iop_MullU32: {
3697         IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
3698         IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
3699         return assignNew('V', mce, Ity_I64,
3700                          binop(Iop_32HLto64, vHi32, vLo32));
3701      }
3702
3703      case Iop_MullS16:
3704      case Iop_MullU16: {
3705         IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
3706         IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
3707         return assignNew('V', mce, Ity_I32,
3708                          binop(Iop_16HLto32, vHi16, vLo16));
3709      }
3710
3711      case Iop_MullS8:
3712      case Iop_MullU8: {
3713         IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
3714         IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
3715         return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
3716      }
3717
3718      case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
3719      case Iop_DivS32:
3720      case Iop_DivU32:
3721      case Iop_DivU32E:
3722      case Iop_DivS32E:
3723      case Iop_QAdd32S: /* could probably do better */
3724      case Iop_QSub32S: /* could probably do better */
3725         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3726
3727      case Iop_DivS64:
3728      case Iop_DivU64:
3729      case Iop_DivS64E:
3730      case Iop_DivU64E:
3731         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3732
3733      case Iop_Add32:
3734         if (mce->bogusLiterals || mce->useLLVMworkarounds)
3735            return expensiveAddSub(mce,True,Ity_I32,
3736                                   vatom1,vatom2, atom1,atom2);
3737         else
3738            goto cheap_AddSub32;
3739      case Iop_Sub32:
3740         if (mce->bogusLiterals)
3741            return expensiveAddSub(mce,False,Ity_I32,
3742                                   vatom1,vatom2, atom1,atom2);
3743         else
3744            goto cheap_AddSub32;
3745
3746      cheap_AddSub32:
3747      case Iop_Mul32:
3748         return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
3749
3750      case Iop_CmpORD32S:
3751      case Iop_CmpORD32U:
3752      case Iop_CmpORD64S:
3753      case Iop_CmpORD64U:
3754         return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
3755
3756      case Iop_Add64:
3757         if (mce->bogusLiterals || mce->useLLVMworkarounds)
3758            return expensiveAddSub(mce,True,Ity_I64,
3759                                   vatom1,vatom2, atom1,atom2);
3760         else
3761            goto cheap_AddSub64;
3762      case Iop_Sub64:
3763         if (mce->bogusLiterals)
3764            return expensiveAddSub(mce,False,Ity_I64,
3765                                   vatom1,vatom2, atom1,atom2);
3766         else
3767            goto cheap_AddSub64;
3768
3769      cheap_AddSub64:
3770      case Iop_Mul64:
3771         return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
3772
3773      case Iop_Mul16:
3774      case Iop_Add16:
3775      case Iop_Sub16:
3776         return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
3777
3778      case Iop_Mul8:
3779      case Iop_Sub8:
3780      case Iop_Add8:
3781         return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
3782
3783      case Iop_CmpEQ64:
3784      case Iop_CmpNE64:
3785         if (mce->bogusLiterals)
3786            goto expensive_cmp64;
3787         else
3788            goto cheap_cmp64;
3789
3790      expensive_cmp64:
3791      case Iop_ExpCmpNE64:
3792         return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
3793
3794      cheap_cmp64:
3795      case Iop_CmpLE64S: case Iop_CmpLE64U:
3796      case Iop_CmpLT64U: case Iop_CmpLT64S:
3797         return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
3798
3799      case Iop_CmpEQ32:
3800      case Iop_CmpNE32:
3801         if (mce->bogusLiterals)
3802            goto expensive_cmp32;
3803         else
3804            goto cheap_cmp32;
3805
3806      expensive_cmp32:
3807      case Iop_ExpCmpNE32:
3808         return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
3809
3810      cheap_cmp32:
3811      case Iop_CmpLE32S: case Iop_CmpLE32U:
3812      case Iop_CmpLT32U: case Iop_CmpLT32S:
3813         return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
3814
3815      case Iop_CmpEQ16: case Iop_CmpNE16:
3816         return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
3817
3818      case Iop_ExpCmpNE16:
3819         return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
3820
3821      case Iop_CmpEQ8: case Iop_CmpNE8:
3822         return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
3823
3824      case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
3825      case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
3826      case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
3827      case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
3828         /* Just say these all produce a defined result, regardless
3829            of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
3830         return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
3831
3832      case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
3833         return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
3834
3835      case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
3836         return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
3837
3838      case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
3839         return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
3840
3841      case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
3842         return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
3843
3844      case Iop_AndV256:
3845         uifu = mkUifUV256; difd = mkDifDV256;
3846         and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
3847      case Iop_AndV128:
3848         uifu = mkUifUV128; difd = mkDifDV128;
3849         and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
3850      case Iop_And64:
3851         uifu = mkUifU64; difd = mkDifD64;
3852         and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
3853      case Iop_And32:
3854         uifu = mkUifU32; difd = mkDifD32;
3855         and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
3856      case Iop_And16:
3857         uifu = mkUifU16; difd = mkDifD16;
3858         and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
3859      case Iop_And8:
3860         uifu = mkUifU8; difd = mkDifD8;
3861         and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
3862
3863      case Iop_OrV256:
3864         uifu = mkUifUV256; difd = mkDifDV256;
3865         and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
3866      case Iop_OrV128:
3867         uifu = mkUifUV128; difd = mkDifDV128;
3868         and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
3869      case Iop_Or64:
3870         uifu = mkUifU64; difd = mkDifD64;
3871         and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
3872      case Iop_Or32:
3873         uifu = mkUifU32; difd = mkDifD32;
3874         and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
3875      case Iop_Or16:
3876         uifu = mkUifU16; difd = mkDifD16;
3877         and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
3878      case Iop_Or8:
3879         uifu = mkUifU8; difd = mkDifD8;
3880         and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
3881
3882      do_And_Or:
3883         return
3884         assignNew(
3885            'V', mce,
3886            and_or_ty,
3887            difd(mce, uifu(mce, vatom1, vatom2),
3888                      difd(mce, improve(mce, atom1, vatom1),
3889                                improve(mce, atom2, vatom2) ) ) );
3890
3891      case Iop_Xor8:
3892         return mkUifU8(mce, vatom1, vatom2);
3893      case Iop_Xor16:
3894         return mkUifU16(mce, vatom1, vatom2);
3895      case Iop_Xor32:
3896         return mkUifU32(mce, vatom1, vatom2);
3897      case Iop_Xor64:
3898         return mkUifU64(mce, vatom1, vatom2);
3899      case Iop_XorV128:
3900         return mkUifUV128(mce, vatom1, vatom2);
3901      case Iop_XorV256:
3902         return mkUifUV256(mce, vatom1, vatom2);
3903
3904      /* V256-bit SIMD */
3905
3906      case Iop_ShrN16x16:
3907      case Iop_ShrN32x8:
3908      case Iop_ShrN64x4:
3909      case Iop_SarN16x16:
3910      case Iop_SarN32x8:
3911      case Iop_ShlN16x16:
3912      case Iop_ShlN32x8:
3913      case Iop_ShlN64x4:
3914         /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3915            this is wrong now, scalar shifts are done properly lazily.
3916            Vector shifts should be fixed too. */
3917         complainIfUndefined(mce, atom2, NULL);
3918         return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
3919
3920      case Iop_QSub8Ux32:
3921      case Iop_QSub8Sx32:
3922      case Iop_Sub8x32:
3923      case Iop_Min8Ux32:
3924      case Iop_Min8Sx32:
3925      case Iop_Max8Ux32:
3926      case Iop_Max8Sx32:
3927      case Iop_CmpGT8Sx32:
3928      case Iop_CmpEQ8x32:
3929      case Iop_Avg8Ux32:
3930      case Iop_QAdd8Ux32:
3931      case Iop_QAdd8Sx32:
3932      case Iop_Add8x32:
3933         return binary8Ix32(mce, vatom1, vatom2);
3934
3935      case Iop_QSub16Ux16:
3936      case Iop_QSub16Sx16:
3937      case Iop_Sub16x16:
3938      case Iop_Mul16x16:
3939      case Iop_MulHi16Sx16:
3940      case Iop_MulHi16Ux16:
3941      case Iop_Min16Sx16:
3942      case Iop_Min16Ux16:
3943      case Iop_Max16Sx16:
3944      case Iop_Max16Ux16:
3945      case Iop_CmpGT16Sx16:
3946      case Iop_CmpEQ16x16:
3947      case Iop_Avg16Ux16:
3948      case Iop_QAdd16Ux16:
3949      case Iop_QAdd16Sx16:
3950      case Iop_Add16x16:
3951         return binary16Ix16(mce, vatom1, vatom2);
3952
3953      case Iop_Sub32x8:
3954      case Iop_CmpGT32Sx8:
3955      case Iop_CmpEQ32x8:
3956      case Iop_Add32x8:
3957      case Iop_Max32Ux8:
3958      case Iop_Max32Sx8:
3959      case Iop_Min32Ux8:
3960      case Iop_Min32Sx8:
3961      case Iop_Mul32x8:
3962         return binary32Ix8(mce, vatom1, vatom2);
3963
3964      case Iop_Sub64x4:
3965      case Iop_Add64x4:
3966      case Iop_CmpEQ64x4:
3967      case Iop_CmpGT64Sx4:
3968         return binary64Ix4(mce, vatom1, vatom2);
3969
3970     /* Perm32x8: rearrange values in left arg using steering values
3971        from right arg.  So rearrange the vbits in the same way but
3972        pessimise wrt steering values. */
3973      case Iop_Perm32x8:
3974         return mkUifUV256(
3975                   mce,
3976                   assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
3977                   mkPCast32x8(mce, vatom2)
3978                );
3979
3980      default:
3981         ppIROp(op);
3982         VG_(tool_panic)("memcheck:expr2vbits_Binop");
3983   }
3984}
3985
3986
3987static
3988IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
3989{
3990   /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
3991      selection of shadow operation implicitly duplicates the logic in
3992      do_shadow_LoadG and should be kept in sync (in the very unlikely
3993      event that the interpretation of such widening ops changes in
3994      future).  See comment in do_shadow_LoadG. */
3995   IRAtom* vatom = expr2vbits( mce, atom );
3996   tl_assert(isOriginalAtom(mce,atom));
3997   switch (op) {
3998
3999      case Iop_Sqrt64Fx2:
4000      case Iop_Abs64Fx2:
4001      case Iop_Neg64Fx2:
4002         return unary64Fx2(mce, vatom);
4003
4004      case Iop_Sqrt64F0x2:
4005         return unary64F0x2(mce, vatom);
4006
4007      case Iop_Sqrt32Fx8:
4008      case Iop_RSqrt32Fx8:
4009      case Iop_Recip32Fx8:
4010         return unary32Fx8(mce, vatom);
4011
4012      case Iop_Sqrt64Fx4:
4013         return unary64Fx4(mce, vatom);
4014
4015      case Iop_Sqrt32Fx4:
4016      case Iop_RSqrt32Fx4:
4017      case Iop_Recip32Fx4:
4018      case Iop_I32UtoFx4:
4019      case Iop_I32StoFx4:
4020      case Iop_QFtoI32Ux4_RZ:
4021      case Iop_QFtoI32Sx4_RZ:
4022      case Iop_RoundF32x4_RM:
4023      case Iop_RoundF32x4_RP:
4024      case Iop_RoundF32x4_RN:
4025      case Iop_RoundF32x4_RZ:
4026      case Iop_Recip32x4:
4027      case Iop_Abs32Fx4:
4028      case Iop_Neg32Fx4:
4029      case Iop_Rsqrte32Fx4:
4030         return unary32Fx4(mce, vatom);
4031
4032      case Iop_I32UtoFx2:
4033      case Iop_I32StoFx2:
4034      case Iop_Recip32Fx2:
4035      case Iop_Recip32x2:
4036      case Iop_Abs32Fx2:
4037      case Iop_Neg32Fx2:
4038      case Iop_Rsqrte32Fx2:
4039         return unary32Fx2(mce, vatom);
4040
4041      case Iop_Sqrt32F0x4:
4042      case Iop_RSqrt32F0x4:
4043      case Iop_Recip32F0x4:
4044         return unary32F0x4(mce, vatom);
4045
4046      case Iop_32UtoV128:
4047      case Iop_64UtoV128:
4048      case Iop_Dup8x16:
4049      case Iop_Dup16x8:
4050      case Iop_Dup32x4:
4051      case Iop_Reverse16_8x16:
4052      case Iop_Reverse32_8x16:
4053      case Iop_Reverse32_16x8:
4054      case Iop_Reverse64_8x16:
4055      case Iop_Reverse64_16x8:
4056      case Iop_Reverse64_32x4:
4057      case Iop_V256toV128_1: case Iop_V256toV128_0:
4058      case Iop_ZeroHI64ofV128:
4059      case Iop_ZeroHI96ofV128:
4060      case Iop_ZeroHI112ofV128:
4061      case Iop_ZeroHI120ofV128:
4062         return assignNew('V', mce, Ity_V128, unop(op, vatom));
4063
4064      case Iop_F128HItoF64:  /* F128 -> high half of F128 */
4065      case Iop_D128HItoD64:  /* D128 -> high half of D128 */
4066         return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
4067      case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
4068      case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
4069         return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
4070
4071      case Iop_NegF128:
4072      case Iop_AbsF128:
4073         return mkPCastTo(mce, Ity_I128, vatom);
4074
4075      case Iop_I32StoF128: /* signed I32 -> F128 */
4076      case Iop_I64StoF128: /* signed I64 -> F128 */
4077      case Iop_I32UtoF128: /* unsigned I32 -> F128 */
4078      case Iop_I64UtoF128: /* unsigned I64 -> F128 */
4079      case Iop_F32toF128:  /* F32 -> F128 */
4080      case Iop_F64toF128:  /* F64 -> F128 */
4081      case Iop_I32StoD128: /* signed I64 -> D128 */
4082      case Iop_I64StoD128: /* signed I64 -> D128 */
4083      case Iop_I32UtoD128: /* unsigned I32 -> D128 */
4084      case Iop_I64UtoD128: /* unsigned I64 -> D128 */
4085         return mkPCastTo(mce, Ity_I128, vatom);
4086
4087      case Iop_F32toF64:
4088      case Iop_I32StoF64:
4089      case Iop_I32UtoF64:
4090      case Iop_NegF64:
4091      case Iop_AbsF64:
4092      case Iop_Est5FRSqrt:
4093      case Iop_RoundF64toF64_NEAREST:
4094      case Iop_RoundF64toF64_NegINF:
4095      case Iop_RoundF64toF64_PosINF:
4096      case Iop_RoundF64toF64_ZERO:
4097      case Iop_Clz64:
4098      case Iop_D32toD64:
4099      case Iop_I32StoD64:
4100      case Iop_I32UtoD64:
4101      case Iop_ExtractExpD64:    /* D64  -> I64 */
4102      case Iop_ExtractExpD128:   /* D128 -> I64 */
4103      case Iop_ExtractSigD64:    /* D64  -> I64 */
4104      case Iop_ExtractSigD128:   /* D128 -> I64 */
4105      case Iop_DPBtoBCD:
4106      case Iop_BCDtoDPB:
4107         return mkPCastTo(mce, Ity_I64, vatom);
4108
4109      case Iop_D64toD128:
4110         return mkPCastTo(mce, Ity_I128, vatom);
4111
4112      case Iop_Clz32:
4113      case Iop_TruncF64asF32:
4114      case Iop_NegF32:
4115      case Iop_AbsF32:
4116         return mkPCastTo(mce, Ity_I32, vatom);
4117
4118      case Iop_Ctz32:
4119      case Iop_Ctz64:
4120         return expensiveCountTrailingZeroes(mce, op, atom, vatom);
4121
4122      case Iop_1Uto64:
4123      case Iop_1Sto64:
4124      case Iop_8Uto64:
4125      case Iop_8Sto64:
4126      case Iop_16Uto64:
4127      case Iop_16Sto64:
4128      case Iop_32Sto64:
4129      case Iop_32Uto64:
4130      case Iop_V128to64:
4131      case Iop_V128HIto64:
4132      case Iop_128HIto64:
4133      case Iop_128to64:
4134      case Iop_Dup8x8:
4135      case Iop_Dup16x4:
4136      case Iop_Dup32x2:
4137      case Iop_Reverse16_8x8:
4138      case Iop_Reverse32_8x8:
4139      case Iop_Reverse32_16x4:
4140      case Iop_Reverse64_8x8:
4141      case Iop_Reverse64_16x4:
4142      case Iop_Reverse64_32x2:
4143      case Iop_V256to64_0: case Iop_V256to64_1:
4144      case Iop_V256to64_2: case Iop_V256to64_3:
4145         return assignNew('V', mce, Ity_I64, unop(op, vatom));
4146
4147      case Iop_64to32:
4148      case Iop_64HIto32:
4149      case Iop_1Uto32:
4150      case Iop_1Sto32:
4151      case Iop_8Uto32:
4152      case Iop_16Uto32:
4153      case Iop_16Sto32:
4154      case Iop_8Sto32:
4155      case Iop_V128to32:
4156         return assignNew('V', mce, Ity_I32, unop(op, vatom));
4157
4158      case Iop_8Sto16:
4159      case Iop_8Uto16:
4160      case Iop_32to16:
4161      case Iop_32HIto16:
4162      case Iop_64to16:
4163      case Iop_GetMSBs8x16:
4164         return assignNew('V', mce, Ity_I16, unop(op, vatom));
4165
4166      case Iop_1Uto8:
4167      case Iop_1Sto8:
4168      case Iop_16to8:
4169      case Iop_16HIto8:
4170      case Iop_32to8:
4171      case Iop_64to8:
4172      case Iop_GetMSBs8x8:
4173         return assignNew('V', mce, Ity_I8, unop(op, vatom));
4174
4175      case Iop_32to1:
4176         return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
4177
4178      case Iop_64to1:
4179         return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
4180
4181      case Iop_ReinterpF64asI64:
4182      case Iop_ReinterpI64asF64:
4183      case Iop_ReinterpI32asF32:
4184      case Iop_ReinterpF32asI32:
4185      case Iop_ReinterpI64asD64:
4186      case Iop_ReinterpD64asI64:
4187      case Iop_NotV256:
4188      case Iop_NotV128:
4189      case Iop_Not64:
4190      case Iop_Not32:
4191      case Iop_Not16:
4192      case Iop_Not8:
4193      case Iop_Not1:
4194         return vatom;
4195
4196      case Iop_CmpNEZ8x8:
4197      case Iop_Cnt8x8:
4198      case Iop_Clz8Sx8:
4199      case Iop_Cls8Sx8:
4200      case Iop_Abs8x8:
4201         return mkPCast8x8(mce, vatom);
4202
4203      case Iop_CmpNEZ8x16:
4204      case Iop_Cnt8x16:
4205      case Iop_Clz8Sx16:
4206      case Iop_Cls8Sx16:
4207      case Iop_Abs8x16:
4208         return mkPCast8x16(mce, vatom);
4209
4210      case Iop_CmpNEZ16x4:
4211      case Iop_Clz16Sx4:
4212      case Iop_Cls16Sx4:
4213      case Iop_Abs16x4:
4214         return mkPCast16x4(mce, vatom);
4215
4216      case Iop_CmpNEZ16x8:
4217      case Iop_Clz16Sx8:
4218      case Iop_Cls16Sx8:
4219      case Iop_Abs16x8:
4220         return mkPCast16x8(mce, vatom);
4221
4222      case Iop_CmpNEZ32x2:
4223      case Iop_Clz32Sx2:
4224      case Iop_Cls32Sx2:
4225      case Iop_FtoI32Ux2_RZ:
4226      case Iop_FtoI32Sx2_RZ:
4227      case Iop_Abs32x2:
4228         return mkPCast32x2(mce, vatom);
4229
4230      case Iop_CmpNEZ32x4:
4231      case Iop_Clz32Sx4:
4232      case Iop_Cls32Sx4:
4233      case Iop_FtoI32Ux4_RZ:
4234      case Iop_FtoI32Sx4_RZ:
4235      case Iop_Abs32x4:
4236         return mkPCast32x4(mce, vatom);
4237
4238      case Iop_CmpwNEZ32:
4239         return mkPCastTo(mce, Ity_I32, vatom);
4240
4241      case Iop_CmpwNEZ64:
4242         return mkPCastTo(mce, Ity_I64, vatom);
4243
4244      case Iop_CmpNEZ64x2:
4245      case Iop_CipherSV128:
4246      case Iop_Clz64x2:
4247         return mkPCast64x2(mce, vatom);
4248
4249      case Iop_PwBitMtxXpose64x2:
4250         return assignNew('V', mce, Ity_V128, unop(op, vatom));
4251
4252      case Iop_NarrowUn16to8x8:
4253      case Iop_NarrowUn32to16x4:
4254      case Iop_NarrowUn64to32x2:
4255      case Iop_QNarrowUn16Sto8Sx8:
4256      case Iop_QNarrowUn16Sto8Ux8:
4257      case Iop_QNarrowUn16Uto8Ux8:
4258      case Iop_QNarrowUn32Sto16Sx4:
4259      case Iop_QNarrowUn32Sto16Ux4:
4260      case Iop_QNarrowUn32Uto16Ux4:
4261      case Iop_QNarrowUn64Sto32Sx2:
4262      case Iop_QNarrowUn64Sto32Ux2:
4263      case Iop_QNarrowUn64Uto32Ux2:
4264         return vectorNarrowUnV128(mce, op, vatom);
4265
4266      case Iop_Widen8Sto16x8:
4267      case Iop_Widen8Uto16x8:
4268      case Iop_Widen16Sto32x4:
4269      case Iop_Widen16Uto32x4:
4270      case Iop_Widen32Sto64x2:
4271      case Iop_Widen32Uto64x2:
4272         return vectorWidenI64(mce, op, vatom);
4273
4274      case Iop_PwAddL32Ux2:
4275      case Iop_PwAddL32Sx2:
4276         return mkPCastTo(mce, Ity_I64,
4277               assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
4278
4279      case Iop_PwAddL16Ux4:
4280      case Iop_PwAddL16Sx4:
4281         return mkPCast32x2(mce,
4282               assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
4283
4284      case Iop_PwAddL8Ux8:
4285      case Iop_PwAddL8Sx8:
4286         return mkPCast16x4(mce,
4287               assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
4288
4289      case Iop_PwAddL32Ux4:
4290      case Iop_PwAddL32Sx4:
4291         return mkPCast64x2(mce,
4292               assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
4293
4294      case Iop_PwAddL16Ux8:
4295      case Iop_PwAddL16Sx8:
4296         return mkPCast32x4(mce,
4297               assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
4298
4299      case Iop_PwAddL8Ux16:
4300      case Iop_PwAddL8Sx16:
4301         return mkPCast16x8(mce,
4302               assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
4303
4304      // TODO: is this correct?
4305      case Iop_AddLV8Ux16:
4306      case Iop_AddLV8Sx16:
4307         return assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom)));
4308
4309      case Iop_AddLV16Ux8:
4310      case Iop_AddLV16Sx8:
4311         return assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom)));
4312
4313      case Iop_AddLV32Ux4:
4314      case Iop_AddLV32Sx4:
4315         return assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom)));
4316
4317      case Iop_I64UtoF32:
4318      default:
4319         ppIROp(op);
4320         VG_(tool_panic)("memcheck:expr2vbits_Unop");
4321   }
4322}
4323
4324
4325/* Worker function -- do not call directly.  See comments on
4326   expr2vbits_Load for the meaning of |guard|.
4327
4328   Generates IR to (1) perform a definedness test of |addr|, (2)
4329   perform a validity test of |addr|, and (3) return the Vbits for the
4330   location indicated by |addr|.  All of this only happens when
4331   |guard| is NULL or |guard| evaluates to True at run time.
4332
4333   If |guard| evaluates to False at run time, the returned value is
4334   the IR-mandated 0x55..55 value, and no checks nor shadow loads are
4335   performed.
4336
4337   The definedness of |guard| itself is not checked.  That is assumed
4338   to have been done before this point, by the caller. */
4339static
4340IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4341                              IREndness end, IRType ty,
4342                              IRAtom* addr, UInt bias, IRAtom* guard )
4343{
4344   tl_assert(isOriginalAtom(mce,addr));
4345   tl_assert(end == Iend_LE || end == Iend_BE);
4346
4347   /* First, emit a definedness test for the address.  This also sets
4348      the address (shadow) to 'defined' following the test. */
4349   complainIfUndefined( mce, addr, guard );
4350
4351   /* Now cook up a call to the relevant helper function, to read the
4352      data V bits from shadow memory. */
4353   ty = shadowTypeV(ty);
4354
4355   void*        helper           = NULL;
4356   const HChar* hname            = NULL;
4357   Bool         ret_via_outparam = False;
4358
4359   if (end == Iend_LE) {
4360      switch (ty) {
4361         case Ity_V256: helper = &MC_(helperc_LOADV256le);
4362                        hname = "MC_(helperc_LOADV256le)";
4363                        ret_via_outparam = True;
4364                        break;
4365         case Ity_V128: helper = &MC_(helperc_LOADV128le);
4366                        hname = "MC_(helperc_LOADV128le)";
4367                        ret_via_outparam = True;
4368                        break;
4369         case Ity_I64:  helper = &MC_(helperc_LOADV64le);
4370                        hname = "MC_(helperc_LOADV64le)";
4371                        break;
4372         case Ity_I32:  helper = &MC_(helperc_LOADV32le);
4373                        hname = "MC_(helperc_LOADV32le)";
4374                        break;
4375         case Ity_I16:  helper = &MC_(helperc_LOADV16le);
4376                        hname = "MC_(helperc_LOADV16le)";
4377                        break;
4378         case Ity_I8:   helper = &MC_(helperc_LOADV8);
4379                        hname = "MC_(helperc_LOADV8)";
4380                        break;
4381         default:       ppIRType(ty);
4382                        VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
4383      }
4384   } else {
4385      switch (ty) {
4386         case Ity_V256: helper = &MC_(helperc_LOADV256be);
4387                        hname = "MC_(helperc_LOADV256be)";
4388                        ret_via_outparam = True;
4389                        break;
4390         case Ity_V128: helper = &MC_(helperc_LOADV128be);
4391                        hname = "MC_(helperc_LOADV128be)";
4392                        ret_via_outparam = True;
4393                        break;
4394         case Ity_I64:  helper = &MC_(helperc_LOADV64be);
4395                        hname = "MC_(helperc_LOADV64be)";
4396                        break;
4397         case Ity_I32:  helper = &MC_(helperc_LOADV32be);
4398                        hname = "MC_(helperc_LOADV32be)";
4399                        break;
4400         case Ity_I16:  helper = &MC_(helperc_LOADV16be);
4401                        hname = "MC_(helperc_LOADV16be)";
4402                        break;
4403         case Ity_I8:   helper = &MC_(helperc_LOADV8);
4404                        hname = "MC_(helperc_LOADV8)";
4405                        break;
4406         default:       ppIRType(ty);
4407                        VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
4408      }
4409   }
4410
4411   tl_assert(helper);
4412   tl_assert(hname);
4413
4414   /* Generate the actual address into addrAct. */
4415   IRAtom* addrAct;
4416   if (bias == 0) {
4417      addrAct = addr;
4418   } else {
4419      IROp    mkAdd;
4420      IRAtom* eBias;
4421      IRType  tyAddr  = mce->hWordTy;
4422      tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
4423      mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
4424      eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
4425      addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
4426   }
4427
4428   /* We need to have a place to park the V bits we're just about to
4429      read. */
4430   IRTemp datavbits = newTemp(mce, ty, VSh);
4431
4432   /* Here's the call. */
4433   IRDirty* di;
4434   if (ret_via_outparam) {
4435      di = unsafeIRDirty_1_N( datavbits,
4436                              2/*regparms*/,
4437                              hname, VG_(fnptr_to_fnentry)( helper ),
4438                              mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
4439   } else {
4440      di = unsafeIRDirty_1_N( datavbits,
4441                              1/*regparms*/,
4442                              hname, VG_(fnptr_to_fnentry)( helper ),
4443                              mkIRExprVec_1( addrAct ) );
4444   }
4445
4446   setHelperAnns( mce, di );
4447   if (guard) {
4448      di->guard = guard;
4449      /* Ideally the didn't-happen return value here would be all-ones
4450         (all-undefined), so it'd be obvious if it got used
4451         inadvertantly.  We can get by with the IR-mandated default
4452         value (0b01 repeating, 0x55 etc) as that'll still look pretty
4453         undefined if it ever leaks out. */
4454   }
4455   stmt( 'V', mce, IRStmt_Dirty(di) );
4456
4457   return mkexpr(datavbits);
4458}
4459
4460
4461/* Generate IR to do a shadow load.  The helper is expected to check
4462   the validity of the address and return the V bits for that address.
4463   This can optionally be controlled by a guard, which is assumed to
4464   be True if NULL.  In the case where the guard is False at runtime,
4465   the helper will return the didn't-do-the-call value of 0x55..55.
4466   Since that means "completely undefined result", the caller of
4467   this function will need to fix up the result somehow in that
4468   case.
4469
4470   Caller of this function is also expected to have checked the
4471   definedness of |guard| before this point.
4472*/
4473static
4474IRAtom* expr2vbits_Load ( MCEnv* mce,
4475                          IREndness end, IRType ty,
4476                          IRAtom* addr, UInt bias,
4477                          IRAtom* guard )
4478{
4479   tl_assert(end == Iend_LE || end == Iend_BE);
4480   switch (shadowTypeV(ty)) {
4481      case Ity_I8:
4482      case Ity_I16:
4483      case Ity_I32:
4484      case Ity_I64:
4485      case Ity_V128:
4486      case Ity_V256:
4487         return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
4488      default:
4489         VG_(tool_panic)("expr2vbits_Load");
4490   }
4491}
4492
4493
4494/* The most general handler for guarded loads.  Assumes the
4495   definedness of GUARD has already been checked by the caller.  A
4496   GUARD of NULL is assumed to mean "always True".  Generates code to
4497   check the definedness and validity of ADDR.
4498
4499   Generate IR to do a shadow load from ADDR and return the V bits.
4500   The loaded type is TY.  The loaded data is then (shadow) widened by
4501   using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
4502   evaluates to False at run time then the returned Vbits are simply
4503   VALT instead.  Note therefore that the argument type of VWIDEN must
4504   be TY and the result type of VWIDEN must equal the type of VALT.
4505*/
4506static
4507IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
4508                                          IREndness end, IRType ty,
4509                                          IRAtom* addr, UInt bias,
4510                                          IRAtom* guard,
4511                                          IROp vwiden, IRAtom* valt )
4512{
4513   /* Sanity check the conversion operation, and also set TYWIDE. */
4514   IRType tyWide = Ity_INVALID;
4515   switch (vwiden) {
4516      case Iop_INVALID:
4517         tyWide = ty;
4518         break;
4519      case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
4520         tyWide = Ity_I32;
4521         break;
4522      default:
4523         VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
4524   }
4525
4526   /* If the guard evaluates to True, this will hold the loaded V bits
4527      at TY.  If the guard evaluates to False, this will be all
4528      ones, meaning "all undefined", in which case we will have to
4529      replace it using an ITE below. */
4530   IRAtom* iftrue1
4531      = assignNew('V', mce, ty,
4532                  expr2vbits_Load(mce, end, ty, addr, bias, guard));
4533   /* Now (shadow-) widen the loaded V bits to the desired width.  In
4534      the guard-is-False case, the allowable widening operators will
4535      in the worst case (unsigned widening) at least leave the
4536      pre-widened part as being marked all-undefined, and in the best
4537      case (signed widening) mark the whole widened result as
4538      undefined.  Anyway, it doesn't matter really, since in this case
4539      we will replace said value with the default value |valt| using an
4540      ITE. */
4541   IRAtom* iftrue2
4542      = vwiden == Iop_INVALID
4543           ? iftrue1
4544           : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
4545   /* These are the V bits we will return if the load doesn't take
4546      place. */
4547   IRAtom* iffalse
4548      = valt;
4549   /* Prepare the cond for the ITE.  Convert a NULL cond into
4550      something that iropt knows how to fold out later. */
4551   IRAtom* cond
4552      = guard == NULL  ? mkU1(1)  : guard;
4553   /* And assemble the final result. */
4554   return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
4555}
4556
4557
4558/* A simpler handler for guarded loads, in which there is no
4559   conversion operation, and the default V bit return (when the guard
4560   evaluates to False at runtime) is "all defined".  If there is no
4561   guard expression or the guard is always TRUE this function behaves
4562   like expr2vbits_Load.  It is assumed that definedness of GUARD has
4563   already been checked at the call site. */
4564static
4565IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
4566                                         IREndness end, IRType ty,
4567                                         IRAtom* addr, UInt bias,
4568                                         IRAtom *guard )
4569{
4570   return expr2vbits_Load_guarded_General(
4571             mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
4572          );
4573}
4574
4575
4576static
4577IRAtom* expr2vbits_ITE ( MCEnv* mce,
4578                         IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
4579{
4580   IRAtom *vbitsC, *vbits0, *vbits1;
4581   IRType ty;
4582   /* Given ITE(cond, iftrue,  iffalse),  generate
4583            ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
4584      That is, steer the V bits like the originals, but trash the
4585      result if the steering value is undefined.  This gives
4586      lazy propagation. */
4587   tl_assert(isOriginalAtom(mce, cond));
4588   tl_assert(isOriginalAtom(mce, iftrue));
4589   tl_assert(isOriginalAtom(mce, iffalse));
4590
4591   vbitsC = expr2vbits(mce, cond);
4592   vbits1 = expr2vbits(mce, iftrue);
4593   vbits0 = expr2vbits(mce, iffalse);
4594   ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
4595
4596   return
4597      mkUifU(mce, ty, assignNew('V', mce, ty,
4598                                     IRExpr_ITE(cond, vbits1, vbits0)),
4599                      mkPCastTo(mce, ty, vbitsC) );
4600}
4601
4602/* --------- This is the main expression-handling function. --------- */
4603
4604static
4605IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
4606{
4607   switch (e->tag) {
4608
4609      case Iex_Get:
4610         return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
4611
4612      case Iex_GetI:
4613         return shadow_GETI( mce, e->Iex.GetI.descr,
4614                                  e->Iex.GetI.ix, e->Iex.GetI.bias );
4615
4616      case Iex_RdTmp:
4617         return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
4618
4619      case Iex_Const:
4620         return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
4621
4622      case Iex_Qop:
4623         return expr2vbits_Qop(
4624                   mce,
4625                   e->Iex.Qop.details->op,
4626                   e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
4627                   e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
4628                );
4629
4630      case Iex_Triop:
4631         return expr2vbits_Triop(
4632                   mce,
4633                   e->Iex.Triop.details->op,
4634                   e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
4635                   e->Iex.Triop.details->arg3
4636                );
4637
4638      case Iex_Binop:
4639         return expr2vbits_Binop(
4640                   mce,
4641                   e->Iex.Binop.op,
4642                   e->Iex.Binop.arg1, e->Iex.Binop.arg2
4643                );
4644
4645      case Iex_Unop:
4646         return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
4647
4648      case Iex_Load:
4649         return expr2vbits_Load( mce, e->Iex.Load.end,
4650                                      e->Iex.Load.ty,
4651                                      e->Iex.Load.addr, 0/*addr bias*/,
4652                                      NULL/* guard == "always True"*/ );
4653
4654      case Iex_CCall:
4655         return mkLazyN( mce, e->Iex.CCall.args,
4656                              e->Iex.CCall.retty,
4657                              e->Iex.CCall.cee );
4658
4659      case Iex_ITE:
4660         return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
4661                                     e->Iex.ITE.iffalse);
4662
4663      default:
4664         VG_(printf)("\n");
4665         ppIRExpr(e);
4666         VG_(printf)("\n");
4667         VG_(tool_panic)("memcheck: expr2vbits");
4668   }
4669}
4670
4671/*------------------------------------------------------------*/
4672/*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
4673/*------------------------------------------------------------*/
4674
4675/* Widen a value to the host word size. */
4676
4677static
4678IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
4679{
4680   IRType ty, tyH;
4681
4682   /* vatom is vbits-value and as such can only have a shadow type. */
4683   tl_assert(isShadowAtom(mce,vatom));
4684
4685   ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
4686   tyH = mce->hWordTy;
4687
4688   if (tyH == Ity_I32) {
4689      switch (ty) {
4690         case Ity_I32:
4691            return vatom;
4692         case Ity_I16:
4693            return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
4694         case Ity_I8:
4695            return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
4696         default:
4697            goto unhandled;
4698      }
4699   } else
4700   if (tyH == Ity_I64) {
4701      switch (ty) {
4702         case Ity_I32:
4703            return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
4704         case Ity_I16:
4705            return assignNew('V', mce, tyH, unop(Iop_32Uto64,
4706                   assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
4707         case Ity_I8:
4708            return assignNew('V', mce, tyH, unop(Iop_32Uto64,
4709                   assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
4710         default:
4711            goto unhandled;
4712      }
4713   } else {
4714      goto unhandled;
4715   }
4716  unhandled:
4717   VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
4718   VG_(tool_panic)("zwidenToHostWord");
4719}
4720
4721
4722/* Generate a shadow store.  |addr| is always the original address
4723   atom.  You can pass in either originals or V-bits for the data
4724   atom, but obviously not both.  This function generates a check for
4725   the definedness and (indirectly) the validity of |addr|, but only
4726   when |guard| evaluates to True at run time (or is NULL).
4727
4728   |guard| :: Ity_I1 controls whether the store really happens; NULL
4729   means it unconditionally does.  Note that |guard| itself is not
4730   checked for definedness; the caller of this function must do that
4731   if necessary.
4732*/
4733static
4734void do_shadow_Store ( MCEnv* mce,
4735                       IREndness end,
4736                       IRAtom* addr, UInt bias,
4737                       IRAtom* data, IRAtom* vdata,
4738                       IRAtom* guard )
4739{
4740   IROp     mkAdd;
4741   IRType   ty, tyAddr;
4742   void*    helper = NULL;
4743   const HChar* hname = NULL;
4744   IRConst* c;
4745
4746   tyAddr = mce->hWordTy;
4747   mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
4748   tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
4749   tl_assert( end == Iend_LE || end == Iend_BE );
4750
4751   if (data) {
4752      tl_assert(!vdata);
4753      tl_assert(isOriginalAtom(mce, data));
4754      tl_assert(bias == 0);
4755      vdata = expr2vbits( mce, data );
4756   } else {
4757      tl_assert(vdata);
4758   }
4759
4760   tl_assert(isOriginalAtom(mce,addr));
4761   tl_assert(isShadowAtom(mce,vdata));
4762
4763   if (guard) {
4764      tl_assert(isOriginalAtom(mce, guard));
4765      tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
4766   }
4767
4768   ty = typeOfIRExpr(mce->sb->tyenv, vdata);
4769
4770   // If we're not doing undefined value checking, pretend that this value
4771   // is "all valid".  That lets Vex's optimiser remove some of the V bit
4772   // shadow computation ops that precede it.
4773   if (MC_(clo_mc_level) == 1) {
4774      switch (ty) {
4775         case Ity_V256: // V256 weirdness -- used four times
4776                        c = IRConst_V256(V_BITS32_DEFINED); break;
4777         case Ity_V128: // V128 weirdness -- used twice
4778                        c = IRConst_V128(V_BITS16_DEFINED); break;
4779         case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
4780         case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
4781         case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
4782         case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
4783         default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
4784      }
4785      vdata = IRExpr_Const( c );
4786   }
4787
4788   /* First, emit a definedness test for the address.  This also sets
4789      the address (shadow) to 'defined' following the test.  Both of
4790      those actions are gated on |guard|. */
4791   complainIfUndefined( mce, addr, guard );
4792
4793   /* Now decide which helper function to call to write the data V
4794      bits into shadow memory. */
4795   if (end == Iend_LE) {
4796      switch (ty) {
4797         case Ity_V256: /* we'll use the helper four times */
4798         case Ity_V128: /* we'll use the helper twice */
4799         case Ity_I64: helper = &MC_(helperc_STOREV64le);
4800                       hname = "MC_(helperc_STOREV64le)";
4801                       break;
4802         case Ity_I32: helper = &MC_(helperc_STOREV32le);
4803                       hname = "MC_(helperc_STOREV32le)";
4804                       break;
4805         case Ity_I16: helper = &MC_(helperc_STOREV16le);
4806                       hname = "MC_(helperc_STOREV16le)";
4807                       break;
4808         case Ity_I8:  helper = &MC_(helperc_STOREV8);
4809                       hname = "MC_(helperc_STOREV8)";
4810                       break;
4811         default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
4812      }
4813   } else {
4814      switch (ty) {
4815         case Ity_V128: /* we'll use the helper twice */
4816         case Ity_I64: helper = &MC_(helperc_STOREV64be);
4817                       hname = "MC_(helperc_STOREV64be)";
4818                       break;
4819         case Ity_I32: helper = &MC_(helperc_STOREV32be);
4820                       hname = "MC_(helperc_STOREV32be)";
4821                       break;
4822         case Ity_I16: helper = &MC_(helperc_STOREV16be);
4823                       hname = "MC_(helperc_STOREV16be)";
4824                       break;
4825         case Ity_I8:  helper = &MC_(helperc_STOREV8);
4826                       hname = "MC_(helperc_STOREV8)";
4827                       break;
4828         /* Note, no V256 case here, because no big-endian target that
4829            we support, has 256 vectors. */
4830         default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
4831      }
4832   }
4833
4834   if (UNLIKELY(ty == Ity_V256)) {
4835
4836      /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
4837         Q3 being the most significant lane. */
4838      /* These are the offsets of the Qs in memory. */
4839      Int     offQ0, offQ1, offQ2, offQ3;
4840
4841      /* Various bits for constructing the 4 lane helper calls */
4842      IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
4843      IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
4844      IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
4845      IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
4846
4847      if (end == Iend_LE) {
4848         offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
4849      } else {
4850         offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
4851      }
4852
4853      eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
4854      addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
4855      vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
4856      diQ0    = unsafeIRDirty_0_N(
4857                   1/*regparms*/,
4858                   hname, VG_(fnptr_to_fnentry)( helper ),
4859                   mkIRExprVec_2( addrQ0, vdataQ0 )
4860                );
4861
4862      eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
4863      addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
4864      vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
4865      diQ1    = unsafeIRDirty_0_N(
4866                   1/*regparms*/,
4867                   hname, VG_(fnptr_to_fnentry)( helper ),
4868                   mkIRExprVec_2( addrQ1, vdataQ1 )
4869                );
4870
4871      eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
4872      addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
4873      vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
4874      diQ2    = unsafeIRDirty_0_N(
4875                   1/*regparms*/,
4876                   hname, VG_(fnptr_to_fnentry)( helper ),
4877                   mkIRExprVec_2( addrQ2, vdataQ2 )
4878                );
4879
4880      eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
4881      addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
4882      vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
4883      diQ3    = unsafeIRDirty_0_N(
4884                   1/*regparms*/,
4885                   hname, VG_(fnptr_to_fnentry)( helper ),
4886                   mkIRExprVec_2( addrQ3, vdataQ3 )
4887                );
4888
4889      if (guard)
4890         diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
4891
4892      setHelperAnns( mce, diQ0 );
4893      setHelperAnns( mce, diQ1 );
4894      setHelperAnns( mce, diQ2 );
4895      setHelperAnns( mce, diQ3 );
4896      stmt( 'V', mce, IRStmt_Dirty(diQ0) );
4897      stmt( 'V', mce, IRStmt_Dirty(diQ1) );
4898      stmt( 'V', mce, IRStmt_Dirty(diQ2) );
4899      stmt( 'V', mce, IRStmt_Dirty(diQ3) );
4900
4901   }
4902   else if (UNLIKELY(ty == Ity_V128)) {
4903
4904      /* V128-bit case */
4905      /* See comment in next clause re 64-bit regparms */
4906      /* also, need to be careful about endianness */
4907
4908      Int     offLo64, offHi64;
4909      IRDirty *diLo64, *diHi64;
4910      IRAtom  *addrLo64, *addrHi64;
4911      IRAtom  *vdataLo64, *vdataHi64;
4912      IRAtom  *eBiasLo64, *eBiasHi64;
4913
4914      if (end == Iend_LE) {
4915         offLo64 = 0;
4916         offHi64 = 8;
4917      } else {
4918         offLo64 = 8;
4919         offHi64 = 0;
4920      }
4921
4922      eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
4923      addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
4924      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
4925      diLo64    = unsafeIRDirty_0_N(
4926                     1/*regparms*/,
4927                     hname, VG_(fnptr_to_fnentry)( helper ),
4928                     mkIRExprVec_2( addrLo64, vdataLo64 )
4929                  );
4930      eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
4931      addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
4932      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
4933      diHi64    = unsafeIRDirty_0_N(
4934                     1/*regparms*/,
4935                     hname, VG_(fnptr_to_fnentry)( helper ),
4936                     mkIRExprVec_2( addrHi64, vdataHi64 )
4937                  );
4938      if (guard) diLo64->guard = guard;
4939      if (guard) diHi64->guard = guard;
4940      setHelperAnns( mce, diLo64 );
4941      setHelperAnns( mce, diHi64 );
4942      stmt( 'V', mce, IRStmt_Dirty(diLo64) );
4943      stmt( 'V', mce, IRStmt_Dirty(diHi64) );
4944
4945   } else {
4946
4947      IRDirty *di;
4948      IRAtom  *addrAct;
4949
4950      /* 8/16/32/64-bit cases */
4951      /* Generate the actual address into addrAct. */
4952      if (bias == 0) {
4953         addrAct = addr;
4954      } else {
4955         IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
4956         addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
4957      }
4958
4959      if (ty == Ity_I64) {
4960         /* We can't do this with regparm 2 on 32-bit platforms, since
4961            the back ends aren't clever enough to handle 64-bit
4962            regparm args.  Therefore be different. */
4963         di = unsafeIRDirty_0_N(
4964                 1/*regparms*/,
4965                 hname, VG_(fnptr_to_fnentry)( helper ),
4966                 mkIRExprVec_2( addrAct, vdata )
4967              );
4968      } else {
4969         di = unsafeIRDirty_0_N(
4970                 2/*regparms*/,
4971                 hname, VG_(fnptr_to_fnentry)( helper ),
4972                 mkIRExprVec_2( addrAct,
4973                                zwidenToHostWord( mce, vdata ))
4974              );
4975      }
4976      if (guard) di->guard = guard;
4977      setHelperAnns( mce, di );
4978      stmt( 'V', mce, IRStmt_Dirty(di) );
4979   }
4980
4981}
4982
4983
4984/* Do lazy pessimistic propagation through a dirty helper call, by
4985   looking at the annotations on it.  This is the most complex part of
4986   Memcheck. */
4987
4988static IRType szToITy ( Int n )
4989{
4990   switch (n) {
4991      case 1: return Ity_I8;
4992      case 2: return Ity_I16;
4993      case 4: return Ity_I32;
4994      case 8: return Ity_I64;
4995      default: VG_(tool_panic)("szToITy(memcheck)");
4996   }
4997}
4998
4999static
5000void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
5001{
5002   Int       i, k, n, toDo, gSz, gOff;
5003   IRAtom    *src, *here, *curr;
5004   IRType    tySrc, tyDst;
5005   IRTemp    dst;
5006   IREndness end;
5007
5008   /* What's the native endianness?  We need to know this. */
5009#  if defined(VG_BIGENDIAN)
5010   end = Iend_BE;
5011#  elif defined(VG_LITTLEENDIAN)
5012   end = Iend_LE;
5013#  else
5014#    error "Unknown endianness"
5015#  endif
5016
5017   /* First check the guard. */
5018   complainIfUndefined(mce, d->guard, NULL);
5019
5020   /* Now round up all inputs and PCast over them. */
5021   curr = definedOfType(Ity_I32);
5022
5023   /* Inputs: unmasked args
5024      Note: arguments are evaluated REGARDLESS of the guard expression */
5025   for (i = 0; d->args[i]; i++) {
5026      IRAtom* arg = d->args[i];
5027      if ( (d->cee->mcx_mask & (1<<i))
5028           || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
5029         /* ignore this arg */
5030      } else {
5031         here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
5032         curr = mkUifU32(mce, here, curr);
5033      }
5034   }
5035
5036   /* Inputs: guest state that we read. */
5037   for (i = 0; i < d->nFxState; i++) {
5038      tl_assert(d->fxState[i].fx != Ifx_None);
5039      if (d->fxState[i].fx == Ifx_Write)
5040         continue;
5041
5042      /* Enumerate the described state segments */
5043      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5044         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5045         gSz  = d->fxState[i].size;
5046
5047         /* Ignore any sections marked as 'always defined'. */
5048         if (isAlwaysDefd(mce, gOff, gSz)) {
5049            if (0)
5050            VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
5051                        gOff, gSz);
5052            continue;
5053         }
5054
5055         /* This state element is read or modified.  So we need to
5056            consider it.  If larger than 8 bytes, deal with it in
5057            8-byte chunks. */
5058         while (True) {
5059            tl_assert(gSz >= 0);
5060            if (gSz == 0) break;
5061            n = gSz <= 8 ? gSz : 8;
5062            /* update 'curr' with UifU of the state slice
5063               gOff .. gOff+n-1 */
5064            tySrc = szToITy( n );
5065
5066            /* Observe the guard expression. If it is false use an
5067               all-bits-defined bit pattern */
5068            IRAtom *cond, *iffalse, *iftrue;
5069
5070            cond    = assignNew('V', mce, Ity_I1, d->guard);
5071            iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
5072            iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
5073            src     = assignNew('V', mce, tySrc,
5074                                IRExpr_ITE(cond, iftrue, iffalse));
5075
5076            here = mkPCastTo( mce, Ity_I32, src );
5077            curr = mkUifU32(mce, here, curr);
5078            gSz -= n;
5079            gOff += n;
5080         }
5081      }
5082   }
5083
5084   /* Inputs: memory.  First set up some info needed regardless of
5085      whether we're doing reads or writes. */
5086
5087   if (d->mFx != Ifx_None) {
5088      /* Because we may do multiple shadow loads/stores from the same
5089         base address, it's best to do a single test of its
5090         definedness right now.  Post-instrumentation optimisation
5091         should remove all but this test. */
5092      IRType tyAddr;
5093      tl_assert(d->mAddr);
5094      complainIfUndefined(mce, d->mAddr, d->guard);
5095
5096      tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
5097      tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
5098      tl_assert(tyAddr == mce->hWordTy); /* not really right */
5099   }
5100
5101   /* Deal with memory inputs (reads or modifies) */
5102   if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
5103      toDo   = d->mSize;
5104      /* chew off 32-bit chunks.  We don't care about the endianness
5105         since it's all going to be condensed down to a single bit,
5106         but nevertheless choose an endianness which is hopefully
5107         native to the platform. */
5108      while (toDo >= 4) {
5109         here = mkPCastTo(
5110                   mce, Ity_I32,
5111                   expr2vbits_Load_guarded_Simple(
5112                      mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
5113                );
5114         curr = mkUifU32(mce, here, curr);
5115         toDo -= 4;
5116      }
5117      /* chew off 16-bit chunks */
5118      while (toDo >= 2) {
5119         here = mkPCastTo(
5120                   mce, Ity_I32,
5121                   expr2vbits_Load_guarded_Simple(
5122                      mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
5123                );
5124         curr = mkUifU32(mce, here, curr);
5125         toDo -= 2;
5126      }
5127      /* chew off the remaining 8-bit chunk, if any */
5128      if (toDo == 1) {
5129         here = mkPCastTo(
5130                   mce, Ity_I32,
5131                   expr2vbits_Load_guarded_Simple(
5132                      mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
5133                );
5134         curr = mkUifU32(mce, here, curr);
5135         toDo -= 1;
5136      }
5137      tl_assert(toDo == 0);
5138   }
5139
5140   /* Whew!  So curr is a 32-bit V-value summarising pessimistically
5141      all the inputs to the helper.  Now we need to re-distribute the
5142      results to all destinations. */
5143
5144   /* Outputs: the destination temporary, if there is one. */
5145   if (d->tmp != IRTemp_INVALID) {
5146      dst   = findShadowTmpV(mce, d->tmp);
5147      tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
5148      assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
5149   }
5150
5151   /* Outputs: guest state that we write or modify. */
5152   for (i = 0; i < d->nFxState; i++) {
5153      tl_assert(d->fxState[i].fx != Ifx_None);
5154      if (d->fxState[i].fx == Ifx_Read)
5155         continue;
5156
5157      /* Enumerate the described state segments */
5158      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5159         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5160         gSz  = d->fxState[i].size;
5161
5162         /* Ignore any sections marked as 'always defined'. */
5163         if (isAlwaysDefd(mce, gOff, gSz))
5164            continue;
5165
5166         /* This state element is written or modified.  So we need to
5167            consider it.  If larger than 8 bytes, deal with it in
5168            8-byte chunks. */
5169         while (True) {
5170            tl_assert(gSz >= 0);
5171            if (gSz == 0) break;
5172            n = gSz <= 8 ? gSz : 8;
5173            /* Write suitably-casted 'curr' to the state slice
5174               gOff .. gOff+n-1 */
5175            tyDst = szToITy( n );
5176            do_shadow_PUT( mce, gOff,
5177                                NULL, /* original atom */
5178                                mkPCastTo( mce, tyDst, curr ), d->guard );
5179            gSz -= n;
5180            gOff += n;
5181         }
5182      }
5183   }
5184
5185   /* Outputs: memory that we write or modify.  Same comments about
5186      endianness as above apply. */
5187   if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
5188      toDo   = d->mSize;
5189      /* chew off 32-bit chunks */
5190      while (toDo >= 4) {
5191         do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5192                          NULL, /* original data */
5193                          mkPCastTo( mce, Ity_I32, curr ),
5194                          d->guard );
5195         toDo -= 4;
5196      }
5197      /* chew off 16-bit chunks */
5198      while (toDo >= 2) {
5199         do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5200                          NULL, /* original data */
5201                          mkPCastTo( mce, Ity_I16, curr ),
5202                          d->guard );
5203         toDo -= 2;
5204      }
5205      /* chew off the remaining 8-bit chunk, if any */
5206      if (toDo == 1) {
5207         do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5208                          NULL, /* original data */
5209                          mkPCastTo( mce, Ity_I8, curr ),
5210                          d->guard );
5211         toDo -= 1;
5212      }
5213      tl_assert(toDo == 0);
5214   }
5215
5216}
5217
5218
5219/* We have an ABI hint telling us that [base .. base+len-1] is to
5220   become undefined ("writable").  Generate code to call a helper to
5221   notify the A/V bit machinery of this fact.
5222
5223   We call
5224   void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
5225                                                    Addr nia );
5226*/
5227static
5228void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
5229{
5230   IRDirty* di;
5231   /* Minor optimisation: if not doing origin tracking, ignore the
5232      supplied nia and pass zero instead.  This is on the basis that
5233      MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
5234      almost always generate a shorter instruction to put zero into a
5235      register than any other value. */
5236   if (MC_(clo_mc_level) < 3)
5237      nia = mkIRExpr_HWord(0);
5238
5239   di = unsafeIRDirty_0_N(
5240           0/*regparms*/,
5241           "MC_(helperc_MAKE_STACK_UNINIT)",
5242           VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
5243           mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
5244        );
5245   stmt( 'V', mce, IRStmt_Dirty(di) );
5246}
5247
5248
5249/* ------ Dealing with IRCAS (big and complex) ------ */
5250
5251/* FWDS */
5252static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
5253                             IRAtom* baseaddr, Int offset );
5254static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
5255static void    gen_store_b ( MCEnv* mce, Int szB,
5256                             IRAtom* baseaddr, Int offset, IRAtom* dataB,
5257                             IRAtom* guard );
5258
5259static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
5260static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
5261
5262
5263/* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
5264   IRExpr.Consts, else this asserts.  If they are both Consts, it
5265   doesn't do anything.  So that just leaves the RdTmp case.
5266
5267   In which case: this assigns the shadow value SHADOW to the IR
5268   shadow temporary associated with ORIG.  That is, ORIG, being an
5269   original temporary, will have a shadow temporary associated with
5270   it.  However, in the case envisaged here, there will so far have
5271   been no IR emitted to actually write a shadow value into that
5272   temporary.  What this routine does is to (emit IR to) copy the
5273   value in SHADOW into said temporary, so that after this call,
5274   IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
5275   value in SHADOW.
5276
5277   Point is to allow callers to compute "by hand" a shadow value for
5278   ORIG, and force it to be associated with ORIG.
5279
5280   How do we know that that shadow associated with ORIG has not so far
5281   been assigned to?  Well, we don't per se know that, but supposing
5282   it had.  Then this routine would create a second assignment to it,
5283   and later the IR sanity checker would barf.  But that never
5284   happens.  QED.
5285*/
5286static void bind_shadow_tmp_to_orig ( UChar how,
5287                                      MCEnv* mce,
5288                                      IRAtom* orig, IRAtom* shadow )
5289{
5290   tl_assert(isOriginalAtom(mce, orig));
5291   tl_assert(isShadowAtom(mce, shadow));
5292   switch (orig->tag) {
5293      case Iex_Const:
5294         tl_assert(shadow->tag == Iex_Const);
5295         break;
5296      case Iex_RdTmp:
5297         tl_assert(shadow->tag == Iex_RdTmp);
5298         if (how == 'V') {
5299            assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
5300                   shadow);
5301         } else {
5302            tl_assert(how == 'B');
5303            assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
5304                   shadow);
5305         }
5306         break;
5307      default:
5308         tl_assert(0);
5309   }
5310}
5311
5312
5313static
5314void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
5315{
5316   /* Scheme is (both single- and double- cases):
5317
5318      1. fetch data#,dataB (the proposed new value)
5319
5320      2. fetch expd#,expdB (what we expect to see at the address)
5321
5322      3. check definedness of address
5323
5324      4. load old#,oldB from shadow memory; this also checks
5325         addressibility of the address
5326
5327      5. the CAS itself
5328
5329      6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
5330
5331      7. if "expected == old" (as computed by (6))
5332            store data#,dataB to shadow memory
5333
5334      Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
5335      'data' but 7 stores 'data#'.  Hence it is possible for the
5336      shadow data to be incorrectly checked and/or updated:
5337
5338      * 7 is at least gated correctly, since the 'expected == old'
5339        condition is derived from outputs of 5.  However, the shadow
5340        write could happen too late: imagine after 5 we are
5341        descheduled, a different thread runs, writes a different
5342        (shadow) value at the address, and then we resume, hence
5343        overwriting the shadow value written by the other thread.
5344
5345      Because the original memory access is atomic, there's no way to
5346      make both the original and shadow accesses into a single atomic
5347      thing, hence this is unavoidable.
5348
5349      At least as Valgrind stands, I don't think it's a problem, since
5350      we're single threaded *and* we guarantee that there are no
5351      context switches during the execution of any specific superblock
5352      -- context switches can only happen at superblock boundaries.
5353
5354      If Valgrind ever becomes MT in the future, then it might be more
5355      of a problem.  A possible kludge would be to artificially
5356      associate with the location, a lock, which we must acquire and
5357      release around the transaction as a whole.  Hmm, that probably
5358      would't work properly since it only guards us against other
5359      threads doing CASs on the same location, not against other
5360      threads doing normal reads and writes.
5361
5362      ------------------------------------------------------------
5363
5364      COMMENT_ON_CasCmpEQ:
5365
5366      Note two things.  Firstly, in the sequence above, we compute
5367      "expected == old", but we don't check definedness of it.  Why
5368      not?  Also, the x86 and amd64 front ends use
5369      Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
5370      determination (expected == old ?) for themselves, and we also
5371      don't check definedness for those primops; we just say that the
5372      result is defined.  Why?  Details follow.
5373
5374      x86/amd64 contains various forms of locked insns:
5375      * lock prefix before all basic arithmetic insn;
5376        eg lock xorl %reg1,(%reg2)
5377      * atomic exchange reg-mem
5378      * compare-and-swaps
5379
5380      Rather than attempt to represent them all, which would be a
5381      royal PITA, I used a result from Maurice Herlihy
5382      (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
5383      demonstrates that compare-and-swap is a primitive more general
5384      than the other two, and so can be used to represent all of them.
5385      So the translation scheme for (eg) lock incl (%reg) is as
5386      follows:
5387
5388        again:
5389         old = * %reg
5390         new = old + 1
5391         atomically { if (* %reg == old) { * %reg = new } else { goto again } }
5392
5393      The "atomically" is the CAS bit.  The scheme is always the same:
5394      get old value from memory, compute new value, atomically stuff
5395      new value back in memory iff the old value has not changed (iow,
5396      no other thread modified it in the meantime).  If it has changed
5397      then we've been out-raced and we have to start over.
5398
5399      Now that's all very neat, but it has the bad side effect of
5400      introducing an explicit equality test into the translation.
5401      Consider the behaviour of said code on a memory location which
5402      is uninitialised.  We will wind up doing a comparison on
5403      uninitialised data, and mc duly complains.
5404
5405      What's difficult about this is, the common case is that the
5406      location is uncontended, and so we're usually comparing the same
5407      value (* %reg) with itself.  So we shouldn't complain even if it
5408      is undefined.  But mc doesn't know that.
5409
5410      My solution is to mark the == in the IR specially, so as to tell
5411      mc that it almost certainly compares a value with itself, and we
5412      should just regard the result as always defined.  Rather than
5413      add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
5414      Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
5415
5416      So there's always the question of, can this give a false
5417      negative?  eg, imagine that initially, * %reg is defined; and we
5418      read that; but then in the gap between the read and the CAS, a
5419      different thread writes an undefined (and different) value at
5420      the location.  Then the CAS in this thread will fail and we will
5421      go back to "again:", but without knowing that the trip back
5422      there was based on an undefined comparison.  No matter; at least
5423      the other thread won the race and the location is correctly
5424      marked as undefined.  What if it wrote an uninitialised version
5425      of the same value that was there originally, though?
5426
5427      etc etc.  Seems like there's a small corner case in which we
5428      might lose the fact that something's defined -- we're out-raced
5429      in between the "old = * reg" and the "atomically {", _and_ the
5430      other thread is writing in an undefined version of what's
5431      already there.  Well, that seems pretty unlikely.
5432
5433      ---
5434
5435      If we ever need to reinstate it .. code which generates a
5436      definedness test for "expected == old" was removed at r10432 of
5437      this file.
5438   */
5439   if (cas->oldHi == IRTemp_INVALID) {
5440      do_shadow_CAS_single( mce, cas );
5441   } else {
5442      do_shadow_CAS_double( mce, cas );
5443   }
5444}
5445
5446
5447static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
5448{
5449   IRAtom *vdataLo = NULL, *bdataLo = NULL;
5450   IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
5451   IRAtom *voldLo  = NULL, *boldLo  = NULL;
5452   IRAtom *expd_eq_old = NULL;
5453   IROp   opCasCmpEQ;
5454   Int    elemSzB;
5455   IRType elemTy;
5456   Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
5457
5458   /* single CAS */
5459   tl_assert(cas->oldHi == IRTemp_INVALID);
5460   tl_assert(cas->expdHi == NULL);
5461   tl_assert(cas->dataHi == NULL);
5462
5463   elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
5464   switch (elemTy) {
5465      case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
5466      case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
5467      case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
5468      case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
5469      default: tl_assert(0); /* IR defn disallows any other types */
5470   }
5471
5472   /* 1. fetch data# (the proposed new value) */
5473   tl_assert(isOriginalAtom(mce, cas->dataLo));
5474   vdataLo
5475      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
5476   tl_assert(isShadowAtom(mce, vdataLo));
5477   if (otrak) {
5478      bdataLo
5479         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
5480      tl_assert(isShadowAtom(mce, bdataLo));
5481   }
5482
5483   /* 2. fetch expected# (what we expect to see at the address) */
5484   tl_assert(isOriginalAtom(mce, cas->expdLo));
5485   vexpdLo
5486      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
5487   tl_assert(isShadowAtom(mce, vexpdLo));
5488   if (otrak) {
5489      bexpdLo
5490         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
5491      tl_assert(isShadowAtom(mce, bexpdLo));
5492   }
5493
5494   /* 3. check definedness of address */
5495   /* 4. fetch old# from shadow memory; this also checks
5496         addressibility of the address */
5497   voldLo
5498      = assignNew(
5499           'V', mce, elemTy,
5500           expr2vbits_Load(
5501              mce,
5502              cas->end, elemTy, cas->addr, 0/*Addr bias*/,
5503              NULL/*always happens*/
5504        ));
5505   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
5506   if (otrak) {
5507      boldLo
5508         = assignNew('B', mce, Ity_I32,
5509                     gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
5510      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
5511   }
5512
5513   /* 5. the CAS itself */
5514   stmt( 'C', mce, IRStmt_CAS(cas) );
5515
5516   /* 6. compute "expected == old" */
5517   /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
5518   /* Note that 'C' is kinda faking it; it is indeed a non-shadow
5519      tree, but it's not copied from the input block. */
5520   expd_eq_old
5521      = assignNew('C', mce, Ity_I1,
5522                  binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
5523
5524   /* 7. if "expected == old"
5525            store data# to shadow memory */
5526   do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
5527                    NULL/*data*/, vdataLo/*vdata*/,
5528                    expd_eq_old/*guard for store*/ );
5529   if (otrak) {
5530      gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
5531                   bdataLo/*bdata*/,
5532                   expd_eq_old/*guard for store*/ );
5533   }
5534}
5535
5536
5537static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
5538{
5539   IRAtom *vdataHi = NULL, *bdataHi = NULL;
5540   IRAtom *vdataLo = NULL, *bdataLo = NULL;
5541   IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
5542   IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
5543   IRAtom *voldHi  = NULL, *boldHi  = NULL;
5544   IRAtom *voldLo  = NULL, *boldLo  = NULL;
5545   IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
5546   IRAtom *expd_eq_old = NULL, *zero = NULL;
5547   IROp   opCasCmpEQ, opOr, opXor;
5548   Int    elemSzB, memOffsLo, memOffsHi;
5549   IRType elemTy;
5550   Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
5551
5552   /* double CAS */
5553   tl_assert(cas->oldHi != IRTemp_INVALID);
5554   tl_assert(cas->expdHi != NULL);
5555   tl_assert(cas->dataHi != NULL);
5556
5557   elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
5558   switch (elemTy) {
5559      case Ity_I8:
5560         opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
5561         elemSzB = 1; zero = mkU8(0);
5562         break;
5563      case Ity_I16:
5564         opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
5565         elemSzB = 2; zero = mkU16(0);
5566         break;
5567      case Ity_I32:
5568         opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
5569         elemSzB = 4; zero = mkU32(0);
5570         break;
5571      case Ity_I64:
5572         opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
5573         elemSzB = 8; zero = mkU64(0);
5574         break;
5575      default:
5576         tl_assert(0); /* IR defn disallows any other types */
5577   }
5578
5579   /* 1. fetch data# (the proposed new value) */
5580   tl_assert(isOriginalAtom(mce, cas->dataHi));
5581   tl_assert(isOriginalAtom(mce, cas->dataLo));
5582   vdataHi
5583      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
5584   vdataLo
5585      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
5586   tl_assert(isShadowAtom(mce, vdataHi));
5587   tl_assert(isShadowAtom(mce, vdataLo));
5588   if (otrak) {
5589      bdataHi
5590         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
5591      bdataLo
5592         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
5593      tl_assert(isShadowAtom(mce, bdataHi));
5594      tl_assert(isShadowAtom(mce, bdataLo));
5595   }
5596
5597   /* 2. fetch expected# (what we expect to see at the address) */
5598   tl_assert(isOriginalAtom(mce, cas->expdHi));
5599   tl_assert(isOriginalAtom(mce, cas->expdLo));
5600   vexpdHi
5601      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
5602   vexpdLo
5603      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
5604   tl_assert(isShadowAtom(mce, vexpdHi));
5605   tl_assert(isShadowAtom(mce, vexpdLo));
5606   if (otrak) {
5607      bexpdHi
5608         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
5609      bexpdLo
5610         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
5611      tl_assert(isShadowAtom(mce, bexpdHi));
5612      tl_assert(isShadowAtom(mce, bexpdLo));
5613   }
5614
5615   /* 3. check definedness of address */
5616   /* 4. fetch old# from shadow memory; this also checks
5617         addressibility of the address */
5618   if (cas->end == Iend_LE) {
5619      memOffsLo = 0;
5620      memOffsHi = elemSzB;
5621   } else {
5622      tl_assert(cas->end == Iend_BE);
5623      memOffsLo = elemSzB;
5624      memOffsHi = 0;
5625   }
5626   voldHi
5627      = assignNew(
5628           'V', mce, elemTy,
5629           expr2vbits_Load(
5630              mce,
5631              cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
5632              NULL/*always happens*/
5633        ));
5634   voldLo
5635      = assignNew(
5636           'V', mce, elemTy,
5637           expr2vbits_Load(
5638              mce,
5639              cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
5640              NULL/*always happens*/
5641        ));
5642   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
5643   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
5644   if (otrak) {
5645      boldHi
5646         = assignNew('B', mce, Ity_I32,
5647                     gen_load_b(mce, elemSzB, cas->addr,
5648                                memOffsHi/*addr bias*/));
5649      boldLo
5650         = assignNew('B', mce, Ity_I32,
5651                     gen_load_b(mce, elemSzB, cas->addr,
5652                                memOffsLo/*addr bias*/));
5653      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
5654      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
5655   }
5656
5657   /* 5. the CAS itself */
5658   stmt( 'C', mce, IRStmt_CAS(cas) );
5659
5660   /* 6. compute "expected == old" */
5661   /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
5662   /* Note that 'C' is kinda faking it; it is indeed a non-shadow
5663      tree, but it's not copied from the input block. */
5664   /*
5665      xHi = oldHi ^ expdHi;
5666      xLo = oldLo ^ expdLo;
5667      xHL = xHi | xLo;
5668      expd_eq_old = xHL == 0;
5669   */
5670   xHi = assignNew('C', mce, elemTy,
5671                   binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
5672   xLo = assignNew('C', mce, elemTy,
5673                   binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
5674   xHL = assignNew('C', mce, elemTy,
5675                   binop(opOr, xHi, xLo));
5676   expd_eq_old
5677      = assignNew('C', mce, Ity_I1,
5678                  binop(opCasCmpEQ, xHL, zero));
5679
5680   /* 7. if "expected == old"
5681            store data# to shadow memory */
5682   do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
5683                    NULL/*data*/, vdataHi/*vdata*/,
5684                    expd_eq_old/*guard for store*/ );
5685   do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
5686                    NULL/*data*/, vdataLo/*vdata*/,
5687                    expd_eq_old/*guard for store*/ );
5688   if (otrak) {
5689      gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
5690                   bdataHi/*bdata*/,
5691                   expd_eq_old/*guard for store*/ );
5692      gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
5693                   bdataLo/*bdata*/,
5694                   expd_eq_old/*guard for store*/ );
5695   }
5696}
5697
5698
5699/* ------ Dealing with LL/SC (not difficult) ------ */
5700
5701static void do_shadow_LLSC ( MCEnv*    mce,
5702                             IREndness stEnd,
5703                             IRTemp    stResult,
5704                             IRExpr*   stAddr,
5705                             IRExpr*   stStoredata )
5706{
5707   /* In short: treat a load-linked like a normal load followed by an
5708      assignment of the loaded (shadow) data to the result temporary.
5709      Treat a store-conditional like a normal store, and mark the
5710      result temporary as defined. */
5711   IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
5712   IRTemp resTmp = findShadowTmpV(mce, stResult);
5713
5714   tl_assert(isIRAtom(stAddr));
5715   if (stStoredata)
5716      tl_assert(isIRAtom(stStoredata));
5717
5718   if (stStoredata == NULL) {
5719      /* Load Linked */
5720      /* Just treat this as a normal load, followed by an assignment of
5721         the value to .result. */
5722      /* Stay sane */
5723      tl_assert(resTy == Ity_I64 || resTy == Ity_I32
5724                || resTy == Ity_I16 || resTy == Ity_I8);
5725      assign( 'V', mce, resTmp,
5726                   expr2vbits_Load(
5727                      mce, stEnd, resTy, stAddr, 0/*addr bias*/,
5728                      NULL/*always happens*/) );
5729   } else {
5730      /* Store Conditional */
5731      /* Stay sane */
5732      IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
5733                                   stStoredata);
5734      tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
5735                || dataTy == Ity_I16 || dataTy == Ity_I8);
5736      do_shadow_Store( mce, stEnd,
5737                            stAddr, 0/* addr bias */,
5738                            stStoredata,
5739                            NULL /* shadow data */,
5740                            NULL/*guard*/ );
5741      /* This is a store conditional, so it writes to .result a value
5742         indicating whether or not the store succeeded.  Just claim
5743         this value is always defined.  In the PowerPC interpretation
5744         of store-conditional, definedness of the success indication
5745         depends on whether the address of the store matches the
5746         reservation address.  But we can't tell that here (and
5747         anyway, we're not being PowerPC-specific).  At least we are
5748         guaranteed that the definedness of the store address, and its
5749         addressibility, will be checked as per normal.  So it seems
5750         pretty safe to just say that the success indication is always
5751         defined.
5752
5753         In schemeS, for origin tracking, we must correspondingly set
5754         a no-origin value for the origin shadow of .result.
5755      */
5756      tl_assert(resTy == Ity_I1);
5757      assign( 'V', mce, resTmp, definedOfType(resTy) );
5758   }
5759}
5760
5761
5762/* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
5763
5764static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
5765{
5766   complainIfUndefined(mce, sg->guard, NULL);
5767   /* do_shadow_Store will generate code to check the definedness and
5768      validity of sg->addr, in the case where sg->guard evaluates to
5769      True at run-time. */
5770   do_shadow_Store( mce, sg->end,
5771                    sg->addr, 0/* addr bias */,
5772                    sg->data,
5773                    NULL /* shadow data */,
5774                    sg->guard );
5775}
5776
5777static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
5778{
5779   complainIfUndefined(mce, lg->guard, NULL);
5780   /* expr2vbits_Load_guarded_General will generate code to check the
5781      definedness and validity of lg->addr, in the case where
5782      lg->guard evaluates to True at run-time. */
5783
5784   /* Look at the LoadG's built-in conversion operation, to determine
5785      the source (actual loaded data) type, and the equivalent IROp.
5786      NOTE that implicitly we are taking a widening operation to be
5787      applied to original atoms and producing one that applies to V
5788      bits.  Since signed and unsigned widening are self-shadowing,
5789      this is a straight copy of the op (modulo swapping from the
5790      IRLoadGOp form to the IROp form).  Note also therefore that this
5791      implicitly duplicates the logic to do with said widening ops in
5792      expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
5793   IROp   vwiden   = Iop_INVALID;
5794   IRType loadedTy = Ity_INVALID;
5795   switch (lg->cvt) {
5796      case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
5797      case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
5798      case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
5799      case ILGop_8Uto32:  loadedTy = Ity_I8;  vwiden = Iop_8Uto32;  break;
5800      case ILGop_8Sto32:  loadedTy = Ity_I8;  vwiden = Iop_8Sto32;  break;
5801      default: VG_(tool_panic)("do_shadow_LoadG");
5802   }
5803
5804   IRAtom* vbits_alt
5805      = expr2vbits( mce, lg->alt );
5806   IRAtom* vbits_final
5807      = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
5808                                        lg->addr, 0/*addr bias*/,
5809                                        lg->guard, vwiden, vbits_alt );
5810   /* And finally, bind the V bits to the destination temporary. */
5811   assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
5812}
5813
5814
5815/*------------------------------------------------------------*/
5816/*--- Memcheck main                                        ---*/
5817/*------------------------------------------------------------*/
5818
5819static void schemeS ( MCEnv* mce, IRStmt* st );
5820
5821static Bool isBogusAtom ( IRAtom* at )
5822{
5823   ULong n = 0;
5824   IRConst* con;
5825   tl_assert(isIRAtom(at));
5826   if (at->tag == Iex_RdTmp)
5827      return False;
5828   tl_assert(at->tag == Iex_Const);
5829   con = at->Iex.Const.con;
5830   switch (con->tag) {
5831      case Ico_U1:   return False;
5832      case Ico_U8:   n = (ULong)con->Ico.U8; break;
5833      case Ico_U16:  n = (ULong)con->Ico.U16; break;
5834      case Ico_U32:  n = (ULong)con->Ico.U32; break;
5835      case Ico_U64:  n = (ULong)con->Ico.U64; break;
5836      case Ico_F64:  return False;
5837      case Ico_F32i: return False;
5838      case Ico_F64i: return False;
5839      case Ico_V128: return False;
5840      case Ico_V256: return False;
5841      default: ppIRExpr(at); tl_assert(0);
5842   }
5843   /* VG_(printf)("%llx\n", n); */
5844   return (/*32*/    n == 0xFEFEFEFFULL
5845           /*32*/ || n == 0x80808080ULL
5846           /*32*/ || n == 0x7F7F7F7FULL
5847           /*32*/ || n == 0x7EFEFEFFULL
5848           /*32*/ || n == 0x81010100ULL
5849           /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
5850           /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
5851           /*64*/ || n == 0x0000000000008080ULL
5852           /*64*/ || n == 0x8080808080808080ULL
5853           /*64*/ || n == 0x0101010101010101ULL
5854          );
5855}
5856
5857static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
5858{
5859   Int      i;
5860   IRExpr*  e;
5861   IRDirty* d;
5862   IRCAS*   cas;
5863   switch (st->tag) {
5864      case Ist_WrTmp:
5865         e = st->Ist.WrTmp.data;
5866         switch (e->tag) {
5867            case Iex_Get:
5868            case Iex_RdTmp:
5869               return False;
5870            case Iex_Const:
5871               return isBogusAtom(e);
5872            case Iex_Unop:
5873               return isBogusAtom(e->Iex.Unop.arg)
5874                      || e->Iex.Unop.op == Iop_GetMSBs8x16;
5875            case Iex_GetI:
5876               return isBogusAtom(e->Iex.GetI.ix);
5877            case Iex_Binop:
5878               return isBogusAtom(e->Iex.Binop.arg1)
5879                      || isBogusAtom(e->Iex.Binop.arg2);
5880            case Iex_Triop:
5881               return isBogusAtom(e->Iex.Triop.details->arg1)
5882                      || isBogusAtom(e->Iex.Triop.details->arg2)
5883                      || isBogusAtom(e->Iex.Triop.details->arg3);
5884            case Iex_Qop:
5885               return isBogusAtom(e->Iex.Qop.details->arg1)
5886                      || isBogusAtom(e->Iex.Qop.details->arg2)
5887                      || isBogusAtom(e->Iex.Qop.details->arg3)
5888                      || isBogusAtom(e->Iex.Qop.details->arg4);
5889            case Iex_ITE:
5890               return isBogusAtom(e->Iex.ITE.cond)
5891                      || isBogusAtom(e->Iex.ITE.iftrue)
5892                      || isBogusAtom(e->Iex.ITE.iffalse);
5893            case Iex_Load:
5894               return isBogusAtom(e->Iex.Load.addr);
5895            case Iex_CCall:
5896               for (i = 0; e->Iex.CCall.args[i]; i++)
5897                  if (isBogusAtom(e->Iex.CCall.args[i]))
5898                     return True;
5899               return False;
5900            default:
5901               goto unhandled;
5902         }
5903      case Ist_Dirty:
5904         d = st->Ist.Dirty.details;
5905         for (i = 0; d->args[i]; i++) {
5906            IRAtom* atom = d->args[i];
5907            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(atom))) {
5908               if (isBogusAtom(atom))
5909                  return True;
5910            }
5911         }
5912         if (isBogusAtom(d->guard))
5913            return True;
5914         if (d->mAddr && isBogusAtom(d->mAddr))
5915            return True;
5916         return False;
5917      case Ist_Put:
5918         return isBogusAtom(st->Ist.Put.data);
5919      case Ist_PutI:
5920         return isBogusAtom(st->Ist.PutI.details->ix)
5921                || isBogusAtom(st->Ist.PutI.details->data);
5922      case Ist_Store:
5923         return isBogusAtom(st->Ist.Store.addr)
5924                || isBogusAtom(st->Ist.Store.data);
5925      case Ist_StoreG: {
5926         IRStoreG* sg = st->Ist.StoreG.details;
5927         return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
5928                || isBogusAtom(sg->guard);
5929      }
5930      case Ist_LoadG: {
5931         IRLoadG* lg = st->Ist.LoadG.details;
5932         return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
5933                || isBogusAtom(lg->guard);
5934      }
5935      case Ist_Exit:
5936         return isBogusAtom(st->Ist.Exit.guard);
5937      case Ist_AbiHint:
5938         return isBogusAtom(st->Ist.AbiHint.base)
5939                || isBogusAtom(st->Ist.AbiHint.nia);
5940      case Ist_NoOp:
5941      case Ist_IMark:
5942      case Ist_MBE:
5943         return False;
5944      case Ist_CAS:
5945         cas = st->Ist.CAS.details;
5946         return isBogusAtom(cas->addr)
5947                || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
5948                || isBogusAtom(cas->expdLo)
5949                || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
5950                || isBogusAtom(cas->dataLo);
5951      case Ist_LLSC:
5952         return isBogusAtom(st->Ist.LLSC.addr)
5953                || (st->Ist.LLSC.storedata
5954                       ? isBogusAtom(st->Ist.LLSC.storedata)
5955                       : False);
5956      default:
5957      unhandled:
5958         ppIRStmt(st);
5959         VG_(tool_panic)("hasBogusLiterals");
5960   }
5961}
5962
5963
5964IRSB* MC_(instrument) ( VgCallbackClosure* closure,
5965                        IRSB* sb_in,
5966                        VexGuestLayout* layout,
5967                        VexGuestExtents* vge,
5968                        VexArchInfo* archinfo_host,
5969                        IRType gWordTy, IRType hWordTy )
5970{
5971   Bool    verboze = 0||False;
5972   Bool    bogus;
5973   Int     i, j, first_stmt;
5974   IRStmt* st;
5975   MCEnv   mce;
5976   IRSB*   sb_out;
5977
5978   if (gWordTy != hWordTy) {
5979      /* We don't currently support this case. */
5980      VG_(tool_panic)("host/guest word size mismatch");
5981   }
5982
5983   /* Check we're not completely nuts */
5984   tl_assert(sizeof(UWord)  == sizeof(void*));
5985   tl_assert(sizeof(Word)   == sizeof(void*));
5986   tl_assert(sizeof(Addr)   == sizeof(void*));
5987   tl_assert(sizeof(ULong)  == 8);
5988   tl_assert(sizeof(Long)   == 8);
5989   tl_assert(sizeof(Addr64) == 8);
5990   tl_assert(sizeof(UInt)   == 4);
5991   tl_assert(sizeof(Int)    == 4);
5992
5993   tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
5994
5995   /* Set up SB */
5996   sb_out = deepCopyIRSBExceptStmts(sb_in);
5997
5998   /* Set up the running environment.  Both .sb and .tmpMap are
5999      modified as we go along.  Note that tmps are added to both
6000      .sb->tyenv and .tmpMap together, so the valid index-set for
6001      those two arrays should always be identical. */
6002   VG_(memset)(&mce, 0, sizeof(mce));
6003   mce.sb             = sb_out;
6004   mce.trace          = verboze;
6005   mce.layout         = layout;
6006   mce.hWordTy        = hWordTy;
6007   mce.bogusLiterals  = False;
6008
6009   /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
6010      Darwin.  10.7 is mostly built with LLVM, which uses these for
6011      bitfield inserts, and we get a lot of false errors if the cheap
6012      interpretation is used, alas.  Could solve this much better if
6013      we knew which of such adds came from x86/amd64 LEA instructions,
6014      since these are the only ones really needing the expensive
6015      interpretation, but that would require some way to tag them in
6016      the _toIR.c front ends, which is a lot of faffing around.  So
6017      for now just use the slow and blunt-instrument solution. */
6018   mce.useLLVMworkarounds = False;
6019#  if defined(VGO_darwin)
6020   mce.useLLVMworkarounds = True;
6021#  endif
6022
6023   mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
6024                            sizeof(TempMapEnt));
6025   for (i = 0; i < sb_in->tyenv->types_used; i++) {
6026      TempMapEnt ent;
6027      ent.kind    = Orig;
6028      ent.shadowV = IRTemp_INVALID;
6029      ent.shadowB = IRTemp_INVALID;
6030      VG_(addToXA)( mce.tmpMap, &ent );
6031   }
6032   tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
6033
6034   /* Make a preliminary inspection of the statements, to see if there
6035      are any dodgy-looking literals.  If there are, we generate
6036      extra-detailed (hence extra-expensive) instrumentation in
6037      places.  Scan the whole bb even if dodgyness is found earlier,
6038      so that the flatness assertion is applied to all stmts. */
6039
6040   bogus = False;
6041
6042   for (i = 0; i < sb_in->stmts_used; i++) {
6043
6044      st = sb_in->stmts[i];
6045      tl_assert(st);
6046      tl_assert(isFlatIRStmt(st));
6047
6048      if (!bogus) {
6049         bogus = checkForBogusLiterals(st);
6050         if (0 && bogus) {
6051            VG_(printf)("bogus: ");
6052            ppIRStmt(st);
6053            VG_(printf)("\n");
6054         }
6055      }
6056
6057   }
6058
6059   mce.bogusLiterals = bogus;
6060
6061   /* Copy verbatim any IR preamble preceding the first IMark */
6062
6063   tl_assert(mce.sb == sb_out);
6064   tl_assert(mce.sb != sb_in);
6065
6066   i = 0;
6067   while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
6068
6069      st = sb_in->stmts[i];
6070      tl_assert(st);
6071      tl_assert(isFlatIRStmt(st));
6072
6073      stmt( 'C', &mce, sb_in->stmts[i] );
6074      i++;
6075   }
6076
6077   /* Nasty problem.  IR optimisation of the pre-instrumented IR may
6078      cause the IR following the preamble to contain references to IR
6079      temporaries defined in the preamble.  Because the preamble isn't
6080      instrumented, these temporaries don't have any shadows.
6081      Nevertheless uses of them following the preamble will cause
6082      memcheck to generate references to their shadows.  End effect is
6083      to cause IR sanity check failures, due to references to
6084      non-existent shadows.  This is only evident for the complex
6085      preambles used for function wrapping on TOC-afflicted platforms
6086      (ppc64-linux).
6087
6088      The following loop therefore scans the preamble looking for
6089      assignments to temporaries.  For each one found it creates an
6090      assignment to the corresponding (V) shadow temp, marking it as
6091      'defined'.  This is the same resulting IR as if the main
6092      instrumentation loop before had been applied to the statement
6093      'tmp = CONSTANT'.
6094
6095      Similarly, if origin tracking is enabled, we must generate an
6096      assignment for the corresponding origin (B) shadow, claiming
6097      no-origin, as appropriate for a defined value.
6098   */
6099   for (j = 0; j < i; j++) {
6100      if (sb_in->stmts[j]->tag == Ist_WrTmp) {
6101         /* findShadowTmpV checks its arg is an original tmp;
6102            no need to assert that here. */
6103         IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
6104         IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
6105         IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
6106         assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
6107         if (MC_(clo_mc_level) == 3) {
6108            IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
6109            tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
6110            assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
6111         }
6112         if (0) {
6113            VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
6114            ppIRType( ty_v );
6115            VG_(printf)("\n");
6116         }
6117      }
6118   }
6119
6120   /* Iterate over the remaining stmts to generate instrumentation. */
6121
6122   tl_assert(sb_in->stmts_used > 0);
6123   tl_assert(i >= 0);
6124   tl_assert(i < sb_in->stmts_used);
6125   tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
6126
6127   for (/* use current i*/; i < sb_in->stmts_used; i++) {
6128
6129      st = sb_in->stmts[i];
6130      first_stmt = sb_out->stmts_used;
6131
6132      if (verboze) {
6133         VG_(printf)("\n");
6134         ppIRStmt(st);
6135         VG_(printf)("\n");
6136      }
6137
6138      if (MC_(clo_mc_level) == 3) {
6139         /* See comments on case Ist_CAS below. */
6140         if (st->tag != Ist_CAS)
6141            schemeS( &mce, st );
6142      }
6143
6144      /* Generate instrumentation code for each stmt ... */
6145
6146      switch (st->tag) {
6147
6148         case Ist_WrTmp:
6149            assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
6150                               expr2vbits( &mce, st->Ist.WrTmp.data) );
6151            break;
6152
6153         case Ist_Put:
6154            do_shadow_PUT( &mce,
6155                           st->Ist.Put.offset,
6156                           st->Ist.Put.data,
6157                           NULL /* shadow atom */, NULL /* guard */ );
6158            break;
6159
6160         case Ist_PutI:
6161            do_shadow_PUTI( &mce, st->Ist.PutI.details);
6162            break;
6163
6164         case Ist_Store:
6165            do_shadow_Store( &mce, st->Ist.Store.end,
6166                                   st->Ist.Store.addr, 0/* addr bias */,
6167                                   st->Ist.Store.data,
6168                                   NULL /* shadow data */,
6169                                   NULL/*guard*/ );
6170            break;
6171
6172         case Ist_StoreG:
6173            do_shadow_StoreG( &mce, st->Ist.StoreG.details );
6174            break;
6175
6176         case Ist_LoadG:
6177            do_shadow_LoadG( &mce, st->Ist.LoadG.details );
6178            break;
6179
6180         case Ist_Exit:
6181            complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
6182            break;
6183
6184         case Ist_IMark:
6185            break;
6186
6187         case Ist_NoOp:
6188         case Ist_MBE:
6189            break;
6190
6191         case Ist_Dirty:
6192            do_shadow_Dirty( &mce, st->Ist.Dirty.details );
6193            break;
6194
6195         case Ist_AbiHint:
6196            do_AbiHint( &mce, st->Ist.AbiHint.base,
6197                              st->Ist.AbiHint.len,
6198                              st->Ist.AbiHint.nia );
6199            break;
6200
6201         case Ist_CAS:
6202            do_shadow_CAS( &mce, st->Ist.CAS.details );
6203            /* Note, do_shadow_CAS copies the CAS itself to the output
6204               block, because it needs to add instrumentation both
6205               before and after it.  Hence skip the copy below.  Also
6206               skip the origin-tracking stuff (call to schemeS) above,
6207               since that's all tangled up with it too; do_shadow_CAS
6208               does it all. */
6209            break;
6210
6211         case Ist_LLSC:
6212            do_shadow_LLSC( &mce,
6213                            st->Ist.LLSC.end,
6214                            st->Ist.LLSC.result,
6215                            st->Ist.LLSC.addr,
6216                            st->Ist.LLSC.storedata );
6217            break;
6218
6219         default:
6220            VG_(printf)("\n");
6221            ppIRStmt(st);
6222            VG_(printf)("\n");
6223            VG_(tool_panic)("memcheck: unhandled IRStmt");
6224
6225      } /* switch (st->tag) */
6226
6227      if (0 && verboze) {
6228         for (j = first_stmt; j < sb_out->stmts_used; j++) {
6229            VG_(printf)("   ");
6230            ppIRStmt(sb_out->stmts[j]);
6231            VG_(printf)("\n");
6232         }
6233         VG_(printf)("\n");
6234      }
6235
6236      /* ... and finally copy the stmt itself to the output.  Except,
6237         skip the copy of IRCASs; see comments on case Ist_CAS
6238         above. */
6239      if (st->tag != Ist_CAS)
6240         stmt('C', &mce, st);
6241   }
6242
6243   /* Now we need to complain if the jump target is undefined. */
6244   first_stmt = sb_out->stmts_used;
6245
6246   if (verboze) {
6247      VG_(printf)("sb_in->next = ");
6248      ppIRExpr(sb_in->next);
6249      VG_(printf)("\n\n");
6250   }
6251
6252   complainIfUndefined( &mce, sb_in->next, NULL );
6253
6254   if (0 && verboze) {
6255      for (j = first_stmt; j < sb_out->stmts_used; j++) {
6256         VG_(printf)("   ");
6257         ppIRStmt(sb_out->stmts[j]);
6258         VG_(printf)("\n");
6259      }
6260      VG_(printf)("\n");
6261   }
6262
6263   /* If this fails, there's been some serious snafu with tmp management,
6264      that should be investigated. */
6265   tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
6266   VG_(deleteXA)( mce.tmpMap );
6267
6268   tl_assert(mce.sb == sb_out);
6269   return sb_out;
6270}
6271
6272/*------------------------------------------------------------*/
6273/*--- Post-tree-build final tidying                        ---*/
6274/*------------------------------------------------------------*/
6275
6276/* This exploits the observation that Memcheck often produces
6277   repeated conditional calls of the form
6278
6279   Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
6280
6281   with the same guard expression G guarding the same helper call.
6282   The second and subsequent calls are redundant.  This usually
6283   results from instrumentation of guest code containing multiple
6284   memory references at different constant offsets from the same base
6285   register.  After optimisation of the instrumentation, you get a
6286   test for the definedness of the base register for each memory
6287   reference, which is kinda pointless.  MC_(final_tidy) therefore
6288   looks for such repeated calls and removes all but the first. */
6289
6290/* A struct for recording which (helper, guard) pairs we have already
6291   seen. */
6292typedef
6293   struct { void* entry; IRExpr* guard; }
6294   Pair;
6295
6296/* Return True if e1 and e2 definitely denote the same value (used to
6297   compare guards).  Return False if unknown; False is the safe
6298   answer.  Since guest registers and guest memory do not have the
6299   SSA property we must return False if any Gets or Loads appear in
6300   the expression. */
6301
6302static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
6303{
6304   if (e1->tag != e2->tag)
6305      return False;
6306   switch (e1->tag) {
6307      case Iex_Const:
6308         return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
6309      case Iex_Binop:
6310         return e1->Iex.Binop.op == e2->Iex.Binop.op
6311                && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
6312                && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
6313      case Iex_Unop:
6314         return e1->Iex.Unop.op == e2->Iex.Unop.op
6315                && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
6316      case Iex_RdTmp:
6317         return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
6318      case Iex_ITE:
6319         return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
6320                && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
6321                && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
6322      case Iex_Qop:
6323      case Iex_Triop:
6324      case Iex_CCall:
6325         /* be lazy.  Could define equality for these, but they never
6326            appear to be used. */
6327         return False;
6328      case Iex_Get:
6329      case Iex_GetI:
6330      case Iex_Load:
6331         /* be conservative - these may not give the same value each
6332            time */
6333         return False;
6334      case Iex_Binder:
6335         /* should never see this */
6336         /* fallthrough */
6337      default:
6338         VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
6339         ppIRExpr(e1);
6340         VG_(tool_panic)("memcheck:sameIRValue");
6341         return False;
6342   }
6343}
6344
6345/* See if 'pairs' already has an entry for (entry, guard).  Return
6346   True if so.  If not, add an entry. */
6347
6348static
6349Bool check_or_add ( XArray* /*of Pair*/ pairs, IRExpr* guard, void* entry )
6350{
6351   Pair  p;
6352   Pair* pp;
6353   Int   i, n = VG_(sizeXA)( pairs );
6354   for (i = 0; i < n; i++) {
6355      pp = VG_(indexXA)( pairs, i );
6356      if (pp->entry == entry && sameIRValue(pp->guard, guard))
6357         return True;
6358   }
6359   p.guard = guard;
6360   p.entry = entry;
6361   VG_(addToXA)( pairs, &p );
6362   return False;
6363}
6364
6365static Bool is_helperc_value_checkN_fail ( const HChar* name )
6366{
6367   return
6368      0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
6369      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
6370      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
6371      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
6372      || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
6373      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
6374      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
6375      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
6376}
6377
6378IRSB* MC_(final_tidy) ( IRSB* sb_in )
6379{
6380   Int i;
6381   IRStmt*   st;
6382   IRDirty*  di;
6383   IRExpr*   guard;
6384   IRCallee* cee;
6385   Bool      alreadyPresent;
6386   XArray*   pairs = VG_(newXA)( VG_(malloc), "mc.ft.1",
6387                                 VG_(free), sizeof(Pair) );
6388   /* Scan forwards through the statements.  Each time a call to one
6389      of the relevant helpers is seen, check if we have made a
6390      previous call to the same helper using the same guard
6391      expression, and if so, delete the call. */
6392   for (i = 0; i < sb_in->stmts_used; i++) {
6393      st = sb_in->stmts[i];
6394      tl_assert(st);
6395      if (st->tag != Ist_Dirty)
6396         continue;
6397      di = st->Ist.Dirty.details;
6398      guard = di->guard;
6399      tl_assert(guard);
6400      if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
6401      cee = di->cee;
6402      if (!is_helperc_value_checkN_fail( cee->name ))
6403         continue;
6404       /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
6405          guard 'guard'.  Check if we have already seen a call to this
6406          function with the same guard.  If so, delete it.  If not,
6407          add it to the set of calls we do know about. */
6408      alreadyPresent = check_or_add( pairs, guard, cee->addr );
6409      if (alreadyPresent) {
6410         sb_in->stmts[i] = IRStmt_NoOp();
6411         if (0) VG_(printf)("XX\n");
6412      }
6413   }
6414   VG_(deleteXA)( pairs );
6415   return sb_in;
6416}
6417
6418
6419/*------------------------------------------------------------*/
6420/*--- Origin tracking stuff                                ---*/
6421/*------------------------------------------------------------*/
6422
6423/* Almost identical to findShadowTmpV. */
6424static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
6425{
6426   TempMapEnt* ent;
6427   /* VG_(indexXA) range-checks 'orig', hence no need to check
6428      here. */
6429   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6430   tl_assert(ent->kind == Orig);
6431   if (ent->shadowB == IRTemp_INVALID) {
6432      IRTemp tmpB
6433        = newTemp( mce, Ity_I32, BSh );
6434      /* newTemp may cause mce->tmpMap to resize, hence previous results
6435         from VG_(indexXA) are invalid. */
6436      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6437      tl_assert(ent->kind == Orig);
6438      tl_assert(ent->shadowB == IRTemp_INVALID);
6439      ent->shadowB = tmpB;
6440   }
6441   return ent->shadowB;
6442}
6443
6444static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
6445{
6446   return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
6447}
6448
6449
6450/* Make a guarded origin load, with no special handling in the
6451   didn't-happen case.  A GUARD of NULL is assumed to mean "always
6452   True".
6453
6454   Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6455   return the otag.  The loaded size is SZB.  If GUARD evaluates to
6456   False at run time then the returned otag is zero.
6457*/
6458static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
6459                                    IRAtom* baseaddr,
6460                                    Int offset, IRExpr* guard )
6461{
6462   void*    hFun;
6463   const HChar* hName;
6464   IRTemp   bTmp;
6465   IRDirty* di;
6466   IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6467   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6468   IRAtom*  ea    = baseaddr;
6469   if (offset != 0) {
6470      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6471                                   : mkU64( (Long)(Int)offset );
6472      ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
6473   }
6474   bTmp = newTemp(mce, mce->hWordTy, BSh);
6475
6476   switch (szB) {
6477      case 1: hFun  = (void*)&MC_(helperc_b_load1);
6478              hName = "MC_(helperc_b_load1)";
6479              break;
6480      case 2: hFun  = (void*)&MC_(helperc_b_load2);
6481              hName = "MC_(helperc_b_load2)";
6482              break;
6483      case 4: hFun  = (void*)&MC_(helperc_b_load4);
6484              hName = "MC_(helperc_b_load4)";
6485              break;
6486      case 8: hFun  = (void*)&MC_(helperc_b_load8);
6487              hName = "MC_(helperc_b_load8)";
6488              break;
6489      case 16: hFun  = (void*)&MC_(helperc_b_load16);
6490               hName = "MC_(helperc_b_load16)";
6491               break;
6492      case 32: hFun  = (void*)&MC_(helperc_b_load32);
6493               hName = "MC_(helperc_b_load32)";
6494               break;
6495      default:
6496         VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
6497         tl_assert(0);
6498   }
6499   di = unsafeIRDirty_1_N(
6500           bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
6501           mkIRExprVec_1( ea )
6502        );
6503   if (guard) {
6504      di->guard = guard;
6505      /* Ideally the didn't-happen return value here would be
6506         all-zeroes (unknown-origin), so it'd be harmless if it got
6507         used inadvertantly.  We slum it out with the IR-mandated
6508         default value (0b01 repeating, 0x55 etc) as that'll probably
6509         trump all legitimate otags via Max32, and it's pretty
6510         obviously bogus. */
6511   }
6512   /* no need to mess with any annotations.  This call accesses
6513      neither guest state nor guest memory. */
6514   stmt( 'B', mce, IRStmt_Dirty(di) );
6515   if (mce->hWordTy == Ity_I64) {
6516      /* 64-bit host */
6517      IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
6518      assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
6519      return mkexpr(bTmp32);
6520   } else {
6521      /* 32-bit host */
6522      return mkexpr(bTmp);
6523   }
6524}
6525
6526
6527/* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
6528   loaded size is SZB.  The load is regarded as unconditional (always
6529   happens).
6530*/
6531static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
6532                            Int offset )
6533{
6534   return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
6535}
6536
6537
6538/* The most general handler for guarded origin loads.  A GUARD of NULL
6539   is assumed to mean "always True".
6540
6541   Generate IR to do a shadow origin load from ADDR+BIAS and return
6542   the B bits.  The loaded type is TY.  If GUARD evaluates to False at
6543   run time then the returned B bits are simply BALT instead.
6544*/
6545static
6546IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
6547                                        IRType ty,
6548                                        IRAtom* addr, UInt bias,
6549                                        IRAtom* guard, IRAtom* balt )
6550{
6551   /* If the guard evaluates to True, this will hold the loaded
6552      origin.  If the guard evaluates to False, this will be zero,
6553      meaning "unknown origin", in which case we will have to replace
6554      it using an ITE below. */
6555   IRAtom* iftrue
6556      = assignNew('B', mce, Ity_I32,
6557                  gen_guarded_load_b(mce, sizeofIRType(ty),
6558                                     addr, bias, guard));
6559   /* These are the bits we will return if the load doesn't take
6560      place. */
6561   IRAtom* iffalse
6562      = balt;
6563   /* Prepare the cond for the ITE.  Convert a NULL cond into
6564      something that iropt knows how to fold out later. */
6565   IRAtom* cond
6566      = guard == NULL  ? mkU1(1)  : guard;
6567   /* And assemble the final result. */
6568   return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
6569}
6570
6571
6572/* Generate a shadow origins store.  guard :: Ity_I1 controls whether
6573   the store really happens; NULL means it unconditionally does. */
6574static void gen_store_b ( MCEnv* mce, Int szB,
6575                          IRAtom* baseaddr, Int offset, IRAtom* dataB,
6576                          IRAtom* guard )
6577{
6578   void*    hFun;
6579   const HChar* hName;
6580   IRDirty* di;
6581   IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6582   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6583   IRAtom*  ea    = baseaddr;
6584   if (guard) {
6585      tl_assert(isOriginalAtom(mce, guard));
6586      tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6587   }
6588   if (offset != 0) {
6589      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6590                                   : mkU64( (Long)(Int)offset );
6591      ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
6592   }
6593   if (mce->hWordTy == Ity_I64)
6594      dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
6595
6596   switch (szB) {
6597      case 1: hFun  = (void*)&MC_(helperc_b_store1);
6598              hName = "MC_(helperc_b_store1)";
6599              break;
6600      case 2: hFun  = (void*)&MC_(helperc_b_store2);
6601              hName = "MC_(helperc_b_store2)";
6602              break;
6603      case 4: hFun  = (void*)&MC_(helperc_b_store4);
6604              hName = "MC_(helperc_b_store4)";
6605              break;
6606      case 8: hFun  = (void*)&MC_(helperc_b_store8);
6607              hName = "MC_(helperc_b_store8)";
6608              break;
6609      case 16: hFun  = (void*)&MC_(helperc_b_store16);
6610               hName = "MC_(helperc_b_store16)";
6611               break;
6612      case 32: hFun  = (void*)&MC_(helperc_b_store32);
6613               hName = "MC_(helperc_b_store32)";
6614               break;
6615      default:
6616         tl_assert(0);
6617   }
6618   di = unsafeIRDirty_0_N( 2/*regparms*/,
6619           hName, VG_(fnptr_to_fnentry)( hFun ),
6620           mkIRExprVec_2( ea, dataB )
6621        );
6622   /* no need to mess with any annotations.  This call accesses
6623      neither guest state nor guest memory. */
6624   if (guard) di->guard = guard;
6625   stmt( 'B', mce, IRStmt_Dirty(di) );
6626}
6627
6628static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
6629   IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6630   if (eTy == Ity_I64)
6631      return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
6632   if (eTy == Ity_I32)
6633      return e;
6634   tl_assert(0);
6635}
6636
6637static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
6638   IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6639   tl_assert(eTy == Ity_I32);
6640   if (dstTy == Ity_I64)
6641      return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
6642   tl_assert(0);
6643}
6644
6645
6646static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
6647{
6648   tl_assert(MC_(clo_mc_level) == 3);
6649
6650   switch (e->tag) {
6651
6652      case Iex_GetI: {
6653         IRRegArray* descr_b;
6654         IRAtom      *t1, *t2, *t3, *t4;
6655         IRRegArray* descr      = e->Iex.GetI.descr;
6656         IRType equivIntTy
6657            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
6658         /* If this array is unshadowable for whatever reason, use the
6659            usual approximation. */
6660         if (equivIntTy == Ity_INVALID)
6661            return mkU32(0);
6662         tl_assert(sizeofIRType(equivIntTy) >= 4);
6663         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
6664         descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
6665                                 equivIntTy, descr->nElems );
6666         /* Do a shadow indexed get of the same size, giving t1.  Take
6667            the bottom 32 bits of it, giving t2.  Compute into t3 the
6668            origin for the index (almost certainly zero, but there's
6669            no harm in being completely general here, since iropt will
6670            remove any useless code), and fold it in, giving a final
6671            value t4. */
6672         t1 = assignNew( 'B', mce, equivIntTy,
6673                          IRExpr_GetI( descr_b, e->Iex.GetI.ix,
6674                                                e->Iex.GetI.bias ));
6675         t2 = narrowTo32( mce, t1 );
6676         t3 = schemeE( mce, e->Iex.GetI.ix );
6677         t4 = gen_maxU32( mce, t2, t3 );
6678         return t4;
6679      }
6680      case Iex_CCall: {
6681         Int i;
6682         IRAtom*  here;
6683         IRExpr** args = e->Iex.CCall.args;
6684         IRAtom*  curr = mkU32(0);
6685         for (i = 0; args[i]; i++) {
6686            tl_assert(i < 32);
6687            tl_assert(isOriginalAtom(mce, args[i]));
6688            /* Only take notice of this arg if the callee's
6689               mc-exclusion mask does not say it is to be excluded. */
6690            if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
6691               /* the arg is to be excluded from definedness checking.
6692                  Do nothing. */
6693               if (0) VG_(printf)("excluding %s(%d)\n",
6694                                  e->Iex.CCall.cee->name, i);
6695            } else {
6696               /* calculate the arg's definedness, and pessimistically
6697                  merge it in. */
6698               here = schemeE( mce, args[i] );
6699               curr = gen_maxU32( mce, curr, here );
6700            }
6701         }
6702         return curr;
6703      }
6704      case Iex_Load: {
6705         Int dszB;
6706         dszB = sizeofIRType(e->Iex.Load.ty);
6707         /* assert that the B value for the address is already
6708            available (somewhere) */
6709         tl_assert(isIRAtom(e->Iex.Load.addr));
6710         tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
6711         return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
6712      }
6713      case Iex_ITE: {
6714         IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
6715         IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
6716         IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
6717         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
6718      }
6719      case Iex_Qop: {
6720         IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
6721         IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
6722         IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
6723         IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
6724         return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
6725                                 gen_maxU32( mce, b3, b4 ) );
6726      }
6727      case Iex_Triop: {
6728         IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
6729         IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
6730         IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
6731         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
6732      }
6733      case Iex_Binop: {
6734         switch (e->Iex.Binop.op) {
6735            case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
6736            case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
6737            case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
6738            case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
6739               /* Just say these all produce a defined result,
6740                  regardless of their arguments.  See
6741                  COMMENT_ON_CasCmpEQ in this file. */
6742               return mkU32(0);
6743            default: {
6744               IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
6745               IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
6746               return gen_maxU32( mce, b1, b2 );
6747            }
6748         }
6749         tl_assert(0);
6750         /*NOTREACHED*/
6751      }
6752      case Iex_Unop: {
6753         IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
6754         return b1;
6755      }
6756      case Iex_Const:
6757         return mkU32(0);
6758      case Iex_RdTmp:
6759         return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
6760      case Iex_Get: {
6761         Int b_offset = MC_(get_otrack_shadow_offset)(
6762                           e->Iex.Get.offset,
6763                           sizeofIRType(e->Iex.Get.ty)
6764                        );
6765         tl_assert(b_offset >= -1
6766                   && b_offset <= mce->layout->total_sizeB -4);
6767         if (b_offset >= 0) {
6768            /* FIXME: this isn't an atom! */
6769            return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
6770                               Ity_I32 );
6771         }
6772         return mkU32(0);
6773      }
6774      default:
6775         VG_(printf)("mc_translate.c: schemeE: unhandled: ");
6776         ppIRExpr(e);
6777         VG_(tool_panic)("memcheck:schemeE");
6778   }
6779}
6780
6781
6782static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
6783{
6784   // This is a hacked version of do_shadow_Dirty
6785   Int       i, k, n, toDo, gSz, gOff;
6786   IRAtom    *here, *curr;
6787   IRTemp    dst;
6788
6789   /* First check the guard. */
6790   curr = schemeE( mce, d->guard );
6791
6792   /* Now round up all inputs and maxU32 over them. */
6793
6794   /* Inputs: unmasked args
6795      Note: arguments are evaluated REGARDLESS of the guard expression */
6796   for (i = 0; d->args[i]; i++) {
6797      IRAtom* arg = d->args[i];
6798      if ( (d->cee->mcx_mask & (1<<i))
6799           || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
6800         /* ignore this arg */
6801      } else {
6802         here = schemeE( mce, arg );
6803         curr = gen_maxU32( mce, curr, here );
6804      }
6805   }
6806
6807   /* Inputs: guest state that we read. */
6808   for (i = 0; i < d->nFxState; i++) {
6809      tl_assert(d->fxState[i].fx != Ifx_None);
6810      if (d->fxState[i].fx == Ifx_Write)
6811         continue;
6812
6813      /* Enumerate the described state segments */
6814      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6815         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6816         gSz  = d->fxState[i].size;
6817
6818         /* Ignore any sections marked as 'always defined'. */
6819         if (isAlwaysDefd(mce, gOff, gSz)) {
6820            if (0)
6821            VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6822                        gOff, gSz);
6823            continue;
6824         }
6825
6826         /* This state element is read or modified.  So we need to
6827            consider it.  If larger than 4 bytes, deal with it in
6828            4-byte chunks. */
6829         while (True) {
6830            Int b_offset;
6831            tl_assert(gSz >= 0);
6832            if (gSz == 0) break;
6833            n = gSz <= 4 ? gSz : 4;
6834            /* update 'curr' with maxU32 of the state slice
6835               gOff .. gOff+n-1 */
6836            b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
6837            if (b_offset != -1) {
6838               /* Observe the guard expression. If it is false use 0, i.e.
6839                  nothing is known about the origin */
6840               IRAtom *cond, *iffalse, *iftrue;
6841
6842               cond = assignNew( 'B', mce, Ity_I1, d->guard);
6843               iffalse = mkU32(0);
6844               iftrue  = assignNew( 'B', mce, Ity_I32,
6845                                    IRExpr_Get(b_offset
6846                                                 + 2*mce->layout->total_sizeB,
6847                                               Ity_I32));
6848               here = assignNew( 'B', mce, Ity_I32,
6849                                 IRExpr_ITE(cond, iftrue, iffalse));
6850               curr = gen_maxU32( mce, curr, here );
6851            }
6852            gSz -= n;
6853            gOff += n;
6854         }
6855      }
6856   }
6857
6858   /* Inputs: memory */
6859
6860   if (d->mFx != Ifx_None) {
6861      /* Because we may do multiple shadow loads/stores from the same
6862         base address, it's best to do a single test of its
6863         definedness right now.  Post-instrumentation optimisation
6864         should remove all but this test. */
6865      tl_assert(d->mAddr);
6866      here = schemeE( mce, d->mAddr );
6867      curr = gen_maxU32( mce, curr, here );
6868   }
6869
6870   /* Deal with memory inputs (reads or modifies) */
6871   if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6872      toDo   = d->mSize;
6873      /* chew off 32-bit chunks.  We don't care about the endianness
6874         since it's all going to be condensed down to a single bit,
6875         but nevertheless choose an endianness which is hopefully
6876         native to the platform. */
6877      while (toDo >= 4) {
6878         here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
6879                                    d->guard );
6880         curr = gen_maxU32( mce, curr, here );
6881         toDo -= 4;
6882      }
6883      /* handle possible 16-bit excess */
6884      while (toDo >= 2) {
6885         here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
6886                                    d->guard );
6887         curr = gen_maxU32( mce, curr, here );
6888         toDo -= 2;
6889      }
6890      /* chew off the remaining 8-bit chunk, if any */
6891      if (toDo == 1) {
6892         here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
6893                                    d->guard );
6894         curr = gen_maxU32( mce, curr, here );
6895         toDo -= 1;
6896      }
6897      tl_assert(toDo == 0);
6898   }
6899
6900   /* Whew!  So curr is a 32-bit B-value which should give an origin
6901      of some use if any of the inputs to the helper are undefined.
6902      Now we need to re-distribute the results to all destinations. */
6903
6904   /* Outputs: the destination temporary, if there is one. */
6905   if (d->tmp != IRTemp_INVALID) {
6906      dst   = findShadowTmpB(mce, d->tmp);
6907      assign( 'V', mce, dst, curr );
6908   }
6909
6910   /* Outputs: guest state that we write or modify. */
6911   for (i = 0; i < d->nFxState; i++) {
6912      tl_assert(d->fxState[i].fx != Ifx_None);
6913      if (d->fxState[i].fx == Ifx_Read)
6914         continue;
6915
6916      /* Enumerate the described state segments */
6917      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6918         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6919         gSz  = d->fxState[i].size;
6920
6921         /* Ignore any sections marked as 'always defined'. */
6922         if (isAlwaysDefd(mce, gOff, gSz))
6923            continue;
6924
6925         /* This state element is written or modified.  So we need to
6926            consider it.  If larger than 4 bytes, deal with it in
6927            4-byte chunks. */
6928         while (True) {
6929            Int b_offset;
6930            tl_assert(gSz >= 0);
6931            if (gSz == 0) break;
6932            n = gSz <= 4 ? gSz : 4;
6933            /* Write 'curr' to the state slice gOff .. gOff+n-1 */
6934            b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
6935            if (b_offset != -1) {
6936
6937               /* If the guard expression evaluates to false we simply Put
6938                  the value that is already stored in the guest state slot */
6939               IRAtom *cond, *iffalse;
6940
6941               cond    = assignNew('B', mce, Ity_I1,
6942                                   d->guard);
6943               iffalse = assignNew('B', mce, Ity_I32,
6944                                   IRExpr_Get(b_offset +
6945                                              2*mce->layout->total_sizeB,
6946                                              Ity_I32));
6947               curr = assignNew('V', mce, Ity_I32,
6948                                IRExpr_ITE(cond, curr, iffalse));
6949
6950               stmt( 'B', mce, IRStmt_Put(b_offset
6951                                          + 2*mce->layout->total_sizeB,
6952                                          curr ));
6953            }
6954            gSz -= n;
6955            gOff += n;
6956         }
6957      }
6958   }
6959
6960   /* Outputs: memory that we write or modify.  Same comments about
6961      endianness as above apply. */
6962   if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6963      toDo   = d->mSize;
6964      /* chew off 32-bit chunks */
6965      while (toDo >= 4) {
6966         gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
6967                      d->guard );
6968         toDo -= 4;
6969      }
6970      /* handle possible 16-bit excess */
6971      while (toDo >= 2) {
6972         gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
6973                      d->guard );
6974         toDo -= 2;
6975      }
6976      /* chew off the remaining 8-bit chunk, if any */
6977      if (toDo == 1) {
6978         gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
6979                      d->guard );
6980         toDo -= 1;
6981      }
6982      tl_assert(toDo == 0);
6983   }
6984}
6985
6986
6987/* Generate IR for origin shadowing for a general guarded store. */
6988static void do_origins_Store_guarded ( MCEnv* mce,
6989                                       IREndness stEnd,
6990                                       IRExpr* stAddr,
6991                                       IRExpr* stData,
6992                                       IRExpr* guard )
6993{
6994   Int     dszB;
6995   IRAtom* dataB;
6996   /* assert that the B value for the address is already available
6997      (somewhere), since the call to schemeE will want to see it.
6998      XXXX how does this actually ensure that?? */
6999   tl_assert(isIRAtom(stAddr));
7000   tl_assert(isIRAtom(stData));
7001   dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7002   dataB = schemeE( mce, stData );
7003   gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7004}
7005
7006
7007/* Generate IR for origin shadowing for a plain store. */
7008static void do_origins_Store_plain ( MCEnv* mce,
7009                                     IREndness stEnd,
7010                                     IRExpr* stAddr,
7011                                     IRExpr* stData )
7012{
7013   do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7014                              NULL/*guard*/ );
7015}
7016
7017
7018/* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7019
7020static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7021{
7022   do_origins_Store_guarded( mce, sg->end, sg->addr,
7023                             sg->data, sg->guard );
7024}
7025
7026static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7027{
7028   IRType loadedTy = Ity_INVALID;
7029   switch (lg->cvt) {
7030      case ILGop_Ident32: loadedTy = Ity_I32; break;
7031      case ILGop_16Uto32: loadedTy = Ity_I16; break;
7032      case ILGop_16Sto32: loadedTy = Ity_I16; break;
7033      case ILGop_8Uto32:  loadedTy = Ity_I8;  break;
7034      case ILGop_8Sto32:  loadedTy = Ity_I8;  break;
7035      default: VG_(tool_panic)("schemeS.IRLoadG");
7036   }
7037   IRAtom* ori_alt
7038      = schemeE( mce,lg->alt );
7039   IRAtom* ori_final
7040      = expr2ori_Load_guarded_General(mce, loadedTy,
7041                                      lg->addr, 0/*addr bias*/,
7042                                      lg->guard, ori_alt );
7043   /* And finally, bind the origin to the destination temporary. */
7044   assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7045}
7046
7047
7048static void schemeS ( MCEnv* mce, IRStmt* st )
7049{
7050   tl_assert(MC_(clo_mc_level) == 3);
7051
7052   switch (st->tag) {
7053
7054      case Ist_AbiHint:
7055         /* The value-check instrumenter handles this - by arranging
7056            to pass the address of the next instruction to
7057            MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7058            happen for origin tracking w.r.t. AbiHints.  So there is
7059            nothing to do here. */
7060         break;
7061
7062      case Ist_PutI: {
7063         IRPutI *puti = st->Ist.PutI.details;
7064         IRRegArray* descr_b;
7065         IRAtom      *t1, *t2, *t3, *t4;
7066         IRRegArray* descr = puti->descr;
7067         IRType equivIntTy
7068            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7069         /* If this array is unshadowable for whatever reason,
7070            generate no code. */
7071         if (equivIntTy == Ity_INVALID)
7072            break;
7073         tl_assert(sizeofIRType(equivIntTy) >= 4);
7074         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7075         descr_b
7076            = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7077                            equivIntTy, descr->nElems );
7078         /* Compute a value to Put - the conjoinment of the origin for
7079            the data to be Put-ted (obviously) and of the index value
7080            (not so obviously). */
7081         t1 = schemeE( mce, puti->data );
7082         t2 = schemeE( mce, puti->ix );
7083         t3 = gen_maxU32( mce, t1, t2 );
7084         t4 = zWidenFrom32( mce, equivIntTy, t3 );
7085         stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7086                                               puti->bias, t4) ));
7087         break;
7088      }
7089
7090      case Ist_Dirty:
7091         do_origins_Dirty( mce, st->Ist.Dirty.details );
7092         break;
7093
7094      case Ist_Store:
7095         do_origins_Store_plain( mce, st->Ist.Store.end,
7096                                      st->Ist.Store.addr,
7097                                      st->Ist.Store.data );
7098         break;
7099
7100      case Ist_StoreG:
7101         do_origins_StoreG( mce, st->Ist.StoreG.details );
7102         break;
7103
7104      case Ist_LoadG:
7105         do_origins_LoadG( mce, st->Ist.LoadG.details );
7106         break;
7107
7108      case Ist_LLSC: {
7109         /* In short: treat a load-linked like a normal load followed
7110            by an assignment of the loaded (shadow) data the result
7111            temporary.  Treat a store-conditional like a normal store,
7112            and mark the result temporary as defined. */
7113         if (st->Ist.LLSC.storedata == NULL) {
7114            /* Load Linked */
7115            IRType resTy
7116               = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7117            IRExpr* vanillaLoad
7118               = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7119            tl_assert(resTy == Ity_I64 || resTy == Ity_I32
7120                      || resTy == Ity_I16 || resTy == Ity_I8);
7121            assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7122                              schemeE(mce, vanillaLoad));
7123         } else {
7124            /* Store conditional */
7125            do_origins_Store_plain( mce, st->Ist.LLSC.end,
7126                                    st->Ist.LLSC.addr,
7127                                    st->Ist.LLSC.storedata );
7128            /* For the rationale behind this, see comments at the
7129               place where the V-shadow for .result is constructed, in
7130               do_shadow_LLSC.  In short, we regard .result as
7131               always-defined. */
7132            assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7133                              mkU32(0) );
7134         }
7135         break;
7136      }
7137
7138      case Ist_Put: {
7139         Int b_offset
7140            = MC_(get_otrack_shadow_offset)(
7141                 st->Ist.Put.offset,
7142                 sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7143              );
7144         if (b_offset >= 0) {
7145            /* FIXME: this isn't an atom! */
7146            stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7147                                       schemeE( mce, st->Ist.Put.data )) );
7148         }
7149         break;
7150      }
7151
7152      case Ist_WrTmp:
7153         assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7154                           schemeE(mce, st->Ist.WrTmp.data) );
7155         break;
7156
7157      case Ist_MBE:
7158      case Ist_NoOp:
7159      case Ist_Exit:
7160      case Ist_IMark:
7161         break;
7162
7163      default:
7164         VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7165         ppIRStmt(st);
7166         VG_(tool_panic)("memcheck:schemeS");
7167   }
7168}
7169
7170
7171/*--------------------------------------------------------------------*/
7172/*--- end                                           mc_translate.c ---*/
7173/*--------------------------------------------------------------------*/
7174