1
2/*--------------------------------------------------------------------*/
3/*--- Instrument IR to perform memory checking operations.         ---*/
4/*---                                               mc_translate.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of MemCheck, a heavyweight Valgrind tool for
9   detecting memory errors.
10
11   Copyright (C) 2000-2013 Julian Seward
12      jseward@acm.org
13
14   This program is free software; you can redistribute it and/or
15   modify it under the terms of the GNU General Public License as
16   published by the Free Software Foundation; either version 2 of the
17   License, or (at your option) any later version.
18
19   This program is distributed in the hope that it will be useful, but
20   WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22   General Public License for more details.
23
24   You should have received a copy of the GNU General Public License
25   along with this program; if not, write to the Free Software
26   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27   02111-1307, USA.
28
29   The GNU General Public License is contained in the file COPYING.
30*/
31
32#include "pub_tool_basics.h"
33#include "pub_tool_poolalloc.h"     // For mc_include.h
34#include "pub_tool_hashtable.h"     // For mc_include.h
35#include "pub_tool_libcassert.h"
36#include "pub_tool_libcprint.h"
37#include "pub_tool_tooliface.h"
38#include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
39#include "pub_tool_xarray.h"
40#include "pub_tool_mallocfree.h"
41#include "pub_tool_libcbase.h"
42
43#include "mc_include.h"
44
45
46/* FIXMEs JRS 2011-June-16.
47
48   Check the interpretation for vector narrowing and widening ops,
49   particularly the saturating ones.  I suspect they are either overly
50   pessimistic and/or wrong.
51
52   Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
53   saturating shifts): the interpretation is overly pessimistic.
54   See comments on the relevant cases below for details.
55
56   Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
57   both rounding and non-rounding variants): ditto
58*/
59
60/* This file implements the Memcheck instrumentation, and in
61   particular contains the core of its undefined value detection
62   machinery.  For a comprehensive background of the terminology,
63   algorithms and rationale used herein, read:
64
65     Using Valgrind to detect undefined value errors with
66     bit-precision
67
68     Julian Seward and Nicholas Nethercote
69
70     2005 USENIX Annual Technical Conference (General Track),
71     Anaheim, CA, USA, April 10-15, 2005.
72
73   ----
74
75   Here is as good a place as any to record exactly when V bits are and
76   should be checked, why, and what function is responsible.
77
78
79   Memcheck complains when an undefined value is used:
80
81   1. In the condition of a conditional branch.  Because it could cause
82      incorrect control flow, and thus cause incorrect externally-visible
83      behaviour.  [mc_translate.c:complainIfUndefined]
84
85   2. As an argument to a system call, or as the value that specifies
86      the system call number.  Because it could cause an incorrect
87      externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
88
89   3. As the address in a load or store.  Because it could cause an
90      incorrect value to be used later, which could cause externally-visible
91      behaviour (eg. via incorrect control flow or an incorrect system call
92      argument)  [complainIfUndefined]
93
94   4. As the target address of a branch.  Because it could cause incorrect
95      control flow.  [complainIfUndefined]
96
97   5. As an argument to setenv, unsetenv, or putenv.  Because it could put
98      an incorrect value into the external environment.
99      [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
100
101   6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
102      [complainIfUndefined]
103
104   7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
105      VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
106      requested it.  [in memcheck.h]
107
108
109   Memcheck also complains, but should not, when an undefined value is used:
110
111   8. As the shift value in certain SIMD shift operations (but not in the
112      standard integer shift operations).  This inconsistency is due to
113      historical reasons.)  [complainIfUndefined]
114
115
116   Memcheck does not complain, but should, when an undefined value is used:
117
118   9. As an input to a client request.  Because the client request may
119      affect the visible behaviour -- see bug #144362 for an example
120      involving the malloc replacements in vg_replace_malloc.c and
121      VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
122      isn't identified.  That bug report also has some info on how to solve
123      the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124
125
126   In practice, 1 and 2 account for the vast majority of cases.
127*/
128
129/* Generation of addr-definedness, addr-validity and
130   guard-definedness checks pertaining to loads and stores (Iex_Load,
131   Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
132   loads/stores) was re-checked 11 May 2013. */
133
134/*------------------------------------------------------------*/
135/*--- Forward decls                                        ---*/
136/*------------------------------------------------------------*/
137
138struct _MCEnv;
139
140static IRType  shadowTypeV ( IRType ty );
141static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
142static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
143
144static IRExpr *i128_const_zero(void);
145
146/*------------------------------------------------------------*/
147/*--- Memcheck running state, and tmp management.          ---*/
148/*------------------------------------------------------------*/
149
150/* Carries info about a particular tmp.  The tmp's number is not
151   recorded, as this is implied by (equal to) its index in the tmpMap
152   in MCEnv.  The tmp's type is also not recorded, as this is present
153   in MCEnv.sb->tyenv.
154
155   When .kind is Orig, .shadowV and .shadowB may give the identities
156   of the temps currently holding the associated definedness (shadowV)
157   and origin (shadowB) values, or these may be IRTemp_INVALID if code
158   to compute such values has not yet been emitted.
159
160   When .kind is VSh or BSh then the tmp is holds a V- or B- value,
161   and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
162   illogical for a shadow tmp itself to be shadowed.
163*/
164typedef
165   enum { Orig=1, VSh=2, BSh=3 }
166   TempKind;
167
168typedef
169   struct {
170      TempKind kind;
171      IRTemp   shadowV;
172      IRTemp   shadowB;
173   }
174   TempMapEnt;
175
176
177/* Carries around state during memcheck instrumentation. */
178typedef
179   struct _MCEnv {
180      /* MODIFIED: the superblock being constructed.  IRStmts are
181         added. */
182      IRSB* sb;
183      Bool  trace;
184
185      /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
186         current kind and possibly shadow temps for each temp in the
187         IRSB being constructed.  Note that it does not contain the
188         type of each tmp.  If you want to know the type, look at the
189         relevant entry in sb->tyenv.  It follows that at all times
190         during the instrumentation process, the valid indices for
191         tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
192         total number of Orig, V- and B- temps allocated so far.
193
194         The reason for this strange split (types in one place, all
195         other info in another) is that we need the types to be
196         attached to sb so as to make it possible to do
197         "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
198         instrumentation process. */
199      XArray* /* of TempMapEnt */ tmpMap;
200
201      /* MODIFIED: indicates whether "bogus" literals have so far been
202         found.  Starts off False, and may change to True. */
203      Bool bogusLiterals;
204
205      /* READONLY: indicates whether we should use expensive
206         interpretations of integer adds, since unfortunately LLVM
207         uses them to do ORs in some circumstances.  Defaulted to True
208         on MacOS and False everywhere else. */
209      Bool useLLVMworkarounds;
210
211      /* READONLY: the guest layout.  This indicates which parts of
212         the guest state should be regarded as 'always defined'. */
213      const VexGuestLayout* layout;
214
215      /* READONLY: the host word type.  Needed for constructing
216         arguments of type 'HWord' to be passed to helper functions.
217         Ity_I32 or Ity_I64 only. */
218      IRType hWordTy;
219   }
220   MCEnv;
221
222/* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
223   demand), as they are encountered.  This is for two reasons.
224
225   (1) (less important reason): Many original tmps are unused due to
226   initial IR optimisation, and we do not want to spaces in tables
227   tracking them.
228
229   Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
230   table indexed [0 .. n_types-1], which gives the current shadow for
231   each original tmp, or INVALID_IRTEMP if none is so far assigned.
232   It is necessary to support making multiple assignments to a shadow
233   -- specifically, after testing a shadow for definedness, it needs
234   to be made defined.  But IR's SSA property disallows this.
235
236   (2) (more important reason): Therefore, when a shadow needs to get
237   a new value, a new temporary is created, the value is assigned to
238   that, and the tmpMap is updated to reflect the new binding.
239
240   A corollary is that if the tmpMap maps a given tmp to
241   IRTemp_INVALID and we are hoping to read that shadow tmp, it means
242   there's a read-before-write error in the original tmps.  The IR
243   sanity checker should catch all such anomalies, however.
244*/
245
246/* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
247   both the table in mce->sb and to our auxiliary mapping.  Note that
248   newTemp may cause mce->tmpMap to resize, hence previous results
249   from VG_(indexXA)(mce->tmpMap) are invalidated. */
250static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
251{
252   Word       newIx;
253   TempMapEnt ent;
254   IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
255   ent.kind    = kind;
256   ent.shadowV = IRTemp_INVALID;
257   ent.shadowB = IRTemp_INVALID;
258   newIx = VG_(addToXA)( mce->tmpMap, &ent );
259   tl_assert(newIx == (Word)tmp);
260   return tmp;
261}
262
263
264/* Find the tmp currently shadowing the given original tmp.  If none
265   so far exists, allocate one.  */
266static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
267{
268   TempMapEnt* ent;
269   /* VG_(indexXA) range-checks 'orig', hence no need to check
270      here. */
271   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
272   tl_assert(ent->kind == Orig);
273   if (ent->shadowV == IRTemp_INVALID) {
274      IRTemp tmpV
275        = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
276      /* newTemp may cause mce->tmpMap to resize, hence previous results
277         from VG_(indexXA) are invalid. */
278      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
279      tl_assert(ent->kind == Orig);
280      tl_assert(ent->shadowV == IRTemp_INVALID);
281      ent->shadowV = tmpV;
282   }
283   return ent->shadowV;
284}
285
286/* Allocate a new shadow for the given original tmp.  This means any
287   previous shadow is abandoned.  This is needed because it is
288   necessary to give a new value to a shadow once it has been tested
289   for undefinedness, but unfortunately IR's SSA property disallows
290   this.  Instead we must abandon the old shadow, allocate a new one
291   and use that instead.
292
293   This is the same as findShadowTmpV, except we don't bother to see
294   if a shadow temp already existed -- we simply allocate a new one
295   regardless. */
296static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
297{
298   TempMapEnt* ent;
299   /* VG_(indexXA) range-checks 'orig', hence no need to check
300      here. */
301   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
302   tl_assert(ent->kind == Orig);
303   if (1) {
304      IRTemp tmpV
305        = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
306      /* newTemp may cause mce->tmpMap to resize, hence previous results
307         from VG_(indexXA) are invalid. */
308      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
309      tl_assert(ent->kind == Orig);
310      ent->shadowV = tmpV;
311   }
312}
313
314
315/*------------------------------------------------------------*/
316/*--- IRAtoms -- a subset of IRExprs                       ---*/
317/*------------------------------------------------------------*/
318
319/* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
320   isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
321   input, most of this code deals in atoms.  Usefully, a value atom
322   always has a V-value which is also an atom: constants are shadowed
323   by constants, and temps are shadowed by the corresponding shadow
324   temporary. */
325
326typedef  IRExpr  IRAtom;
327
328/* (used for sanity checks only): is this an atom which looks
329   like it's from original code? */
330static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
331{
332   if (a1->tag == Iex_Const)
333      return True;
334   if (a1->tag == Iex_RdTmp) {
335      TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
336      return ent->kind == Orig;
337   }
338   return False;
339}
340
341/* (used for sanity checks only): is this an atom which looks
342   like it's from shadow code? */
343static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
344{
345   if (a1->tag == Iex_Const)
346      return True;
347   if (a1->tag == Iex_RdTmp) {
348      TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
349      return ent->kind == VSh || ent->kind == BSh;
350   }
351   return False;
352}
353
354/* (used for sanity checks only): check that both args are atoms and
355   are identically-kinded. */
356static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
357{
358   if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
359      return True;
360   if (a1->tag == Iex_Const && a2->tag == Iex_Const)
361      return True;
362   return False;
363}
364
365
366/*------------------------------------------------------------*/
367/*--- Type management                                      ---*/
368/*------------------------------------------------------------*/
369
370/* Shadow state is always accessed using integer types.  This returns
371   an integer type with the same size (as per sizeofIRType) as the
372   given type.  The only valid shadow types are Bit, I8, I16, I32,
373   I64, I128, V128, V256. */
374
375static IRType shadowTypeV ( IRType ty )
376{
377   switch (ty) {
378      case Ity_I1:
379      case Ity_I8:
380      case Ity_I16:
381      case Ity_I32:
382      case Ity_I64:
383      case Ity_I128: return ty;
384      case Ity_F16:  return Ity_I16;
385      case Ity_F32:  return Ity_I32;
386      case Ity_D32:  return Ity_I32;
387      case Ity_F64:  return Ity_I64;
388      case Ity_D64:  return Ity_I64;
389      case Ity_F128: return Ity_I128;
390      case Ity_D128: return Ity_I128;
391      case Ity_V128: return Ity_V128;
392      case Ity_V256: return Ity_V256;
393      default: ppIRType(ty);
394               VG_(tool_panic)("memcheck:shadowTypeV");
395   }
396}
397
398/* Produce a 'defined' value of the given shadow type.  Should only be
399   supplied shadow types (Bit/I8/I16/I32/UI64). */
400static IRExpr* definedOfType ( IRType ty ) {
401   switch (ty) {
402      case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
403      case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
404      case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
405      case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
406      case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
407      case Ity_I128: return i128_const_zero();
408      case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
409      case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
410      default:       VG_(tool_panic)("memcheck:definedOfType");
411   }
412}
413
414
415/*------------------------------------------------------------*/
416/*--- Constructing IR fragments                            ---*/
417/*------------------------------------------------------------*/
418
419/* add stmt to a bb */
420static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
421   if (mce->trace) {
422      VG_(printf)("  %c: ", cat);
423      ppIRStmt(st);
424      VG_(printf)("\n");
425   }
426   addStmtToIRSB(mce->sb, st);
427}
428
429/* assign value to tmp */
430static inline
431void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
432   stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
433}
434
435/* build various kinds of expressions */
436#define triop(_op, _arg1, _arg2, _arg3) \
437                                 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
438#define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
439#define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
440#define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
441#define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
442#define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
443#define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
444#define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
445#define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
446#define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
447
448/* Bind the given expression to a new temporary, and return the
449   temporary.  This effectively converts an arbitrary expression into
450   an atom.
451
452   'ty' is the type of 'e' and hence the type that the new temporary
453   needs to be.  But passing it in is redundant, since we can deduce
454   the type merely by inspecting 'e'.  So at least use that fact to
455   assert that the two types agree. */
456static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
457{
458   TempKind k;
459   IRTemp   t;
460   IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
461
462   tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
463   switch (cat) {
464      case 'V': k = VSh;  break;
465      case 'B': k = BSh;  break;
466      case 'C': k = Orig; break;
467                /* happens when we are making up new "orig"
468                   expressions, for IRCAS handling */
469      default: tl_assert(0);
470   }
471   t = newTemp(mce, ty, k);
472   assign(cat, mce, t, e);
473   return mkexpr(t);
474}
475
476
477/*------------------------------------------------------------*/
478/*--- Helper functions for 128-bit ops                     ---*/
479/*------------------------------------------------------------*/
480
481static IRExpr *i128_const_zero(void)
482{
483   IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
484   return binop(Iop_64HLto128, z64, z64);
485}
486
487/* There are no I128-bit loads and/or stores [as generated by any
488   current front ends].  So we do not need to worry about that in
489   expr2vbits_Load */
490
491
492/*------------------------------------------------------------*/
493/*--- Constructing definedness primitive ops               ---*/
494/*------------------------------------------------------------*/
495
496/* --------- Defined-if-either-defined --------- */
497
498static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
499   tl_assert(isShadowAtom(mce,a1));
500   tl_assert(isShadowAtom(mce,a2));
501   return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
502}
503
504static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
505   tl_assert(isShadowAtom(mce,a1));
506   tl_assert(isShadowAtom(mce,a2));
507   return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
508}
509
510static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
511   tl_assert(isShadowAtom(mce,a1));
512   tl_assert(isShadowAtom(mce,a2));
513   return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
514}
515
516static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
517   tl_assert(isShadowAtom(mce,a1));
518   tl_assert(isShadowAtom(mce,a2));
519   return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
520}
521
522static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
523   tl_assert(isShadowAtom(mce,a1));
524   tl_assert(isShadowAtom(mce,a2));
525   return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
526}
527
528static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
529   tl_assert(isShadowAtom(mce,a1));
530   tl_assert(isShadowAtom(mce,a2));
531   return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
532}
533
534/* --------- Undefined-if-either-undefined --------- */
535
536static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
537   tl_assert(isShadowAtom(mce,a1));
538   tl_assert(isShadowAtom(mce,a2));
539   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
540}
541
542static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
543   tl_assert(isShadowAtom(mce,a1));
544   tl_assert(isShadowAtom(mce,a2));
545   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
546}
547
548static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
549   tl_assert(isShadowAtom(mce,a1));
550   tl_assert(isShadowAtom(mce,a2));
551   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
552}
553
554static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
555   tl_assert(isShadowAtom(mce,a1));
556   tl_assert(isShadowAtom(mce,a2));
557   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
558}
559
560static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
561   IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
562   tl_assert(isShadowAtom(mce,a1));
563   tl_assert(isShadowAtom(mce,a2));
564   tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
565   tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
566   tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
567   tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
568   tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
569   tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
570
571   return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
572}
573
574static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
575   tl_assert(isShadowAtom(mce,a1));
576   tl_assert(isShadowAtom(mce,a2));
577   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
578}
579
580static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
581   tl_assert(isShadowAtom(mce,a1));
582   tl_assert(isShadowAtom(mce,a2));
583   return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
584}
585
586static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
587   switch (vty) {
588      case Ity_I8:   return mkUifU8(mce, a1, a2);
589      case Ity_I16:  return mkUifU16(mce, a1, a2);
590      case Ity_I32:  return mkUifU32(mce, a1, a2);
591      case Ity_I64:  return mkUifU64(mce, a1, a2);
592      case Ity_I128: return mkUifU128(mce, a1, a2);
593      case Ity_V128: return mkUifUV128(mce, a1, a2);
594      case Ity_V256: return mkUifUV256(mce, a1, a2);
595      default:
596         VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
597         VG_(tool_panic)("memcheck:mkUifU");
598   }
599}
600
601/* --------- The Left-family of operations. --------- */
602
603static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
604   tl_assert(isShadowAtom(mce,a1));
605   return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
606}
607
608static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
609   tl_assert(isShadowAtom(mce,a1));
610   return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
611}
612
613static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
614   tl_assert(isShadowAtom(mce,a1));
615   return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
616}
617
618static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
619   tl_assert(isShadowAtom(mce,a1));
620   return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
621}
622
623/* --------- 'Improvement' functions for AND/OR. --------- */
624
625/* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
626   defined (0); all other -> undefined (1).
627*/
628static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
629{
630   tl_assert(isOriginalAtom(mce, data));
631   tl_assert(isShadowAtom(mce, vbits));
632   tl_assert(sameKindedAtoms(data, vbits));
633   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
634}
635
636static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
637{
638   tl_assert(isOriginalAtom(mce, data));
639   tl_assert(isShadowAtom(mce, vbits));
640   tl_assert(sameKindedAtoms(data, vbits));
641   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
642}
643
644static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
645{
646   tl_assert(isOriginalAtom(mce, data));
647   tl_assert(isShadowAtom(mce, vbits));
648   tl_assert(sameKindedAtoms(data, vbits));
649   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
650}
651
652static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
653{
654   tl_assert(isOriginalAtom(mce, data));
655   tl_assert(isShadowAtom(mce, vbits));
656   tl_assert(sameKindedAtoms(data, vbits));
657   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
658}
659
660static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
661{
662   tl_assert(isOriginalAtom(mce, data));
663   tl_assert(isShadowAtom(mce, vbits));
664   tl_assert(sameKindedAtoms(data, vbits));
665   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
666}
667
668static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
669{
670   tl_assert(isOriginalAtom(mce, data));
671   tl_assert(isShadowAtom(mce, vbits));
672   tl_assert(sameKindedAtoms(data, vbits));
673   return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
674}
675
676/* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
677   defined (0); all other -> undefined (1).
678*/
679static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
680{
681   tl_assert(isOriginalAtom(mce, data));
682   tl_assert(isShadowAtom(mce, vbits));
683   tl_assert(sameKindedAtoms(data, vbits));
684   return assignNew(
685             'V', mce, Ity_I8,
686             binop(Iop_Or8,
687                   assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
688                   vbits) );
689}
690
691static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
692{
693   tl_assert(isOriginalAtom(mce, data));
694   tl_assert(isShadowAtom(mce, vbits));
695   tl_assert(sameKindedAtoms(data, vbits));
696   return assignNew(
697             'V', mce, Ity_I16,
698             binop(Iop_Or16,
699                   assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
700                   vbits) );
701}
702
703static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
704{
705   tl_assert(isOriginalAtom(mce, data));
706   tl_assert(isShadowAtom(mce, vbits));
707   tl_assert(sameKindedAtoms(data, vbits));
708   return assignNew(
709             'V', mce, Ity_I32,
710             binop(Iop_Or32,
711                   assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
712                   vbits) );
713}
714
715static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
716{
717   tl_assert(isOriginalAtom(mce, data));
718   tl_assert(isShadowAtom(mce, vbits));
719   tl_assert(sameKindedAtoms(data, vbits));
720   return assignNew(
721             'V', mce, Ity_I64,
722             binop(Iop_Or64,
723                   assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
724                   vbits) );
725}
726
727static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
728{
729   tl_assert(isOriginalAtom(mce, data));
730   tl_assert(isShadowAtom(mce, vbits));
731   tl_assert(sameKindedAtoms(data, vbits));
732   return assignNew(
733             'V', mce, Ity_V128,
734             binop(Iop_OrV128,
735                   assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
736                   vbits) );
737}
738
739static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
740{
741   tl_assert(isOriginalAtom(mce, data));
742   tl_assert(isShadowAtom(mce, vbits));
743   tl_assert(sameKindedAtoms(data, vbits));
744   return assignNew(
745             'V', mce, Ity_V256,
746             binop(Iop_OrV256,
747                   assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
748                   vbits) );
749}
750
751/* --------- Pessimising casts. --------- */
752
753/* The function returns an expression of type DST_TY. If any of the VBITS
754   is undefined (value == 1) the resulting expression has all bits set to
755   1. Otherwise, all bits are 0. */
756
757static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
758{
759   IRType  src_ty;
760   IRAtom* tmp1;
761
762   /* Note, dst_ty is a shadow type, not an original type. */
763   tl_assert(isShadowAtom(mce,vbits));
764   src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
765
766   /* Fast-track some common cases */
767   if (src_ty == Ity_I32 && dst_ty == Ity_I32)
768      return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
769
770   if (src_ty == Ity_I64 && dst_ty == Ity_I64)
771      return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
772
773   if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
774      /* PCast the arg, then clone it. */
775      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
776      return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
777   }
778
779   if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
780      /* PCast the arg, then clone it 4 times. */
781      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
782      tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
783      return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
784   }
785
786   if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
787      /* PCast the arg, then clone it 8 times. */
788      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
789      tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
790      tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
791      return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
792   }
793
794   if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
795      /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
796         the top half. */
797      IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
798      return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
799   }
800
801   if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
802      /* Use InterleaveHI64x2 to copy the top half of the vector into
803         the bottom half.  Then we can UifU it with the original, throw
804         away the upper half of the result, and PCast-I64-to-I64
805         the lower half. */
806      // Generates vbits[127:64] : vbits[127:64]
807      IRAtom* hi64hi64
808         = assignNew('V', mce, Ity_V128,
809                     binop(Iop_InterleaveHI64x2, vbits, vbits));
810      // Generates
811      //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
812      //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
813      IRAtom* lohi64
814         = mkUifUV128(mce, hi64hi64, vbits);
815      // Generates UifU(vbits[127:64],vbits[63:0])
816      IRAtom* lo64
817         = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
818      // Generates
819      //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
820      //   == PCast-to-I64( vbits[127:0] )
821      IRAtom* res
822         = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
823      return res;
824   }
825
826   /* Else do it the slow way .. */
827   /* First of all, collapse vbits down to a single bit. */
828   tmp1   = NULL;
829   switch (src_ty) {
830      case Ity_I1:
831         tmp1 = vbits;
832         break;
833      case Ity_I8:
834         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
835         break;
836      case Ity_I16:
837         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
838         break;
839      case Ity_I32:
840         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
841         break;
842      case Ity_I64:
843         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
844         break;
845      case Ity_I128: {
846         /* Gah.  Chop it in half, OR the halves together, and compare
847            that with zero. */
848         IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
849         IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
850         IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
851         tmp1         = assignNew('V', mce, Ity_I1,
852                                       unop(Iop_CmpNEZ64, tmp4));
853         break;
854      }
855      default:
856         ppIRType(src_ty);
857         VG_(tool_panic)("mkPCastTo(1)");
858   }
859   tl_assert(tmp1);
860   /* Now widen up to the dst type. */
861   switch (dst_ty) {
862      case Ity_I1:
863         return tmp1;
864      case Ity_I8:
865         return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
866      case Ity_I16:
867         return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
868      case Ity_I32:
869         return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
870      case Ity_I64:
871         return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
872      case Ity_V128:
873         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
874         tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
875         return tmp1;
876      case Ity_I128:
877         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
878         tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
879         return tmp1;
880      case Ity_V256:
881         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
882         tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
883                                                    tmp1, tmp1));
884         tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
885                                                    tmp1, tmp1));
886         return tmp1;
887      default:
888         ppIRType(dst_ty);
889         VG_(tool_panic)("mkPCastTo(2)");
890   }
891}
892
893/* This is a minor variant.  It takes an arg of some type and returns
894   a value of the same type.  The result consists entirely of Defined
895   (zero) bits except its least significant bit, which is a PCast of
896   the entire argument down to a single bit. */
897static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
898{
899   if (ty == Ity_V128) {
900      /* --- Case for V128 --- */
901      IRAtom* varg128 = varg;
902      // generates: PCast-to-I64(varg128)
903      IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
904      // Now introduce zeros (defined bits) in the top 63 places
905      // generates: Def--(63)--Def PCast-to-I1(varg128)
906      IRAtom* d63pc
907         = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
908      // generates: Def--(64)--Def
909      IRAtom* d64
910         = definedOfType(Ity_I64);
911      // generates: Def--(127)--Def PCast-to-I1(varg128)
912      IRAtom* res
913         = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
914      return res;
915   }
916   if (ty == Ity_I64) {
917      /* --- Case for I64 --- */
918      // PCast to 64
919      IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
920      // Zero (Def) out the top 63 bits
921      IRAtom* res
922         = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
923      return res;
924   }
925   /*NOTREACHED*/
926   tl_assert(0);
927}
928
929/* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
930/*
931   Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
932   PCasting to Ity_U1.  However, sometimes it is necessary to be more
933   accurate.  The insight is that the result is defined if two
934   corresponding bits can be found, one from each argument, so that
935   both bits are defined but are different -- that makes EQ say "No"
936   and NE say "Yes".  Hence, we compute an improvement term and DifD
937   it onto the "normal" (UifU) result.
938
939   The result is:
940
941   PCastTo<1> (
942      -- naive version
943      PCastTo<sz>( UifU<sz>(vxx, vyy) )
944
945      `DifD<sz>`
946
947      -- improvement term
948      PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
949   )
950
951   where
952     vec contains 0 (defined) bits where the corresponding arg bits
953     are defined but different, and 1 bits otherwise.
954
955     vec = Or<sz>( vxx,   // 0 iff bit defined
956                   vyy,   // 0 iff bit defined
957                   Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
958                 )
959
960     If any bit of vec is 0, the result is defined and so the
961     improvement term should produce 0...0, else it should produce
962     1...1.
963
964     Hence require for the improvement term:
965
966        if vec == 1...1 then 1...1 else 0...0
967     ->
968        PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
969
970   This was extensively re-analysed and checked on 6 July 05.
971*/
972static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
973                                    IRType  ty,
974                                    IRAtom* vxx, IRAtom* vyy,
975                                    IRAtom* xx,  IRAtom* yy )
976{
977   IRAtom *naive, *vec, *improvement_term;
978   IRAtom *improved, *final_cast, *top;
979   IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
980
981   tl_assert(isShadowAtom(mce,vxx));
982   tl_assert(isShadowAtom(mce,vyy));
983   tl_assert(isOriginalAtom(mce,xx));
984   tl_assert(isOriginalAtom(mce,yy));
985   tl_assert(sameKindedAtoms(vxx,xx));
986   tl_assert(sameKindedAtoms(vyy,yy));
987
988   switch (ty) {
989      case Ity_I16:
990         opOR   = Iop_Or16;
991         opDIFD = Iop_And16;
992         opUIFU = Iop_Or16;
993         opNOT  = Iop_Not16;
994         opXOR  = Iop_Xor16;
995         opCMP  = Iop_CmpEQ16;
996         top    = mkU16(0xFFFF);
997         break;
998      case Ity_I32:
999         opOR   = Iop_Or32;
1000         opDIFD = Iop_And32;
1001         opUIFU = Iop_Or32;
1002         opNOT  = Iop_Not32;
1003         opXOR  = Iop_Xor32;
1004         opCMP  = Iop_CmpEQ32;
1005         top    = mkU32(0xFFFFFFFF);
1006         break;
1007      case Ity_I64:
1008         opOR   = Iop_Or64;
1009         opDIFD = Iop_And64;
1010         opUIFU = Iop_Or64;
1011         opNOT  = Iop_Not64;
1012         opXOR  = Iop_Xor64;
1013         opCMP  = Iop_CmpEQ64;
1014         top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
1015         break;
1016      default:
1017         VG_(tool_panic)("expensiveCmpEQorNE");
1018   }
1019
1020   naive
1021      = mkPCastTo(mce,ty,
1022                  assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
1023
1024   vec
1025      = assignNew(
1026           'V', mce,ty,
1027           binop( opOR,
1028                  assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
1029                  assignNew(
1030                     'V', mce,ty,
1031                     unop( opNOT,
1032                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1033
1034   improvement_term
1035      = mkPCastTo( mce,ty,
1036                   assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
1037
1038   improved
1039      = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
1040
1041   final_cast
1042      = mkPCastTo( mce, Ity_I1, improved );
1043
1044   return final_cast;
1045}
1046
1047
1048/* --------- Semi-accurate interpretation of CmpORD. --------- */
1049
1050/* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1051
1052      CmpORD32S(x,y) = 1<<3   if  x <s y
1053                     = 1<<2   if  x >s y
1054                     = 1<<1   if  x == y
1055
1056   and similarly the unsigned variant.  The default interpretation is:
1057
1058      CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1059                                  & (7<<1)
1060
1061   The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1062   are zero and therefore defined (viz, zero).
1063
1064   Also deal with a special case better:
1065
1066      CmpORD32S(x,0)
1067
1068   Here, bit 3 (LT) of the result is a copy of the top bit of x and
1069   will be defined even if the rest of x isn't.  In which case we do:
1070
1071      CmpORD32S#(x,x#,0,{impliedly 0}#)
1072         = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1073           | (x# >>u 31) << 3      -- LT# = x#[31]
1074
1075   Analogous handling for CmpORD64{S,U}.
1076*/
1077static Bool isZeroU32 ( IRAtom* e )
1078{
1079   return
1080      toBool( e->tag == Iex_Const
1081              && e->Iex.Const.con->tag == Ico_U32
1082              && e->Iex.Const.con->Ico.U32 == 0 );
1083}
1084
1085static Bool isZeroU64 ( IRAtom* e )
1086{
1087   return
1088      toBool( e->tag == Iex_Const
1089              && e->Iex.Const.con->tag == Ico_U64
1090              && e->Iex.Const.con->Ico.U64 == 0 );
1091}
1092
1093static IRAtom* doCmpORD ( MCEnv*  mce,
1094                          IROp    cmp_op,
1095                          IRAtom* xxhash, IRAtom* yyhash,
1096                          IRAtom* xx,     IRAtom* yy )
1097{
1098   Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1099   Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1100   IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
1101   IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
1102   IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
1103   IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
1104   IRType ty     = m64 ? Ity_I64   : Ity_I32;
1105   Int    width  = m64 ? 64        : 32;
1106
1107   Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1108
1109   IRAtom* threeLeft1 = NULL;
1110   IRAtom* sevenLeft1 = NULL;
1111
1112   tl_assert(isShadowAtom(mce,xxhash));
1113   tl_assert(isShadowAtom(mce,yyhash));
1114   tl_assert(isOriginalAtom(mce,xx));
1115   tl_assert(isOriginalAtom(mce,yy));
1116   tl_assert(sameKindedAtoms(xxhash,xx));
1117   tl_assert(sameKindedAtoms(yyhash,yy));
1118   tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1119             || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1120
1121   if (0) {
1122      ppIROp(cmp_op); VG_(printf)(" ");
1123      ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1124   }
1125
1126   if (syned && isZero(yy)) {
1127      /* fancy interpretation */
1128      /* if yy is zero, then it must be fully defined (zero#). */
1129      tl_assert(isZero(yyhash));
1130      threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
1131      return
1132         binop(
1133            opOR,
1134            assignNew(
1135               'V', mce,ty,
1136               binop(
1137                  opAND,
1138                  mkPCastTo(mce,ty, xxhash),
1139                  threeLeft1
1140               )),
1141            assignNew(
1142               'V', mce,ty,
1143               binop(
1144                  opSHL,
1145                  assignNew(
1146                     'V', mce,ty,
1147                     binop(opSHR, xxhash, mkU8(width-1))),
1148                  mkU8(3)
1149               ))
1150	 );
1151   } else {
1152      /* standard interpretation */
1153      sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1154      return
1155         binop(
1156            opAND,
1157            mkPCastTo( mce,ty,
1158                       mkUifU(mce,ty, xxhash,yyhash)),
1159            sevenLeft1
1160         );
1161   }
1162}
1163
1164
1165/*------------------------------------------------------------*/
1166/*--- Emit a test and complaint if something is undefined. ---*/
1167/*------------------------------------------------------------*/
1168
1169static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1170
1171
1172/* Set the annotations on a dirty helper to indicate that the stack
1173   pointer and instruction pointers might be read.  This is the
1174   behaviour of all 'emit-a-complaint' style functions we might
1175   call. */
1176
1177static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1178   di->nFxState = 2;
1179   di->fxState[0].fx        = Ifx_Read;
1180   di->fxState[0].offset    = mce->layout->offset_SP;
1181   di->fxState[0].size      = mce->layout->sizeof_SP;
1182   di->fxState[0].nRepeats  = 0;
1183   di->fxState[0].repeatLen = 0;
1184   di->fxState[1].fx        = Ifx_Read;
1185   di->fxState[1].offset    = mce->layout->offset_IP;
1186   di->fxState[1].size      = mce->layout->sizeof_IP;
1187   di->fxState[1].nRepeats  = 0;
1188   di->fxState[1].repeatLen = 0;
1189}
1190
1191
1192/* Check the supplied *original* |atom| for undefinedness, and emit a
1193   complaint if so.  Once that happens, mark it as defined.  This is
1194   possible because the atom is either a tmp or literal.  If it's a
1195   tmp, it will be shadowed by a tmp, and so we can set the shadow to
1196   be defined.  In fact as mentioned above, we will have to allocate a
1197   new tmp to carry the new 'defined' shadow value, and update the
1198   original->tmp mapping accordingly; we cannot simply assign a new
1199   value to an existing shadow tmp as this breaks SSAness.
1200
1201   The checks are performed, any resulting complaint emitted, and
1202   |atom|'s shadow temp set to 'defined', ONLY in the case that
1203   |guard| evaluates to True at run-time.  If it evaluates to False
1204   then no action is performed.  If |guard| is NULL (the usual case)
1205   then it is assumed to be always-true, and hence these actions are
1206   performed unconditionally.
1207
1208   This routine does not generate code to check the definedness of
1209   |guard|.  The caller is assumed to have taken care of that already.
1210*/
1211static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1212{
1213   IRAtom*  vatom;
1214   IRType   ty;
1215   Int      sz;
1216   IRDirty* di;
1217   IRAtom*  cond;
1218   IRAtom*  origin;
1219   void*    fn;
1220   const HChar* nm;
1221   IRExpr** args;
1222   Int      nargs;
1223
1224   // Don't do V bit tests if we're not reporting undefined value errors.
1225   if (MC_(clo_mc_level) == 1)
1226      return;
1227
1228   if (guard)
1229      tl_assert(isOriginalAtom(mce, guard));
1230
1231   /* Since the original expression is atomic, there's no duplicated
1232      work generated by making multiple V-expressions for it.  So we
1233      don't really care about the possibility that someone else may
1234      also create a V-interpretion for it. */
1235   tl_assert(isOriginalAtom(mce, atom));
1236   vatom = expr2vbits( mce, atom );
1237   tl_assert(isShadowAtom(mce, vatom));
1238   tl_assert(sameKindedAtoms(atom, vatom));
1239
1240   ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1241
1242   /* sz is only used for constructing the error message */
1243   sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1244
1245   cond = mkPCastTo( mce, Ity_I1, vatom );
1246   /* cond will be 0 if all defined, and 1 if any not defined. */
1247
1248   /* Get the origin info for the value we are about to check.  At
1249      least, if we are doing origin tracking.  If not, use a dummy
1250      zero origin. */
1251   if (MC_(clo_mc_level) == 3) {
1252      origin = schemeE( mce, atom );
1253      if (mce->hWordTy == Ity_I64) {
1254         origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1255      }
1256   } else {
1257      origin = NULL;
1258   }
1259
1260   fn    = NULL;
1261   nm    = NULL;
1262   args  = NULL;
1263   nargs = -1;
1264
1265   switch (sz) {
1266      case 0:
1267         if (origin) {
1268            fn    = &MC_(helperc_value_check0_fail_w_o);
1269            nm    = "MC_(helperc_value_check0_fail_w_o)";
1270            args  = mkIRExprVec_1(origin);
1271            nargs = 1;
1272         } else {
1273            fn    = &MC_(helperc_value_check0_fail_no_o);
1274            nm    = "MC_(helperc_value_check0_fail_no_o)";
1275            args  = mkIRExprVec_0();
1276            nargs = 0;
1277         }
1278         break;
1279      case 1:
1280         if (origin) {
1281            fn    = &MC_(helperc_value_check1_fail_w_o);
1282            nm    = "MC_(helperc_value_check1_fail_w_o)";
1283            args  = mkIRExprVec_1(origin);
1284            nargs = 1;
1285         } else {
1286            fn    = &MC_(helperc_value_check1_fail_no_o);
1287            nm    = "MC_(helperc_value_check1_fail_no_o)";
1288            args  = mkIRExprVec_0();
1289            nargs = 0;
1290         }
1291         break;
1292      case 4:
1293         if (origin) {
1294            fn    = &MC_(helperc_value_check4_fail_w_o);
1295            nm    = "MC_(helperc_value_check4_fail_w_o)";
1296            args  = mkIRExprVec_1(origin);
1297            nargs = 1;
1298         } else {
1299            fn    = &MC_(helperc_value_check4_fail_no_o);
1300            nm    = "MC_(helperc_value_check4_fail_no_o)";
1301            args  = mkIRExprVec_0();
1302            nargs = 0;
1303         }
1304         break;
1305      case 8:
1306         if (origin) {
1307            fn    = &MC_(helperc_value_check8_fail_w_o);
1308            nm    = "MC_(helperc_value_check8_fail_w_o)";
1309            args  = mkIRExprVec_1(origin);
1310            nargs = 1;
1311         } else {
1312            fn    = &MC_(helperc_value_check8_fail_no_o);
1313            nm    = "MC_(helperc_value_check8_fail_no_o)";
1314            args  = mkIRExprVec_0();
1315            nargs = 0;
1316         }
1317         break;
1318      case 2:
1319      case 16:
1320         if (origin) {
1321            fn    = &MC_(helperc_value_checkN_fail_w_o);
1322            nm    = "MC_(helperc_value_checkN_fail_w_o)";
1323            args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1324            nargs = 2;
1325         } else {
1326            fn    = &MC_(helperc_value_checkN_fail_no_o);
1327            nm    = "MC_(helperc_value_checkN_fail_no_o)";
1328            args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1329            nargs = 1;
1330         }
1331         break;
1332      default:
1333         VG_(tool_panic)("unexpected szB");
1334   }
1335
1336   tl_assert(fn);
1337   tl_assert(nm);
1338   tl_assert(args);
1339   tl_assert(nargs >= 0 && nargs <= 2);
1340   tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1341              || (MC_(clo_mc_level) == 2 && origin == NULL) );
1342
1343   di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1344                           VG_(fnptr_to_fnentry)( fn ), args );
1345   di->guard = cond; // and cond is PCast-to-1(atom#)
1346
1347   /* If the complaint is to be issued under a guard condition, AND
1348      that into the guard condition for the helper call. */
1349   if (guard) {
1350      IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1351      IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1352      IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1353      di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1354   }
1355
1356   setHelperAnns( mce, di );
1357   stmt( 'V', mce, IRStmt_Dirty(di));
1358
1359   /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1360      defined -- but only in the case where the guard evaluates to
1361      True at run-time.  Do the update by setting the orig->shadow
1362      mapping for tmp to reflect the fact that this shadow is getting
1363      a new value. */
1364   tl_assert(isIRAtom(vatom));
1365   /* sameKindedAtoms ... */
1366   if (vatom->tag == Iex_RdTmp) {
1367      tl_assert(atom->tag == Iex_RdTmp);
1368      if (guard == NULL) {
1369         // guard is 'always True', hence update unconditionally
1370         newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1371         assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1372                          definedOfType(ty));
1373      } else {
1374         // update the temp only conditionally.  Do this by copying
1375         // its old value when the guard is False.
1376         // The old value ..
1377         IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1378         newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1379         IRAtom* new_tmpV
1380            = assignNew('V', mce, shadowTypeV(ty),
1381                        IRExpr_ITE(guard, definedOfType(ty),
1382                                          mkexpr(old_tmpV)));
1383         assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1384      }
1385   }
1386}
1387
1388
1389/*------------------------------------------------------------*/
1390/*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1391/*------------------------------------------------------------*/
1392
1393/* Examine the always-defined sections declared in layout to see if
1394   the (offset,size) section is within one.  Note, is is an error to
1395   partially fall into such a region: (offset,size) should either be
1396   completely in such a region or completely not-in such a region.
1397*/
1398static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1399{
1400   Int minoffD, maxoffD, i;
1401   Int minoff = offset;
1402   Int maxoff = minoff + size - 1;
1403   tl_assert((minoff & ~0xFFFF) == 0);
1404   tl_assert((maxoff & ~0xFFFF) == 0);
1405
1406   for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1407      minoffD = mce->layout->alwaysDefd[i].offset;
1408      maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1409      tl_assert((minoffD & ~0xFFFF) == 0);
1410      tl_assert((maxoffD & ~0xFFFF) == 0);
1411
1412      if (maxoff < minoffD || maxoffD < minoff)
1413         continue; /* no overlap */
1414      if (minoff >= minoffD && maxoff <= maxoffD)
1415         return True; /* completely contained in an always-defd section */
1416
1417      VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1418   }
1419   return False; /* could not find any containing section */
1420}
1421
1422
1423/* Generate into bb suitable actions to shadow this Put.  If the state
1424   slice is marked 'always defined', do nothing.  Otherwise, write the
1425   supplied V bits to the shadow state.  We can pass in either an
1426   original atom or a V-atom, but not both.  In the former case the
1427   relevant V-bits are then generated from the original.
1428   We assume here, that the definedness of GUARD has already been checked.
1429*/
1430static
1431void do_shadow_PUT ( MCEnv* mce,  Int offset,
1432                     IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1433{
1434   IRType ty;
1435
1436   // Don't do shadow PUTs if we're not doing undefined value checking.
1437   // Their absence lets Vex's optimiser remove all the shadow computation
1438   // that they depend on, which includes GETs of the shadow registers.
1439   if (MC_(clo_mc_level) == 1)
1440      return;
1441
1442   if (atom) {
1443      tl_assert(!vatom);
1444      tl_assert(isOriginalAtom(mce, atom));
1445      vatom = expr2vbits( mce, atom );
1446   } else {
1447      tl_assert(vatom);
1448      tl_assert(isShadowAtom(mce, vatom));
1449   }
1450
1451   ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1452   tl_assert(ty != Ity_I1);
1453   tl_assert(ty != Ity_I128);
1454   if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1455      /* later: no ... */
1456      /* emit code to emit a complaint if any of the vbits are 1. */
1457      /* complainIfUndefined(mce, atom); */
1458   } else {
1459      /* Do a plain shadow Put. */
1460      if (guard) {
1461         /* If the guard expression evaluates to false we simply Put the value
1462            that is already stored in the guest state slot */
1463         IRAtom *cond, *iffalse;
1464
1465         cond    = assignNew('V', mce, Ity_I1, guard);
1466         iffalse = assignNew('V', mce, ty,
1467                             IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1468         vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1469      }
1470      stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1471   }
1472}
1473
1474
1475/* Return an expression which contains the V bits corresponding to the
1476   given GETI (passed in in pieces).
1477*/
1478static
1479void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1480{
1481   IRAtom* vatom;
1482   IRType  ty, tyS;
1483   Int     arrSize;;
1484   IRRegArray* descr = puti->descr;
1485   IRAtom*     ix    = puti->ix;
1486   Int         bias  = puti->bias;
1487   IRAtom*     atom  = puti->data;
1488
1489   // Don't do shadow PUTIs if we're not doing undefined value checking.
1490   // Their absence lets Vex's optimiser remove all the shadow computation
1491   // that they depend on, which includes GETIs of the shadow registers.
1492   if (MC_(clo_mc_level) == 1)
1493      return;
1494
1495   tl_assert(isOriginalAtom(mce,atom));
1496   vatom = expr2vbits( mce, atom );
1497   tl_assert(sameKindedAtoms(atom, vatom));
1498   ty   = descr->elemTy;
1499   tyS  = shadowTypeV(ty);
1500   arrSize = descr->nElems * sizeofIRType(ty);
1501   tl_assert(ty != Ity_I1);
1502   tl_assert(isOriginalAtom(mce,ix));
1503   complainIfUndefined(mce, ix, NULL);
1504   if (isAlwaysDefd(mce, descr->base, arrSize)) {
1505      /* later: no ... */
1506      /* emit code to emit a complaint if any of the vbits are 1. */
1507      /* complainIfUndefined(mce, atom); */
1508   } else {
1509      /* Do a cloned version of the Put that refers to the shadow
1510         area. */
1511      IRRegArray* new_descr
1512         = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1513                         tyS, descr->nElems);
1514      stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1515   }
1516}
1517
1518
1519/* Return an expression which contains the V bits corresponding to the
1520   given GET (passed in in pieces).
1521*/
1522static
1523IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1524{
1525   IRType tyS = shadowTypeV(ty);
1526   tl_assert(ty != Ity_I1);
1527   tl_assert(ty != Ity_I128);
1528   if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1529      /* Always defined, return all zeroes of the relevant type */
1530      return definedOfType(tyS);
1531   } else {
1532      /* return a cloned version of the Get that refers to the shadow
1533         area. */
1534      /* FIXME: this isn't an atom! */
1535      return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1536   }
1537}
1538
1539
1540/* Return an expression which contains the V bits corresponding to the
1541   given GETI (passed in in pieces).
1542*/
1543static
1544IRExpr* shadow_GETI ( MCEnv* mce,
1545                      IRRegArray* descr, IRAtom* ix, Int bias )
1546{
1547   IRType ty   = descr->elemTy;
1548   IRType tyS  = shadowTypeV(ty);
1549   Int arrSize = descr->nElems * sizeofIRType(ty);
1550   tl_assert(ty != Ity_I1);
1551   tl_assert(isOriginalAtom(mce,ix));
1552   complainIfUndefined(mce, ix, NULL);
1553   if (isAlwaysDefd(mce, descr->base, arrSize)) {
1554      /* Always defined, return all zeroes of the relevant type */
1555      return definedOfType(tyS);
1556   } else {
1557      /* return a cloned version of the Get that refers to the shadow
1558         area. */
1559      IRRegArray* new_descr
1560         = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1561                         tyS, descr->nElems);
1562      return IRExpr_GetI( new_descr, ix, bias );
1563   }
1564}
1565
1566
1567/*------------------------------------------------------------*/
1568/*--- Generating approximations for unknown operations,    ---*/
1569/*--- using lazy-propagate semantics                       ---*/
1570/*------------------------------------------------------------*/
1571
1572/* Lazy propagation of undefinedness from two values, resulting in the
1573   specified shadow type.
1574*/
1575static
1576IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1577{
1578   IRAtom* at;
1579   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1580   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1581   tl_assert(isShadowAtom(mce,va1));
1582   tl_assert(isShadowAtom(mce,va2));
1583
1584   /* The general case is inefficient because PCast is an expensive
1585      operation.  Here are some special cases which use PCast only
1586      once rather than twice. */
1587
1588   /* I64 x I64 -> I64 */
1589   if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1590      if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1591      at = mkUifU(mce, Ity_I64, va1, va2);
1592      at = mkPCastTo(mce, Ity_I64, at);
1593      return at;
1594   }
1595
1596   /* I64 x I64 -> I32 */
1597   if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1598      if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1599      at = mkUifU(mce, Ity_I64, va1, va2);
1600      at = mkPCastTo(mce, Ity_I32, at);
1601      return at;
1602   }
1603
1604   if (0) {
1605      VG_(printf)("mkLazy2 ");
1606      ppIRType(t1);
1607      VG_(printf)("_");
1608      ppIRType(t2);
1609      VG_(printf)("_");
1610      ppIRType(finalVty);
1611      VG_(printf)("\n");
1612   }
1613
1614   /* General case: force everything via 32-bit intermediaries. */
1615   at = mkPCastTo(mce, Ity_I32, va1);
1616   at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1617   at = mkPCastTo(mce, finalVty, at);
1618   return at;
1619}
1620
1621
1622/* 3-arg version of the above. */
1623static
1624IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1625                  IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1626{
1627   IRAtom* at;
1628   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1629   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1630   IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1631   tl_assert(isShadowAtom(mce,va1));
1632   tl_assert(isShadowAtom(mce,va2));
1633   tl_assert(isShadowAtom(mce,va3));
1634
1635   /* The general case is inefficient because PCast is an expensive
1636      operation.  Here are some special cases which use PCast only
1637      twice rather than three times. */
1638
1639   /* I32 x I64 x I64 -> I64 */
1640   /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1641   if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1642       && finalVty == Ity_I64) {
1643      if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1644      /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1645         mode indication which is fully defined, this should get
1646         folded out later. */
1647      at = mkPCastTo(mce, Ity_I64, va1);
1648      /* Now fold in 2nd and 3rd args. */
1649      at = mkUifU(mce, Ity_I64, at, va2);
1650      at = mkUifU(mce, Ity_I64, at, va3);
1651      /* and PCast once again. */
1652      at = mkPCastTo(mce, Ity_I64, at);
1653      return at;
1654   }
1655
1656   /* I32 x I8 x I64 -> I64 */
1657   if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1658       && finalVty == Ity_I64) {
1659      if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1660      /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1661       * rounding mode indication which is fully defined, this should
1662       * get folded out later.
1663      */
1664      IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1665      IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1666      at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1667      at = mkUifU(mce, Ity_I64, at, va3);
1668      /* and PCast once again. */
1669      at = mkPCastTo(mce, Ity_I64, at);
1670      return at;
1671   }
1672
1673   /* I32 x I64 x I64 -> I32 */
1674   if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1675       && finalVty == Ity_I32) {
1676      if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1677      at = mkPCastTo(mce, Ity_I64, va1);
1678      at = mkUifU(mce, Ity_I64, at, va2);
1679      at = mkUifU(mce, Ity_I64, at, va3);
1680      at = mkPCastTo(mce, Ity_I32, at);
1681      return at;
1682   }
1683
1684   /* I32 x I32 x I32 -> I32 */
1685   /* 32-bit FP idiom, as (eg) happens on ARM */
1686   if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1687       && finalVty == Ity_I32) {
1688      if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1689      at = va1;
1690      at = mkUifU(mce, Ity_I32, at, va2);
1691      at = mkUifU(mce, Ity_I32, at, va3);
1692      at = mkPCastTo(mce, Ity_I32, at);
1693      return at;
1694   }
1695
1696   /* I32 x I128 x I128 -> I128 */
1697   /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1698   if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1699       && finalVty == Ity_I128) {
1700      if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1701      /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1702         mode indication which is fully defined, this should get
1703         folded out later. */
1704      at = mkPCastTo(mce, Ity_I128, va1);
1705      /* Now fold in 2nd and 3rd args. */
1706      at = mkUifU(mce, Ity_I128, at, va2);
1707      at = mkUifU(mce, Ity_I128, at, va3);
1708      /* and PCast once again. */
1709      at = mkPCastTo(mce, Ity_I128, at);
1710      return at;
1711   }
1712
1713   /* I32 x I8 x I128 -> I128 */
1714   /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1715   if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
1716       && finalVty == Ity_I128) {
1717      if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
1718      /* Use I64 as an intermediate type, which means PCasting all 3
1719         args to I64 to start with. 1st arg is typically a rounding
1720         mode indication which is fully defined, so we hope that it
1721         will get folded out later. */
1722      IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1723      IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1724      IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
1725      /* Now UifU all three together. */
1726      at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1727      at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
1728      /* and PCast once again. */
1729      at = mkPCastTo(mce, Ity_I128, at);
1730      return at;
1731   }
1732   if (1) {
1733      VG_(printf)("mkLazy3: ");
1734      ppIRType(t1);
1735      VG_(printf)(" x ");
1736      ppIRType(t2);
1737      VG_(printf)(" x ");
1738      ppIRType(t3);
1739      VG_(printf)(" -> ");
1740      ppIRType(finalVty);
1741      VG_(printf)("\n");
1742   }
1743
1744   tl_assert(0);
1745   /* General case: force everything via 32-bit intermediaries. */
1746   /*
1747   at = mkPCastTo(mce, Ity_I32, va1);
1748   at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1749   at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
1750   at = mkPCastTo(mce, finalVty, at);
1751   return at;
1752   */
1753}
1754
1755
1756/* 4-arg version of the above. */
1757static
1758IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
1759                  IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
1760{
1761   IRAtom* at;
1762   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1763   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1764   IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1765   IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
1766   tl_assert(isShadowAtom(mce,va1));
1767   tl_assert(isShadowAtom(mce,va2));
1768   tl_assert(isShadowAtom(mce,va3));
1769   tl_assert(isShadowAtom(mce,va4));
1770
1771   /* The general case is inefficient because PCast is an expensive
1772      operation.  Here are some special cases which use PCast only
1773      twice rather than three times. */
1774
1775   /* I32 x I64 x I64 x I64 -> I64 */
1776   /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1777   if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
1778       && finalVty == Ity_I64) {
1779      if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
1780      /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1781         mode indication which is fully defined, this should get
1782         folded out later. */
1783      at = mkPCastTo(mce, Ity_I64, va1);
1784      /* Now fold in 2nd, 3rd, 4th args. */
1785      at = mkUifU(mce, Ity_I64, at, va2);
1786      at = mkUifU(mce, Ity_I64, at, va3);
1787      at = mkUifU(mce, Ity_I64, at, va4);
1788      /* and PCast once again. */
1789      at = mkPCastTo(mce, Ity_I64, at);
1790      return at;
1791   }
1792   /* I32 x I32 x I32 x I32 -> I32 */
1793   /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1794   if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
1795       && finalVty == Ity_I32) {
1796      if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
1797      at = va1;
1798      /* Now fold in 2nd, 3rd, 4th args. */
1799      at = mkUifU(mce, Ity_I32, at, va2);
1800      at = mkUifU(mce, Ity_I32, at, va3);
1801      at = mkUifU(mce, Ity_I32, at, va4);
1802      at = mkPCastTo(mce, Ity_I32, at);
1803      return at;
1804   }
1805
1806   if (1) {
1807      VG_(printf)("mkLazy4: ");
1808      ppIRType(t1);
1809      VG_(printf)(" x ");
1810      ppIRType(t2);
1811      VG_(printf)(" x ");
1812      ppIRType(t3);
1813      VG_(printf)(" x ");
1814      ppIRType(t4);
1815      VG_(printf)(" -> ");
1816      ppIRType(finalVty);
1817      VG_(printf)("\n");
1818   }
1819
1820   tl_assert(0);
1821}
1822
1823
1824/* Do the lazy propagation game from a null-terminated vector of
1825   atoms.  This is presumably the arguments to a helper call, so the
1826   IRCallee info is also supplied in order that we can know which
1827   arguments should be ignored (via the .mcx_mask field).
1828*/
1829static
1830IRAtom* mkLazyN ( MCEnv* mce,
1831                  IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
1832{
1833   Int     i;
1834   IRAtom* here;
1835   IRAtom* curr;
1836   IRType  mergeTy;
1837   Bool    mergeTy64 = True;
1838
1839   /* Decide on the type of the merge intermediary.  If all relevant
1840      args are I64, then it's I64.  In all other circumstances, use
1841      I32. */
1842   for (i = 0; exprvec[i]; i++) {
1843      tl_assert(i < 32);
1844      tl_assert(isOriginalAtom(mce, exprvec[i]));
1845      if (cee->mcx_mask & (1<<i))
1846         continue;
1847      if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
1848         mergeTy64 = False;
1849   }
1850
1851   mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
1852   curr    = definedOfType(mergeTy);
1853
1854   for (i = 0; exprvec[i]; i++) {
1855      tl_assert(i < 32);
1856      tl_assert(isOriginalAtom(mce, exprvec[i]));
1857      /* Only take notice of this arg if the callee's mc-exclusion
1858         mask does not say it is to be excluded. */
1859      if (cee->mcx_mask & (1<<i)) {
1860         /* the arg is to be excluded from definedness checking.  Do
1861            nothing. */
1862         if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
1863      } else {
1864         /* calculate the arg's definedness, and pessimistically merge
1865            it in. */
1866         here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
1867         curr = mergeTy64
1868                   ? mkUifU64(mce, here, curr)
1869                   : mkUifU32(mce, here, curr);
1870      }
1871   }
1872   return mkPCastTo(mce, finalVtype, curr );
1873}
1874
1875
1876/*------------------------------------------------------------*/
1877/*--- Generating expensive sequences for exact carry-chain ---*/
1878/*--- propagation in add/sub and related operations.       ---*/
1879/*------------------------------------------------------------*/
1880
1881static
1882IRAtom* expensiveAddSub ( MCEnv*  mce,
1883                          Bool    add,
1884                          IRType  ty,
1885                          IRAtom* qaa, IRAtom* qbb,
1886                          IRAtom* aa,  IRAtom* bb )
1887{
1888   IRAtom *a_min, *b_min, *a_max, *b_max;
1889   IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
1890
1891   tl_assert(isShadowAtom(mce,qaa));
1892   tl_assert(isShadowAtom(mce,qbb));
1893   tl_assert(isOriginalAtom(mce,aa));
1894   tl_assert(isOriginalAtom(mce,bb));
1895   tl_assert(sameKindedAtoms(qaa,aa));
1896   tl_assert(sameKindedAtoms(qbb,bb));
1897
1898   switch (ty) {
1899      case Ity_I32:
1900         opAND = Iop_And32;
1901         opOR  = Iop_Or32;
1902         opXOR = Iop_Xor32;
1903         opNOT = Iop_Not32;
1904         opADD = Iop_Add32;
1905         opSUB = Iop_Sub32;
1906         break;
1907      case Ity_I64:
1908         opAND = Iop_And64;
1909         opOR  = Iop_Or64;
1910         opXOR = Iop_Xor64;
1911         opNOT = Iop_Not64;
1912         opADD = Iop_Add64;
1913         opSUB = Iop_Sub64;
1914         break;
1915      default:
1916         VG_(tool_panic)("expensiveAddSub");
1917   }
1918
1919   // a_min = aa & ~qaa
1920   a_min = assignNew('V', mce,ty,
1921                     binop(opAND, aa,
1922                                  assignNew('V', mce,ty, unop(opNOT, qaa))));
1923
1924   // b_min = bb & ~qbb
1925   b_min = assignNew('V', mce,ty,
1926                     binop(opAND, bb,
1927                                  assignNew('V', mce,ty, unop(opNOT, qbb))));
1928
1929   // a_max = aa | qaa
1930   a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
1931
1932   // b_max = bb | qbb
1933   b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
1934
1935   if (add) {
1936      // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
1937      return
1938      assignNew('V', mce,ty,
1939         binop( opOR,
1940                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
1941                assignNew('V', mce,ty,
1942                   binop( opXOR,
1943                          assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
1944                          assignNew('V', mce,ty, binop(opADD, a_max, b_max))
1945                   )
1946                )
1947         )
1948      );
1949   } else {
1950      // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
1951      return
1952      assignNew('V', mce,ty,
1953         binop( opOR,
1954                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
1955                assignNew('V', mce,ty,
1956                   binop( opXOR,
1957                          assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
1958                          assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
1959                   )
1960                )
1961         )
1962      );
1963   }
1964
1965}
1966
1967
1968static
1969IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
1970                                       IRAtom* atom, IRAtom* vatom )
1971{
1972   IRType ty;
1973   IROp xorOp, subOp, andOp;
1974   IRExpr *one;
1975   IRAtom *improver, *improved;
1976   tl_assert(isShadowAtom(mce,vatom));
1977   tl_assert(isOriginalAtom(mce,atom));
1978   tl_assert(sameKindedAtoms(atom,vatom));
1979
1980   switch (czop) {
1981      case Iop_Ctz32:
1982         ty = Ity_I32;
1983         xorOp = Iop_Xor32;
1984         subOp = Iop_Sub32;
1985         andOp = Iop_And32;
1986         one = mkU32(1);
1987         break;
1988      case Iop_Ctz64:
1989         ty = Ity_I64;
1990         xorOp = Iop_Xor64;
1991         subOp = Iop_Sub64;
1992         andOp = Iop_And64;
1993         one = mkU64(1);
1994         break;
1995      default:
1996         ppIROp(czop);
1997         VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
1998   }
1999
2000   // improver = atom ^ (atom - 1)
2001   //
2002   // That is, improver has its low ctz(atom) bits equal to one;
2003   // higher bits (if any) equal to zero.
2004   improver = assignNew('V', mce,ty,
2005                        binop(xorOp,
2006                              atom,
2007                              assignNew('V', mce, ty,
2008                                        binop(subOp, atom, one))));
2009
2010   // improved = vatom & improver
2011   //
2012   // That is, treat any V bits above the first ctz(atom) bits as
2013   // "defined".
2014   improved = assignNew('V', mce, ty,
2015                        binop(andOp, vatom, improver));
2016
2017   // Return pessimizing cast of improved.
2018   return mkPCastTo(mce, ty, improved);
2019}
2020
2021
2022/*------------------------------------------------------------*/
2023/*--- Scalar shifts.                                       ---*/
2024/*------------------------------------------------------------*/
2025
2026/* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2027   idea is to shift the definedness bits by the original shift amount.
2028   This introduces 0s ("defined") in new positions for left shifts and
2029   unsigned right shifts, and copies the top definedness bit for
2030   signed right shifts.  So, conveniently, applying the original shift
2031   operator to the definedness bits for the left arg is exactly the
2032   right thing to do:
2033
2034      (qaa << bb)
2035
2036   However if the shift amount is undefined then the whole result
2037   is undefined.  Hence need:
2038
2039      (qaa << bb) `UifU` PCast(qbb)
2040
2041   If the shift amount bb is a literal than qbb will say 'all defined'
2042   and the UifU and PCast will get folded out by post-instrumentation
2043   optimisation.
2044*/
2045static IRAtom* scalarShift ( MCEnv*  mce,
2046                             IRType  ty,
2047                             IROp    original_op,
2048                             IRAtom* qaa, IRAtom* qbb,
2049                             IRAtom* aa,  IRAtom* bb )
2050{
2051   tl_assert(isShadowAtom(mce,qaa));
2052   tl_assert(isShadowAtom(mce,qbb));
2053   tl_assert(isOriginalAtom(mce,aa));
2054   tl_assert(isOriginalAtom(mce,bb));
2055   tl_assert(sameKindedAtoms(qaa,aa));
2056   tl_assert(sameKindedAtoms(qbb,bb));
2057   return
2058      assignNew(
2059         'V', mce, ty,
2060         mkUifU( mce, ty,
2061                 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2062                 mkPCastTo(mce, ty, qbb)
2063         )
2064   );
2065}
2066
2067
2068/*------------------------------------------------------------*/
2069/*--- Helpers for dealing with vector primops.             ---*/
2070/*------------------------------------------------------------*/
2071
2072/* Vector pessimisation -- pessimise within each lane individually. */
2073
2074static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2075{
2076   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2077}
2078
2079static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2080{
2081   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2082}
2083
2084static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2085{
2086   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2087}
2088
2089static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2090{
2091   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2092}
2093
2094static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2095{
2096   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2097}
2098
2099static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2100{
2101   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2102}
2103
2104static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2105{
2106   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2107}
2108
2109static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2110{
2111   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2112}
2113
2114static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2115{
2116   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2117}
2118
2119static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2120{
2121   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2122}
2123
2124static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2125{
2126   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2127}
2128
2129static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2130{
2131   return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2132}
2133
2134static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2135{
2136   return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2137}
2138
2139
2140/* Here's a simple scheme capable of handling ops derived from SSE1
2141   code and while only generating ops that can be efficiently
2142   implemented in SSE1. */
2143
2144/* All-lanes versions are straightforward:
2145
2146   binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2147
2148   unary32Fx4(x,y)    ==> PCast32x4(x#)
2149
2150   Lowest-lane-only versions are more complex:
2151
2152   binary32F0x4(x,y)  ==> SetV128lo32(
2153                             x#,
2154                             PCast32(V128to32(UifUV128(x#,y#)))
2155                          )
2156
2157   This is perhaps not so obvious.  In particular, it's faster to
2158   do a V128-bit UifU and then take the bottom 32 bits than the more
2159   obvious scheme of taking the bottom 32 bits of each operand
2160   and doing a 32-bit UifU.  Basically since UifU is fast and
2161   chopping lanes off vector values is slow.
2162
2163   Finally:
2164
2165   unary32F0x4(x)     ==> SetV128lo32(
2166                             x#,
2167                             PCast32(V128to32(x#))
2168                          )
2169
2170   Where:
2171
2172   PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2173   PCast32x4(v#) = CmpNEZ32x4(v#)
2174*/
2175
2176static
2177IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2178{
2179   IRAtom* at;
2180   tl_assert(isShadowAtom(mce, vatomX));
2181   tl_assert(isShadowAtom(mce, vatomY));
2182   at = mkUifUV128(mce, vatomX, vatomY);
2183   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2184   return at;
2185}
2186
2187static
2188IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2189{
2190   IRAtom* at;
2191   tl_assert(isShadowAtom(mce, vatomX));
2192   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2193   return at;
2194}
2195
2196static
2197IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2198{
2199   IRAtom* at;
2200   tl_assert(isShadowAtom(mce, vatomX));
2201   tl_assert(isShadowAtom(mce, vatomY));
2202   at = mkUifUV128(mce, vatomX, vatomY);
2203   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2204   at = mkPCastTo(mce, Ity_I32, at);
2205   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2206   return at;
2207}
2208
2209static
2210IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2211{
2212   IRAtom* at;
2213   tl_assert(isShadowAtom(mce, vatomX));
2214   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2215   at = mkPCastTo(mce, Ity_I32, at);
2216   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2217   return at;
2218}
2219
2220/* --- ... and ... 64Fx2 versions of the same ... --- */
2221
2222static
2223IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2224{
2225   IRAtom* at;
2226   tl_assert(isShadowAtom(mce, vatomX));
2227   tl_assert(isShadowAtom(mce, vatomY));
2228   at = mkUifUV128(mce, vatomX, vatomY);
2229   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2230   return at;
2231}
2232
2233static
2234IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2235{
2236   IRAtom* at;
2237   tl_assert(isShadowAtom(mce, vatomX));
2238   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2239   return at;
2240}
2241
2242static
2243IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2244{
2245   IRAtom* at;
2246   tl_assert(isShadowAtom(mce, vatomX));
2247   tl_assert(isShadowAtom(mce, vatomY));
2248   at = mkUifUV128(mce, vatomX, vatomY);
2249   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2250   at = mkPCastTo(mce, Ity_I64, at);
2251   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2252   return at;
2253}
2254
2255static
2256IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2257{
2258   IRAtom* at;
2259   tl_assert(isShadowAtom(mce, vatomX));
2260   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2261   at = mkPCastTo(mce, Ity_I64, at);
2262   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2263   return at;
2264}
2265
2266/* --- --- ... and ... 32Fx2 versions of the same --- --- */
2267
2268static
2269IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2270{
2271   IRAtom* at;
2272   tl_assert(isShadowAtom(mce, vatomX));
2273   tl_assert(isShadowAtom(mce, vatomY));
2274   at = mkUifU64(mce, vatomX, vatomY);
2275   at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2276   return at;
2277}
2278
2279static
2280IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2281{
2282   IRAtom* at;
2283   tl_assert(isShadowAtom(mce, vatomX));
2284   at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2285   return at;
2286}
2287
2288/* --- ... and ... 64Fx4 versions of the same ... --- */
2289
2290static
2291IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2292{
2293   IRAtom* at;
2294   tl_assert(isShadowAtom(mce, vatomX));
2295   tl_assert(isShadowAtom(mce, vatomY));
2296   at = mkUifUV256(mce, vatomX, vatomY);
2297   at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2298   return at;
2299}
2300
2301static
2302IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2303{
2304   IRAtom* at;
2305   tl_assert(isShadowAtom(mce, vatomX));
2306   at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2307   return at;
2308}
2309
2310/* --- ... and ... 32Fx8 versions of the same ... --- */
2311
2312static
2313IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2314{
2315   IRAtom* at;
2316   tl_assert(isShadowAtom(mce, vatomX));
2317   tl_assert(isShadowAtom(mce, vatomY));
2318   at = mkUifUV256(mce, vatomX, vatomY);
2319   at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2320   return at;
2321}
2322
2323static
2324IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2325{
2326   IRAtom* at;
2327   tl_assert(isShadowAtom(mce, vatomX));
2328   at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2329   return at;
2330}
2331
2332/* --- 64Fx2 binary FP ops, with rounding mode --- */
2333
2334static
2335IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2336                                       IRAtom* vatomX, IRAtom* vatomY )
2337{
2338   /* This is the same as binary64Fx2, except that we subsequently
2339      pessimise vRM (definedness of the rounding mode), widen to 128
2340      bits and UifU it into the result.  As with the scalar cases, if
2341      the RM is a constant then it is defined and so this extra bit
2342      will get constant-folded out later. */
2343   // "do" the vector args
2344   IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2345   // PCast the RM, and widen it to 128 bits
2346   IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2347   // Roll it into the result
2348   t1 = mkUifUV128(mce, t1, t2);
2349   return t1;
2350}
2351
2352/* --- ... and ... 32Fx4 versions of the same --- */
2353
2354static
2355IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2356                                       IRAtom* vatomX, IRAtom* vatomY )
2357{
2358   IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2359   // PCast the RM, and widen it to 128 bits
2360   IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2361   // Roll it into the result
2362   t1 = mkUifUV128(mce, t1, t2);
2363   return t1;
2364}
2365
2366/* --- ... and ... 64Fx4 versions of the same --- */
2367
2368static
2369IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2370                                       IRAtom* vatomX, IRAtom* vatomY )
2371{
2372   IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2373   // PCast the RM, and widen it to 256 bits
2374   IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2375   // Roll it into the result
2376   t1 = mkUifUV256(mce, t1, t2);
2377   return t1;
2378}
2379
2380/* --- ... and ... 32Fx8 versions of the same --- */
2381
2382static
2383IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2384                                       IRAtom* vatomX, IRAtom* vatomY )
2385{
2386   IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2387   // PCast the RM, and widen it to 256 bits
2388   IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2389   // Roll it into the result
2390   t1 = mkUifUV256(mce, t1, t2);
2391   return t1;
2392}
2393
2394/* --- 64Fx2 unary FP ops, with rounding mode --- */
2395
2396static
2397IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2398{
2399   /* Same scheme as binary64Fx2_w_rm. */
2400   // "do" the vector arg
2401   IRAtom* t1 = unary64Fx2(mce, vatomX);
2402   // PCast the RM, and widen it to 128 bits
2403   IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2404   // Roll it into the result
2405   t1 = mkUifUV128(mce, t1, t2);
2406   return t1;
2407}
2408
2409/* --- ... and ... 32Fx4 versions of the same --- */
2410
2411static
2412IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2413{
2414   /* Same scheme as unary32Fx4_w_rm. */
2415   IRAtom* t1 = unary32Fx4(mce, vatomX);
2416   // PCast the RM, and widen it to 128 bits
2417   IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2418   // Roll it into the result
2419   t1 = mkUifUV128(mce, t1, t2);
2420   return t1;
2421}
2422
2423
2424/* --- --- Vector saturated narrowing --- --- */
2425
2426/* We used to do something very clever here, but on closer inspection
2427   (2011-Jun-15), and in particular bug #279698, it turns out to be
2428   wrong.  Part of the problem came from the fact that for a long
2429   time, the IR primops to do with saturated narrowing were
2430   underspecified and managed to confuse multiple cases which needed
2431   to be separate: the op names had a signedness qualifier, but in
2432   fact the source and destination signednesses needed to be specified
2433   independently, so the op names really need two independent
2434   signedness specifiers.
2435
2436   As of 2011-Jun-15 (ish) the underspecification was sorted out
2437   properly.  The incorrect instrumentation remained, though.  That
2438   has now (2011-Oct-22) been fixed.
2439
2440   What we now do is simple:
2441
2442   Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2443   number of lanes, X is the source lane width and signedness, and Y
2444   is the destination lane width and signedness.  In all cases the
2445   destination lane width is half the source lane width, so the names
2446   have a bit of redundancy, but are at least easy to read.
2447
2448   For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2449   to unsigned 16s.
2450
2451   Let Vanilla(OP) be a function that takes OP, one of these
2452   saturating narrowing ops, and produces the same "shaped" narrowing
2453   op which is not saturating, but merely dumps the most significant
2454   bits.  "same shape" means that the lane numbers and widths are the
2455   same as with OP.
2456
2457   For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2458                  = Iop_NarrowBin32to16x8,
2459   that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2460   dumping the top half of each lane.
2461
2462   So, with that in place, the scheme is simple, and it is simple to
2463   pessimise each lane individually and then apply Vanilla(OP) so as
2464   to get the result in the right "shape".  If the original OP is
2465   QNarrowBinXtoYxZ then we produce
2466
2467   Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2468
2469   or for the case when OP is unary (Iop_QNarrowUn*)
2470
2471   Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2472*/
2473static
2474IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2475{
2476   switch (qnarrowOp) {
2477      /* Binary: (128, 128) -> 128 */
2478      case Iop_QNarrowBin16Sto8Ux16:
2479      case Iop_QNarrowBin16Sto8Sx16:
2480      case Iop_QNarrowBin16Uto8Ux16:
2481      case Iop_QNarrowBin64Sto32Sx4:
2482      case Iop_QNarrowBin64Uto32Ux4:
2483         return Iop_NarrowBin16to8x16;
2484      case Iop_QNarrowBin32Sto16Ux8:
2485      case Iop_QNarrowBin32Sto16Sx8:
2486      case Iop_QNarrowBin32Uto16Ux8:
2487         return Iop_NarrowBin32to16x8;
2488      /* Binary: (64, 64) -> 64 */
2489      case Iop_QNarrowBin32Sto16Sx4:
2490         return Iop_NarrowBin32to16x4;
2491      case Iop_QNarrowBin16Sto8Ux8:
2492      case Iop_QNarrowBin16Sto8Sx8:
2493         return Iop_NarrowBin16to8x8;
2494      /* Unary: 128 -> 64 */
2495      case Iop_QNarrowUn64Uto32Ux2:
2496      case Iop_QNarrowUn64Sto32Sx2:
2497      case Iop_QNarrowUn64Sto32Ux2:
2498         return Iop_NarrowUn64to32x2;
2499      case Iop_QNarrowUn32Uto16Ux4:
2500      case Iop_QNarrowUn32Sto16Sx4:
2501      case Iop_QNarrowUn32Sto16Ux4:
2502         return Iop_NarrowUn32to16x4;
2503      case Iop_QNarrowUn16Uto8Ux8:
2504      case Iop_QNarrowUn16Sto8Sx8:
2505      case Iop_QNarrowUn16Sto8Ux8:
2506         return Iop_NarrowUn16to8x8;
2507      default:
2508         ppIROp(qnarrowOp);
2509         VG_(tool_panic)("vanillaNarrowOpOfShape");
2510   }
2511}
2512
2513static
2514IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
2515                              IRAtom* vatom1, IRAtom* vatom2)
2516{
2517   IRAtom *at1, *at2, *at3;
2518   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2519   switch (narrow_op) {
2520      case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
2521      case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
2522      case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
2523      case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
2524      case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
2525      case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
2526      case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
2527      case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
2528      default: VG_(tool_panic)("vectorNarrowBinV128");
2529   }
2530   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2531   tl_assert(isShadowAtom(mce,vatom1));
2532   tl_assert(isShadowAtom(mce,vatom2));
2533   at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2534   at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
2535   at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
2536   return at3;
2537}
2538
2539static
2540IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
2541                            IRAtom* vatom1, IRAtom* vatom2)
2542{
2543   IRAtom *at1, *at2, *at3;
2544   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2545   switch (narrow_op) {
2546      case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
2547      case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
2548      case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
2549      default: VG_(tool_panic)("vectorNarrowBin64");
2550   }
2551   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2552   tl_assert(isShadowAtom(mce,vatom1));
2553   tl_assert(isShadowAtom(mce,vatom2));
2554   at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
2555   at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
2556   at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
2557   return at3;
2558}
2559
2560static
2561IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
2562                             IRAtom* vatom1)
2563{
2564   IRAtom *at1, *at2;
2565   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2566   tl_assert(isShadowAtom(mce,vatom1));
2567   /* For vanilla narrowing (non-saturating), we can just apply
2568      the op directly to the V bits. */
2569   switch (narrow_op) {
2570      case Iop_NarrowUn16to8x8:
2571      case Iop_NarrowUn32to16x4:
2572      case Iop_NarrowUn64to32x2:
2573         at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
2574         return at1;
2575      default:
2576         break; /* Do Plan B */
2577   }
2578   /* Plan B: for ops that involve a saturation operation on the args,
2579      we must PCast before the vanilla narrow. */
2580   switch (narrow_op) {
2581      case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
2582      case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
2583      case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
2584      case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
2585      case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
2586      case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
2587      case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
2588      case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
2589      case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
2590      default: VG_(tool_panic)("vectorNarrowUnV128");
2591   }
2592   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2593   at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2594   at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
2595   return at2;
2596}
2597
2598static
2599IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
2600                         IRAtom* vatom1)
2601{
2602   IRAtom *at1, *at2;
2603   IRAtom* (*pcast)( MCEnv*, IRAtom* );
2604   switch (longen_op) {
2605      case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
2606      case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
2607      case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
2608      case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
2609      case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
2610      case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
2611      default: VG_(tool_panic)("vectorWidenI64");
2612   }
2613   tl_assert(isShadowAtom(mce,vatom1));
2614   at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
2615   at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
2616   return at2;
2617}
2618
2619
2620/* --- --- Vector integer arithmetic --- --- */
2621
2622/* Simple ... UifU the args and per-lane pessimise the results. */
2623
2624/* --- V256-bit versions --- */
2625
2626static
2627IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2628{
2629   IRAtom* at;
2630   at = mkUifUV256(mce, vatom1, vatom2);
2631   at = mkPCast8x32(mce, at);
2632   return at;
2633}
2634
2635static
2636IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2637{
2638   IRAtom* at;
2639   at = mkUifUV256(mce, vatom1, vatom2);
2640   at = mkPCast16x16(mce, at);
2641   return at;
2642}
2643
2644static
2645IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2646{
2647   IRAtom* at;
2648   at = mkUifUV256(mce, vatom1, vatom2);
2649   at = mkPCast32x8(mce, at);
2650   return at;
2651}
2652
2653static
2654IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2655{
2656   IRAtom* at;
2657   at = mkUifUV256(mce, vatom1, vatom2);
2658   at = mkPCast64x4(mce, at);
2659   return at;
2660}
2661
2662/* --- V128-bit versions --- */
2663
2664static
2665IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2666{
2667   IRAtom* at;
2668   at = mkUifUV128(mce, vatom1, vatom2);
2669   at = mkPCast8x16(mce, at);
2670   return at;
2671}
2672
2673static
2674IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2675{
2676   IRAtom* at;
2677   at = mkUifUV128(mce, vatom1, vatom2);
2678   at = mkPCast16x8(mce, at);
2679   return at;
2680}
2681
2682static
2683IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2684{
2685   IRAtom* at;
2686   at = mkUifUV128(mce, vatom1, vatom2);
2687   at = mkPCast32x4(mce, at);
2688   return at;
2689}
2690
2691static
2692IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2693{
2694   IRAtom* at;
2695   at = mkUifUV128(mce, vatom1, vatom2);
2696   at = mkPCast64x2(mce, at);
2697   return at;
2698}
2699
2700/* --- 64-bit versions --- */
2701
2702static
2703IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2704{
2705   IRAtom* at;
2706   at = mkUifU64(mce, vatom1, vatom2);
2707   at = mkPCast8x8(mce, at);
2708   return at;
2709}
2710
2711static
2712IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2713{
2714   IRAtom* at;
2715   at = mkUifU64(mce, vatom1, vatom2);
2716   at = mkPCast16x4(mce, at);
2717   return at;
2718}
2719
2720static
2721IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2722{
2723   IRAtom* at;
2724   at = mkUifU64(mce, vatom1, vatom2);
2725   at = mkPCast32x2(mce, at);
2726   return at;
2727}
2728
2729static
2730IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2731{
2732   IRAtom* at;
2733   at = mkUifU64(mce, vatom1, vatom2);
2734   at = mkPCastTo(mce, Ity_I64, at);
2735   return at;
2736}
2737
2738/* --- 32-bit versions --- */
2739
2740static
2741IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2742{
2743   IRAtom* at;
2744   at = mkUifU32(mce, vatom1, vatom2);
2745   at = mkPCast8x4(mce, at);
2746   return at;
2747}
2748
2749static
2750IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2751{
2752   IRAtom* at;
2753   at = mkUifU32(mce, vatom1, vatom2);
2754   at = mkPCast16x2(mce, at);
2755   return at;
2756}
2757
2758
2759/*------------------------------------------------------------*/
2760/*--- Generate shadow values from all kinds of IRExprs.    ---*/
2761/*------------------------------------------------------------*/
2762
2763static
2764IRAtom* expr2vbits_Qop ( MCEnv* mce,
2765                         IROp op,
2766                         IRAtom* atom1, IRAtom* atom2,
2767                         IRAtom* atom3, IRAtom* atom4 )
2768{
2769   IRAtom* vatom1 = expr2vbits( mce, atom1 );
2770   IRAtom* vatom2 = expr2vbits( mce, atom2 );
2771   IRAtom* vatom3 = expr2vbits( mce, atom3 );
2772   IRAtom* vatom4 = expr2vbits( mce, atom4 );
2773
2774   tl_assert(isOriginalAtom(mce,atom1));
2775   tl_assert(isOriginalAtom(mce,atom2));
2776   tl_assert(isOriginalAtom(mce,atom3));
2777   tl_assert(isOriginalAtom(mce,atom4));
2778   tl_assert(isShadowAtom(mce,vatom1));
2779   tl_assert(isShadowAtom(mce,vatom2));
2780   tl_assert(isShadowAtom(mce,vatom3));
2781   tl_assert(isShadowAtom(mce,vatom4));
2782   tl_assert(sameKindedAtoms(atom1,vatom1));
2783   tl_assert(sameKindedAtoms(atom2,vatom2));
2784   tl_assert(sameKindedAtoms(atom3,vatom3));
2785   tl_assert(sameKindedAtoms(atom4,vatom4));
2786   switch (op) {
2787      case Iop_MAddF64:
2788      case Iop_MAddF64r32:
2789      case Iop_MSubF64:
2790      case Iop_MSubF64r32:
2791         /* I32(rm) x F64 x F64 x F64 -> F64 */
2792         return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
2793
2794      case Iop_MAddF32:
2795      case Iop_MSubF32:
2796         /* I32(rm) x F32 x F32 x F32 -> F32 */
2797         return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
2798
2799      /* V256-bit data-steering */
2800      case Iop_64x4toV256:
2801         return assignNew('V', mce, Ity_V256,
2802                          IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
2803
2804      default:
2805         ppIROp(op);
2806         VG_(tool_panic)("memcheck:expr2vbits_Qop");
2807   }
2808}
2809
2810
2811static
2812IRAtom* expr2vbits_Triop ( MCEnv* mce,
2813                           IROp op,
2814                           IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
2815{
2816   IRAtom* vatom1 = expr2vbits( mce, atom1 );
2817   IRAtom* vatom2 = expr2vbits( mce, atom2 );
2818   IRAtom* vatom3 = expr2vbits( mce, atom3 );
2819
2820   tl_assert(isOriginalAtom(mce,atom1));
2821   tl_assert(isOriginalAtom(mce,atom2));
2822   tl_assert(isOriginalAtom(mce,atom3));
2823   tl_assert(isShadowAtom(mce,vatom1));
2824   tl_assert(isShadowAtom(mce,vatom2));
2825   tl_assert(isShadowAtom(mce,vatom3));
2826   tl_assert(sameKindedAtoms(atom1,vatom1));
2827   tl_assert(sameKindedAtoms(atom2,vatom2));
2828   tl_assert(sameKindedAtoms(atom3,vatom3));
2829   switch (op) {
2830      case Iop_AddF128:
2831      case Iop_AddD128:
2832      case Iop_SubF128:
2833      case Iop_SubD128:
2834      case Iop_MulF128:
2835      case Iop_MulD128:
2836      case Iop_DivF128:
2837      case Iop_DivD128:
2838      case Iop_QuantizeD128:
2839         /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
2840         return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
2841      case Iop_AddF64:
2842      case Iop_AddD64:
2843      case Iop_AddF64r32:
2844      case Iop_SubF64:
2845      case Iop_SubD64:
2846      case Iop_SubF64r32:
2847      case Iop_MulF64:
2848      case Iop_MulD64:
2849      case Iop_MulF64r32:
2850      case Iop_DivF64:
2851      case Iop_DivD64:
2852      case Iop_DivF64r32:
2853      case Iop_ScaleF64:
2854      case Iop_Yl2xF64:
2855      case Iop_Yl2xp1F64:
2856      case Iop_AtanF64:
2857      case Iop_PRemF64:
2858      case Iop_PRem1F64:
2859      case Iop_QuantizeD64:
2860         /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
2861         return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
2862      case Iop_PRemC3210F64:
2863      case Iop_PRem1C3210F64:
2864         /* I32(rm) x F64 x F64 -> I32 */
2865         return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
2866      case Iop_AddF32:
2867      case Iop_SubF32:
2868      case Iop_MulF32:
2869      case Iop_DivF32:
2870         /* I32(rm) x F32 x F32 -> I32 */
2871         return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
2872      case Iop_SignificanceRoundD64:
2873         /* IRRoundingMode(I32) x I8 x D64 -> D64 */
2874         return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
2875      case Iop_SignificanceRoundD128:
2876         /* IRRoundingMode(I32) x I8 x D128 -> D128 */
2877         return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
2878      case Iop_SliceV128:
2879         /* (V128, V128, I8) -> V128 */
2880         complainIfUndefined(mce, atom3, NULL);
2881         return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
2882      case Iop_Slice64:
2883         /* (I64, I64, I8) -> I64 */
2884         complainIfUndefined(mce, atom3, NULL);
2885         return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
2886      case Iop_SetElem8x8:
2887      case Iop_SetElem16x4:
2888      case Iop_SetElem32x2:
2889         complainIfUndefined(mce, atom2, NULL);
2890         return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
2891      /* BCDIops */
2892      case Iop_BCDAdd:
2893      case Iop_BCDSub:
2894         complainIfUndefined(mce, atom3, NULL);
2895         return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
2896
2897      /* Vector FP with rounding mode as the first arg */
2898      case Iop_Add64Fx2:
2899      case Iop_Sub64Fx2:
2900      case Iop_Mul64Fx2:
2901      case Iop_Div64Fx2:
2902         return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
2903
2904      case Iop_Add32Fx4:
2905      case Iop_Sub32Fx4:
2906      case Iop_Mul32Fx4:
2907      case Iop_Div32Fx4:
2908        return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
2909
2910      case Iop_Add64Fx4:
2911      case Iop_Sub64Fx4:
2912      case Iop_Mul64Fx4:
2913      case Iop_Div64Fx4:
2914         return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
2915
2916      case Iop_Add32Fx8:
2917      case Iop_Sub32Fx8:
2918      case Iop_Mul32Fx8:
2919      case Iop_Div32Fx8:
2920         return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
2921
2922      default:
2923         ppIROp(op);
2924         VG_(tool_panic)("memcheck:expr2vbits_Triop");
2925   }
2926}
2927
2928
2929static
2930IRAtom* expr2vbits_Binop ( MCEnv* mce,
2931                           IROp op,
2932                           IRAtom* atom1, IRAtom* atom2 )
2933{
2934   IRType  and_or_ty;
2935   IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
2936   IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
2937   IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
2938
2939   IRAtom* vatom1 = expr2vbits( mce, atom1 );
2940   IRAtom* vatom2 = expr2vbits( mce, atom2 );
2941
2942   tl_assert(isOriginalAtom(mce,atom1));
2943   tl_assert(isOriginalAtom(mce,atom2));
2944   tl_assert(isShadowAtom(mce,vatom1));
2945   tl_assert(isShadowAtom(mce,vatom2));
2946   tl_assert(sameKindedAtoms(atom1,vatom1));
2947   tl_assert(sameKindedAtoms(atom2,vatom2));
2948   switch (op) {
2949
2950      /* 32-bit SIMD */
2951
2952      case Iop_Add16x2:
2953      case Iop_HAdd16Ux2:
2954      case Iop_HAdd16Sx2:
2955      case Iop_Sub16x2:
2956      case Iop_HSub16Ux2:
2957      case Iop_HSub16Sx2:
2958      case Iop_QAdd16Sx2:
2959      case Iop_QSub16Sx2:
2960      case Iop_QSub16Ux2:
2961      case Iop_QAdd16Ux2:
2962         return binary16Ix2(mce, vatom1, vatom2);
2963
2964      case Iop_Add8x4:
2965      case Iop_HAdd8Ux4:
2966      case Iop_HAdd8Sx4:
2967      case Iop_Sub8x4:
2968      case Iop_HSub8Ux4:
2969      case Iop_HSub8Sx4:
2970      case Iop_QSub8Ux4:
2971      case Iop_QAdd8Ux4:
2972      case Iop_QSub8Sx4:
2973      case Iop_QAdd8Sx4:
2974         return binary8Ix4(mce, vatom1, vatom2);
2975
2976      /* 64-bit SIMD */
2977
2978      case Iop_ShrN8x8:
2979      case Iop_ShrN16x4:
2980      case Iop_ShrN32x2:
2981      case Iop_SarN8x8:
2982      case Iop_SarN16x4:
2983      case Iop_SarN32x2:
2984      case Iop_ShlN16x4:
2985      case Iop_ShlN32x2:
2986      case Iop_ShlN8x8:
2987         /* Same scheme as with all other shifts. */
2988         complainIfUndefined(mce, atom2, NULL);
2989         return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
2990
2991      case Iop_QNarrowBin32Sto16Sx4:
2992      case Iop_QNarrowBin16Sto8Sx8:
2993      case Iop_QNarrowBin16Sto8Ux8:
2994         return vectorNarrowBin64(mce, op, vatom1, vatom2);
2995
2996      case Iop_Min8Ux8:
2997      case Iop_Min8Sx8:
2998      case Iop_Max8Ux8:
2999      case Iop_Max8Sx8:
3000      case Iop_Avg8Ux8:
3001      case Iop_QSub8Sx8:
3002      case Iop_QSub8Ux8:
3003      case Iop_Sub8x8:
3004      case Iop_CmpGT8Sx8:
3005      case Iop_CmpGT8Ux8:
3006      case Iop_CmpEQ8x8:
3007      case Iop_QAdd8Sx8:
3008      case Iop_QAdd8Ux8:
3009      case Iop_QSal8x8:
3010      case Iop_QShl8x8:
3011      case Iop_Add8x8:
3012      case Iop_Mul8x8:
3013      case Iop_PolynomialMul8x8:
3014         return binary8Ix8(mce, vatom1, vatom2);
3015
3016      case Iop_Min16Sx4:
3017      case Iop_Min16Ux4:
3018      case Iop_Max16Sx4:
3019      case Iop_Max16Ux4:
3020      case Iop_Avg16Ux4:
3021      case Iop_QSub16Ux4:
3022      case Iop_QSub16Sx4:
3023      case Iop_Sub16x4:
3024      case Iop_Mul16x4:
3025      case Iop_MulHi16Sx4:
3026      case Iop_MulHi16Ux4:
3027      case Iop_CmpGT16Sx4:
3028      case Iop_CmpGT16Ux4:
3029      case Iop_CmpEQ16x4:
3030      case Iop_QAdd16Sx4:
3031      case Iop_QAdd16Ux4:
3032      case Iop_QSal16x4:
3033      case Iop_QShl16x4:
3034      case Iop_Add16x4:
3035      case Iop_QDMulHi16Sx4:
3036      case Iop_QRDMulHi16Sx4:
3037         return binary16Ix4(mce, vatom1, vatom2);
3038
3039      case Iop_Sub32x2:
3040      case Iop_Mul32x2:
3041      case Iop_Max32Sx2:
3042      case Iop_Max32Ux2:
3043      case Iop_Min32Sx2:
3044      case Iop_Min32Ux2:
3045      case Iop_CmpGT32Sx2:
3046      case Iop_CmpGT32Ux2:
3047      case Iop_CmpEQ32x2:
3048      case Iop_Add32x2:
3049      case Iop_QAdd32Ux2:
3050      case Iop_QAdd32Sx2:
3051      case Iop_QSub32Ux2:
3052      case Iop_QSub32Sx2:
3053      case Iop_QSal32x2:
3054      case Iop_QShl32x2:
3055      case Iop_QDMulHi32Sx2:
3056      case Iop_QRDMulHi32Sx2:
3057         return binary32Ix2(mce, vatom1, vatom2);
3058
3059      case Iop_QSub64Ux1:
3060      case Iop_QSub64Sx1:
3061      case Iop_QAdd64Ux1:
3062      case Iop_QAdd64Sx1:
3063      case Iop_QSal64x1:
3064      case Iop_QShl64x1:
3065      case Iop_Sal64x1:
3066         return binary64Ix1(mce, vatom1, vatom2);
3067
3068      case Iop_QShlNsatSU8x8:
3069      case Iop_QShlNsatUU8x8:
3070      case Iop_QShlNsatSS8x8:
3071         complainIfUndefined(mce, atom2, NULL);
3072         return mkPCast8x8(mce, vatom1);
3073
3074      case Iop_QShlNsatSU16x4:
3075      case Iop_QShlNsatUU16x4:
3076      case Iop_QShlNsatSS16x4:
3077         complainIfUndefined(mce, atom2, NULL);
3078         return mkPCast16x4(mce, vatom1);
3079
3080      case Iop_QShlNsatSU32x2:
3081      case Iop_QShlNsatUU32x2:
3082      case Iop_QShlNsatSS32x2:
3083         complainIfUndefined(mce, atom2, NULL);
3084         return mkPCast32x2(mce, vatom1);
3085
3086      case Iop_QShlNsatSU64x1:
3087      case Iop_QShlNsatUU64x1:
3088      case Iop_QShlNsatSS64x1:
3089         complainIfUndefined(mce, atom2, NULL);
3090         return mkPCast32x2(mce, vatom1);
3091
3092      case Iop_PwMax32Sx2:
3093      case Iop_PwMax32Ux2:
3094      case Iop_PwMin32Sx2:
3095      case Iop_PwMin32Ux2:
3096      case Iop_PwMax32Fx2:
3097      case Iop_PwMin32Fx2:
3098         return assignNew('V', mce, Ity_I64,
3099                          binop(Iop_PwMax32Ux2,
3100                                mkPCast32x2(mce, vatom1),
3101                                mkPCast32x2(mce, vatom2)));
3102
3103      case Iop_PwMax16Sx4:
3104      case Iop_PwMax16Ux4:
3105      case Iop_PwMin16Sx4:
3106      case Iop_PwMin16Ux4:
3107         return assignNew('V', mce, Ity_I64,
3108                          binop(Iop_PwMax16Ux4,
3109                                mkPCast16x4(mce, vatom1),
3110                                mkPCast16x4(mce, vatom2)));
3111
3112      case Iop_PwMax8Sx8:
3113      case Iop_PwMax8Ux8:
3114      case Iop_PwMin8Sx8:
3115      case Iop_PwMin8Ux8:
3116         return assignNew('V', mce, Ity_I64,
3117                          binop(Iop_PwMax8Ux8,
3118                                mkPCast8x8(mce, vatom1),
3119                                mkPCast8x8(mce, vatom2)));
3120
3121      case Iop_PwAdd32x2:
3122      case Iop_PwAdd32Fx2:
3123         return mkPCast32x2(mce,
3124               assignNew('V', mce, Ity_I64,
3125                         binop(Iop_PwAdd32x2,
3126                               mkPCast32x2(mce, vatom1),
3127                               mkPCast32x2(mce, vatom2))));
3128
3129      case Iop_PwAdd16x4:
3130         return mkPCast16x4(mce,
3131               assignNew('V', mce, Ity_I64,
3132                         binop(op, mkPCast16x4(mce, vatom1),
3133                                   mkPCast16x4(mce, vatom2))));
3134
3135      case Iop_PwAdd8x8:
3136         return mkPCast8x8(mce,
3137               assignNew('V', mce, Ity_I64,
3138                         binop(op, mkPCast8x8(mce, vatom1),
3139                                   mkPCast8x8(mce, vatom2))));
3140
3141      case Iop_Shl8x8:
3142      case Iop_Shr8x8:
3143      case Iop_Sar8x8:
3144      case Iop_Sal8x8:
3145         return mkUifU64(mce,
3146                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3147                   mkPCast8x8(mce,vatom2)
3148                );
3149
3150      case Iop_Shl16x4:
3151      case Iop_Shr16x4:
3152      case Iop_Sar16x4:
3153      case Iop_Sal16x4:
3154         return mkUifU64(mce,
3155                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3156                   mkPCast16x4(mce,vatom2)
3157                );
3158
3159      case Iop_Shl32x2:
3160      case Iop_Shr32x2:
3161      case Iop_Sar32x2:
3162      case Iop_Sal32x2:
3163         return mkUifU64(mce,
3164                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3165                   mkPCast32x2(mce,vatom2)
3166                );
3167
3168      /* 64-bit data-steering */
3169      case Iop_InterleaveLO32x2:
3170      case Iop_InterleaveLO16x4:
3171      case Iop_InterleaveLO8x8:
3172      case Iop_InterleaveHI32x2:
3173      case Iop_InterleaveHI16x4:
3174      case Iop_InterleaveHI8x8:
3175      case Iop_CatOddLanes8x8:
3176      case Iop_CatEvenLanes8x8:
3177      case Iop_CatOddLanes16x4:
3178      case Iop_CatEvenLanes16x4:
3179      case Iop_InterleaveOddLanes8x8:
3180      case Iop_InterleaveEvenLanes8x8:
3181      case Iop_InterleaveOddLanes16x4:
3182      case Iop_InterleaveEvenLanes16x4:
3183         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3184
3185      case Iop_GetElem8x8:
3186         complainIfUndefined(mce, atom2, NULL);
3187         return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3188      case Iop_GetElem16x4:
3189         complainIfUndefined(mce, atom2, NULL);
3190         return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3191      case Iop_GetElem32x2:
3192         complainIfUndefined(mce, atom2, NULL);
3193         return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3194
3195      /* Perm8x8: rearrange values in left arg using steering values
3196        from right arg.  So rearrange the vbits in the same way but
3197        pessimise wrt steering values. */
3198      case Iop_Perm8x8:
3199         return mkUifU64(
3200                   mce,
3201                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3202                   mkPCast8x8(mce, vatom2)
3203                );
3204
3205      /* V128-bit SIMD */
3206
3207      case Iop_Sqrt32Fx4:
3208         return unary32Fx4_w_rm(mce, vatom1, vatom2);
3209      case Iop_Sqrt64Fx2:
3210         return unary64Fx2_w_rm(mce, vatom1, vatom2);
3211
3212      case Iop_ShrN8x16:
3213      case Iop_ShrN16x8:
3214      case Iop_ShrN32x4:
3215      case Iop_ShrN64x2:
3216      case Iop_SarN8x16:
3217      case Iop_SarN16x8:
3218      case Iop_SarN32x4:
3219      case Iop_SarN64x2:
3220      case Iop_ShlN8x16:
3221      case Iop_ShlN16x8:
3222      case Iop_ShlN32x4:
3223      case Iop_ShlN64x2:
3224         /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3225            this is wrong now, scalar shifts are done properly lazily.
3226            Vector shifts should be fixed too. */
3227         complainIfUndefined(mce, atom2, NULL);
3228         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3229
3230      /* V x V shifts/rotates are done using the standard lazy scheme. */
3231      /* For the non-rounding variants of bi-di vector x vector
3232         shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3233         But note that this is overly pessimistic, because in fact only
3234         the bottom 8 bits of each lane of the second argument are taken
3235         into account when shifting.  So really we ought to ignore
3236         undefinedness in bits 8 and above of each lane in the
3237         second argument. */
3238      case Iop_Shl8x16:
3239      case Iop_Shr8x16:
3240      case Iop_Sar8x16:
3241      case Iop_Sal8x16:
3242      case Iop_Rol8x16:
3243      case Iop_Sh8Sx16:
3244      case Iop_Sh8Ux16:
3245         return mkUifUV128(mce,
3246                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3247                   mkPCast8x16(mce,vatom2)
3248                );
3249
3250      case Iop_Shl16x8:
3251      case Iop_Shr16x8:
3252      case Iop_Sar16x8:
3253      case Iop_Sal16x8:
3254      case Iop_Rol16x8:
3255      case Iop_Sh16Sx8:
3256      case Iop_Sh16Ux8:
3257         return mkUifUV128(mce,
3258                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3259                   mkPCast16x8(mce,vatom2)
3260                );
3261
3262      case Iop_Shl32x4:
3263      case Iop_Shr32x4:
3264      case Iop_Sar32x4:
3265      case Iop_Sal32x4:
3266      case Iop_Rol32x4:
3267      case Iop_Sh32Sx4:
3268      case Iop_Sh32Ux4:
3269         return mkUifUV128(mce,
3270                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3271                   mkPCast32x4(mce,vatom2)
3272                );
3273
3274      case Iop_Shl64x2:
3275      case Iop_Shr64x2:
3276      case Iop_Sar64x2:
3277      case Iop_Sal64x2:
3278      case Iop_Rol64x2:
3279      case Iop_Sh64Sx2:
3280      case Iop_Sh64Ux2:
3281         return mkUifUV128(mce,
3282                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3283                   mkPCast64x2(mce,vatom2)
3284                );
3285
3286      /* For the rounding variants of bi-di vector x vector shifts, the
3287         rounding adjustment can cause undefinedness to propagate through
3288         the entire lane, in the worst case.  Too complex to handle
3289         properly .. just UifU the arguments and then PCast them.
3290         Suboptimal but safe. */
3291      case Iop_Rsh8Sx16:
3292      case Iop_Rsh8Ux16:
3293         return binary8Ix16(mce, vatom1, vatom2);
3294      case Iop_Rsh16Sx8:
3295      case Iop_Rsh16Ux8:
3296         return binary16Ix8(mce, vatom1, vatom2);
3297      case Iop_Rsh32Sx4:
3298      case Iop_Rsh32Ux4:
3299         return binary32Ix4(mce, vatom1, vatom2);
3300      case Iop_Rsh64Sx2:
3301      case Iop_Rsh64Ux2:
3302         return binary64Ix2(mce, vatom1, vatom2);
3303
3304      case Iop_F32ToFixed32Ux4_RZ:
3305      case Iop_F32ToFixed32Sx4_RZ:
3306      case Iop_Fixed32UToF32x4_RN:
3307      case Iop_Fixed32SToF32x4_RN:
3308         complainIfUndefined(mce, atom2, NULL);
3309         return mkPCast32x4(mce, vatom1);
3310
3311      case Iop_F32ToFixed32Ux2_RZ:
3312      case Iop_F32ToFixed32Sx2_RZ:
3313      case Iop_Fixed32UToF32x2_RN:
3314      case Iop_Fixed32SToF32x2_RN:
3315         complainIfUndefined(mce, atom2, NULL);
3316         return mkPCast32x2(mce, vatom1);
3317
3318      case Iop_QSub8Ux16:
3319      case Iop_QSub8Sx16:
3320      case Iop_Sub8x16:
3321      case Iop_Min8Ux16:
3322      case Iop_Min8Sx16:
3323      case Iop_Max8Ux16:
3324      case Iop_Max8Sx16:
3325      case Iop_CmpGT8Sx16:
3326      case Iop_CmpGT8Ux16:
3327      case Iop_CmpEQ8x16:
3328      case Iop_Avg8Ux16:
3329      case Iop_Avg8Sx16:
3330      case Iop_QAdd8Ux16:
3331      case Iop_QAdd8Sx16:
3332      case Iop_QAddExtUSsatSS8x16:
3333      case Iop_QAddExtSUsatUU8x16:
3334      case Iop_QSal8x16:
3335      case Iop_QShl8x16:
3336      case Iop_Add8x16:
3337      case Iop_Mul8x16:
3338      case Iop_PolynomialMul8x16:
3339      case Iop_PolynomialMulAdd8x16:
3340         return binary8Ix16(mce, vatom1, vatom2);
3341
3342      case Iop_QSub16Ux8:
3343      case Iop_QSub16Sx8:
3344      case Iop_Sub16x8:
3345      case Iop_Mul16x8:
3346      case Iop_MulHi16Sx8:
3347      case Iop_MulHi16Ux8:
3348      case Iop_Min16Sx8:
3349      case Iop_Min16Ux8:
3350      case Iop_Max16Sx8:
3351      case Iop_Max16Ux8:
3352      case Iop_CmpGT16Sx8:
3353      case Iop_CmpGT16Ux8:
3354      case Iop_CmpEQ16x8:
3355      case Iop_Avg16Ux8:
3356      case Iop_Avg16Sx8:
3357      case Iop_QAdd16Ux8:
3358      case Iop_QAdd16Sx8:
3359      case Iop_QAddExtUSsatSS16x8:
3360      case Iop_QAddExtSUsatUU16x8:
3361      case Iop_QSal16x8:
3362      case Iop_QShl16x8:
3363      case Iop_Add16x8:
3364      case Iop_QDMulHi16Sx8:
3365      case Iop_QRDMulHi16Sx8:
3366      case Iop_PolynomialMulAdd16x8:
3367         return binary16Ix8(mce, vatom1, vatom2);
3368
3369      case Iop_Sub32x4:
3370      case Iop_CmpGT32Sx4:
3371      case Iop_CmpGT32Ux4:
3372      case Iop_CmpEQ32x4:
3373      case Iop_QAdd32Sx4:
3374      case Iop_QAdd32Ux4:
3375      case Iop_QSub32Sx4:
3376      case Iop_QSub32Ux4:
3377      case Iop_QAddExtUSsatSS32x4:
3378      case Iop_QAddExtSUsatUU32x4:
3379      case Iop_QSal32x4:
3380      case Iop_QShl32x4:
3381      case Iop_Avg32Ux4:
3382      case Iop_Avg32Sx4:
3383      case Iop_Add32x4:
3384      case Iop_Max32Ux4:
3385      case Iop_Max32Sx4:
3386      case Iop_Min32Ux4:
3387      case Iop_Min32Sx4:
3388      case Iop_Mul32x4:
3389      case Iop_QDMulHi32Sx4:
3390      case Iop_QRDMulHi32Sx4:
3391      case Iop_PolynomialMulAdd32x4:
3392         return binary32Ix4(mce, vatom1, vatom2);
3393
3394      case Iop_Sub64x2:
3395      case Iop_Add64x2:
3396      case Iop_Max64Sx2:
3397      case Iop_Max64Ux2:
3398      case Iop_Min64Sx2:
3399      case Iop_Min64Ux2:
3400      case Iop_CmpEQ64x2:
3401      case Iop_CmpGT64Sx2:
3402      case Iop_CmpGT64Ux2:
3403      case Iop_QSal64x2:
3404      case Iop_QShl64x2:
3405      case Iop_QAdd64Ux2:
3406      case Iop_QAdd64Sx2:
3407      case Iop_QSub64Ux2:
3408      case Iop_QSub64Sx2:
3409      case Iop_QAddExtUSsatSS64x2:
3410      case Iop_QAddExtSUsatUU64x2:
3411      case Iop_PolynomialMulAdd64x2:
3412      case Iop_CipherV128:
3413      case Iop_CipherLV128:
3414      case Iop_NCipherV128:
3415      case Iop_NCipherLV128:
3416        return binary64Ix2(mce, vatom1, vatom2);
3417
3418      case Iop_QNarrowBin64Sto32Sx4:
3419      case Iop_QNarrowBin64Uto32Ux4:
3420      case Iop_QNarrowBin32Sto16Sx8:
3421      case Iop_QNarrowBin32Uto16Ux8:
3422      case Iop_QNarrowBin32Sto16Ux8:
3423      case Iop_QNarrowBin16Sto8Sx16:
3424      case Iop_QNarrowBin16Uto8Ux16:
3425      case Iop_QNarrowBin16Sto8Ux16:
3426         return vectorNarrowBinV128(mce, op, vatom1, vatom2);
3427
3428      case Iop_Min64Fx2:
3429      case Iop_Max64Fx2:
3430      case Iop_CmpLT64Fx2:
3431      case Iop_CmpLE64Fx2:
3432      case Iop_CmpEQ64Fx2:
3433      case Iop_CmpUN64Fx2:
3434      case Iop_RecipStep64Fx2:
3435      case Iop_RSqrtStep64Fx2:
3436         return binary64Fx2(mce, vatom1, vatom2);
3437
3438      case Iop_Sub64F0x2:
3439      case Iop_Mul64F0x2:
3440      case Iop_Min64F0x2:
3441      case Iop_Max64F0x2:
3442      case Iop_Div64F0x2:
3443      case Iop_CmpLT64F0x2:
3444      case Iop_CmpLE64F0x2:
3445      case Iop_CmpEQ64F0x2:
3446      case Iop_CmpUN64F0x2:
3447      case Iop_Add64F0x2:
3448         return binary64F0x2(mce, vatom1, vatom2);
3449
3450      case Iop_Min32Fx4:
3451      case Iop_Max32Fx4:
3452      case Iop_CmpLT32Fx4:
3453      case Iop_CmpLE32Fx4:
3454      case Iop_CmpEQ32Fx4:
3455      case Iop_CmpUN32Fx4:
3456      case Iop_CmpGT32Fx4:
3457      case Iop_CmpGE32Fx4:
3458      case Iop_RecipStep32Fx4:
3459      case Iop_RSqrtStep32Fx4:
3460         return binary32Fx4(mce, vatom1, vatom2);
3461
3462      case Iop_Sub32Fx2:
3463      case Iop_Mul32Fx2:
3464      case Iop_Min32Fx2:
3465      case Iop_Max32Fx2:
3466      case Iop_CmpEQ32Fx2:
3467      case Iop_CmpGT32Fx2:
3468      case Iop_CmpGE32Fx2:
3469      case Iop_Add32Fx2:
3470      case Iop_RecipStep32Fx2:
3471      case Iop_RSqrtStep32Fx2:
3472         return binary32Fx2(mce, vatom1, vatom2);
3473
3474      case Iop_Sub32F0x4:
3475      case Iop_Mul32F0x4:
3476      case Iop_Min32F0x4:
3477      case Iop_Max32F0x4:
3478      case Iop_Div32F0x4:
3479      case Iop_CmpLT32F0x4:
3480      case Iop_CmpLE32F0x4:
3481      case Iop_CmpEQ32F0x4:
3482      case Iop_CmpUN32F0x4:
3483      case Iop_Add32F0x4:
3484         return binary32F0x4(mce, vatom1, vatom2);
3485
3486      case Iop_QShlNsatSU8x16:
3487      case Iop_QShlNsatUU8x16:
3488      case Iop_QShlNsatSS8x16:
3489         complainIfUndefined(mce, atom2, NULL);
3490         return mkPCast8x16(mce, vatom1);
3491
3492      case Iop_QShlNsatSU16x8:
3493      case Iop_QShlNsatUU16x8:
3494      case Iop_QShlNsatSS16x8:
3495         complainIfUndefined(mce, atom2, NULL);
3496         return mkPCast16x8(mce, vatom1);
3497
3498      case Iop_QShlNsatSU32x4:
3499      case Iop_QShlNsatUU32x4:
3500      case Iop_QShlNsatSS32x4:
3501         complainIfUndefined(mce, atom2, NULL);
3502         return mkPCast32x4(mce, vatom1);
3503
3504      case Iop_QShlNsatSU64x2:
3505      case Iop_QShlNsatUU64x2:
3506      case Iop_QShlNsatSS64x2:
3507         complainIfUndefined(mce, atom2, NULL);
3508         return mkPCast32x4(mce, vatom1);
3509
3510      /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
3511         To make this simpler, do the following:
3512         * complain if the shift amount (the I8) is undefined
3513         * pcast each lane at the wide width
3514         * truncate each lane to half width
3515         * pcast the resulting 64-bit value to a single bit and use
3516           that as the least significant bit of the upper half of the
3517           result. */
3518      case Iop_QandQShrNnarrow64Uto32Ux2:
3519      case Iop_QandQSarNnarrow64Sto32Sx2:
3520      case Iop_QandQSarNnarrow64Sto32Ux2:
3521      case Iop_QandQRShrNnarrow64Uto32Ux2:
3522      case Iop_QandQRSarNnarrow64Sto32Sx2:
3523      case Iop_QandQRSarNnarrow64Sto32Ux2:
3524      case Iop_QandQShrNnarrow32Uto16Ux4:
3525      case Iop_QandQSarNnarrow32Sto16Sx4:
3526      case Iop_QandQSarNnarrow32Sto16Ux4:
3527      case Iop_QandQRShrNnarrow32Uto16Ux4:
3528      case Iop_QandQRSarNnarrow32Sto16Sx4:
3529      case Iop_QandQRSarNnarrow32Sto16Ux4:
3530      case Iop_QandQShrNnarrow16Uto8Ux8:
3531      case Iop_QandQSarNnarrow16Sto8Sx8:
3532      case Iop_QandQSarNnarrow16Sto8Ux8:
3533      case Iop_QandQRShrNnarrow16Uto8Ux8:
3534      case Iop_QandQRSarNnarrow16Sto8Sx8:
3535      case Iop_QandQRSarNnarrow16Sto8Ux8:
3536      {
3537         IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
3538         IROp opNarrow = Iop_INVALID;
3539         switch (op) {
3540            case Iop_QandQShrNnarrow64Uto32Ux2:
3541            case Iop_QandQSarNnarrow64Sto32Sx2:
3542            case Iop_QandQSarNnarrow64Sto32Ux2:
3543            case Iop_QandQRShrNnarrow64Uto32Ux2:
3544            case Iop_QandQRSarNnarrow64Sto32Sx2:
3545            case Iop_QandQRSarNnarrow64Sto32Ux2:
3546               fnPessim = mkPCast64x2;
3547               opNarrow = Iop_NarrowUn64to32x2;
3548               break;
3549            case Iop_QandQShrNnarrow32Uto16Ux4:
3550            case Iop_QandQSarNnarrow32Sto16Sx4:
3551            case Iop_QandQSarNnarrow32Sto16Ux4:
3552            case Iop_QandQRShrNnarrow32Uto16Ux4:
3553            case Iop_QandQRSarNnarrow32Sto16Sx4:
3554            case Iop_QandQRSarNnarrow32Sto16Ux4:
3555               fnPessim = mkPCast32x4;
3556               opNarrow = Iop_NarrowUn32to16x4;
3557               break;
3558            case Iop_QandQShrNnarrow16Uto8Ux8:
3559            case Iop_QandQSarNnarrow16Sto8Sx8:
3560            case Iop_QandQSarNnarrow16Sto8Ux8:
3561            case Iop_QandQRShrNnarrow16Uto8Ux8:
3562            case Iop_QandQRSarNnarrow16Sto8Sx8:
3563            case Iop_QandQRSarNnarrow16Sto8Ux8:
3564               fnPessim = mkPCast16x8;
3565               opNarrow = Iop_NarrowUn16to8x8;
3566               break;
3567            default:
3568               tl_assert(0);
3569         }
3570         complainIfUndefined(mce, atom2, NULL);
3571         // Pessimised shift result
3572         IRAtom* shV
3573            = fnPessim(mce, vatom1);
3574         // Narrowed, pessimised shift result
3575         IRAtom* shVnarrowed
3576            = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
3577         // Generates: Def--(63)--Def PCast-to-I1(narrowed)
3578         IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
3579         // and assemble the result
3580         return assignNew('V', mce, Ity_V128,
3581                          binop(Iop_64HLtoV128, qV, shVnarrowed));
3582      }
3583
3584      case Iop_Mull32Sx2:
3585      case Iop_Mull32Ux2:
3586      case Iop_QDMull32Sx2:
3587         return vectorWidenI64(mce, Iop_Widen32Sto64x2,
3588                                    mkUifU64(mce, vatom1, vatom2));
3589
3590      case Iop_Mull16Sx4:
3591      case Iop_Mull16Ux4:
3592      case Iop_QDMull16Sx4:
3593         return vectorWidenI64(mce, Iop_Widen16Sto32x4,
3594                                    mkUifU64(mce, vatom1, vatom2));
3595
3596      case Iop_Mull8Sx8:
3597      case Iop_Mull8Ux8:
3598      case Iop_PolynomialMull8x8:
3599         return vectorWidenI64(mce, Iop_Widen8Sto16x8,
3600                                    mkUifU64(mce, vatom1, vatom2));
3601
3602      case Iop_PwAdd32x4:
3603         return mkPCast32x4(mce,
3604               assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
3605                     mkPCast32x4(mce, vatom2))));
3606
3607      case Iop_PwAdd16x8:
3608         return mkPCast16x8(mce,
3609               assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
3610                     mkPCast16x8(mce, vatom2))));
3611
3612      case Iop_PwAdd8x16:
3613         return mkPCast8x16(mce,
3614               assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
3615                     mkPCast8x16(mce, vatom2))));
3616
3617      /* V128-bit data-steering */
3618      case Iop_SetV128lo32:
3619      case Iop_SetV128lo64:
3620      case Iop_64HLtoV128:
3621      case Iop_InterleaveLO64x2:
3622      case Iop_InterleaveLO32x4:
3623      case Iop_InterleaveLO16x8:
3624      case Iop_InterleaveLO8x16:
3625      case Iop_InterleaveHI64x2:
3626      case Iop_InterleaveHI32x4:
3627      case Iop_InterleaveHI16x8:
3628      case Iop_InterleaveHI8x16:
3629      case Iop_CatOddLanes8x16:
3630      case Iop_CatOddLanes16x8:
3631      case Iop_CatOddLanes32x4:
3632      case Iop_CatEvenLanes8x16:
3633      case Iop_CatEvenLanes16x8:
3634      case Iop_CatEvenLanes32x4:
3635      case Iop_InterleaveOddLanes8x16:
3636      case Iop_InterleaveOddLanes16x8:
3637      case Iop_InterleaveOddLanes32x4:
3638      case Iop_InterleaveEvenLanes8x16:
3639      case Iop_InterleaveEvenLanes16x8:
3640      case Iop_InterleaveEvenLanes32x4:
3641         return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
3642
3643      case Iop_GetElem8x16:
3644         complainIfUndefined(mce, atom2, NULL);
3645         return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3646      case Iop_GetElem16x8:
3647         complainIfUndefined(mce, atom2, NULL);
3648         return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3649      case Iop_GetElem32x4:
3650         complainIfUndefined(mce, atom2, NULL);
3651         return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3652      case Iop_GetElem64x2:
3653         complainIfUndefined(mce, atom2, NULL);
3654         return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3655
3656     /* Perm8x16: rearrange values in left arg using steering values
3657        from right arg.  So rearrange the vbits in the same way but
3658        pessimise wrt steering values.  Perm32x4 ditto. */
3659      case Iop_Perm8x16:
3660         return mkUifUV128(
3661                   mce,
3662                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3663                   mkPCast8x16(mce, vatom2)
3664                );
3665      case Iop_Perm32x4:
3666         return mkUifUV128(
3667                   mce,
3668                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3669                   mkPCast32x4(mce, vatom2)
3670                );
3671
3672     /* These two take the lower half of each 16-bit lane, sign/zero
3673        extend it to 32, and multiply together, producing a 32x4
3674        result (and implicitly ignoring half the operand bits).  So
3675        treat it as a bunch of independent 16x8 operations, but then
3676        do 32-bit shifts left-right to copy the lower half results
3677        (which are all 0s or all 1s due to PCasting in binary16Ix8)
3678        into the upper half of each result lane. */
3679      case Iop_MullEven16Ux8:
3680      case Iop_MullEven16Sx8: {
3681         IRAtom* at;
3682         at = binary16Ix8(mce,vatom1,vatom2);
3683         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
3684         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
3685	 return at;
3686      }
3687
3688      /* Same deal as Iop_MullEven16{S,U}x8 */
3689      case Iop_MullEven8Ux16:
3690      case Iop_MullEven8Sx16: {
3691         IRAtom* at;
3692         at = binary8Ix16(mce,vatom1,vatom2);
3693         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
3694         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
3695	 return at;
3696      }
3697
3698      /* Same deal as Iop_MullEven16{S,U}x8 */
3699      case Iop_MullEven32Ux4:
3700      case Iop_MullEven32Sx4: {
3701         IRAtom* at;
3702         at = binary32Ix4(mce,vatom1,vatom2);
3703         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
3704         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
3705         return at;
3706      }
3707
3708      /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
3709         32x4 -> 16x8 laneage, discarding the upper half of each lane.
3710         Simply apply same op to the V bits, since this really no more
3711         than a data steering operation. */
3712      case Iop_NarrowBin32to16x8:
3713      case Iop_NarrowBin16to8x16:
3714      case Iop_NarrowBin64to32x4:
3715         return assignNew('V', mce, Ity_V128,
3716                                    binop(op, vatom1, vatom2));
3717
3718      case Iop_ShrV128:
3719      case Iop_ShlV128:
3720         /* Same scheme as with all other shifts.  Note: 10 Nov 05:
3721            this is wrong now, scalar shifts are done properly lazily.
3722            Vector shifts should be fixed too. */
3723         complainIfUndefined(mce, atom2, NULL);
3724         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3725
3726      /* SHA Iops */
3727      case Iop_SHA256:
3728      case Iop_SHA512:
3729         complainIfUndefined(mce, atom2, NULL);
3730         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3731
3732      /* I128-bit data-steering */
3733      case Iop_64HLto128:
3734         return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
3735
3736      /* V256-bit SIMD */
3737
3738      case Iop_Max64Fx4:
3739      case Iop_Min64Fx4:
3740         return binary64Fx4(mce, vatom1, vatom2);
3741
3742      case Iop_Max32Fx8:
3743      case Iop_Min32Fx8:
3744         return binary32Fx8(mce, vatom1, vatom2);
3745
3746      /* V256-bit data-steering */
3747      case Iop_V128HLtoV256:
3748         return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
3749
3750      /* Scalar floating point */
3751
3752      case Iop_F32toI64S:
3753      case Iop_F32toI64U:
3754         /* I32(rm) x F32 -> I64 */
3755         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3756
3757      case Iop_I64StoF32:
3758         /* I32(rm) x I64 -> F32 */
3759         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3760
3761      case Iop_RoundF64toInt:
3762      case Iop_RoundF64toF32:
3763      case Iop_F64toI64S:
3764      case Iop_F64toI64U:
3765      case Iop_I64StoF64:
3766      case Iop_I64UtoF64:
3767      case Iop_SinF64:
3768      case Iop_CosF64:
3769      case Iop_TanF64:
3770      case Iop_2xm1F64:
3771      case Iop_SqrtF64:
3772      case Iop_RecpExpF64:
3773         /* I32(rm) x I64/F64 -> I64/F64 */
3774         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3775
3776      case Iop_ShlD64:
3777      case Iop_ShrD64:
3778      case Iop_RoundD64toInt:
3779         /* I32(rm) x D64 -> D64 */
3780         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3781
3782      case Iop_ShlD128:
3783      case Iop_ShrD128:
3784      case Iop_RoundD128toInt:
3785         /* I32(rm) x D128 -> D128 */
3786         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3787
3788      case Iop_D64toI64S:
3789      case Iop_D64toI64U:
3790      case Iop_I64StoD64:
3791      case Iop_I64UtoD64:
3792         /* I32(rm) x I64/D64 -> D64/I64 */
3793         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3794
3795      case Iop_F32toD32:
3796      case Iop_F64toD32:
3797      case Iop_F128toD32:
3798      case Iop_D32toF32:
3799      case Iop_D64toF32:
3800      case Iop_D128toF32:
3801         /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
3802         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3803
3804      case Iop_F32toD64:
3805      case Iop_F64toD64:
3806      case Iop_F128toD64:
3807      case Iop_D32toF64:
3808      case Iop_D64toF64:
3809      case Iop_D128toF64:
3810         /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
3811         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3812
3813      case Iop_F32toD128:
3814      case Iop_F64toD128:
3815      case Iop_F128toD128:
3816      case Iop_D32toF128:
3817      case Iop_D64toF128:
3818      case Iop_D128toF128:
3819         /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
3820         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3821
3822      case Iop_RoundF32toInt:
3823      case Iop_SqrtF32:
3824      case Iop_RecpExpF32:
3825         /* I32(rm) x I32/F32 -> I32/F32 */
3826         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3827
3828      case Iop_SqrtF128:
3829         /* I32(rm) x F128 -> F128 */
3830         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3831
3832      case Iop_I32StoF32:
3833      case Iop_I32UtoF32:
3834      case Iop_F32toI32S:
3835      case Iop_F32toI32U:
3836         /* First arg is I32 (rounding mode), second is F32/I32 (data). */
3837         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3838
3839      case Iop_F64toF16:
3840      case Iop_F32toF16:
3841         /* First arg is I32 (rounding mode), second is F64/F32 (data). */
3842         return mkLazy2(mce, Ity_I16, vatom1, vatom2);
3843
3844      case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
3845      case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
3846      case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
3847      case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
3848      case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
3849         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3850
3851      case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
3852      case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
3853      case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
3854      case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
3855      case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
3856      case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
3857         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3858
3859      case Iop_F64HLtoF128:
3860      case Iop_D64HLtoD128:
3861         return assignNew('V', mce, Ity_I128,
3862                          binop(Iop_64HLto128, vatom1, vatom2));
3863
3864      case Iop_F64toI32U:
3865      case Iop_F64toI32S:
3866      case Iop_F64toF32:
3867      case Iop_I64UtoF32:
3868      case Iop_D64toI32U:
3869      case Iop_D64toI32S:
3870         /* First arg is I32 (rounding mode), second is F64/D64 (data). */
3871         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3872
3873      case Iop_D64toD32:
3874         /* First arg is I32 (rounding mode), second is D64 (data). */
3875         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3876
3877      case Iop_F64toI16S:
3878         /* First arg is I32 (rounding mode), second is F64 (data). */
3879         return mkLazy2(mce, Ity_I16, vatom1, vatom2);
3880
3881      case Iop_InsertExpD64:
3882         /*  I64 x I64 -> D64 */
3883         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3884
3885      case Iop_InsertExpD128:
3886         /*  I64 x I128 -> D128 */
3887         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3888
3889      case Iop_CmpF32:
3890      case Iop_CmpF64:
3891      case Iop_CmpF128:
3892      case Iop_CmpD64:
3893      case Iop_CmpD128:
3894      case Iop_CmpExpD64:
3895      case Iop_CmpExpD128:
3896         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3897
3898      /* non-FP after here */
3899
3900      case Iop_DivModU64to32:
3901      case Iop_DivModS64to32:
3902         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3903
3904      case Iop_DivModU128to64:
3905      case Iop_DivModS128to64:
3906         return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3907
3908      case Iop_8HLto16:
3909         return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
3910      case Iop_16HLto32:
3911         return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
3912      case Iop_32HLto64:
3913         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3914
3915      case Iop_DivModS64to64:
3916      case Iop_MullS64:
3917      case Iop_MullU64: {
3918         IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
3919         IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
3920         return assignNew('V', mce, Ity_I128,
3921                          binop(Iop_64HLto128, vHi64, vLo64));
3922      }
3923
3924      case Iop_MullS32:
3925      case Iop_MullU32: {
3926         IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
3927         IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
3928         return assignNew('V', mce, Ity_I64,
3929                          binop(Iop_32HLto64, vHi32, vLo32));
3930      }
3931
3932      case Iop_MullS16:
3933      case Iop_MullU16: {
3934         IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
3935         IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
3936         return assignNew('V', mce, Ity_I32,
3937                          binop(Iop_16HLto32, vHi16, vLo16));
3938      }
3939
3940      case Iop_MullS8:
3941      case Iop_MullU8: {
3942         IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
3943         IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
3944         return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
3945      }
3946
3947      case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
3948      case Iop_DivS32:
3949      case Iop_DivU32:
3950      case Iop_DivU32E:
3951      case Iop_DivS32E:
3952      case Iop_QAdd32S: /* could probably do better */
3953      case Iop_QSub32S: /* could probably do better */
3954         return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3955
3956      case Iop_DivS64:
3957      case Iop_DivU64:
3958      case Iop_DivS64E:
3959      case Iop_DivU64E:
3960         return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3961
3962      case Iop_Add32:
3963         if (mce->bogusLiterals || mce->useLLVMworkarounds)
3964            return expensiveAddSub(mce,True,Ity_I32,
3965                                   vatom1,vatom2, atom1,atom2);
3966         else
3967            goto cheap_AddSub32;
3968      case Iop_Sub32:
3969         if (mce->bogusLiterals)
3970            return expensiveAddSub(mce,False,Ity_I32,
3971                                   vatom1,vatom2, atom1,atom2);
3972         else
3973            goto cheap_AddSub32;
3974
3975      cheap_AddSub32:
3976      case Iop_Mul32:
3977         return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
3978
3979      case Iop_CmpORD32S:
3980      case Iop_CmpORD32U:
3981      case Iop_CmpORD64S:
3982      case Iop_CmpORD64U:
3983         return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
3984
3985      case Iop_Add64:
3986         if (mce->bogusLiterals || mce->useLLVMworkarounds)
3987            return expensiveAddSub(mce,True,Ity_I64,
3988                                   vatom1,vatom2, atom1,atom2);
3989         else
3990            goto cheap_AddSub64;
3991      case Iop_Sub64:
3992         if (mce->bogusLiterals)
3993            return expensiveAddSub(mce,False,Ity_I64,
3994                                   vatom1,vatom2, atom1,atom2);
3995         else
3996            goto cheap_AddSub64;
3997
3998      cheap_AddSub64:
3999      case Iop_Mul64:
4000         return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4001
4002      case Iop_Mul16:
4003      case Iop_Add16:
4004      case Iop_Sub16:
4005         return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4006
4007      case Iop_Mul8:
4008      case Iop_Sub8:
4009      case Iop_Add8:
4010         return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4011
4012      case Iop_CmpEQ64:
4013      case Iop_CmpNE64:
4014         if (mce->bogusLiterals)
4015            goto expensive_cmp64;
4016         else
4017            goto cheap_cmp64;
4018
4019      expensive_cmp64:
4020      case Iop_ExpCmpNE64:
4021         return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4022
4023      cheap_cmp64:
4024      case Iop_CmpLE64S: case Iop_CmpLE64U:
4025      case Iop_CmpLT64U: case Iop_CmpLT64S:
4026         return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4027
4028      case Iop_CmpEQ32:
4029      case Iop_CmpNE32:
4030         if (mce->bogusLiterals)
4031            goto expensive_cmp32;
4032         else
4033            goto cheap_cmp32;
4034
4035      expensive_cmp32:
4036      case Iop_ExpCmpNE32:
4037         return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4038
4039      cheap_cmp32:
4040      case Iop_CmpLE32S: case Iop_CmpLE32U:
4041      case Iop_CmpLT32U: case Iop_CmpLT32S:
4042         return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4043
4044      case Iop_CmpEQ16: case Iop_CmpNE16:
4045         return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4046
4047      case Iop_ExpCmpNE16:
4048         return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4049
4050      case Iop_CmpEQ8: case Iop_CmpNE8:
4051         return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4052
4053      case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4054      case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4055      case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4056      case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4057         /* Just say these all produce a defined result, regardless
4058            of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4059         return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4060
4061      case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4062         return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4063
4064      case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4065         return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4066
4067      case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4068         return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4069
4070      case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4071         return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4072
4073      case Iop_AndV256:
4074         uifu = mkUifUV256; difd = mkDifDV256;
4075         and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4076      case Iop_AndV128:
4077         uifu = mkUifUV128; difd = mkDifDV128;
4078         and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4079      case Iop_And64:
4080         uifu = mkUifU64; difd = mkDifD64;
4081         and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4082      case Iop_And32:
4083         uifu = mkUifU32; difd = mkDifD32;
4084         and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4085      case Iop_And16:
4086         uifu = mkUifU16; difd = mkDifD16;
4087         and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4088      case Iop_And8:
4089         uifu = mkUifU8; difd = mkDifD8;
4090         and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4091
4092      case Iop_OrV256:
4093         uifu = mkUifUV256; difd = mkDifDV256;
4094         and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4095      case Iop_OrV128:
4096         uifu = mkUifUV128; difd = mkDifDV128;
4097         and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4098      case Iop_Or64:
4099         uifu = mkUifU64; difd = mkDifD64;
4100         and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4101      case Iop_Or32:
4102         uifu = mkUifU32; difd = mkDifD32;
4103         and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4104      case Iop_Or16:
4105         uifu = mkUifU16; difd = mkDifD16;
4106         and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4107      case Iop_Or8:
4108         uifu = mkUifU8; difd = mkDifD8;
4109         and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4110
4111      do_And_Or:
4112         return
4113         assignNew(
4114            'V', mce,
4115            and_or_ty,
4116            difd(mce, uifu(mce, vatom1, vatom2),
4117                      difd(mce, improve(mce, atom1, vatom1),
4118                                improve(mce, atom2, vatom2) ) ) );
4119
4120      case Iop_Xor8:
4121         return mkUifU8(mce, vatom1, vatom2);
4122      case Iop_Xor16:
4123         return mkUifU16(mce, vatom1, vatom2);
4124      case Iop_Xor32:
4125         return mkUifU32(mce, vatom1, vatom2);
4126      case Iop_Xor64:
4127         return mkUifU64(mce, vatom1, vatom2);
4128      case Iop_XorV128:
4129         return mkUifUV128(mce, vatom1, vatom2);
4130      case Iop_XorV256:
4131         return mkUifUV256(mce, vatom1, vatom2);
4132
4133      /* V256-bit SIMD */
4134
4135      case Iop_ShrN16x16:
4136      case Iop_ShrN32x8:
4137      case Iop_ShrN64x4:
4138      case Iop_SarN16x16:
4139      case Iop_SarN32x8:
4140      case Iop_ShlN16x16:
4141      case Iop_ShlN32x8:
4142      case Iop_ShlN64x4:
4143         /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4144            this is wrong now, scalar shifts are done properly lazily.
4145            Vector shifts should be fixed too. */
4146         complainIfUndefined(mce, atom2, NULL);
4147         return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4148
4149      case Iop_QSub8Ux32:
4150      case Iop_QSub8Sx32:
4151      case Iop_Sub8x32:
4152      case Iop_Min8Ux32:
4153      case Iop_Min8Sx32:
4154      case Iop_Max8Ux32:
4155      case Iop_Max8Sx32:
4156      case Iop_CmpGT8Sx32:
4157      case Iop_CmpEQ8x32:
4158      case Iop_Avg8Ux32:
4159      case Iop_QAdd8Ux32:
4160      case Iop_QAdd8Sx32:
4161      case Iop_Add8x32:
4162         return binary8Ix32(mce, vatom1, vatom2);
4163
4164      case Iop_QSub16Ux16:
4165      case Iop_QSub16Sx16:
4166      case Iop_Sub16x16:
4167      case Iop_Mul16x16:
4168      case Iop_MulHi16Sx16:
4169      case Iop_MulHi16Ux16:
4170      case Iop_Min16Sx16:
4171      case Iop_Min16Ux16:
4172      case Iop_Max16Sx16:
4173      case Iop_Max16Ux16:
4174      case Iop_CmpGT16Sx16:
4175      case Iop_CmpEQ16x16:
4176      case Iop_Avg16Ux16:
4177      case Iop_QAdd16Ux16:
4178      case Iop_QAdd16Sx16:
4179      case Iop_Add16x16:
4180         return binary16Ix16(mce, vatom1, vatom2);
4181
4182      case Iop_Sub32x8:
4183      case Iop_CmpGT32Sx8:
4184      case Iop_CmpEQ32x8:
4185      case Iop_Add32x8:
4186      case Iop_Max32Ux8:
4187      case Iop_Max32Sx8:
4188      case Iop_Min32Ux8:
4189      case Iop_Min32Sx8:
4190      case Iop_Mul32x8:
4191         return binary32Ix8(mce, vatom1, vatom2);
4192
4193      case Iop_Sub64x4:
4194      case Iop_Add64x4:
4195      case Iop_CmpEQ64x4:
4196      case Iop_CmpGT64Sx4:
4197         return binary64Ix4(mce, vatom1, vatom2);
4198
4199     /* Perm32x8: rearrange values in left arg using steering values
4200        from right arg.  So rearrange the vbits in the same way but
4201        pessimise wrt steering values. */
4202      case Iop_Perm32x8:
4203         return mkUifUV256(
4204                   mce,
4205                   assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4206                   mkPCast32x8(mce, vatom2)
4207                );
4208
4209      /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4210         Handle the shifted results in the same way that other
4211         binary Q ops are handled, eg QSub: UifU the two args,
4212         then pessimise -- which is binaryNIxM.  But for the upper
4213         V128, we require to generate just 1 bit which is the
4214         pessimised shift result, with 127 defined zeroes above it.
4215
4216         Note that this overly pessimistic in that in fact only the
4217         bottom 8 bits of each lane of the second arg determine the shift
4218         amount.  Really we ought to ignore any undefinedness in the
4219         rest of the lanes of the second arg. */
4220      case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
4221      case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4222      case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
4223      case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4224      case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
4225      case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4226      case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
4227      case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4228      {
4229         // The function to generate the pessimised shift result
4230         IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4231         switch (op) {
4232            case Iop_QandSQsh64x2:
4233            case Iop_QandUQsh64x2:
4234            case Iop_QandSQRsh64x2:
4235            case Iop_QandUQRsh64x2:
4236               binaryNIxM = binary64Ix2;
4237               break;
4238            case Iop_QandSQsh32x4:
4239            case Iop_QandUQsh32x4:
4240            case Iop_QandSQRsh32x4:
4241            case Iop_QandUQRsh32x4:
4242               binaryNIxM = binary32Ix4;
4243               break;
4244            case Iop_QandSQsh16x8:
4245            case Iop_QandUQsh16x8:
4246            case Iop_QandSQRsh16x8:
4247            case Iop_QandUQRsh16x8:
4248               binaryNIxM = binary16Ix8;
4249               break;
4250            case Iop_QandSQsh8x16:
4251            case Iop_QandUQsh8x16:
4252            case Iop_QandSQRsh8x16:
4253            case Iop_QandUQRsh8x16:
4254               binaryNIxM = binary8Ix16;
4255               break;
4256            default:
4257               tl_assert(0);
4258         }
4259         tl_assert(binaryNIxM);
4260         // Pessimised shift result, shV[127:0]
4261         IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4262         // Generates: Def--(127)--Def PCast-to-I1(shV)
4263         IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4264         // and assemble the result
4265         return assignNew('V', mce, Ity_V256,
4266                          binop(Iop_V128HLtoV256, qV, shV));
4267      }
4268
4269      default:
4270         ppIROp(op);
4271         VG_(tool_panic)("memcheck:expr2vbits_Binop");
4272   }
4273}
4274
4275
4276static
4277IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
4278{
4279   /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
4280      selection of shadow operation implicitly duplicates the logic in
4281      do_shadow_LoadG and should be kept in sync (in the very unlikely
4282      event that the interpretation of such widening ops changes in
4283      future).  See comment in do_shadow_LoadG. */
4284   IRAtom* vatom = expr2vbits( mce, atom );
4285   tl_assert(isOriginalAtom(mce,atom));
4286   switch (op) {
4287
4288      case Iop_Abs64Fx2:
4289      case Iop_Neg64Fx2:
4290      case Iop_RSqrtEst64Fx2:
4291      case Iop_RecipEst64Fx2:
4292         return unary64Fx2(mce, vatom);
4293
4294      case Iop_Sqrt64F0x2:
4295         return unary64F0x2(mce, vatom);
4296
4297      case Iop_Sqrt32Fx8:
4298      case Iop_RSqrtEst32Fx8:
4299      case Iop_RecipEst32Fx8:
4300         return unary32Fx8(mce, vatom);
4301
4302      case Iop_Sqrt64Fx4:
4303         return unary64Fx4(mce, vatom);
4304
4305      case Iop_RecipEst32Fx4:
4306      case Iop_I32UtoFx4:
4307      case Iop_I32StoFx4:
4308      case Iop_QFtoI32Ux4_RZ:
4309      case Iop_QFtoI32Sx4_RZ:
4310      case Iop_RoundF32x4_RM:
4311      case Iop_RoundF32x4_RP:
4312      case Iop_RoundF32x4_RN:
4313      case Iop_RoundF32x4_RZ:
4314      case Iop_RecipEst32Ux4:
4315      case Iop_Abs32Fx4:
4316      case Iop_Neg32Fx4:
4317      case Iop_RSqrtEst32Fx4:
4318         return unary32Fx4(mce, vatom);
4319
4320      case Iop_I32UtoFx2:
4321      case Iop_I32StoFx2:
4322      case Iop_RecipEst32Fx2:
4323      case Iop_RecipEst32Ux2:
4324      case Iop_Abs32Fx2:
4325      case Iop_Neg32Fx2:
4326      case Iop_RSqrtEst32Fx2:
4327         return unary32Fx2(mce, vatom);
4328
4329      case Iop_Sqrt32F0x4:
4330      case Iop_RSqrtEst32F0x4:
4331      case Iop_RecipEst32F0x4:
4332         return unary32F0x4(mce, vatom);
4333
4334      case Iop_32UtoV128:
4335      case Iop_64UtoV128:
4336      case Iop_Dup8x16:
4337      case Iop_Dup16x8:
4338      case Iop_Dup32x4:
4339      case Iop_Reverse1sIn8_x16:
4340      case Iop_Reverse8sIn16_x8:
4341      case Iop_Reverse8sIn32_x4:
4342      case Iop_Reverse16sIn32_x4:
4343      case Iop_Reverse8sIn64_x2:
4344      case Iop_Reverse16sIn64_x2:
4345      case Iop_Reverse32sIn64_x2:
4346      case Iop_V256toV128_1: case Iop_V256toV128_0:
4347      case Iop_ZeroHI64ofV128:
4348      case Iop_ZeroHI96ofV128:
4349      case Iop_ZeroHI112ofV128:
4350      case Iop_ZeroHI120ofV128:
4351         return assignNew('V', mce, Ity_V128, unop(op, vatom));
4352
4353      case Iop_F128HItoF64:  /* F128 -> high half of F128 */
4354      case Iop_D128HItoD64:  /* D128 -> high half of D128 */
4355         return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
4356      case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
4357      case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
4358         return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
4359
4360      case Iop_NegF128:
4361      case Iop_AbsF128:
4362         return mkPCastTo(mce, Ity_I128, vatom);
4363
4364      case Iop_I32StoF128: /* signed I32 -> F128 */
4365      case Iop_I64StoF128: /* signed I64 -> F128 */
4366      case Iop_I32UtoF128: /* unsigned I32 -> F128 */
4367      case Iop_I64UtoF128: /* unsigned I64 -> F128 */
4368      case Iop_F32toF128:  /* F32 -> F128 */
4369      case Iop_F64toF128:  /* F64 -> F128 */
4370      case Iop_I32StoD128: /* signed I64 -> D128 */
4371      case Iop_I64StoD128: /* signed I64 -> D128 */
4372      case Iop_I32UtoD128: /* unsigned I32 -> D128 */
4373      case Iop_I64UtoD128: /* unsigned I64 -> D128 */
4374         return mkPCastTo(mce, Ity_I128, vatom);
4375
4376      case Iop_F16toF64:
4377      case Iop_F32toF64:
4378      case Iop_I32StoF64:
4379      case Iop_I32UtoF64:
4380      case Iop_NegF64:
4381      case Iop_AbsF64:
4382      case Iop_RSqrtEst5GoodF64:
4383      case Iop_RoundF64toF64_NEAREST:
4384      case Iop_RoundF64toF64_NegINF:
4385      case Iop_RoundF64toF64_PosINF:
4386      case Iop_RoundF64toF64_ZERO:
4387      case Iop_Clz64:
4388      case Iop_D32toD64:
4389      case Iop_I32StoD64:
4390      case Iop_I32UtoD64:
4391      case Iop_ExtractExpD64:    /* D64  -> I64 */
4392      case Iop_ExtractExpD128:   /* D128 -> I64 */
4393      case Iop_ExtractSigD64:    /* D64  -> I64 */
4394      case Iop_ExtractSigD128:   /* D128 -> I64 */
4395      case Iop_DPBtoBCD:
4396      case Iop_BCDtoDPB:
4397         return mkPCastTo(mce, Ity_I64, vatom);
4398
4399      case Iop_D64toD128:
4400         return mkPCastTo(mce, Ity_I128, vatom);
4401
4402      case Iop_Clz32:
4403      case Iop_TruncF64asF32:
4404      case Iop_NegF32:
4405      case Iop_AbsF32:
4406      case Iop_F16toF32:
4407         return mkPCastTo(mce, Ity_I32, vatom);
4408
4409      case Iop_Ctz32:
4410      case Iop_Ctz64:
4411         return expensiveCountTrailingZeroes(mce, op, atom, vatom);
4412
4413      case Iop_1Uto64:
4414      case Iop_1Sto64:
4415      case Iop_8Uto64:
4416      case Iop_8Sto64:
4417      case Iop_16Uto64:
4418      case Iop_16Sto64:
4419      case Iop_32Sto64:
4420      case Iop_32Uto64:
4421      case Iop_V128to64:
4422      case Iop_V128HIto64:
4423      case Iop_128HIto64:
4424      case Iop_128to64:
4425      case Iop_Dup8x8:
4426      case Iop_Dup16x4:
4427      case Iop_Dup32x2:
4428      case Iop_Reverse8sIn16_x4:
4429      case Iop_Reverse8sIn32_x2:
4430      case Iop_Reverse16sIn32_x2:
4431      case Iop_Reverse8sIn64_x1:
4432      case Iop_Reverse16sIn64_x1:
4433      case Iop_Reverse32sIn64_x1:
4434      case Iop_V256to64_0: case Iop_V256to64_1:
4435      case Iop_V256to64_2: case Iop_V256to64_3:
4436         return assignNew('V', mce, Ity_I64, unop(op, vatom));
4437
4438      case Iop_64to32:
4439      case Iop_64HIto32:
4440      case Iop_1Uto32:
4441      case Iop_1Sto32:
4442      case Iop_8Uto32:
4443      case Iop_16Uto32:
4444      case Iop_16Sto32:
4445      case Iop_8Sto32:
4446      case Iop_V128to32:
4447         return assignNew('V', mce, Ity_I32, unop(op, vatom));
4448
4449      case Iop_8Sto16:
4450      case Iop_8Uto16:
4451      case Iop_32to16:
4452      case Iop_32HIto16:
4453      case Iop_64to16:
4454      case Iop_GetMSBs8x16:
4455         return assignNew('V', mce, Ity_I16, unop(op, vatom));
4456
4457      case Iop_1Uto8:
4458      case Iop_1Sto8:
4459      case Iop_16to8:
4460      case Iop_16HIto8:
4461      case Iop_32to8:
4462      case Iop_64to8:
4463      case Iop_GetMSBs8x8:
4464         return assignNew('V', mce, Ity_I8, unop(op, vatom));
4465
4466      case Iop_32to1:
4467         return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
4468
4469      case Iop_64to1:
4470         return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
4471
4472      case Iop_ReinterpF64asI64:
4473      case Iop_ReinterpI64asF64:
4474      case Iop_ReinterpI32asF32:
4475      case Iop_ReinterpF32asI32:
4476      case Iop_ReinterpI64asD64:
4477      case Iop_ReinterpD64asI64:
4478      case Iop_NotV256:
4479      case Iop_NotV128:
4480      case Iop_Not64:
4481      case Iop_Not32:
4482      case Iop_Not16:
4483      case Iop_Not8:
4484      case Iop_Not1:
4485         return vatom;
4486
4487      case Iop_CmpNEZ8x8:
4488      case Iop_Cnt8x8:
4489      case Iop_Clz8x8:
4490      case Iop_Cls8x8:
4491      case Iop_Abs8x8:
4492         return mkPCast8x8(mce, vatom);
4493
4494      case Iop_CmpNEZ8x16:
4495      case Iop_Cnt8x16:
4496      case Iop_Clz8x16:
4497      case Iop_Cls8x16:
4498      case Iop_Abs8x16:
4499         return mkPCast8x16(mce, vatom);
4500
4501      case Iop_CmpNEZ16x4:
4502      case Iop_Clz16x4:
4503      case Iop_Cls16x4:
4504      case Iop_Abs16x4:
4505         return mkPCast16x4(mce, vatom);
4506
4507      case Iop_CmpNEZ16x8:
4508      case Iop_Clz16x8:
4509      case Iop_Cls16x8:
4510      case Iop_Abs16x8:
4511         return mkPCast16x8(mce, vatom);
4512
4513      case Iop_CmpNEZ32x2:
4514      case Iop_Clz32x2:
4515      case Iop_Cls32x2:
4516      case Iop_FtoI32Ux2_RZ:
4517      case Iop_FtoI32Sx2_RZ:
4518      case Iop_Abs32x2:
4519         return mkPCast32x2(mce, vatom);
4520
4521      case Iop_CmpNEZ32x4:
4522      case Iop_Clz32x4:
4523      case Iop_Cls32x4:
4524      case Iop_FtoI32Ux4_RZ:
4525      case Iop_FtoI32Sx4_RZ:
4526      case Iop_Abs32x4:
4527      case Iop_RSqrtEst32Ux4:
4528         return mkPCast32x4(mce, vatom);
4529
4530      case Iop_CmpwNEZ32:
4531         return mkPCastTo(mce, Ity_I32, vatom);
4532
4533      case Iop_CmpwNEZ64:
4534         return mkPCastTo(mce, Ity_I64, vatom);
4535
4536      case Iop_CmpNEZ64x2:
4537      case Iop_CipherSV128:
4538      case Iop_Clz64x2:
4539      case Iop_Abs64x2:
4540         return mkPCast64x2(mce, vatom);
4541
4542      case Iop_PwBitMtxXpose64x2:
4543         return assignNew('V', mce, Ity_V128, unop(op, vatom));
4544
4545      case Iop_NarrowUn16to8x8:
4546      case Iop_NarrowUn32to16x4:
4547      case Iop_NarrowUn64to32x2:
4548      case Iop_QNarrowUn16Sto8Sx8:
4549      case Iop_QNarrowUn16Sto8Ux8:
4550      case Iop_QNarrowUn16Uto8Ux8:
4551      case Iop_QNarrowUn32Sto16Sx4:
4552      case Iop_QNarrowUn32Sto16Ux4:
4553      case Iop_QNarrowUn32Uto16Ux4:
4554      case Iop_QNarrowUn64Sto32Sx2:
4555      case Iop_QNarrowUn64Sto32Ux2:
4556      case Iop_QNarrowUn64Uto32Ux2:
4557         return vectorNarrowUnV128(mce, op, vatom);
4558
4559      case Iop_Widen8Sto16x8:
4560      case Iop_Widen8Uto16x8:
4561      case Iop_Widen16Sto32x4:
4562      case Iop_Widen16Uto32x4:
4563      case Iop_Widen32Sto64x2:
4564      case Iop_Widen32Uto64x2:
4565         return vectorWidenI64(mce, op, vatom);
4566
4567      case Iop_PwAddL32Ux2:
4568      case Iop_PwAddL32Sx2:
4569         return mkPCastTo(mce, Ity_I64,
4570               assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
4571
4572      case Iop_PwAddL16Ux4:
4573      case Iop_PwAddL16Sx4:
4574         return mkPCast32x2(mce,
4575               assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
4576
4577      case Iop_PwAddL8Ux8:
4578      case Iop_PwAddL8Sx8:
4579         return mkPCast16x4(mce,
4580               assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
4581
4582      case Iop_PwAddL32Ux4:
4583      case Iop_PwAddL32Sx4:
4584         return mkPCast64x2(mce,
4585               assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
4586
4587      case Iop_PwAddL16Ux8:
4588      case Iop_PwAddL16Sx8:
4589         return mkPCast32x4(mce,
4590               assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
4591
4592      case Iop_PwAddL8Ux16:
4593      case Iop_PwAddL8Sx16:
4594         return mkPCast16x8(mce,
4595               assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
4596
4597      case Iop_I64UtoF32:
4598      default:
4599         ppIROp(op);
4600         VG_(tool_panic)("memcheck:expr2vbits_Unop");
4601   }
4602}
4603
4604
4605/* Worker function -- do not call directly.  See comments on
4606   expr2vbits_Load for the meaning of |guard|.
4607
4608   Generates IR to (1) perform a definedness test of |addr|, (2)
4609   perform a validity test of |addr|, and (3) return the Vbits for the
4610   location indicated by |addr|.  All of this only happens when
4611   |guard| is NULL or |guard| evaluates to True at run time.
4612
4613   If |guard| evaluates to False at run time, the returned value is
4614   the IR-mandated 0x55..55 value, and no checks nor shadow loads are
4615   performed.
4616
4617   The definedness of |guard| itself is not checked.  That is assumed
4618   to have been done before this point, by the caller. */
4619static
4620IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4621                              IREndness end, IRType ty,
4622                              IRAtom* addr, UInt bias, IRAtom* guard )
4623{
4624   tl_assert(isOriginalAtom(mce,addr));
4625   tl_assert(end == Iend_LE || end == Iend_BE);
4626
4627   /* First, emit a definedness test for the address.  This also sets
4628      the address (shadow) to 'defined' following the test. */
4629   complainIfUndefined( mce, addr, guard );
4630
4631   /* Now cook up a call to the relevant helper function, to read the
4632      data V bits from shadow memory. */
4633   ty = shadowTypeV(ty);
4634
4635   void*        helper           = NULL;
4636   const HChar* hname            = NULL;
4637   Bool         ret_via_outparam = False;
4638
4639   if (end == Iend_LE) {
4640      switch (ty) {
4641         case Ity_V256: helper = &MC_(helperc_LOADV256le);
4642                        hname = "MC_(helperc_LOADV256le)";
4643                        ret_via_outparam = True;
4644                        break;
4645         case Ity_V128: helper = &MC_(helperc_LOADV128le);
4646                        hname = "MC_(helperc_LOADV128le)";
4647                        ret_via_outparam = True;
4648                        break;
4649         case Ity_I64:  helper = &MC_(helperc_LOADV64le);
4650                        hname = "MC_(helperc_LOADV64le)";
4651                        break;
4652         case Ity_I32:  helper = &MC_(helperc_LOADV32le);
4653                        hname = "MC_(helperc_LOADV32le)";
4654                        break;
4655         case Ity_I16:  helper = &MC_(helperc_LOADV16le);
4656                        hname = "MC_(helperc_LOADV16le)";
4657                        break;
4658         case Ity_I8:   helper = &MC_(helperc_LOADV8);
4659                        hname = "MC_(helperc_LOADV8)";
4660                        break;
4661         default:       ppIRType(ty);
4662                        VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
4663      }
4664   } else {
4665      switch (ty) {
4666         case Ity_V256: helper = &MC_(helperc_LOADV256be);
4667                        hname = "MC_(helperc_LOADV256be)";
4668                        ret_via_outparam = True;
4669                        break;
4670         case Ity_V128: helper = &MC_(helperc_LOADV128be);
4671                        hname = "MC_(helperc_LOADV128be)";
4672                        ret_via_outparam = True;
4673                        break;
4674         case Ity_I64:  helper = &MC_(helperc_LOADV64be);
4675                        hname = "MC_(helperc_LOADV64be)";
4676                        break;
4677         case Ity_I32:  helper = &MC_(helperc_LOADV32be);
4678                        hname = "MC_(helperc_LOADV32be)";
4679                        break;
4680         case Ity_I16:  helper = &MC_(helperc_LOADV16be);
4681                        hname = "MC_(helperc_LOADV16be)";
4682                        break;
4683         case Ity_I8:   helper = &MC_(helperc_LOADV8);
4684                        hname = "MC_(helperc_LOADV8)";
4685                        break;
4686         default:       ppIRType(ty);
4687                        VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
4688      }
4689   }
4690
4691   tl_assert(helper);
4692   tl_assert(hname);
4693
4694   /* Generate the actual address into addrAct. */
4695   IRAtom* addrAct;
4696   if (bias == 0) {
4697      addrAct = addr;
4698   } else {
4699      IROp    mkAdd;
4700      IRAtom* eBias;
4701      IRType  tyAddr  = mce->hWordTy;
4702      tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
4703      mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
4704      eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
4705      addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
4706   }
4707
4708   /* We need to have a place to park the V bits we're just about to
4709      read. */
4710   IRTemp datavbits = newTemp(mce, ty, VSh);
4711
4712   /* Here's the call. */
4713   IRDirty* di;
4714   if (ret_via_outparam) {
4715      di = unsafeIRDirty_1_N( datavbits,
4716                              2/*regparms*/,
4717                              hname, VG_(fnptr_to_fnentry)( helper ),
4718                              mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
4719   } else {
4720      di = unsafeIRDirty_1_N( datavbits,
4721                              1/*regparms*/,
4722                              hname, VG_(fnptr_to_fnentry)( helper ),
4723                              mkIRExprVec_1( addrAct ) );
4724   }
4725
4726   setHelperAnns( mce, di );
4727   if (guard) {
4728      di->guard = guard;
4729      /* Ideally the didn't-happen return value here would be all-ones
4730         (all-undefined), so it'd be obvious if it got used
4731         inadvertantly.  We can get by with the IR-mandated default
4732         value (0b01 repeating, 0x55 etc) as that'll still look pretty
4733         undefined if it ever leaks out. */
4734   }
4735   stmt( 'V', mce, IRStmt_Dirty(di) );
4736
4737   return mkexpr(datavbits);
4738}
4739
4740
4741/* Generate IR to do a shadow load.  The helper is expected to check
4742   the validity of the address and return the V bits for that address.
4743   This can optionally be controlled by a guard, which is assumed to
4744   be True if NULL.  In the case where the guard is False at runtime,
4745   the helper will return the didn't-do-the-call value of 0x55..55.
4746   Since that means "completely undefined result", the caller of
4747   this function will need to fix up the result somehow in that
4748   case.
4749
4750   Caller of this function is also expected to have checked the
4751   definedness of |guard| before this point.
4752*/
4753static
4754IRAtom* expr2vbits_Load ( MCEnv* mce,
4755                          IREndness end, IRType ty,
4756                          IRAtom* addr, UInt bias,
4757                          IRAtom* guard )
4758{
4759   tl_assert(end == Iend_LE || end == Iend_BE);
4760   switch (shadowTypeV(ty)) {
4761      case Ity_I8:
4762      case Ity_I16:
4763      case Ity_I32:
4764      case Ity_I64:
4765      case Ity_V128:
4766      case Ity_V256:
4767         return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
4768      default:
4769         VG_(tool_panic)("expr2vbits_Load");
4770   }
4771}
4772
4773
4774/* The most general handler for guarded loads.  Assumes the
4775   definedness of GUARD has already been checked by the caller.  A
4776   GUARD of NULL is assumed to mean "always True".  Generates code to
4777   check the definedness and validity of ADDR.
4778
4779   Generate IR to do a shadow load from ADDR and return the V bits.
4780   The loaded type is TY.  The loaded data is then (shadow) widened by
4781   using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
4782   evaluates to False at run time then the returned Vbits are simply
4783   VALT instead.  Note therefore that the argument type of VWIDEN must
4784   be TY and the result type of VWIDEN must equal the type of VALT.
4785*/
4786static
4787IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
4788                                          IREndness end, IRType ty,
4789                                          IRAtom* addr, UInt bias,
4790                                          IRAtom* guard,
4791                                          IROp vwiden, IRAtom* valt )
4792{
4793   /* Sanity check the conversion operation, and also set TYWIDE. */
4794   IRType tyWide = Ity_INVALID;
4795   switch (vwiden) {
4796      case Iop_INVALID:
4797         tyWide = ty;
4798         break;
4799      case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
4800         tyWide = Ity_I32;
4801         break;
4802      default:
4803         VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
4804   }
4805
4806   /* If the guard evaluates to True, this will hold the loaded V bits
4807      at TY.  If the guard evaluates to False, this will be all
4808      ones, meaning "all undefined", in which case we will have to
4809      replace it using an ITE below. */
4810   IRAtom* iftrue1
4811      = assignNew('V', mce, ty,
4812                  expr2vbits_Load(mce, end, ty, addr, bias, guard));
4813   /* Now (shadow-) widen the loaded V bits to the desired width.  In
4814      the guard-is-False case, the allowable widening operators will
4815      in the worst case (unsigned widening) at least leave the
4816      pre-widened part as being marked all-undefined, and in the best
4817      case (signed widening) mark the whole widened result as
4818      undefined.  Anyway, it doesn't matter really, since in this case
4819      we will replace said value with the default value |valt| using an
4820      ITE. */
4821   IRAtom* iftrue2
4822      = vwiden == Iop_INVALID
4823           ? iftrue1
4824           : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
4825   /* These are the V bits we will return if the load doesn't take
4826      place. */
4827   IRAtom* iffalse
4828      = valt;
4829   /* Prepare the cond for the ITE.  Convert a NULL cond into
4830      something that iropt knows how to fold out later. */
4831   IRAtom* cond
4832      = guard == NULL  ? mkU1(1)  : guard;
4833   /* And assemble the final result. */
4834   return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
4835}
4836
4837
4838/* A simpler handler for guarded loads, in which there is no
4839   conversion operation, and the default V bit return (when the guard
4840   evaluates to False at runtime) is "all defined".  If there is no
4841   guard expression or the guard is always TRUE this function behaves
4842   like expr2vbits_Load.  It is assumed that definedness of GUARD has
4843   already been checked at the call site. */
4844static
4845IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
4846                                         IREndness end, IRType ty,
4847                                         IRAtom* addr, UInt bias,
4848                                         IRAtom *guard )
4849{
4850   return expr2vbits_Load_guarded_General(
4851             mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
4852          );
4853}
4854
4855
4856static
4857IRAtom* expr2vbits_ITE ( MCEnv* mce,
4858                         IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
4859{
4860   IRAtom *vbitsC, *vbits0, *vbits1;
4861   IRType ty;
4862   /* Given ITE(cond, iftrue,  iffalse),  generate
4863            ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
4864      That is, steer the V bits like the originals, but trash the
4865      result if the steering value is undefined.  This gives
4866      lazy propagation. */
4867   tl_assert(isOriginalAtom(mce, cond));
4868   tl_assert(isOriginalAtom(mce, iftrue));
4869   tl_assert(isOriginalAtom(mce, iffalse));
4870
4871   vbitsC = expr2vbits(mce, cond);
4872   vbits1 = expr2vbits(mce, iftrue);
4873   vbits0 = expr2vbits(mce, iffalse);
4874   ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
4875
4876   return
4877      mkUifU(mce, ty, assignNew('V', mce, ty,
4878                                     IRExpr_ITE(cond, vbits1, vbits0)),
4879                      mkPCastTo(mce, ty, vbitsC) );
4880}
4881
4882/* --------- This is the main expression-handling function. --------- */
4883
4884static
4885IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
4886{
4887   switch (e->tag) {
4888
4889      case Iex_Get:
4890         return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
4891
4892      case Iex_GetI:
4893         return shadow_GETI( mce, e->Iex.GetI.descr,
4894                                  e->Iex.GetI.ix, e->Iex.GetI.bias );
4895
4896      case Iex_RdTmp:
4897         return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
4898
4899      case Iex_Const:
4900         return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
4901
4902      case Iex_Qop:
4903         return expr2vbits_Qop(
4904                   mce,
4905                   e->Iex.Qop.details->op,
4906                   e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
4907                   e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
4908                );
4909
4910      case Iex_Triop:
4911         return expr2vbits_Triop(
4912                   mce,
4913                   e->Iex.Triop.details->op,
4914                   e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
4915                   e->Iex.Triop.details->arg3
4916                );
4917
4918      case Iex_Binop:
4919         return expr2vbits_Binop(
4920                   mce,
4921                   e->Iex.Binop.op,
4922                   e->Iex.Binop.arg1, e->Iex.Binop.arg2
4923                );
4924
4925      case Iex_Unop:
4926         return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
4927
4928      case Iex_Load:
4929         return expr2vbits_Load( mce, e->Iex.Load.end,
4930                                      e->Iex.Load.ty,
4931                                      e->Iex.Load.addr, 0/*addr bias*/,
4932                                      NULL/* guard == "always True"*/ );
4933
4934      case Iex_CCall:
4935         return mkLazyN( mce, e->Iex.CCall.args,
4936                              e->Iex.CCall.retty,
4937                              e->Iex.CCall.cee );
4938
4939      case Iex_ITE:
4940         return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
4941                                     e->Iex.ITE.iffalse);
4942
4943      default:
4944         VG_(printf)("\n");
4945         ppIRExpr(e);
4946         VG_(printf)("\n");
4947         VG_(tool_panic)("memcheck: expr2vbits");
4948   }
4949}
4950
4951/*------------------------------------------------------------*/
4952/*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
4953/*------------------------------------------------------------*/
4954
4955/* Widen a value to the host word size. */
4956
4957static
4958IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
4959{
4960   IRType ty, tyH;
4961
4962   /* vatom is vbits-value and as such can only have a shadow type. */
4963   tl_assert(isShadowAtom(mce,vatom));
4964
4965   ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
4966   tyH = mce->hWordTy;
4967
4968   if (tyH == Ity_I32) {
4969      switch (ty) {
4970         case Ity_I32:
4971            return vatom;
4972         case Ity_I16:
4973            return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
4974         case Ity_I8:
4975            return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
4976         default:
4977            goto unhandled;
4978      }
4979   } else
4980   if (tyH == Ity_I64) {
4981      switch (ty) {
4982         case Ity_I32:
4983            return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
4984         case Ity_I16:
4985            return assignNew('V', mce, tyH, unop(Iop_32Uto64,
4986                   assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
4987         case Ity_I8:
4988            return assignNew('V', mce, tyH, unop(Iop_32Uto64,
4989                   assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
4990         default:
4991            goto unhandled;
4992      }
4993   } else {
4994      goto unhandled;
4995   }
4996  unhandled:
4997   VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
4998   VG_(tool_panic)("zwidenToHostWord");
4999}
5000
5001
5002/* Generate a shadow store.  |addr| is always the original address
5003   atom.  You can pass in either originals or V-bits for the data
5004   atom, but obviously not both.  This function generates a check for
5005   the definedness and (indirectly) the validity of |addr|, but only
5006   when |guard| evaluates to True at run time (or is NULL).
5007
5008   |guard| :: Ity_I1 controls whether the store really happens; NULL
5009   means it unconditionally does.  Note that |guard| itself is not
5010   checked for definedness; the caller of this function must do that
5011   if necessary.
5012*/
5013static
5014void do_shadow_Store ( MCEnv* mce,
5015                       IREndness end,
5016                       IRAtom* addr, UInt bias,
5017                       IRAtom* data, IRAtom* vdata,
5018                       IRAtom* guard )
5019{
5020   IROp     mkAdd;
5021   IRType   ty, tyAddr;
5022   void*    helper = NULL;
5023   const HChar* hname = NULL;
5024   IRConst* c;
5025
5026   tyAddr = mce->hWordTy;
5027   mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5028   tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5029   tl_assert( end == Iend_LE || end == Iend_BE );
5030
5031   if (data) {
5032      tl_assert(!vdata);
5033      tl_assert(isOriginalAtom(mce, data));
5034      tl_assert(bias == 0);
5035      vdata = expr2vbits( mce, data );
5036   } else {
5037      tl_assert(vdata);
5038   }
5039
5040   tl_assert(isOriginalAtom(mce,addr));
5041   tl_assert(isShadowAtom(mce,vdata));
5042
5043   if (guard) {
5044      tl_assert(isOriginalAtom(mce, guard));
5045      tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5046   }
5047
5048   ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5049
5050   // If we're not doing undefined value checking, pretend that this value
5051   // is "all valid".  That lets Vex's optimiser remove some of the V bit
5052   // shadow computation ops that precede it.
5053   if (MC_(clo_mc_level) == 1) {
5054      switch (ty) {
5055         case Ity_V256: // V256 weirdness -- used four times
5056                        c = IRConst_V256(V_BITS32_DEFINED); break;
5057         case Ity_V128: // V128 weirdness -- used twice
5058                        c = IRConst_V128(V_BITS16_DEFINED); break;
5059         case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
5060         case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
5061         case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
5062         case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
5063         default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5064      }
5065      vdata = IRExpr_Const( c );
5066   }
5067
5068   /* First, emit a definedness test for the address.  This also sets
5069      the address (shadow) to 'defined' following the test.  Both of
5070      those actions are gated on |guard|. */
5071   complainIfUndefined( mce, addr, guard );
5072
5073   /* Now decide which helper function to call to write the data V
5074      bits into shadow memory. */
5075   if (end == Iend_LE) {
5076      switch (ty) {
5077         case Ity_V256: /* we'll use the helper four times */
5078         case Ity_V128: /* we'll use the helper twice */
5079         case Ity_I64: helper = &MC_(helperc_STOREV64le);
5080                       hname = "MC_(helperc_STOREV64le)";
5081                       break;
5082         case Ity_I32: helper = &MC_(helperc_STOREV32le);
5083                       hname = "MC_(helperc_STOREV32le)";
5084                       break;
5085         case Ity_I16: helper = &MC_(helperc_STOREV16le);
5086                       hname = "MC_(helperc_STOREV16le)";
5087                       break;
5088         case Ity_I8:  helper = &MC_(helperc_STOREV8);
5089                       hname = "MC_(helperc_STOREV8)";
5090                       break;
5091         default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5092      }
5093   } else {
5094      switch (ty) {
5095         case Ity_V128: /* we'll use the helper twice */
5096         case Ity_I64: helper = &MC_(helperc_STOREV64be);
5097                       hname = "MC_(helperc_STOREV64be)";
5098                       break;
5099         case Ity_I32: helper = &MC_(helperc_STOREV32be);
5100                       hname = "MC_(helperc_STOREV32be)";
5101                       break;
5102         case Ity_I16: helper = &MC_(helperc_STOREV16be);
5103                       hname = "MC_(helperc_STOREV16be)";
5104                       break;
5105         case Ity_I8:  helper = &MC_(helperc_STOREV8);
5106                       hname = "MC_(helperc_STOREV8)";
5107                       break;
5108         /* Note, no V256 case here, because no big-endian target that
5109            we support, has 256 vectors. */
5110         default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
5111      }
5112   }
5113
5114   if (UNLIKELY(ty == Ity_V256)) {
5115
5116      /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
5117         Q3 being the most significant lane. */
5118      /* These are the offsets of the Qs in memory. */
5119      Int     offQ0, offQ1, offQ2, offQ3;
5120
5121      /* Various bits for constructing the 4 lane helper calls */
5122      IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
5123      IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
5124      IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
5125      IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
5126
5127      if (end == Iend_LE) {
5128         offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
5129      } else {
5130         offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
5131      }
5132
5133      eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
5134      addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
5135      vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
5136      diQ0    = unsafeIRDirty_0_N(
5137                   1/*regparms*/,
5138                   hname, VG_(fnptr_to_fnentry)( helper ),
5139                   mkIRExprVec_2( addrQ0, vdataQ0 )
5140                );
5141
5142      eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
5143      addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
5144      vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
5145      diQ1    = unsafeIRDirty_0_N(
5146                   1/*regparms*/,
5147                   hname, VG_(fnptr_to_fnentry)( helper ),
5148                   mkIRExprVec_2( addrQ1, vdataQ1 )
5149                );
5150
5151      eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
5152      addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
5153      vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
5154      diQ2    = unsafeIRDirty_0_N(
5155                   1/*regparms*/,
5156                   hname, VG_(fnptr_to_fnentry)( helper ),
5157                   mkIRExprVec_2( addrQ2, vdataQ2 )
5158                );
5159
5160      eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
5161      addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
5162      vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
5163      diQ3    = unsafeIRDirty_0_N(
5164                   1/*regparms*/,
5165                   hname, VG_(fnptr_to_fnentry)( helper ),
5166                   mkIRExprVec_2( addrQ3, vdataQ3 )
5167                );
5168
5169      if (guard)
5170         diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
5171
5172      setHelperAnns( mce, diQ0 );
5173      setHelperAnns( mce, diQ1 );
5174      setHelperAnns( mce, diQ2 );
5175      setHelperAnns( mce, diQ3 );
5176      stmt( 'V', mce, IRStmt_Dirty(diQ0) );
5177      stmt( 'V', mce, IRStmt_Dirty(diQ1) );
5178      stmt( 'V', mce, IRStmt_Dirty(diQ2) );
5179      stmt( 'V', mce, IRStmt_Dirty(diQ3) );
5180
5181   }
5182   else if (UNLIKELY(ty == Ity_V128)) {
5183
5184      /* V128-bit case */
5185      /* See comment in next clause re 64-bit regparms */
5186      /* also, need to be careful about endianness */
5187
5188      Int     offLo64, offHi64;
5189      IRDirty *diLo64, *diHi64;
5190      IRAtom  *addrLo64, *addrHi64;
5191      IRAtom  *vdataLo64, *vdataHi64;
5192      IRAtom  *eBiasLo64, *eBiasHi64;
5193
5194      if (end == Iend_LE) {
5195         offLo64 = 0;
5196         offHi64 = 8;
5197      } else {
5198         offLo64 = 8;
5199         offHi64 = 0;
5200      }
5201
5202      eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
5203      addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
5204      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
5205      diLo64    = unsafeIRDirty_0_N(
5206                     1/*regparms*/,
5207                     hname, VG_(fnptr_to_fnentry)( helper ),
5208                     mkIRExprVec_2( addrLo64, vdataLo64 )
5209                  );
5210      eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
5211      addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
5212      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
5213      diHi64    = unsafeIRDirty_0_N(
5214                     1/*regparms*/,
5215                     hname, VG_(fnptr_to_fnentry)( helper ),
5216                     mkIRExprVec_2( addrHi64, vdataHi64 )
5217                  );
5218      if (guard) diLo64->guard = guard;
5219      if (guard) diHi64->guard = guard;
5220      setHelperAnns( mce, diLo64 );
5221      setHelperAnns( mce, diHi64 );
5222      stmt( 'V', mce, IRStmt_Dirty(diLo64) );
5223      stmt( 'V', mce, IRStmt_Dirty(diHi64) );
5224
5225   } else {
5226
5227      IRDirty *di;
5228      IRAtom  *addrAct;
5229
5230      /* 8/16/32/64-bit cases */
5231      /* Generate the actual address into addrAct. */
5232      if (bias == 0) {
5233         addrAct = addr;
5234      } else {
5235         IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5236         addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
5237      }
5238
5239      if (ty == Ity_I64) {
5240         /* We can't do this with regparm 2 on 32-bit platforms, since
5241            the back ends aren't clever enough to handle 64-bit
5242            regparm args.  Therefore be different. */
5243         di = unsafeIRDirty_0_N(
5244                 1/*regparms*/,
5245                 hname, VG_(fnptr_to_fnentry)( helper ),
5246                 mkIRExprVec_2( addrAct, vdata )
5247              );
5248      } else {
5249         di = unsafeIRDirty_0_N(
5250                 2/*regparms*/,
5251                 hname, VG_(fnptr_to_fnentry)( helper ),
5252                 mkIRExprVec_2( addrAct,
5253                                zwidenToHostWord( mce, vdata ))
5254              );
5255      }
5256      if (guard) di->guard = guard;
5257      setHelperAnns( mce, di );
5258      stmt( 'V', mce, IRStmt_Dirty(di) );
5259   }
5260
5261}
5262
5263
5264/* Do lazy pessimistic propagation through a dirty helper call, by
5265   looking at the annotations on it.  This is the most complex part of
5266   Memcheck. */
5267
5268static IRType szToITy ( Int n )
5269{
5270   switch (n) {
5271      case 1: return Ity_I8;
5272      case 2: return Ity_I16;
5273      case 4: return Ity_I32;
5274      case 8: return Ity_I64;
5275      default: VG_(tool_panic)("szToITy(memcheck)");
5276   }
5277}
5278
5279static
5280void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
5281{
5282   Int       i, k, n, toDo, gSz, gOff;
5283   IRAtom    *src, *here, *curr;
5284   IRType    tySrc, tyDst;
5285   IRTemp    dst;
5286   IREndness end;
5287
5288   /* What's the native endianness?  We need to know this. */
5289#  if defined(VG_BIGENDIAN)
5290   end = Iend_BE;
5291#  elif defined(VG_LITTLEENDIAN)
5292   end = Iend_LE;
5293#  else
5294#    error "Unknown endianness"
5295#  endif
5296
5297   /* First check the guard. */
5298   complainIfUndefined(mce, d->guard, NULL);
5299
5300   /* Now round up all inputs and PCast over them. */
5301   curr = definedOfType(Ity_I32);
5302
5303   /* Inputs: unmasked args
5304      Note: arguments are evaluated REGARDLESS of the guard expression */
5305   for (i = 0; d->args[i]; i++) {
5306      IRAtom* arg = d->args[i];
5307      if ( (d->cee->mcx_mask & (1<<i))
5308           || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
5309         /* ignore this arg */
5310      } else {
5311         here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
5312         curr = mkUifU32(mce, here, curr);
5313      }
5314   }
5315
5316   /* Inputs: guest state that we read. */
5317   for (i = 0; i < d->nFxState; i++) {
5318      tl_assert(d->fxState[i].fx != Ifx_None);
5319      if (d->fxState[i].fx == Ifx_Write)
5320         continue;
5321
5322      /* Enumerate the described state segments */
5323      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5324         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5325         gSz  = d->fxState[i].size;
5326
5327         /* Ignore any sections marked as 'always defined'. */
5328         if (isAlwaysDefd(mce, gOff, gSz)) {
5329            if (0)
5330            VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
5331                        gOff, gSz);
5332            continue;
5333         }
5334
5335         /* This state element is read or modified.  So we need to
5336            consider it.  If larger than 8 bytes, deal with it in
5337            8-byte chunks. */
5338         while (True) {
5339            tl_assert(gSz >= 0);
5340            if (gSz == 0) break;
5341            n = gSz <= 8 ? gSz : 8;
5342            /* update 'curr' with UifU of the state slice
5343               gOff .. gOff+n-1 */
5344            tySrc = szToITy( n );
5345
5346            /* Observe the guard expression. If it is false use an
5347               all-bits-defined bit pattern */
5348            IRAtom *cond, *iffalse, *iftrue;
5349
5350            cond    = assignNew('V', mce, Ity_I1, d->guard);
5351            iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
5352            iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
5353            src     = assignNew('V', mce, tySrc,
5354                                IRExpr_ITE(cond, iftrue, iffalse));
5355
5356            here = mkPCastTo( mce, Ity_I32, src );
5357            curr = mkUifU32(mce, here, curr);
5358            gSz -= n;
5359            gOff += n;
5360         }
5361      }
5362   }
5363
5364   /* Inputs: memory.  First set up some info needed regardless of
5365      whether we're doing reads or writes. */
5366
5367   if (d->mFx != Ifx_None) {
5368      /* Because we may do multiple shadow loads/stores from the same
5369         base address, it's best to do a single test of its
5370         definedness right now.  Post-instrumentation optimisation
5371         should remove all but this test. */
5372      IRType tyAddr;
5373      tl_assert(d->mAddr);
5374      complainIfUndefined(mce, d->mAddr, d->guard);
5375
5376      tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
5377      tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
5378      tl_assert(tyAddr == mce->hWordTy); /* not really right */
5379   }
5380
5381   /* Deal with memory inputs (reads or modifies) */
5382   if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
5383      toDo   = d->mSize;
5384      /* chew off 32-bit chunks.  We don't care about the endianness
5385         since it's all going to be condensed down to a single bit,
5386         but nevertheless choose an endianness which is hopefully
5387         native to the platform. */
5388      while (toDo >= 4) {
5389         here = mkPCastTo(
5390                   mce, Ity_I32,
5391                   expr2vbits_Load_guarded_Simple(
5392                      mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
5393                );
5394         curr = mkUifU32(mce, here, curr);
5395         toDo -= 4;
5396      }
5397      /* chew off 16-bit chunks */
5398      while (toDo >= 2) {
5399         here = mkPCastTo(
5400                   mce, Ity_I32,
5401                   expr2vbits_Load_guarded_Simple(
5402                      mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
5403                );
5404         curr = mkUifU32(mce, here, curr);
5405         toDo -= 2;
5406      }
5407      /* chew off the remaining 8-bit chunk, if any */
5408      if (toDo == 1) {
5409         here = mkPCastTo(
5410                   mce, Ity_I32,
5411                   expr2vbits_Load_guarded_Simple(
5412                      mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
5413                );
5414         curr = mkUifU32(mce, here, curr);
5415         toDo -= 1;
5416      }
5417      tl_assert(toDo == 0);
5418   }
5419
5420   /* Whew!  So curr is a 32-bit V-value summarising pessimistically
5421      all the inputs to the helper.  Now we need to re-distribute the
5422      results to all destinations. */
5423
5424   /* Outputs: the destination temporary, if there is one. */
5425   if (d->tmp != IRTemp_INVALID) {
5426      dst   = findShadowTmpV(mce, d->tmp);
5427      tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
5428      assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
5429   }
5430
5431   /* Outputs: guest state that we write or modify. */
5432   for (i = 0; i < d->nFxState; i++) {
5433      tl_assert(d->fxState[i].fx != Ifx_None);
5434      if (d->fxState[i].fx == Ifx_Read)
5435         continue;
5436
5437      /* Enumerate the described state segments */
5438      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5439         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5440         gSz  = d->fxState[i].size;
5441
5442         /* Ignore any sections marked as 'always defined'. */
5443         if (isAlwaysDefd(mce, gOff, gSz))
5444            continue;
5445
5446         /* This state element is written or modified.  So we need to
5447            consider it.  If larger than 8 bytes, deal with it in
5448            8-byte chunks. */
5449         while (True) {
5450            tl_assert(gSz >= 0);
5451            if (gSz == 0) break;
5452            n = gSz <= 8 ? gSz : 8;
5453            /* Write suitably-casted 'curr' to the state slice
5454               gOff .. gOff+n-1 */
5455            tyDst = szToITy( n );
5456            do_shadow_PUT( mce, gOff,
5457                                NULL, /* original atom */
5458                                mkPCastTo( mce, tyDst, curr ), d->guard );
5459            gSz -= n;
5460            gOff += n;
5461         }
5462      }
5463   }
5464
5465   /* Outputs: memory that we write or modify.  Same comments about
5466      endianness as above apply. */
5467   if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
5468      toDo   = d->mSize;
5469      /* chew off 32-bit chunks */
5470      while (toDo >= 4) {
5471         do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5472                          NULL, /* original data */
5473                          mkPCastTo( mce, Ity_I32, curr ),
5474                          d->guard );
5475         toDo -= 4;
5476      }
5477      /* chew off 16-bit chunks */
5478      while (toDo >= 2) {
5479         do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5480                          NULL, /* original data */
5481                          mkPCastTo( mce, Ity_I16, curr ),
5482                          d->guard );
5483         toDo -= 2;
5484      }
5485      /* chew off the remaining 8-bit chunk, if any */
5486      if (toDo == 1) {
5487         do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5488                          NULL, /* original data */
5489                          mkPCastTo( mce, Ity_I8, curr ),
5490                          d->guard );
5491         toDo -= 1;
5492      }
5493      tl_assert(toDo == 0);
5494   }
5495
5496}
5497
5498
5499/* We have an ABI hint telling us that [base .. base+len-1] is to
5500   become undefined ("writable").  Generate code to call a helper to
5501   notify the A/V bit machinery of this fact.
5502
5503   We call
5504   void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
5505                                                    Addr nia );
5506*/
5507static
5508void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
5509{
5510   IRDirty* di;
5511   /* Minor optimisation: if not doing origin tracking, ignore the
5512      supplied nia and pass zero instead.  This is on the basis that
5513      MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
5514      almost always generate a shorter instruction to put zero into a
5515      register than any other value. */
5516   if (MC_(clo_mc_level) < 3)
5517      nia = mkIRExpr_HWord(0);
5518
5519   di = unsafeIRDirty_0_N(
5520           0/*regparms*/,
5521           "MC_(helperc_MAKE_STACK_UNINIT)",
5522           VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
5523           mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
5524        );
5525   stmt( 'V', mce, IRStmt_Dirty(di) );
5526}
5527
5528
5529/* ------ Dealing with IRCAS (big and complex) ------ */
5530
5531/* FWDS */
5532static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
5533                             IRAtom* baseaddr, Int offset );
5534static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
5535static void    gen_store_b ( MCEnv* mce, Int szB,
5536                             IRAtom* baseaddr, Int offset, IRAtom* dataB,
5537                             IRAtom* guard );
5538
5539static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
5540static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
5541
5542
5543/* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
5544   IRExpr.Consts, else this asserts.  If they are both Consts, it
5545   doesn't do anything.  So that just leaves the RdTmp case.
5546
5547   In which case: this assigns the shadow value SHADOW to the IR
5548   shadow temporary associated with ORIG.  That is, ORIG, being an
5549   original temporary, will have a shadow temporary associated with
5550   it.  However, in the case envisaged here, there will so far have
5551   been no IR emitted to actually write a shadow value into that
5552   temporary.  What this routine does is to (emit IR to) copy the
5553   value in SHADOW into said temporary, so that after this call,
5554   IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
5555   value in SHADOW.
5556
5557   Point is to allow callers to compute "by hand" a shadow value for
5558   ORIG, and force it to be associated with ORIG.
5559
5560   How do we know that that shadow associated with ORIG has not so far
5561   been assigned to?  Well, we don't per se know that, but supposing
5562   it had.  Then this routine would create a second assignment to it,
5563   and later the IR sanity checker would barf.  But that never
5564   happens.  QED.
5565*/
5566static void bind_shadow_tmp_to_orig ( UChar how,
5567                                      MCEnv* mce,
5568                                      IRAtom* orig, IRAtom* shadow )
5569{
5570   tl_assert(isOriginalAtom(mce, orig));
5571   tl_assert(isShadowAtom(mce, shadow));
5572   switch (orig->tag) {
5573      case Iex_Const:
5574         tl_assert(shadow->tag == Iex_Const);
5575         break;
5576      case Iex_RdTmp:
5577         tl_assert(shadow->tag == Iex_RdTmp);
5578         if (how == 'V') {
5579            assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
5580                   shadow);
5581         } else {
5582            tl_assert(how == 'B');
5583            assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
5584                   shadow);
5585         }
5586         break;
5587      default:
5588         tl_assert(0);
5589   }
5590}
5591
5592
5593static
5594void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
5595{
5596   /* Scheme is (both single- and double- cases):
5597
5598      1. fetch data#,dataB (the proposed new value)
5599
5600      2. fetch expd#,expdB (what we expect to see at the address)
5601
5602      3. check definedness of address
5603
5604      4. load old#,oldB from shadow memory; this also checks
5605         addressibility of the address
5606
5607      5. the CAS itself
5608
5609      6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
5610
5611      7. if "expected == old" (as computed by (6))
5612            store data#,dataB to shadow memory
5613
5614      Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
5615      'data' but 7 stores 'data#'.  Hence it is possible for the
5616      shadow data to be incorrectly checked and/or updated:
5617
5618      * 7 is at least gated correctly, since the 'expected == old'
5619        condition is derived from outputs of 5.  However, the shadow
5620        write could happen too late: imagine after 5 we are
5621        descheduled, a different thread runs, writes a different
5622        (shadow) value at the address, and then we resume, hence
5623        overwriting the shadow value written by the other thread.
5624
5625      Because the original memory access is atomic, there's no way to
5626      make both the original and shadow accesses into a single atomic
5627      thing, hence this is unavoidable.
5628
5629      At least as Valgrind stands, I don't think it's a problem, since
5630      we're single threaded *and* we guarantee that there are no
5631      context switches during the execution of any specific superblock
5632      -- context switches can only happen at superblock boundaries.
5633
5634      If Valgrind ever becomes MT in the future, then it might be more
5635      of a problem.  A possible kludge would be to artificially
5636      associate with the location, a lock, which we must acquire and
5637      release around the transaction as a whole.  Hmm, that probably
5638      would't work properly since it only guards us against other
5639      threads doing CASs on the same location, not against other
5640      threads doing normal reads and writes.
5641
5642      ------------------------------------------------------------
5643
5644      COMMENT_ON_CasCmpEQ:
5645
5646      Note two things.  Firstly, in the sequence above, we compute
5647      "expected == old", but we don't check definedness of it.  Why
5648      not?  Also, the x86 and amd64 front ends use
5649      Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
5650      determination (expected == old ?) for themselves, and we also
5651      don't check definedness for those primops; we just say that the
5652      result is defined.  Why?  Details follow.
5653
5654      x86/amd64 contains various forms of locked insns:
5655      * lock prefix before all basic arithmetic insn;
5656        eg lock xorl %reg1,(%reg2)
5657      * atomic exchange reg-mem
5658      * compare-and-swaps
5659
5660      Rather than attempt to represent them all, which would be a
5661      royal PITA, I used a result from Maurice Herlihy
5662      (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
5663      demonstrates that compare-and-swap is a primitive more general
5664      than the other two, and so can be used to represent all of them.
5665      So the translation scheme for (eg) lock incl (%reg) is as
5666      follows:
5667
5668        again:
5669         old = * %reg
5670         new = old + 1
5671         atomically { if (* %reg == old) { * %reg = new } else { goto again } }
5672
5673      The "atomically" is the CAS bit.  The scheme is always the same:
5674      get old value from memory, compute new value, atomically stuff
5675      new value back in memory iff the old value has not changed (iow,
5676      no other thread modified it in the meantime).  If it has changed
5677      then we've been out-raced and we have to start over.
5678
5679      Now that's all very neat, but it has the bad side effect of
5680      introducing an explicit equality test into the translation.
5681      Consider the behaviour of said code on a memory location which
5682      is uninitialised.  We will wind up doing a comparison on
5683      uninitialised data, and mc duly complains.
5684
5685      What's difficult about this is, the common case is that the
5686      location is uncontended, and so we're usually comparing the same
5687      value (* %reg) with itself.  So we shouldn't complain even if it
5688      is undefined.  But mc doesn't know that.
5689
5690      My solution is to mark the == in the IR specially, so as to tell
5691      mc that it almost certainly compares a value with itself, and we
5692      should just regard the result as always defined.  Rather than
5693      add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
5694      Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
5695
5696      So there's always the question of, can this give a false
5697      negative?  eg, imagine that initially, * %reg is defined; and we
5698      read that; but then in the gap between the read and the CAS, a
5699      different thread writes an undefined (and different) value at
5700      the location.  Then the CAS in this thread will fail and we will
5701      go back to "again:", but without knowing that the trip back
5702      there was based on an undefined comparison.  No matter; at least
5703      the other thread won the race and the location is correctly
5704      marked as undefined.  What if it wrote an uninitialised version
5705      of the same value that was there originally, though?
5706
5707      etc etc.  Seems like there's a small corner case in which we
5708      might lose the fact that something's defined -- we're out-raced
5709      in between the "old = * reg" and the "atomically {", _and_ the
5710      other thread is writing in an undefined version of what's
5711      already there.  Well, that seems pretty unlikely.
5712
5713      ---
5714
5715      If we ever need to reinstate it .. code which generates a
5716      definedness test for "expected == old" was removed at r10432 of
5717      this file.
5718   */
5719   if (cas->oldHi == IRTemp_INVALID) {
5720      do_shadow_CAS_single( mce, cas );
5721   } else {
5722      do_shadow_CAS_double( mce, cas );
5723   }
5724}
5725
5726
5727static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
5728{
5729   IRAtom *vdataLo = NULL, *bdataLo = NULL;
5730   IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
5731   IRAtom *voldLo  = NULL, *boldLo  = NULL;
5732   IRAtom *expd_eq_old = NULL;
5733   IROp   opCasCmpEQ;
5734   Int    elemSzB;
5735   IRType elemTy;
5736   Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
5737
5738   /* single CAS */
5739   tl_assert(cas->oldHi == IRTemp_INVALID);
5740   tl_assert(cas->expdHi == NULL);
5741   tl_assert(cas->dataHi == NULL);
5742
5743   elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
5744   switch (elemTy) {
5745      case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
5746      case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
5747      case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
5748      case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
5749      default: tl_assert(0); /* IR defn disallows any other types */
5750   }
5751
5752   /* 1. fetch data# (the proposed new value) */
5753   tl_assert(isOriginalAtom(mce, cas->dataLo));
5754   vdataLo
5755      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
5756   tl_assert(isShadowAtom(mce, vdataLo));
5757   if (otrak) {
5758      bdataLo
5759         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
5760      tl_assert(isShadowAtom(mce, bdataLo));
5761   }
5762
5763   /* 2. fetch expected# (what we expect to see at the address) */
5764   tl_assert(isOriginalAtom(mce, cas->expdLo));
5765   vexpdLo
5766      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
5767   tl_assert(isShadowAtom(mce, vexpdLo));
5768   if (otrak) {
5769      bexpdLo
5770         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
5771      tl_assert(isShadowAtom(mce, bexpdLo));
5772   }
5773
5774   /* 3. check definedness of address */
5775   /* 4. fetch old# from shadow memory; this also checks
5776         addressibility of the address */
5777   voldLo
5778      = assignNew(
5779           'V', mce, elemTy,
5780           expr2vbits_Load(
5781              mce,
5782              cas->end, elemTy, cas->addr, 0/*Addr bias*/,
5783              NULL/*always happens*/
5784        ));
5785   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
5786   if (otrak) {
5787      boldLo
5788         = assignNew('B', mce, Ity_I32,
5789                     gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
5790      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
5791   }
5792
5793   /* 5. the CAS itself */
5794   stmt( 'C', mce, IRStmt_CAS(cas) );
5795
5796   /* 6. compute "expected == old" */
5797   /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
5798   /* Note that 'C' is kinda faking it; it is indeed a non-shadow
5799      tree, but it's not copied from the input block. */
5800   expd_eq_old
5801      = assignNew('C', mce, Ity_I1,
5802                  binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
5803
5804   /* 7. if "expected == old"
5805            store data# to shadow memory */
5806   do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
5807                    NULL/*data*/, vdataLo/*vdata*/,
5808                    expd_eq_old/*guard for store*/ );
5809   if (otrak) {
5810      gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
5811                   bdataLo/*bdata*/,
5812                   expd_eq_old/*guard for store*/ );
5813   }
5814}
5815
5816
5817static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
5818{
5819   IRAtom *vdataHi = NULL, *bdataHi = NULL;
5820   IRAtom *vdataLo = NULL, *bdataLo = NULL;
5821   IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
5822   IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
5823   IRAtom *voldHi  = NULL, *boldHi  = NULL;
5824   IRAtom *voldLo  = NULL, *boldLo  = NULL;
5825   IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
5826   IRAtom *expd_eq_old = NULL, *zero = NULL;
5827   IROp   opCasCmpEQ, opOr, opXor;
5828   Int    elemSzB, memOffsLo, memOffsHi;
5829   IRType elemTy;
5830   Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
5831
5832   /* double CAS */
5833   tl_assert(cas->oldHi != IRTemp_INVALID);
5834   tl_assert(cas->expdHi != NULL);
5835   tl_assert(cas->dataHi != NULL);
5836
5837   elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
5838   switch (elemTy) {
5839      case Ity_I8:
5840         opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
5841         elemSzB = 1; zero = mkU8(0);
5842         break;
5843      case Ity_I16:
5844         opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
5845         elemSzB = 2; zero = mkU16(0);
5846         break;
5847      case Ity_I32:
5848         opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
5849         elemSzB = 4; zero = mkU32(0);
5850         break;
5851      case Ity_I64:
5852         opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
5853         elemSzB = 8; zero = mkU64(0);
5854         break;
5855      default:
5856         tl_assert(0); /* IR defn disallows any other types */
5857   }
5858
5859   /* 1. fetch data# (the proposed new value) */
5860   tl_assert(isOriginalAtom(mce, cas->dataHi));
5861   tl_assert(isOriginalAtom(mce, cas->dataLo));
5862   vdataHi
5863      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
5864   vdataLo
5865      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
5866   tl_assert(isShadowAtom(mce, vdataHi));
5867   tl_assert(isShadowAtom(mce, vdataLo));
5868   if (otrak) {
5869      bdataHi
5870         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
5871      bdataLo
5872         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
5873      tl_assert(isShadowAtom(mce, bdataHi));
5874      tl_assert(isShadowAtom(mce, bdataLo));
5875   }
5876
5877   /* 2. fetch expected# (what we expect to see at the address) */
5878   tl_assert(isOriginalAtom(mce, cas->expdHi));
5879   tl_assert(isOriginalAtom(mce, cas->expdLo));
5880   vexpdHi
5881      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
5882   vexpdLo
5883      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
5884   tl_assert(isShadowAtom(mce, vexpdHi));
5885   tl_assert(isShadowAtom(mce, vexpdLo));
5886   if (otrak) {
5887      bexpdHi
5888         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
5889      bexpdLo
5890         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
5891      tl_assert(isShadowAtom(mce, bexpdHi));
5892      tl_assert(isShadowAtom(mce, bexpdLo));
5893   }
5894
5895   /* 3. check definedness of address */
5896   /* 4. fetch old# from shadow memory; this also checks
5897         addressibility of the address */
5898   if (cas->end == Iend_LE) {
5899      memOffsLo = 0;
5900      memOffsHi = elemSzB;
5901   } else {
5902      tl_assert(cas->end == Iend_BE);
5903      memOffsLo = elemSzB;
5904      memOffsHi = 0;
5905   }
5906   voldHi
5907      = assignNew(
5908           'V', mce, elemTy,
5909           expr2vbits_Load(
5910              mce,
5911              cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
5912              NULL/*always happens*/
5913        ));
5914   voldLo
5915      = assignNew(
5916           'V', mce, elemTy,
5917           expr2vbits_Load(
5918              mce,
5919              cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
5920              NULL/*always happens*/
5921        ));
5922   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
5923   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
5924   if (otrak) {
5925      boldHi
5926         = assignNew('B', mce, Ity_I32,
5927                     gen_load_b(mce, elemSzB, cas->addr,
5928                                memOffsHi/*addr bias*/));
5929      boldLo
5930         = assignNew('B', mce, Ity_I32,
5931                     gen_load_b(mce, elemSzB, cas->addr,
5932                                memOffsLo/*addr bias*/));
5933      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
5934      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
5935   }
5936
5937   /* 5. the CAS itself */
5938   stmt( 'C', mce, IRStmt_CAS(cas) );
5939
5940   /* 6. compute "expected == old" */
5941   /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
5942   /* Note that 'C' is kinda faking it; it is indeed a non-shadow
5943      tree, but it's not copied from the input block. */
5944   /*
5945      xHi = oldHi ^ expdHi;
5946      xLo = oldLo ^ expdLo;
5947      xHL = xHi | xLo;
5948      expd_eq_old = xHL == 0;
5949   */
5950   xHi = assignNew('C', mce, elemTy,
5951                   binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
5952   xLo = assignNew('C', mce, elemTy,
5953                   binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
5954   xHL = assignNew('C', mce, elemTy,
5955                   binop(opOr, xHi, xLo));
5956   expd_eq_old
5957      = assignNew('C', mce, Ity_I1,
5958                  binop(opCasCmpEQ, xHL, zero));
5959
5960   /* 7. if "expected == old"
5961            store data# to shadow memory */
5962   do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
5963                    NULL/*data*/, vdataHi/*vdata*/,
5964                    expd_eq_old/*guard for store*/ );
5965   do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
5966                    NULL/*data*/, vdataLo/*vdata*/,
5967                    expd_eq_old/*guard for store*/ );
5968   if (otrak) {
5969      gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
5970                   bdataHi/*bdata*/,
5971                   expd_eq_old/*guard for store*/ );
5972      gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
5973                   bdataLo/*bdata*/,
5974                   expd_eq_old/*guard for store*/ );
5975   }
5976}
5977
5978
5979/* ------ Dealing with LL/SC (not difficult) ------ */
5980
5981static void do_shadow_LLSC ( MCEnv*    mce,
5982                             IREndness stEnd,
5983                             IRTemp    stResult,
5984                             IRExpr*   stAddr,
5985                             IRExpr*   stStoredata )
5986{
5987   /* In short: treat a load-linked like a normal load followed by an
5988      assignment of the loaded (shadow) data to the result temporary.
5989      Treat a store-conditional like a normal store, and mark the
5990      result temporary as defined. */
5991   IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
5992   IRTemp resTmp = findShadowTmpV(mce, stResult);
5993
5994   tl_assert(isIRAtom(stAddr));
5995   if (stStoredata)
5996      tl_assert(isIRAtom(stStoredata));
5997
5998   if (stStoredata == NULL) {
5999      /* Load Linked */
6000      /* Just treat this as a normal load, followed by an assignment of
6001         the value to .result. */
6002      /* Stay sane */
6003      tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6004                || resTy == Ity_I16 || resTy == Ity_I8);
6005      assign( 'V', mce, resTmp,
6006                   expr2vbits_Load(
6007                      mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6008                      NULL/*always happens*/) );
6009   } else {
6010      /* Store Conditional */
6011      /* Stay sane */
6012      IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6013                                   stStoredata);
6014      tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
6015                || dataTy == Ity_I16 || dataTy == Ity_I8);
6016      do_shadow_Store( mce, stEnd,
6017                            stAddr, 0/* addr bias */,
6018                            stStoredata,
6019                            NULL /* shadow data */,
6020                            NULL/*guard*/ );
6021      /* This is a store conditional, so it writes to .result a value
6022         indicating whether or not the store succeeded.  Just claim
6023         this value is always defined.  In the PowerPC interpretation
6024         of store-conditional, definedness of the success indication
6025         depends on whether the address of the store matches the
6026         reservation address.  But we can't tell that here (and
6027         anyway, we're not being PowerPC-specific).  At least we are
6028         guaranteed that the definedness of the store address, and its
6029         addressibility, will be checked as per normal.  So it seems
6030         pretty safe to just say that the success indication is always
6031         defined.
6032
6033         In schemeS, for origin tracking, we must correspondingly set
6034         a no-origin value for the origin shadow of .result.
6035      */
6036      tl_assert(resTy == Ity_I1);
6037      assign( 'V', mce, resTmp, definedOfType(resTy) );
6038   }
6039}
6040
6041
6042/* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6043
6044static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6045{
6046   complainIfUndefined(mce, sg->guard, NULL);
6047   /* do_shadow_Store will generate code to check the definedness and
6048      validity of sg->addr, in the case where sg->guard evaluates to
6049      True at run-time. */
6050   do_shadow_Store( mce, sg->end,
6051                    sg->addr, 0/* addr bias */,
6052                    sg->data,
6053                    NULL /* shadow data */,
6054                    sg->guard );
6055}
6056
6057static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6058{
6059   complainIfUndefined(mce, lg->guard, NULL);
6060   /* expr2vbits_Load_guarded_General will generate code to check the
6061      definedness and validity of lg->addr, in the case where
6062      lg->guard evaluates to True at run-time. */
6063
6064   /* Look at the LoadG's built-in conversion operation, to determine
6065      the source (actual loaded data) type, and the equivalent IROp.
6066      NOTE that implicitly we are taking a widening operation to be
6067      applied to original atoms and producing one that applies to V
6068      bits.  Since signed and unsigned widening are self-shadowing,
6069      this is a straight copy of the op (modulo swapping from the
6070      IRLoadGOp form to the IROp form).  Note also therefore that this
6071      implicitly duplicates the logic to do with said widening ops in
6072      expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
6073   IROp   vwiden   = Iop_INVALID;
6074   IRType loadedTy = Ity_INVALID;
6075   switch (lg->cvt) {
6076      case ILGop_Ident64: loadedTy = Ity_I64; vwiden = Iop_INVALID; break;
6077      case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
6078      case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
6079      case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
6080      case ILGop_8Uto32:  loadedTy = Ity_I8;  vwiden = Iop_8Uto32;  break;
6081      case ILGop_8Sto32:  loadedTy = Ity_I8;  vwiden = Iop_8Sto32;  break;
6082      default: VG_(tool_panic)("do_shadow_LoadG");
6083   }
6084
6085   IRAtom* vbits_alt
6086      = expr2vbits( mce, lg->alt );
6087   IRAtom* vbits_final
6088      = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
6089                                        lg->addr, 0/*addr bias*/,
6090                                        lg->guard, vwiden, vbits_alt );
6091   /* And finally, bind the V bits to the destination temporary. */
6092   assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
6093}
6094
6095
6096/*------------------------------------------------------------*/
6097/*--- Memcheck main                                        ---*/
6098/*------------------------------------------------------------*/
6099
6100static void schemeS ( MCEnv* mce, IRStmt* st );
6101
6102static Bool isBogusAtom ( IRAtom* at )
6103{
6104   ULong n = 0;
6105   IRConst* con;
6106   tl_assert(isIRAtom(at));
6107   if (at->tag == Iex_RdTmp)
6108      return False;
6109   tl_assert(at->tag == Iex_Const);
6110   con = at->Iex.Const.con;
6111   switch (con->tag) {
6112      case Ico_U1:   return False;
6113      case Ico_U8:   n = (ULong)con->Ico.U8; break;
6114      case Ico_U16:  n = (ULong)con->Ico.U16; break;
6115      case Ico_U32:  n = (ULong)con->Ico.U32; break;
6116      case Ico_U64:  n = (ULong)con->Ico.U64; break;
6117      case Ico_F32:  return False;
6118      case Ico_F64:  return False;
6119      case Ico_F32i: return False;
6120      case Ico_F64i: return False;
6121      case Ico_V128: return False;
6122      case Ico_V256: return False;
6123      default: ppIRExpr(at); tl_assert(0);
6124   }
6125   /* VG_(printf)("%llx\n", n); */
6126   return (/*32*/    n == 0xFEFEFEFFULL
6127           /*32*/ || n == 0x80808080ULL
6128           /*32*/ || n == 0x7F7F7F7FULL
6129           /*32*/ || n == 0x7EFEFEFFULL
6130           /*32*/ || n == 0x81010100ULL
6131           /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
6132           /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
6133           /*64*/ || n == 0x0000000000008080ULL
6134           /*64*/ || n == 0x8080808080808080ULL
6135           /*64*/ || n == 0x0101010101010101ULL
6136          );
6137}
6138
6139static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
6140{
6141   Int      i;
6142   IRExpr*  e;
6143   IRDirty* d;
6144   IRCAS*   cas;
6145   switch (st->tag) {
6146      case Ist_WrTmp:
6147         e = st->Ist.WrTmp.data;
6148         switch (e->tag) {
6149            case Iex_Get:
6150            case Iex_RdTmp:
6151               return False;
6152            case Iex_Const:
6153               return isBogusAtom(e);
6154            case Iex_Unop:
6155               return isBogusAtom(e->Iex.Unop.arg)
6156                      || e->Iex.Unop.op == Iop_GetMSBs8x16;
6157            case Iex_GetI:
6158               return isBogusAtom(e->Iex.GetI.ix);
6159            case Iex_Binop:
6160               return isBogusAtom(e->Iex.Binop.arg1)
6161                      || isBogusAtom(e->Iex.Binop.arg2);
6162            case Iex_Triop:
6163               return isBogusAtom(e->Iex.Triop.details->arg1)
6164                      || isBogusAtom(e->Iex.Triop.details->arg2)
6165                      || isBogusAtom(e->Iex.Triop.details->arg3);
6166            case Iex_Qop:
6167               return isBogusAtom(e->Iex.Qop.details->arg1)
6168                      || isBogusAtom(e->Iex.Qop.details->arg2)
6169                      || isBogusAtom(e->Iex.Qop.details->arg3)
6170                      || isBogusAtom(e->Iex.Qop.details->arg4);
6171            case Iex_ITE:
6172               return isBogusAtom(e->Iex.ITE.cond)
6173                      || isBogusAtom(e->Iex.ITE.iftrue)
6174                      || isBogusAtom(e->Iex.ITE.iffalse);
6175            case Iex_Load:
6176               return isBogusAtom(e->Iex.Load.addr);
6177            case Iex_CCall:
6178               for (i = 0; e->Iex.CCall.args[i]; i++)
6179                  if (isBogusAtom(e->Iex.CCall.args[i]))
6180                     return True;
6181               return False;
6182            default:
6183               goto unhandled;
6184         }
6185      case Ist_Dirty:
6186         d = st->Ist.Dirty.details;
6187         for (i = 0; d->args[i]; i++) {
6188            IRAtom* atom = d->args[i];
6189            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(atom))) {
6190               if (isBogusAtom(atom))
6191                  return True;
6192            }
6193         }
6194         if (isBogusAtom(d->guard))
6195            return True;
6196         if (d->mAddr && isBogusAtom(d->mAddr))
6197            return True;
6198         return False;
6199      case Ist_Put:
6200         return isBogusAtom(st->Ist.Put.data);
6201      case Ist_PutI:
6202         return isBogusAtom(st->Ist.PutI.details->ix)
6203                || isBogusAtom(st->Ist.PutI.details->data);
6204      case Ist_Store:
6205         return isBogusAtom(st->Ist.Store.addr)
6206                || isBogusAtom(st->Ist.Store.data);
6207      case Ist_StoreG: {
6208         IRStoreG* sg = st->Ist.StoreG.details;
6209         return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
6210                || isBogusAtom(sg->guard);
6211      }
6212      case Ist_LoadG: {
6213         IRLoadG* lg = st->Ist.LoadG.details;
6214         return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
6215                || isBogusAtom(lg->guard);
6216      }
6217      case Ist_Exit:
6218         return isBogusAtom(st->Ist.Exit.guard);
6219      case Ist_AbiHint:
6220         return isBogusAtom(st->Ist.AbiHint.base)
6221                || isBogusAtom(st->Ist.AbiHint.nia);
6222      case Ist_NoOp:
6223      case Ist_IMark:
6224      case Ist_MBE:
6225         return False;
6226      case Ist_CAS:
6227         cas = st->Ist.CAS.details;
6228         return isBogusAtom(cas->addr)
6229                || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
6230                || isBogusAtom(cas->expdLo)
6231                || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
6232                || isBogusAtom(cas->dataLo);
6233      case Ist_LLSC:
6234         return isBogusAtom(st->Ist.LLSC.addr)
6235                || (st->Ist.LLSC.storedata
6236                       ? isBogusAtom(st->Ist.LLSC.storedata)
6237                       : False);
6238      default:
6239      unhandled:
6240         ppIRStmt(st);
6241         VG_(tool_panic)("hasBogusLiterals");
6242   }
6243}
6244
6245
6246IRSB* MC_(instrument) ( VgCallbackClosure* closure,
6247                        IRSB* sb_in,
6248                        const VexGuestLayout* layout,
6249                        const VexGuestExtents* vge,
6250                        const VexArchInfo* archinfo_host,
6251                        IRType gWordTy, IRType hWordTy )
6252{
6253   Bool    verboze = 0||False;
6254   Bool    bogus;
6255   Int     i, j, first_stmt;
6256   IRStmt* st;
6257   MCEnv   mce;
6258   IRSB*   sb_out;
6259
6260   if (gWordTy != hWordTy) {
6261      /* We don't currently support this case. */
6262      VG_(tool_panic)("host/guest word size mismatch");
6263   }
6264
6265   /* Check we're not completely nuts */
6266   tl_assert(sizeof(UWord)  == sizeof(void*));
6267   tl_assert(sizeof(Word)   == sizeof(void*));
6268   tl_assert(sizeof(Addr)   == sizeof(void*));
6269   tl_assert(sizeof(ULong)  == 8);
6270   tl_assert(sizeof(Long)   == 8);
6271   tl_assert(sizeof(UInt)   == 4);
6272   tl_assert(sizeof(Int)    == 4);
6273
6274   tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
6275
6276   /* Set up SB */
6277   sb_out = deepCopyIRSBExceptStmts(sb_in);
6278
6279   /* Set up the running environment.  Both .sb and .tmpMap are
6280      modified as we go along.  Note that tmps are added to both
6281      .sb->tyenv and .tmpMap together, so the valid index-set for
6282      those two arrays should always be identical. */
6283   VG_(memset)(&mce, 0, sizeof(mce));
6284   mce.sb             = sb_out;
6285   mce.trace          = verboze;
6286   mce.layout         = layout;
6287   mce.hWordTy        = hWordTy;
6288   mce.bogusLiterals  = False;
6289
6290   /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
6291      Darwin.  10.7 is mostly built with LLVM, which uses these for
6292      bitfield inserts, and we get a lot of false errors if the cheap
6293      interpretation is used, alas.  Could solve this much better if
6294      we knew which of such adds came from x86/amd64 LEA instructions,
6295      since these are the only ones really needing the expensive
6296      interpretation, but that would require some way to tag them in
6297      the _toIR.c front ends, which is a lot of faffing around.  So
6298      for now just use the slow and blunt-instrument solution. */
6299   mce.useLLVMworkarounds = False;
6300#  if defined(VGO_darwin)
6301   mce.useLLVMworkarounds = True;
6302#  endif
6303
6304   mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
6305                            sizeof(TempMapEnt));
6306   VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
6307   for (i = 0; i < sb_in->tyenv->types_used; i++) {
6308      TempMapEnt ent;
6309      ent.kind    = Orig;
6310      ent.shadowV = IRTemp_INVALID;
6311      ent.shadowB = IRTemp_INVALID;
6312      VG_(addToXA)( mce.tmpMap, &ent );
6313   }
6314   tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
6315
6316   /* Make a preliminary inspection of the statements, to see if there
6317      are any dodgy-looking literals.  If there are, we generate
6318      extra-detailed (hence extra-expensive) instrumentation in
6319      places.  Scan the whole bb even if dodgyness is found earlier,
6320      so that the flatness assertion is applied to all stmts. */
6321
6322   bogus = False;
6323
6324   for (i = 0; i < sb_in->stmts_used; i++) {
6325
6326      st = sb_in->stmts[i];
6327      tl_assert(st);
6328      tl_assert(isFlatIRStmt(st));
6329
6330      if (!bogus) {
6331         bogus = checkForBogusLiterals(st);
6332         if (0 && bogus) {
6333            VG_(printf)("bogus: ");
6334            ppIRStmt(st);
6335            VG_(printf)("\n");
6336         }
6337      }
6338
6339   }
6340
6341   mce.bogusLiterals = bogus;
6342
6343   /* Copy verbatim any IR preamble preceding the first IMark */
6344
6345   tl_assert(mce.sb == sb_out);
6346   tl_assert(mce.sb != sb_in);
6347
6348   i = 0;
6349   while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
6350
6351      st = sb_in->stmts[i];
6352      tl_assert(st);
6353      tl_assert(isFlatIRStmt(st));
6354
6355      stmt( 'C', &mce, sb_in->stmts[i] );
6356      i++;
6357   }
6358
6359   /* Nasty problem.  IR optimisation of the pre-instrumented IR may
6360      cause the IR following the preamble to contain references to IR
6361      temporaries defined in the preamble.  Because the preamble isn't
6362      instrumented, these temporaries don't have any shadows.
6363      Nevertheless uses of them following the preamble will cause
6364      memcheck to generate references to their shadows.  End effect is
6365      to cause IR sanity check failures, due to references to
6366      non-existent shadows.  This is only evident for the complex
6367      preambles used for function wrapping on TOC-afflicted platforms
6368      (ppc64-linux).
6369
6370      The following loop therefore scans the preamble looking for
6371      assignments to temporaries.  For each one found it creates an
6372      assignment to the corresponding (V) shadow temp, marking it as
6373      'defined'.  This is the same resulting IR as if the main
6374      instrumentation loop before had been applied to the statement
6375      'tmp = CONSTANT'.
6376
6377      Similarly, if origin tracking is enabled, we must generate an
6378      assignment for the corresponding origin (B) shadow, claiming
6379      no-origin, as appropriate for a defined value.
6380   */
6381   for (j = 0; j < i; j++) {
6382      if (sb_in->stmts[j]->tag == Ist_WrTmp) {
6383         /* findShadowTmpV checks its arg is an original tmp;
6384            no need to assert that here. */
6385         IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
6386         IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
6387         IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
6388         assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
6389         if (MC_(clo_mc_level) == 3) {
6390            IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
6391            tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
6392            assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
6393         }
6394         if (0) {
6395            VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
6396            ppIRType( ty_v );
6397            VG_(printf)("\n");
6398         }
6399      }
6400   }
6401
6402   /* Iterate over the remaining stmts to generate instrumentation. */
6403
6404   tl_assert(sb_in->stmts_used > 0);
6405   tl_assert(i >= 0);
6406   tl_assert(i < sb_in->stmts_used);
6407   tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
6408
6409   for (/* use current i*/; i < sb_in->stmts_used; i++) {
6410
6411      st = sb_in->stmts[i];
6412      first_stmt = sb_out->stmts_used;
6413
6414      if (verboze) {
6415         VG_(printf)("\n");
6416         ppIRStmt(st);
6417         VG_(printf)("\n");
6418      }
6419
6420      if (MC_(clo_mc_level) == 3) {
6421         /* See comments on case Ist_CAS below. */
6422         if (st->tag != Ist_CAS)
6423            schemeS( &mce, st );
6424      }
6425
6426      /* Generate instrumentation code for each stmt ... */
6427
6428      switch (st->tag) {
6429
6430         case Ist_WrTmp:
6431            assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
6432                               expr2vbits( &mce, st->Ist.WrTmp.data) );
6433            break;
6434
6435         case Ist_Put:
6436            do_shadow_PUT( &mce,
6437                           st->Ist.Put.offset,
6438                           st->Ist.Put.data,
6439                           NULL /* shadow atom */, NULL /* guard */ );
6440            break;
6441
6442         case Ist_PutI:
6443            do_shadow_PUTI( &mce, st->Ist.PutI.details);
6444            break;
6445
6446         case Ist_Store:
6447            do_shadow_Store( &mce, st->Ist.Store.end,
6448                                   st->Ist.Store.addr, 0/* addr bias */,
6449                                   st->Ist.Store.data,
6450                                   NULL /* shadow data */,
6451                                   NULL/*guard*/ );
6452            break;
6453
6454         case Ist_StoreG:
6455            do_shadow_StoreG( &mce, st->Ist.StoreG.details );
6456            break;
6457
6458         case Ist_LoadG:
6459            do_shadow_LoadG( &mce, st->Ist.LoadG.details );
6460            break;
6461
6462         case Ist_Exit:
6463            complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
6464            break;
6465
6466         case Ist_IMark:
6467            break;
6468
6469         case Ist_NoOp:
6470         case Ist_MBE:
6471            break;
6472
6473         case Ist_Dirty:
6474            do_shadow_Dirty( &mce, st->Ist.Dirty.details );
6475            break;
6476
6477         case Ist_AbiHint:
6478            do_AbiHint( &mce, st->Ist.AbiHint.base,
6479                              st->Ist.AbiHint.len,
6480                              st->Ist.AbiHint.nia );
6481            break;
6482
6483         case Ist_CAS:
6484            do_shadow_CAS( &mce, st->Ist.CAS.details );
6485            /* Note, do_shadow_CAS copies the CAS itself to the output
6486               block, because it needs to add instrumentation both
6487               before and after it.  Hence skip the copy below.  Also
6488               skip the origin-tracking stuff (call to schemeS) above,
6489               since that's all tangled up with it too; do_shadow_CAS
6490               does it all. */
6491            break;
6492
6493         case Ist_LLSC:
6494            do_shadow_LLSC( &mce,
6495                            st->Ist.LLSC.end,
6496                            st->Ist.LLSC.result,
6497                            st->Ist.LLSC.addr,
6498                            st->Ist.LLSC.storedata );
6499            break;
6500
6501         default:
6502            VG_(printf)("\n");
6503            ppIRStmt(st);
6504            VG_(printf)("\n");
6505            VG_(tool_panic)("memcheck: unhandled IRStmt");
6506
6507      } /* switch (st->tag) */
6508
6509      if (0 && verboze) {
6510         for (j = first_stmt; j < sb_out->stmts_used; j++) {
6511            VG_(printf)("   ");
6512            ppIRStmt(sb_out->stmts[j]);
6513            VG_(printf)("\n");
6514         }
6515         VG_(printf)("\n");
6516      }
6517
6518      /* ... and finally copy the stmt itself to the output.  Except,
6519         skip the copy of IRCASs; see comments on case Ist_CAS
6520         above. */
6521      if (st->tag != Ist_CAS)
6522         stmt('C', &mce, st);
6523   }
6524
6525   /* Now we need to complain if the jump target is undefined. */
6526   first_stmt = sb_out->stmts_used;
6527
6528   if (verboze) {
6529      VG_(printf)("sb_in->next = ");
6530      ppIRExpr(sb_in->next);
6531      VG_(printf)("\n\n");
6532   }
6533
6534   complainIfUndefined( &mce, sb_in->next, NULL );
6535
6536   if (0 && verboze) {
6537      for (j = first_stmt; j < sb_out->stmts_used; j++) {
6538         VG_(printf)("   ");
6539         ppIRStmt(sb_out->stmts[j]);
6540         VG_(printf)("\n");
6541      }
6542      VG_(printf)("\n");
6543   }
6544
6545   /* If this fails, there's been some serious snafu with tmp management,
6546      that should be investigated. */
6547   tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
6548   VG_(deleteXA)( mce.tmpMap );
6549
6550   tl_assert(mce.sb == sb_out);
6551   return sb_out;
6552}
6553
6554/*------------------------------------------------------------*/
6555/*--- Post-tree-build final tidying                        ---*/
6556/*------------------------------------------------------------*/
6557
6558/* This exploits the observation that Memcheck often produces
6559   repeated conditional calls of the form
6560
6561   Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
6562
6563   with the same guard expression G guarding the same helper call.
6564   The second and subsequent calls are redundant.  This usually
6565   results from instrumentation of guest code containing multiple
6566   memory references at different constant offsets from the same base
6567   register.  After optimisation of the instrumentation, you get a
6568   test for the definedness of the base register for each memory
6569   reference, which is kinda pointless.  MC_(final_tidy) therefore
6570   looks for such repeated calls and removes all but the first. */
6571
6572/* A struct for recording which (helper, guard) pairs we have already
6573   seen. */
6574typedef
6575   struct { void* entry; IRExpr* guard; }
6576   Pair;
6577
6578/* Return True if e1 and e2 definitely denote the same value (used to
6579   compare guards).  Return False if unknown; False is the safe
6580   answer.  Since guest registers and guest memory do not have the
6581   SSA property we must return False if any Gets or Loads appear in
6582   the expression. */
6583
6584static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
6585{
6586   if (e1->tag != e2->tag)
6587      return False;
6588   switch (e1->tag) {
6589      case Iex_Const:
6590         return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
6591      case Iex_Binop:
6592         return e1->Iex.Binop.op == e2->Iex.Binop.op
6593                && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
6594                && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
6595      case Iex_Unop:
6596         return e1->Iex.Unop.op == e2->Iex.Unop.op
6597                && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
6598      case Iex_RdTmp:
6599         return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
6600      case Iex_ITE:
6601         return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
6602                && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
6603                && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
6604      case Iex_Qop:
6605      case Iex_Triop:
6606      case Iex_CCall:
6607         /* be lazy.  Could define equality for these, but they never
6608            appear to be used. */
6609         return False;
6610      case Iex_Get:
6611      case Iex_GetI:
6612      case Iex_Load:
6613         /* be conservative - these may not give the same value each
6614            time */
6615         return False;
6616      case Iex_Binder:
6617         /* should never see this */
6618         /* fallthrough */
6619      default:
6620         VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
6621         ppIRExpr(e1);
6622         VG_(tool_panic)("memcheck:sameIRValue");
6623         return False;
6624   }
6625}
6626
6627/* See if 'pairs' already has an entry for (entry, guard).  Return
6628   True if so.  If not, add an entry. */
6629
6630static
6631Bool check_or_add ( XArray* /*of Pair*/ pairs, IRExpr* guard, void* entry )
6632{
6633   Pair  p;
6634   Pair* pp;
6635   Int   i, n = VG_(sizeXA)( pairs );
6636   for (i = 0; i < n; i++) {
6637      pp = VG_(indexXA)( pairs, i );
6638      if (pp->entry == entry && sameIRValue(pp->guard, guard))
6639         return True;
6640   }
6641   p.guard = guard;
6642   p.entry = entry;
6643   VG_(addToXA)( pairs, &p );
6644   return False;
6645}
6646
6647static Bool is_helperc_value_checkN_fail ( const HChar* name )
6648{
6649   return
6650      0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
6651      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
6652      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
6653      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
6654      || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
6655      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
6656      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
6657      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
6658}
6659
6660IRSB* MC_(final_tidy) ( IRSB* sb_in )
6661{
6662   Int i;
6663   IRStmt*   st;
6664   IRDirty*  di;
6665   IRExpr*   guard;
6666   IRCallee* cee;
6667   Bool      alreadyPresent;
6668   XArray*   pairs = VG_(newXA)( VG_(malloc), "mc.ft.1",
6669                                 VG_(free), sizeof(Pair) );
6670   /* Scan forwards through the statements.  Each time a call to one
6671      of the relevant helpers is seen, check if we have made a
6672      previous call to the same helper using the same guard
6673      expression, and if so, delete the call. */
6674   for (i = 0; i < sb_in->stmts_used; i++) {
6675      st = sb_in->stmts[i];
6676      tl_assert(st);
6677      if (st->tag != Ist_Dirty)
6678         continue;
6679      di = st->Ist.Dirty.details;
6680      guard = di->guard;
6681      tl_assert(guard);
6682      if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
6683      cee = di->cee;
6684      if (!is_helperc_value_checkN_fail( cee->name ))
6685         continue;
6686       /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
6687          guard 'guard'.  Check if we have already seen a call to this
6688          function with the same guard.  If so, delete it.  If not,
6689          add it to the set of calls we do know about. */
6690      alreadyPresent = check_or_add( pairs, guard, cee->addr );
6691      if (alreadyPresent) {
6692         sb_in->stmts[i] = IRStmt_NoOp();
6693         if (0) VG_(printf)("XX\n");
6694      }
6695   }
6696   VG_(deleteXA)( pairs );
6697   return sb_in;
6698}
6699
6700
6701/*------------------------------------------------------------*/
6702/*--- Origin tracking stuff                                ---*/
6703/*------------------------------------------------------------*/
6704
6705/* Almost identical to findShadowTmpV. */
6706static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
6707{
6708   TempMapEnt* ent;
6709   /* VG_(indexXA) range-checks 'orig', hence no need to check
6710      here. */
6711   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6712   tl_assert(ent->kind == Orig);
6713   if (ent->shadowB == IRTemp_INVALID) {
6714      IRTemp tmpB
6715        = newTemp( mce, Ity_I32, BSh );
6716      /* newTemp may cause mce->tmpMap to resize, hence previous results
6717         from VG_(indexXA) are invalid. */
6718      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6719      tl_assert(ent->kind == Orig);
6720      tl_assert(ent->shadowB == IRTemp_INVALID);
6721      ent->shadowB = tmpB;
6722   }
6723   return ent->shadowB;
6724}
6725
6726static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
6727{
6728   return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
6729}
6730
6731
6732/* Make a guarded origin load, with no special handling in the
6733   didn't-happen case.  A GUARD of NULL is assumed to mean "always
6734   True".
6735
6736   Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6737   return the otag.  The loaded size is SZB.  If GUARD evaluates to
6738   False at run time then the returned otag is zero.
6739*/
6740static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
6741                                    IRAtom* baseaddr,
6742                                    Int offset, IRExpr* guard )
6743{
6744   void*    hFun;
6745   const HChar* hName;
6746   IRTemp   bTmp;
6747   IRDirty* di;
6748   IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6749   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6750   IRAtom*  ea    = baseaddr;
6751   if (offset != 0) {
6752      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6753                                   : mkU64( (Long)(Int)offset );
6754      ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
6755   }
6756   bTmp = newTemp(mce, mce->hWordTy, BSh);
6757
6758   switch (szB) {
6759      case 1: hFun  = (void*)&MC_(helperc_b_load1);
6760              hName = "MC_(helperc_b_load1)";
6761              break;
6762      case 2: hFun  = (void*)&MC_(helperc_b_load2);
6763              hName = "MC_(helperc_b_load2)";
6764              break;
6765      case 4: hFun  = (void*)&MC_(helperc_b_load4);
6766              hName = "MC_(helperc_b_load4)";
6767              break;
6768      case 8: hFun  = (void*)&MC_(helperc_b_load8);
6769              hName = "MC_(helperc_b_load8)";
6770              break;
6771      case 16: hFun  = (void*)&MC_(helperc_b_load16);
6772               hName = "MC_(helperc_b_load16)";
6773               break;
6774      case 32: hFun  = (void*)&MC_(helperc_b_load32);
6775               hName = "MC_(helperc_b_load32)";
6776               break;
6777      default:
6778         VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
6779         tl_assert(0);
6780   }
6781   di = unsafeIRDirty_1_N(
6782           bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
6783           mkIRExprVec_1( ea )
6784        );
6785   if (guard) {
6786      di->guard = guard;
6787      /* Ideally the didn't-happen return value here would be
6788         all-zeroes (unknown-origin), so it'd be harmless if it got
6789         used inadvertantly.  We slum it out with the IR-mandated
6790         default value (0b01 repeating, 0x55 etc) as that'll probably
6791         trump all legitimate otags via Max32, and it's pretty
6792         obviously bogus. */
6793   }
6794   /* no need to mess with any annotations.  This call accesses
6795      neither guest state nor guest memory. */
6796   stmt( 'B', mce, IRStmt_Dirty(di) );
6797   if (mce->hWordTy == Ity_I64) {
6798      /* 64-bit host */
6799      IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
6800      assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
6801      return mkexpr(bTmp32);
6802   } else {
6803      /* 32-bit host */
6804      return mkexpr(bTmp);
6805   }
6806}
6807
6808
6809/* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
6810   loaded size is SZB.  The load is regarded as unconditional (always
6811   happens).
6812*/
6813static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
6814                            Int offset )
6815{
6816   return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
6817}
6818
6819
6820/* The most general handler for guarded origin loads.  A GUARD of NULL
6821   is assumed to mean "always True".
6822
6823   Generate IR to do a shadow origin load from ADDR+BIAS and return
6824   the B bits.  The loaded type is TY.  If GUARD evaluates to False at
6825   run time then the returned B bits are simply BALT instead.
6826*/
6827static
6828IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
6829                                        IRType ty,
6830                                        IRAtom* addr, UInt bias,
6831                                        IRAtom* guard, IRAtom* balt )
6832{
6833   /* If the guard evaluates to True, this will hold the loaded
6834      origin.  If the guard evaluates to False, this will be zero,
6835      meaning "unknown origin", in which case we will have to replace
6836      it using an ITE below. */
6837   IRAtom* iftrue
6838      = assignNew('B', mce, Ity_I32,
6839                  gen_guarded_load_b(mce, sizeofIRType(ty),
6840                                     addr, bias, guard));
6841   /* These are the bits we will return if the load doesn't take
6842      place. */
6843   IRAtom* iffalse
6844      = balt;
6845   /* Prepare the cond for the ITE.  Convert a NULL cond into
6846      something that iropt knows how to fold out later. */
6847   IRAtom* cond
6848      = guard == NULL  ? mkU1(1)  : guard;
6849   /* And assemble the final result. */
6850   return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
6851}
6852
6853
6854/* Generate a shadow origins store.  guard :: Ity_I1 controls whether
6855   the store really happens; NULL means it unconditionally does. */
6856static void gen_store_b ( MCEnv* mce, Int szB,
6857                          IRAtom* baseaddr, Int offset, IRAtom* dataB,
6858                          IRAtom* guard )
6859{
6860   void*    hFun;
6861   const HChar* hName;
6862   IRDirty* di;
6863   IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6864   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6865   IRAtom*  ea    = baseaddr;
6866   if (guard) {
6867      tl_assert(isOriginalAtom(mce, guard));
6868      tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6869   }
6870   if (offset != 0) {
6871      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6872                                   : mkU64( (Long)(Int)offset );
6873      ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
6874   }
6875   if (mce->hWordTy == Ity_I64)
6876      dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
6877
6878   switch (szB) {
6879      case 1: hFun  = (void*)&MC_(helperc_b_store1);
6880              hName = "MC_(helperc_b_store1)";
6881              break;
6882      case 2: hFun  = (void*)&MC_(helperc_b_store2);
6883              hName = "MC_(helperc_b_store2)";
6884              break;
6885      case 4: hFun  = (void*)&MC_(helperc_b_store4);
6886              hName = "MC_(helperc_b_store4)";
6887              break;
6888      case 8: hFun  = (void*)&MC_(helperc_b_store8);
6889              hName = "MC_(helperc_b_store8)";
6890              break;
6891      case 16: hFun  = (void*)&MC_(helperc_b_store16);
6892               hName = "MC_(helperc_b_store16)";
6893               break;
6894      case 32: hFun  = (void*)&MC_(helperc_b_store32);
6895               hName = "MC_(helperc_b_store32)";
6896               break;
6897      default:
6898         tl_assert(0);
6899   }
6900   di = unsafeIRDirty_0_N( 2/*regparms*/,
6901           hName, VG_(fnptr_to_fnentry)( hFun ),
6902           mkIRExprVec_2( ea, dataB )
6903        );
6904   /* no need to mess with any annotations.  This call accesses
6905      neither guest state nor guest memory. */
6906   if (guard) di->guard = guard;
6907   stmt( 'B', mce, IRStmt_Dirty(di) );
6908}
6909
6910static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
6911   IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6912   if (eTy == Ity_I64)
6913      return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
6914   if (eTy == Ity_I32)
6915      return e;
6916   tl_assert(0);
6917}
6918
6919static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
6920   IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6921   tl_assert(eTy == Ity_I32);
6922   if (dstTy == Ity_I64)
6923      return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
6924   tl_assert(0);
6925}
6926
6927
6928static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
6929{
6930   tl_assert(MC_(clo_mc_level) == 3);
6931
6932   switch (e->tag) {
6933
6934      case Iex_GetI: {
6935         IRRegArray* descr_b;
6936         IRAtom      *t1, *t2, *t3, *t4;
6937         IRRegArray* descr      = e->Iex.GetI.descr;
6938         IRType equivIntTy
6939            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
6940         /* If this array is unshadowable for whatever reason, use the
6941            usual approximation. */
6942         if (equivIntTy == Ity_INVALID)
6943            return mkU32(0);
6944         tl_assert(sizeofIRType(equivIntTy) >= 4);
6945         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
6946         descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
6947                                 equivIntTy, descr->nElems );
6948         /* Do a shadow indexed get of the same size, giving t1.  Take
6949            the bottom 32 bits of it, giving t2.  Compute into t3 the
6950            origin for the index (almost certainly zero, but there's
6951            no harm in being completely general here, since iropt will
6952            remove any useless code), and fold it in, giving a final
6953            value t4. */
6954         t1 = assignNew( 'B', mce, equivIntTy,
6955                          IRExpr_GetI( descr_b, e->Iex.GetI.ix,
6956                                                e->Iex.GetI.bias ));
6957         t2 = narrowTo32( mce, t1 );
6958         t3 = schemeE( mce, e->Iex.GetI.ix );
6959         t4 = gen_maxU32( mce, t2, t3 );
6960         return t4;
6961      }
6962      case Iex_CCall: {
6963         Int i;
6964         IRAtom*  here;
6965         IRExpr** args = e->Iex.CCall.args;
6966         IRAtom*  curr = mkU32(0);
6967         for (i = 0; args[i]; i++) {
6968            tl_assert(i < 32);
6969            tl_assert(isOriginalAtom(mce, args[i]));
6970            /* Only take notice of this arg if the callee's
6971               mc-exclusion mask does not say it is to be excluded. */
6972            if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
6973               /* the arg is to be excluded from definedness checking.
6974                  Do nothing. */
6975               if (0) VG_(printf)("excluding %s(%d)\n",
6976                                  e->Iex.CCall.cee->name, i);
6977            } else {
6978               /* calculate the arg's definedness, and pessimistically
6979                  merge it in. */
6980               here = schemeE( mce, args[i] );
6981               curr = gen_maxU32( mce, curr, here );
6982            }
6983         }
6984         return curr;
6985      }
6986      case Iex_Load: {
6987         Int dszB;
6988         dszB = sizeofIRType(e->Iex.Load.ty);
6989         /* assert that the B value for the address is already
6990            available (somewhere) */
6991         tl_assert(isIRAtom(e->Iex.Load.addr));
6992         tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
6993         return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
6994      }
6995      case Iex_ITE: {
6996         IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
6997         IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
6998         IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
6999         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7000      }
7001      case Iex_Qop: {
7002         IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7003         IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7004         IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7005         IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7006         return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7007                                 gen_maxU32( mce, b3, b4 ) );
7008      }
7009      case Iex_Triop: {
7010         IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7011         IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7012         IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7013         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7014      }
7015      case Iex_Binop: {
7016         switch (e->Iex.Binop.op) {
7017            case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
7018            case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7019            case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7020            case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7021               /* Just say these all produce a defined result,
7022                  regardless of their arguments.  See
7023                  COMMENT_ON_CasCmpEQ in this file. */
7024               return mkU32(0);
7025            default: {
7026               IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7027               IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7028               return gen_maxU32( mce, b1, b2 );
7029            }
7030         }
7031         tl_assert(0);
7032         /*NOTREACHED*/
7033      }
7034      case Iex_Unop: {
7035         IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7036         return b1;
7037      }
7038      case Iex_Const:
7039         return mkU32(0);
7040      case Iex_RdTmp:
7041         return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7042      case Iex_Get: {
7043         Int b_offset = MC_(get_otrack_shadow_offset)(
7044                           e->Iex.Get.offset,
7045                           sizeofIRType(e->Iex.Get.ty)
7046                        );
7047         tl_assert(b_offset >= -1
7048                   && b_offset <= mce->layout->total_sizeB -4);
7049         if (b_offset >= 0) {
7050            /* FIXME: this isn't an atom! */
7051            return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7052                               Ity_I32 );
7053         }
7054         return mkU32(0);
7055      }
7056      default:
7057         VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7058         ppIRExpr(e);
7059         VG_(tool_panic)("memcheck:schemeE");
7060   }
7061}
7062
7063
7064static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7065{
7066   // This is a hacked version of do_shadow_Dirty
7067   Int       i, k, n, toDo, gSz, gOff;
7068   IRAtom    *here, *curr;
7069   IRTemp    dst;
7070
7071   /* First check the guard. */
7072   curr = schemeE( mce, d->guard );
7073
7074   /* Now round up all inputs and maxU32 over them. */
7075
7076   /* Inputs: unmasked args
7077      Note: arguments are evaluated REGARDLESS of the guard expression */
7078   for (i = 0; d->args[i]; i++) {
7079      IRAtom* arg = d->args[i];
7080      if ( (d->cee->mcx_mask & (1<<i))
7081           || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
7082         /* ignore this arg */
7083      } else {
7084         here = schemeE( mce, arg );
7085         curr = gen_maxU32( mce, curr, here );
7086      }
7087   }
7088
7089   /* Inputs: guest state that we read. */
7090   for (i = 0; i < d->nFxState; i++) {
7091      tl_assert(d->fxState[i].fx != Ifx_None);
7092      if (d->fxState[i].fx == Ifx_Write)
7093         continue;
7094
7095      /* Enumerate the described state segments */
7096      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7097         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7098         gSz  = d->fxState[i].size;
7099
7100         /* Ignore any sections marked as 'always defined'. */
7101         if (isAlwaysDefd(mce, gOff, gSz)) {
7102            if (0)
7103            VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7104                        gOff, gSz);
7105            continue;
7106         }
7107
7108         /* This state element is read or modified.  So we need to
7109            consider it.  If larger than 4 bytes, deal with it in
7110            4-byte chunks. */
7111         while (True) {
7112            Int b_offset;
7113            tl_assert(gSz >= 0);
7114            if (gSz == 0) break;
7115            n = gSz <= 4 ? gSz : 4;
7116            /* update 'curr' with maxU32 of the state slice
7117               gOff .. gOff+n-1 */
7118            b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7119            if (b_offset != -1) {
7120               /* Observe the guard expression. If it is false use 0, i.e.
7121                  nothing is known about the origin */
7122               IRAtom *cond, *iffalse, *iftrue;
7123
7124               cond = assignNew( 'B', mce, Ity_I1, d->guard);
7125               iffalse = mkU32(0);
7126               iftrue  = assignNew( 'B', mce, Ity_I32,
7127                                    IRExpr_Get(b_offset
7128                                                 + 2*mce->layout->total_sizeB,
7129                                               Ity_I32));
7130               here = assignNew( 'B', mce, Ity_I32,
7131                                 IRExpr_ITE(cond, iftrue, iffalse));
7132               curr = gen_maxU32( mce, curr, here );
7133            }
7134            gSz -= n;
7135            gOff += n;
7136         }
7137      }
7138   }
7139
7140   /* Inputs: memory */
7141
7142   if (d->mFx != Ifx_None) {
7143      /* Because we may do multiple shadow loads/stores from the same
7144         base address, it's best to do a single test of its
7145         definedness right now.  Post-instrumentation optimisation
7146         should remove all but this test. */
7147      tl_assert(d->mAddr);
7148      here = schemeE( mce, d->mAddr );
7149      curr = gen_maxU32( mce, curr, here );
7150   }
7151
7152   /* Deal with memory inputs (reads or modifies) */
7153   if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7154      toDo   = d->mSize;
7155      /* chew off 32-bit chunks.  We don't care about the endianness
7156         since it's all going to be condensed down to a single bit,
7157         but nevertheless choose an endianness which is hopefully
7158         native to the platform. */
7159      while (toDo >= 4) {
7160         here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7161                                    d->guard );
7162         curr = gen_maxU32( mce, curr, here );
7163         toDo -= 4;
7164      }
7165      /* handle possible 16-bit excess */
7166      while (toDo >= 2) {
7167         here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7168                                    d->guard );
7169         curr = gen_maxU32( mce, curr, here );
7170         toDo -= 2;
7171      }
7172      /* chew off the remaining 8-bit chunk, if any */
7173      if (toDo == 1) {
7174         here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7175                                    d->guard );
7176         curr = gen_maxU32( mce, curr, here );
7177         toDo -= 1;
7178      }
7179      tl_assert(toDo == 0);
7180   }
7181
7182   /* Whew!  So curr is a 32-bit B-value which should give an origin
7183      of some use if any of the inputs to the helper are undefined.
7184      Now we need to re-distribute the results to all destinations. */
7185
7186   /* Outputs: the destination temporary, if there is one. */
7187   if (d->tmp != IRTemp_INVALID) {
7188      dst   = findShadowTmpB(mce, d->tmp);
7189      assign( 'V', mce, dst, curr );
7190   }
7191
7192   /* Outputs: guest state that we write or modify. */
7193   for (i = 0; i < d->nFxState; i++) {
7194      tl_assert(d->fxState[i].fx != Ifx_None);
7195      if (d->fxState[i].fx == Ifx_Read)
7196         continue;
7197
7198      /* Enumerate the described state segments */
7199      for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7200         gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7201         gSz  = d->fxState[i].size;
7202
7203         /* Ignore any sections marked as 'always defined'. */
7204         if (isAlwaysDefd(mce, gOff, gSz))
7205            continue;
7206
7207         /* This state element is written or modified.  So we need to
7208            consider it.  If larger than 4 bytes, deal with it in
7209            4-byte chunks. */
7210         while (True) {
7211            Int b_offset;
7212            tl_assert(gSz >= 0);
7213            if (gSz == 0) break;
7214            n = gSz <= 4 ? gSz : 4;
7215            /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7216            b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7217            if (b_offset != -1) {
7218
7219               /* If the guard expression evaluates to false we simply Put
7220                  the value that is already stored in the guest state slot */
7221               IRAtom *cond, *iffalse;
7222
7223               cond    = assignNew('B', mce, Ity_I1,
7224                                   d->guard);
7225               iffalse = assignNew('B', mce, Ity_I32,
7226                                   IRExpr_Get(b_offset +
7227                                              2*mce->layout->total_sizeB,
7228                                              Ity_I32));
7229               curr = assignNew('V', mce, Ity_I32,
7230                                IRExpr_ITE(cond, curr, iffalse));
7231
7232               stmt( 'B', mce, IRStmt_Put(b_offset
7233                                          + 2*mce->layout->total_sizeB,
7234                                          curr ));
7235            }
7236            gSz -= n;
7237            gOff += n;
7238         }
7239      }
7240   }
7241
7242   /* Outputs: memory that we write or modify.  Same comments about
7243      endianness as above apply. */
7244   if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7245      toDo   = d->mSize;
7246      /* chew off 32-bit chunks */
7247      while (toDo >= 4) {
7248         gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7249                      d->guard );
7250         toDo -= 4;
7251      }
7252      /* handle possible 16-bit excess */
7253      while (toDo >= 2) {
7254         gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7255                      d->guard );
7256         toDo -= 2;
7257      }
7258      /* chew off the remaining 8-bit chunk, if any */
7259      if (toDo == 1) {
7260         gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7261                      d->guard );
7262         toDo -= 1;
7263      }
7264      tl_assert(toDo == 0);
7265   }
7266}
7267
7268
7269/* Generate IR for origin shadowing for a general guarded store. */
7270static void do_origins_Store_guarded ( MCEnv* mce,
7271                                       IREndness stEnd,
7272                                       IRExpr* stAddr,
7273                                       IRExpr* stData,
7274                                       IRExpr* guard )
7275{
7276   Int     dszB;
7277   IRAtom* dataB;
7278   /* assert that the B value for the address is already available
7279      (somewhere), since the call to schemeE will want to see it.
7280      XXXX how does this actually ensure that?? */
7281   tl_assert(isIRAtom(stAddr));
7282   tl_assert(isIRAtom(stData));
7283   dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7284   dataB = schemeE( mce, stData );
7285   gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7286}
7287
7288
7289/* Generate IR for origin shadowing for a plain store. */
7290static void do_origins_Store_plain ( MCEnv* mce,
7291                                     IREndness stEnd,
7292                                     IRExpr* stAddr,
7293                                     IRExpr* stData )
7294{
7295   do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7296                              NULL/*guard*/ );
7297}
7298
7299
7300/* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7301
7302static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7303{
7304   do_origins_Store_guarded( mce, sg->end, sg->addr,
7305                             sg->data, sg->guard );
7306}
7307
7308static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7309{
7310   IRType loadedTy = Ity_INVALID;
7311   switch (lg->cvt) {
7312      case ILGop_Ident64: loadedTy = Ity_I64; break;
7313      case ILGop_Ident32: loadedTy = Ity_I32; break;
7314      case ILGop_16Uto32: loadedTy = Ity_I16; break;
7315      case ILGop_16Sto32: loadedTy = Ity_I16; break;
7316      case ILGop_8Uto32:  loadedTy = Ity_I8;  break;
7317      case ILGop_8Sto32:  loadedTy = Ity_I8;  break;
7318      default: VG_(tool_panic)("schemeS.IRLoadG");
7319   }
7320   IRAtom* ori_alt
7321      = schemeE( mce,lg->alt );
7322   IRAtom* ori_final
7323      = expr2ori_Load_guarded_General(mce, loadedTy,
7324                                      lg->addr, 0/*addr bias*/,
7325                                      lg->guard, ori_alt );
7326   /* And finally, bind the origin to the destination temporary. */
7327   assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7328}
7329
7330
7331static void schemeS ( MCEnv* mce, IRStmt* st )
7332{
7333   tl_assert(MC_(clo_mc_level) == 3);
7334
7335   switch (st->tag) {
7336
7337      case Ist_AbiHint:
7338         /* The value-check instrumenter handles this - by arranging
7339            to pass the address of the next instruction to
7340            MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7341            happen for origin tracking w.r.t. AbiHints.  So there is
7342            nothing to do here. */
7343         break;
7344
7345      case Ist_PutI: {
7346         IRPutI *puti = st->Ist.PutI.details;
7347         IRRegArray* descr_b;
7348         IRAtom      *t1, *t2, *t3, *t4;
7349         IRRegArray* descr = puti->descr;
7350         IRType equivIntTy
7351            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7352         /* If this array is unshadowable for whatever reason,
7353            generate no code. */
7354         if (equivIntTy == Ity_INVALID)
7355            break;
7356         tl_assert(sizeofIRType(equivIntTy) >= 4);
7357         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7358         descr_b
7359            = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7360                            equivIntTy, descr->nElems );
7361         /* Compute a value to Put - the conjoinment of the origin for
7362            the data to be Put-ted (obviously) and of the index value
7363            (not so obviously). */
7364         t1 = schemeE( mce, puti->data );
7365         t2 = schemeE( mce, puti->ix );
7366         t3 = gen_maxU32( mce, t1, t2 );
7367         t4 = zWidenFrom32( mce, equivIntTy, t3 );
7368         stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7369                                               puti->bias, t4) ));
7370         break;
7371      }
7372
7373      case Ist_Dirty:
7374         do_origins_Dirty( mce, st->Ist.Dirty.details );
7375         break;
7376
7377      case Ist_Store:
7378         do_origins_Store_plain( mce, st->Ist.Store.end,
7379                                      st->Ist.Store.addr,
7380                                      st->Ist.Store.data );
7381         break;
7382
7383      case Ist_StoreG:
7384         do_origins_StoreG( mce, st->Ist.StoreG.details );
7385         break;
7386
7387      case Ist_LoadG:
7388         do_origins_LoadG( mce, st->Ist.LoadG.details );
7389         break;
7390
7391      case Ist_LLSC: {
7392         /* In short: treat a load-linked like a normal load followed
7393            by an assignment of the loaded (shadow) data the result
7394            temporary.  Treat a store-conditional like a normal store,
7395            and mark the result temporary as defined. */
7396         if (st->Ist.LLSC.storedata == NULL) {
7397            /* Load Linked */
7398            IRType resTy
7399               = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7400            IRExpr* vanillaLoad
7401               = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7402            tl_assert(resTy == Ity_I64 || resTy == Ity_I32
7403                      || resTy == Ity_I16 || resTy == Ity_I8);
7404            assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7405                              schemeE(mce, vanillaLoad));
7406         } else {
7407            /* Store conditional */
7408            do_origins_Store_plain( mce, st->Ist.LLSC.end,
7409                                    st->Ist.LLSC.addr,
7410                                    st->Ist.LLSC.storedata );
7411            /* For the rationale behind this, see comments at the
7412               place where the V-shadow for .result is constructed, in
7413               do_shadow_LLSC.  In short, we regard .result as
7414               always-defined. */
7415            assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7416                              mkU32(0) );
7417         }
7418         break;
7419      }
7420
7421      case Ist_Put: {
7422         Int b_offset
7423            = MC_(get_otrack_shadow_offset)(
7424                 st->Ist.Put.offset,
7425                 sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7426              );
7427         if (b_offset >= 0) {
7428            /* FIXME: this isn't an atom! */
7429            stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7430                                       schemeE( mce, st->Ist.Put.data )) );
7431         }
7432         break;
7433      }
7434
7435      case Ist_WrTmp:
7436         assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7437                           schemeE(mce, st->Ist.WrTmp.data) );
7438         break;
7439
7440      case Ist_MBE:
7441      case Ist_NoOp:
7442      case Ist_Exit:
7443      case Ist_IMark:
7444         break;
7445
7446      default:
7447         VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7448         ppIRStmt(st);
7449         VG_(tool_panic)("memcheck:schemeS");
7450   }
7451}
7452
7453
7454/*--------------------------------------------------------------------*/
7455/*--- end                                           mc_translate.c ---*/
7456/*--------------------------------------------------------------------*/
7457