1
2/*--------------------------------------------------------------------*/
3/*--- LibHB: a library for implementing and checking               ---*/
4/*--- the happens-before relationship in concurrent programs.      ---*/
5/*---                                                 libhb_main.c ---*/
6/*--------------------------------------------------------------------*/
7
8/*
9   This file is part of LibHB, a library for implementing and checking
10   the happens-before relationship in concurrent programs.
11
12   Copyright (C) 2008-2017 OpenWorks Ltd
13      info@open-works.co.uk
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "pub_tool_basics.h"
34#include "pub_tool_poolalloc.h"
35#include "pub_tool_libcassert.h"
36#include "pub_tool_libcbase.h"
37#include "pub_tool_libcprint.h"
38#include "pub_tool_mallocfree.h"
39#include "pub_tool_wordfm.h"
40#include "pub_tool_hashtable.h"
41#include "pub_tool_xarray.h"
42#include "pub_tool_oset.h"
43#include "pub_tool_threadstate.h"
44#include "pub_tool_aspacemgr.h"
45#include "pub_tool_stacktrace.h"
46#include "pub_tool_execontext.h"
47#include "pub_tool_errormgr.h"
48#include "pub_tool_options.h"        // VG_(clo_stats)
49#include "hg_basics.h"
50#include "hg_wordset.h"
51#include "hg_lock_n_thread.h"
52#include "hg_errors.h"
53
54#include "libhb.h"
55
56
57/////////////////////////////////////////////////////////////////
58/////////////////////////////////////////////////////////////////
59//                                                             //
60// Debugging #defines                                          //
61//                                                             //
62/////////////////////////////////////////////////////////////////
63/////////////////////////////////////////////////////////////////
64
65/* Check the sanity of shadow values in the core memory state
66   machine.  Change #if 0 to #if 1 to enable this. */
67#if 0
68#  define CHECK_MSM 1
69#else
70#  define CHECK_MSM 0
71#endif
72
73
74/* Check sanity (reference counts, etc) in the conflicting access
75   machinery.  Change #if 0 to #if 1 to enable this. */
76#if 0
77#  define CHECK_CEM 1
78#else
79#  define CHECK_CEM 0
80#endif
81
82
83/* Check sanity in the compressed shadow memory machinery,
84   particularly in its caching innards.  Unfortunately there's no
85   almost-zero-cost way to make them selectable at run time.  Hence
86   set the #if 0 to #if 1 and rebuild if you want them. */
87#if 0
88#  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
89#  define inline __attribute__((noinline))
90   /* probably want to ditch -fomit-frame-pointer too */
91#else
92#  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
93#endif
94
95
96/////////////////////////////////////////////////////////////////
97/////////////////////////////////////////////////////////////////
98//                                                             //
99// data decls: VtsID                                           //
100//                                                             //
101/////////////////////////////////////////////////////////////////
102/////////////////////////////////////////////////////////////////
103
104/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
105   bits, since they have to be packed into the lowest 30 bits of an
106   SVal. */
107typedef  UInt  VtsID;
108#define VtsID_INVALID 0xFFFFFFFF
109
110
111
112/////////////////////////////////////////////////////////////////
113/////////////////////////////////////////////////////////////////
114//                                                             //
115// data decls: SVal                                            //
116//                                                             //
117/////////////////////////////////////////////////////////////////
118/////////////////////////////////////////////////////////////////
119
120typedef  ULong  SVal;
121
122/* This value has special significance to the implementation, and callers
123   may not store it in the shadow memory. */
124#define SVal_INVALID (3ULL << 62)
125
126/* This is the default value for shadow memory.  Initially the shadow
127   memory contains no accessible areas and so all reads produce this
128   value.  TODO: make this caller-defineable. */
129#define SVal_NOACCESS (2ULL << 62)
130
131
132
133/////////////////////////////////////////////////////////////////
134/////////////////////////////////////////////////////////////////
135//                                                             //
136// data decls: ScalarTS                                        //
137//                                                             //
138/////////////////////////////////////////////////////////////////
139/////////////////////////////////////////////////////////////////
140
141/* Scalar Timestamp.  We have to store a lot of these, so there is
142   some effort to make them as small as possible.  Logically they are
143   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
144   We pack it into 64 bits by representing the Thr* using a ThrID, a
145   small integer (18 bits), and a 46 bit integer for the timestamp
146   number.  The 46/18 split is arbitrary, but has the effect that
147   Helgrind can only handle programs that create 2^18 or fewer threads
148   over their entire lifetime, and have no more than 2^46 timestamp
149   ticks (synchronisation operations on the same thread).
150
151   This doesn't seem like much of a limitation.  2^46 ticks is
152   7.06e+13, and if each tick (optimistically) takes the machine 1000
153   cycles to process, then the minimum time to process that many ticks
154   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
155   but VTS ticks, which isn't realistic.
156
157   NB1: SCALARTS_N_THRBITS must be 27 or lower.  The obvious limit is
158   32 since a ThrID is a UInt.  27 comes from the fact that
159   'Thr_n_RCEC', which records information about old accesses, packs
160   in tsw not only a ThrID but also minimum 4+1 other bits (access size
161   and writeness) in a UInt, hence limiting size to 32-(4+1) == 27.
162
163   NB2: thrid values are issued upwards from 1024, and values less
164   than that aren't valid.  This isn't per se necessary (any order
165   will do, so long as they are unique), but it does help ensure they
166   are less likely to get confused with the various other kinds of
167   small-integer thread ids drifting around (eg, TId).
168   So, SCALARTS_N_THRBITS must be 11 or more.
169   See also NB5.
170
171   NB3: this probably also relies on the fact that Thr's are never
172   deallocated -- they exist forever.  Hence the 1-1 mapping from
173   Thr's to thrid values (set up in Thr__new) persists forever.
174
175   NB4: temp_max_sized_VTS is allocated at startup and never freed.
176   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
177   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
178   making the memory use for this go sky-high.  With
179   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
180   like an OK tradeoff.  If more than 256k threads need to be
181   supported, we could change SCALARTS_N_THRBITS to 20, which would
182   facilitate supporting 1 million threads at the cost of 8MB storage
183   for temp_max_sized_VTS.
184
185   NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
186   ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
187   must never be a valid ThrID.  Given NB2 that's OK.
188*/
189#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 27 inclusive,
190                                  See NB1 and NB2 above. */
191
192#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
193typedef
194   struct {
195      ThrID thrid : SCALARTS_N_THRBITS;
196      ULong tym   : SCALARTS_N_TYMBITS;
197   }
198   ScalarTS;
199
200#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
201
202
203
204/////////////////////////////////////////////////////////////////
205/////////////////////////////////////////////////////////////////
206//                                                             //
207// data decls: Filter                                          //
208//                                                             //
209/////////////////////////////////////////////////////////////////
210/////////////////////////////////////////////////////////////////
211
212// baseline: 5, 9
213#define FI_LINE_SZB_LOG2  5
214#define FI_NUM_LINES_LOG2 10
215
216#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
217#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
218
219#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
220#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
221
222#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
223                             & (Addr)(FI_NUM_LINES-1) )
224
225
226/* In the lines, each 8 bytes are treated individually, and are mapped
227   to a UShort.  Regardless of endianness of the underlying machine,
228   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
229   the highest address.
230
231   Of each bit pair, the higher numbered bit is set if a R has been
232   seen, so the actual layout is:
233
234   15 14             ...  01 00
235
236   R  W  for addr+7  ...  R  W  for addr+0
237
238   So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
239*/
240
241/* tags are separated from lines.  tags are Addrs and are
242   the base address of the line. */
243typedef
244   struct {
245      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
246   }
247   FiLine;
248
249typedef
250   struct {
251      Addr   tags[FI_NUM_LINES];
252      FiLine lines[FI_NUM_LINES];
253   }
254   Filter;
255
256
257
258/////////////////////////////////////////////////////////////////
259/////////////////////////////////////////////////////////////////
260//                                                             //
261// data decls: Thr, ULong_n_EC                                 //
262//                                                             //
263/////////////////////////////////////////////////////////////////
264/////////////////////////////////////////////////////////////////
265
266// Records stacks for H1 history mechanism (DRD-style)
267typedef
268   struct { ULong ull; ExeContext* ec; }
269   ULong_n_EC;
270
271
272/* How many of the above records to collect for each thread?  Older
273   ones are dumped when we run out of space.  62.5k requires 1MB per
274   thread, since each ULong_n_EC record is 16 bytes long.  When more
275   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
276   deleted to make space.  Hence in the worst case we will be able to
277   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
278   Kw transitions (segments in this thread).  For the current setting
279   that gives a guaranteed stack for at least the last 31.25k
280   segments. */
281#define N_KWs_N_STACKs_PER_THREAD 62500
282
283
284struct _Thr {
285   /* Current VTSs for this thread.  They change as we go along.  viR
286      is the VTS to be used for reads, viW for writes.  Usually they
287      are the same, but can differ when we deal with reader-writer
288      locks.  It is always the case that
289         VtsID__cmpLEQ(viW,viR) == True
290      that is, viW must be the same, or lagging behind, viR. */
291   VtsID viR;
292   VtsID viW;
293
294   /* Is initially False, and is set to True after the thread really
295      has done a low-level exit.  When True, we expect to never see
296      any more memory references done by this thread. */
297   Bool llexit_done;
298
299   /* Is initially False, and is set to True after the thread has been
300      joined with (reaped by some other thread).  After this point, we
301      do not expect to see any uses of .viR or .viW, so it is safe to
302      set them to VtsID_INVALID. */
303   Bool joinedwith_done;
304
305   /* A small integer giving a unique identity to this Thr.  See
306      comments on the definition of ScalarTS for details. */
307   ThrID thrid : SCALARTS_N_THRBITS;
308
309   /* A filter that removes references for which we believe that
310      msmcread/msmcwrite will not change the state, nor report a
311      race. */
312   Filter* filter;
313
314   /* A pointer back to the top level Thread structure.  There is a
315      1-1 mapping between Thread and Thr structures -- each Thr points
316      at its corresponding Thread, and vice versa.  Really, Thr and
317      Thread should be merged into a single structure. */
318   Thread* hgthread;
319
320   /* The ULongs (scalar Kws) in this accumulate in strictly
321      increasing order, without duplicates.  This is important because
322      we need to be able to find a given scalar Kw in this array
323      later, by binary search. */
324   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
325};
326
327
328
329/////////////////////////////////////////////////////////////////
330/////////////////////////////////////////////////////////////////
331//                                                             //
332// data decls: SO                                              //
333//                                                             //
334/////////////////////////////////////////////////////////////////
335/////////////////////////////////////////////////////////////////
336
337// (UInt) `echo "Synchronisation object" | md5sum`
338#define SO_MAGIC 0x56b3c5b0U
339
340struct _SO {
341   struct _SO* admin_prev;
342   struct _SO* admin_next;
343   VtsID viR; /* r-clock of sender */
344   VtsID viW; /* w-clock of sender */
345   UInt  magic;
346};
347
348
349
350/////////////////////////////////////////////////////////////////
351/////////////////////////////////////////////////////////////////
352//                                                             //
353// Forward declarations                                        //
354//                                                             //
355/////////////////////////////////////////////////////////////////
356/////////////////////////////////////////////////////////////////
357
358/* fwds for
359   Globals needed by other parts of the library.  These are set
360   once at startup and then never changed. */
361static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
362static ExeContext* (*main_get_EC)( Thr* ) = NULL;
363
364/* misc fn and data fwdses */
365static void VtsID__rcinc ( VtsID ii );
366static void VtsID__rcdec ( VtsID ii );
367
368static inline Bool SVal__isC ( SVal s );
369static inline VtsID SVal__unC_Rmin ( SVal s );
370static inline VtsID SVal__unC_Wmin ( SVal s );
371static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
372static inline void SVal__rcinc ( SVal s );
373static inline void SVal__rcdec ( SVal s );
374/* SVal in LineZ are used to store various pointers. */
375static inline void *SVal2Ptr (SVal s);
376static inline SVal Ptr2SVal (void* ptr);
377
378/* A double linked list of all the SO's. */
379SO* admin_SO;
380
381
382
383/////////////////////////////////////////////////////////////////
384/////////////////////////////////////////////////////////////////
385//                                                             //
386// SECTION BEGIN compressed shadow memory                      //
387//                                                             //
388/////////////////////////////////////////////////////////////////
389/////////////////////////////////////////////////////////////////
390
391#ifndef __HB_ZSM_H
392#define __HB_ZSM_H
393
394/* Initialise the library.  Once initialised, it will (or may) call
395   SVal__rcinc and SVal__rcdec in response to all the calls below, in order to
396   allow the user to do reference counting on the SVals stored herein.
397   It is important to understand, however, that due to internal
398   caching, the reference counts are in general inaccurate, and can be
399   both above or below the true reference count for an item.  In
400   particular, the library may indicate that the reference count for
401   an item is zero, when in fact it is not.
402
403   To make the reference counting exact and therefore non-pointless,
404   call zsm_flush_cache.  Immediately after it returns, the reference
405   counts for all items, as deduced by the caller by observing calls
406   to SVal__rcinc and SVal__rcdec, will be correct, and so any items with a
407   zero reference count may be freed (or at least considered to be
408   unreferenced by this library).
409*/
410static void zsm_init ( void );
411
412static void zsm_sset_range  ( Addr, SizeT, SVal );
413static void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew );
414static void zsm_scopy_range ( Addr, Addr, SizeT );
415static void zsm_flush_cache ( void );
416
417#endif /* ! __HB_ZSM_H */
418
419
420/* Round a up to the next multiple of N.  N must be a power of 2 */
421#define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
422/* Round a down to the next multiple of N.  N must be a power of 2 */
423#define ROUNDDN(a, N)   ((a) & ~(N-1))
424
425/* True if a belongs in range [start, start + szB[
426   (i.e. start + szB is excluded). */
427static inline Bool address_in_range (Addr a, Addr start,  SizeT szB)
428{
429   /* Checking start <= a && a < start + szB.
430      As start and a are unsigned addresses, the condition can
431      be simplified. */
432   if (CHECK_ZSM)
433      tl_assert ((a - start < szB)
434                 == (start <= a
435                     &&       a < start + szB));
436   return a - start < szB;
437}
438
439/* ------ CacheLine ------ */
440
441#define N_LINE_BITS      6 /* must be >= 3 */
442#define N_LINE_ARANGE    (1 << N_LINE_BITS)
443#define N_LINE_TREES     (N_LINE_ARANGE >> 3)
444
445typedef
446   struct {
447      UShort descrs[N_LINE_TREES];
448      SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
449   }
450   CacheLine;
451
452#define TREE_DESCR_16_0 (1<<0)
453#define TREE_DESCR_32_0 (1<<1)
454#define TREE_DESCR_16_1 (1<<2)
455#define TREE_DESCR_64   (1<<3)
456#define TREE_DESCR_16_2 (1<<4)
457#define TREE_DESCR_32_1 (1<<5)
458#define TREE_DESCR_16_3 (1<<6)
459#define TREE_DESCR_8_0  (1<<7)
460#define TREE_DESCR_8_1  (1<<8)
461#define TREE_DESCR_8_2  (1<<9)
462#define TREE_DESCR_8_3  (1<<10)
463#define TREE_DESCR_8_4  (1<<11)
464#define TREE_DESCR_8_5  (1<<12)
465#define TREE_DESCR_8_6  (1<<13)
466#define TREE_DESCR_8_7  (1<<14)
467#define TREE_DESCR_DTY  (1<<15)
468
469typedef
470   struct {
471      SVal  dict[4]; /* can represent up to 4 diff values in the line */
472      UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
473                                      dict indexes */
474      /* if dict[0] == SVal_INVALID then dict[1] is a pointer to the
475         LineF to use, and dict[2..] are also SVal_INVALID. */
476   }
477   LineZ; /* compressed rep for a cache line */
478
479/* LineZ.dict[1] is used to store various pointers:
480   * In the first lineZ of a free SecMap, it points to the next free SecMap.
481   * In a lineZ for which we need to use a lineF, it points to the lineF. */
482
483
484typedef
485   struct {
486      SVal w64s[N_LINE_ARANGE];
487   }
488   LineF; /* full rep for a cache line */
489
490/* We use a pool allocator for LineF, as LineF is relatively small,
491   and we will often alloc/release such lines. */
492static PoolAlloc* LineF_pool_allocator;
493
494/* SVal in a lineZ are used to store various pointers.
495   Below are conversion functions to support that. */
496static inline LineF *LineF_Ptr (LineZ *lineZ)
497{
498   tl_assert(lineZ->dict[0] == SVal_INVALID);
499   return SVal2Ptr (lineZ->dict[1]);
500}
501
502/* Shadow memory.
503   Primary map is a WordFM Addr SecMap*.
504   SecMaps cover some page-size-ish section of address space and hold
505     a compressed representation.
506   CacheLine-sized chunks of SecMaps are copied into a Cache, being
507   decompressed when moved into the cache and recompressed on the
508   way out.  Because of this, the cache must operate as a writeback
509   cache, not a writethrough one.
510
511   Each SecMap must hold a power-of-2 number of CacheLines.  Hence
512   N_SECMAP_BITS must >= N_LINE_BITS.
513*/
514#define N_SECMAP_BITS   13
515#define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
516
517// # CacheLines held by a SecMap
518#define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
519
520/* The data in the SecMap is held in the array of LineZs.  Each LineZ
521   either carries the required data directly, in a compressed
522   representation, or it holds (in .dict[1]) a pointer to a LineF
523   that holds the full representation.
524
525   As each in-use LineF is referred to by exactly one LineZ,
526   the number of .linesZ[] that refer to a lineF should equal
527   the number of used lineF.
528
529   RC obligations: the RCs presented to the user include exactly
530   the values in:
531   * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
532   * F reps that are in use
533
534   Hence the following actions at the following transitions are required:
535
536   F rep: alloc'd       -> freed                -- rcdec_LineF
537   F rep:               -> alloc'd              -- rcinc_LineF
538   Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
539   Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
540*/
541
542typedef
543   struct {
544      UInt   magic;
545      LineZ  linesZ[N_SECMAP_ZLINES];
546   }
547   SecMap;
548
549#define SecMap_MAGIC   0x571e58cbU
550
551// (UInt) `echo "Free SecMap" | md5sum`
552#define SecMap_free_MAGIC 0x5a977f30U
553
554__attribute__((unused))
555static inline Bool is_sane_SecMap ( SecMap* sm ) {
556   return sm != NULL && sm->magic == SecMap_MAGIC;
557}
558
559/* ------ Cache ------ */
560
561#define N_WAY_BITS 16
562#define N_WAY_NENT (1 << N_WAY_BITS)
563
564/* Each tag is the address of the associated CacheLine, rounded down
565   to a CacheLine address boundary.  A CacheLine size must be a power
566   of 2 and must be 8 or more.  Hence an easy way to initialise the
567   cache so it is empty is to set all the tag values to any value % 8
568   != 0, eg 1.  This means all queries in the cache initially miss.
569   It does however require us to detect and not writeback, any line
570   with a bogus tag. */
571typedef
572   struct {
573      CacheLine lyns0[N_WAY_NENT];
574      Addr      tags0[N_WAY_NENT];
575   }
576   Cache;
577
578static inline Bool is_valid_scache_tag ( Addr tag ) {
579   /* a valid tag should be naturally aligned to the start of
580      a CacheLine. */
581   return 0 == (tag & (N_LINE_ARANGE - 1));
582}
583
584
585/* --------- Primary data structures --------- */
586
587/* Shadow memory primary map */
588static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
589static Cache   cache_shmem;
590
591
592static UWord stats__secmaps_search       = 0; // # SM finds
593static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
594static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
595static UWord stats__secmaps_in_map_shmem = 0; // # SecMaps 'live'
596static UWord stats__secmaps_scanGC       = 0; // # nr of scan GC done.
597static UWord stats__secmaps_scanGCed     = 0; // # SecMaps GC-ed via scan
598static UWord stats__secmaps_ssetGCed     = 0; // # SecMaps GC-ed via setnoaccess
599static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
600static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
601static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
602static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
603static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
604static UWord stats__cache_F_fetches      = 0; // # F lines fetched
605static UWord stats__cache_F_wbacks       = 0; // # F lines written back
606static UWord stats__cache_flushes_invals = 0; // # cache flushes and invals
607static UWord stats__cache_totrefs        = 0; // # total accesses
608static UWord stats__cache_totmisses      = 0; // # misses
609static ULong stats__cache_make_New_arange = 0; // total arange made New
610static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
611static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
612static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
613static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
614static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
615static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
616static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
617static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
618static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
619static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
620static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
621static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
622static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
623static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
624static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
625static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
626static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
627static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
628static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
629static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
630static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
631static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
632static UWord stats__vts__tick            = 0; // # calls to VTS__tick
633static UWord stats__vts__join            = 0; // # calls to VTS__join
634static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
635static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
636static UWord stats__vts_tab_GC           = 0; // # nr of vts_tab GC
637static UWord stats__vts_pruning          = 0; // # nr of vts pruning
638
639// # calls to VTS__cmp_structural w/ slow case
640static UWord stats__vts__cmp_structural_slow = 0;
641
642// # calls to VTS__indexAt_SLOW
643static UWord stats__vts__indexat_slow = 0;
644
645// # calls to vts_set__find__or__clone_and_add
646static UWord stats__vts_set__focaa    = 0;
647
648// # calls to vts_set__find__or__clone_and_add that lead to an
649// allocation
650static UWord stats__vts_set__focaa_a  = 0;
651
652
653static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
654   return a & ~(N_SECMAP_ARANGE - 1);
655}
656static inline UWord shmem__get_SecMap_offset ( Addr a ) {
657   return a & (N_SECMAP_ARANGE - 1);
658}
659
660
661/*----------------------------------------------------------------*/
662/*--- map_shmem :: WordFM Addr SecMap                          ---*/
663/*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
664/*----------------------------------------------------------------*/
665
666/*--------------- SecMap allocation --------------- */
667
668static HChar* shmem__bigchunk_next = NULL;
669static HChar* shmem__bigchunk_end1 = NULL;
670
671static void* shmem__bigchunk_alloc ( SizeT n )
672{
673   const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
674   tl_assert(n > 0);
675   n = VG_ROUNDUP(n, 16);
676   tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
677   tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
678             <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
679   if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
680      if (0)
681      VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
682                  (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
683      shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
684      if (shmem__bigchunk_next == NULL)
685         VG_(out_of_memory_NORETURN)(
686            "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
687      shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
688   }
689   tl_assert(shmem__bigchunk_next);
690   tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
691   tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
692   shmem__bigchunk_next += n;
693   return shmem__bigchunk_next - n;
694}
695
696/* SecMap changed to be fully SVal_NOACCESS are inserted in a list of
697   recycled SecMap. When a new SecMap is needed, a recycled SecMap
698   will be used in preference to allocating a new SecMap. */
699/* We make a linked list of SecMap. The first LineZ is re-used to
700   implement the linked list. */
701/* Returns the SecMap following sm in the free list.
702   NULL if sm is the last SecMap. sm must be on the free list. */
703static inline SecMap *SecMap_freelist_next ( SecMap* sm )
704{
705   tl_assert (sm);
706   tl_assert (sm->magic == SecMap_free_MAGIC);
707   return SVal2Ptr (sm->linesZ[0].dict[1]);
708}
709static inline void set_SecMap_freelist_next ( SecMap* sm, SecMap* next )
710{
711   tl_assert (sm);
712   tl_assert (sm->magic == SecMap_free_MAGIC);
713   tl_assert (next == NULL || next->magic == SecMap_free_MAGIC);
714   sm->linesZ[0].dict[1] = Ptr2SVal (next);
715}
716
717static SecMap *SecMap_freelist = NULL;
718static UWord SecMap_freelist_length(void)
719{
720   SecMap *sm;
721   UWord n = 0;
722
723   sm = SecMap_freelist;
724   while (sm) {
725     n++;
726     sm = SecMap_freelist_next (sm);
727   }
728   return n;
729}
730
731static void push_SecMap_on_freelist(SecMap* sm)
732{
733   if (0) VG_(message)(Vg_DebugMsg, "%p push\n", sm);
734   sm->magic = SecMap_free_MAGIC;
735   set_SecMap_freelist_next(sm, SecMap_freelist);
736   SecMap_freelist = sm;
737}
738/* Returns a free SecMap if there is one.
739   Otherwise, returns NULL. */
740static SecMap *pop_SecMap_from_freelist(void)
741{
742   SecMap *sm;
743
744   sm = SecMap_freelist;
745   if (sm) {
746      tl_assert (sm->magic == SecMap_free_MAGIC);
747      SecMap_freelist = SecMap_freelist_next (sm);
748      if (0) VG_(message)(Vg_DebugMsg, "%p pop\n", sm);
749   }
750   return sm;
751}
752
753static SecMap* shmem__alloc_or_recycle_SecMap ( void )
754{
755   Word    i, j;
756   SecMap* sm = pop_SecMap_from_freelist();
757
758   if (!sm) {
759      sm = shmem__bigchunk_alloc( sizeof(SecMap) );
760      stats__secmaps_allocd++;
761      stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
762      stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
763      stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
764   }
765   if (0) VG_(printf)("alloc_SecMap %p\n",sm);
766   tl_assert(sm);
767   sm->magic = SecMap_MAGIC;
768   for (i = 0; i < N_SECMAP_ZLINES; i++) {
769      sm->linesZ[i].dict[0] = SVal_NOACCESS;
770      sm->linesZ[i].dict[1] = SVal_INVALID;
771      sm->linesZ[i].dict[2] = SVal_INVALID;
772      sm->linesZ[i].dict[3] = SVal_INVALID;
773      for (j = 0; j < N_LINE_ARANGE/4; j++)
774         sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
775   }
776   return sm;
777}
778
779typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
780static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
781
782static SecMap* shmem__find_SecMap ( Addr ga )
783{
784   SecMap* sm    = NULL;
785   Addr    gaKey = shmem__round_to_SecMap_base(ga);
786   // Cache
787   stats__secmaps_search++;
788   if (LIKELY(gaKey == smCache[0].gaKey))
789      return smCache[0].sm;
790   if (LIKELY(gaKey == smCache[1].gaKey)) {
791      SMCacheEnt tmp = smCache[0];
792      smCache[0] = smCache[1];
793      smCache[1] = tmp;
794      return smCache[0].sm;
795   }
796   if (gaKey == smCache[2].gaKey) {
797      SMCacheEnt tmp = smCache[1];
798      smCache[1] = smCache[2];
799      smCache[2] = tmp;
800      return smCache[1].sm;
801   }
802   // end Cache
803   stats__secmaps_search_slow++;
804   if (VG_(lookupFM)( map_shmem,
805                      NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
806      tl_assert(sm != NULL);
807      smCache[2] = smCache[1];
808      smCache[1] = smCache[0];
809      smCache[0].gaKey = gaKey;
810      smCache[0].sm    = sm;
811   } else {
812      tl_assert(sm == NULL);
813   }
814   return sm;
815}
816
817/* Scan the SecMap and count the SecMap that can be GC-ed.
818   If really, really does the GC of the SecMap. */
819/* NOT TO BE CALLED FROM WITHIN libzsm. */
820static UWord next_SecMap_GC_at = 1000;
821__attribute__((noinline))
822static UWord shmem__SecMap_do_GC(Bool really)
823{
824   UWord secmapW = 0;
825   Addr  gaKey;
826   UWord examined = 0;
827   UWord ok_GCed = 0;
828
829   /* First invalidate the smCache */
830   smCache[0].gaKey = 1;
831   smCache[1].gaKey = 1;
832   smCache[2].gaKey = 1;
833   STATIC_ASSERT (3 == sizeof(smCache)/sizeof(smCache[0]));
834
835   VG_(initIterFM)( map_shmem );
836   while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
837      UWord   i;
838      UWord   j;
839      UWord   n_linesF = 0;
840      SecMap* sm = (SecMap*)secmapW;
841      tl_assert(sm->magic == SecMap_MAGIC);
842      Bool ok_to_GC = True;
843
844      examined++;
845
846      /* Deal with the LineZs and the possible LineF of a LineZ. */
847      for (i = 0; i < N_SECMAP_ZLINES && ok_to_GC; i++) {
848         LineZ* lineZ = &sm->linesZ[i];
849         if (lineZ->dict[0] != SVal_INVALID) {
850            ok_to_GC = lineZ->dict[0] == SVal_NOACCESS
851               && !SVal__isC (lineZ->dict[1])
852               && !SVal__isC (lineZ->dict[2])
853               && !SVal__isC (lineZ->dict[3]);
854         } else {
855            LineF *lineF = LineF_Ptr(lineZ);
856            n_linesF++;
857            for (j = 0; j < N_LINE_ARANGE && ok_to_GC; j++)
858               ok_to_GC = lineF->w64s[j] == SVal_NOACCESS;
859         }
860      }
861      if (ok_to_GC)
862         ok_GCed++;
863      if (ok_to_GC && really) {
864        SecMap *fm_sm;
865        Addr fm_gaKey;
866        /* We cannot remove a SecMap from map_shmem while iterating.
867           So, stop iteration, remove from map_shmem, recreate the iteration
868           on the next SecMap. */
869        VG_(doneIterFM) ( map_shmem );
870        /* No need to rcdec linesZ or linesF, these are all SVal_NOACCESS.
871           We just need to free the lineF referenced by the linesZ. */
872        if (n_linesF > 0) {
873           for (i = 0; i < N_SECMAP_ZLINES && n_linesF > 0; i++) {
874              LineZ* lineZ = &sm->linesZ[i];
875              if (lineZ->dict[0] == SVal_INVALID) {
876                 VG_(freeEltPA)( LineF_pool_allocator, LineF_Ptr(lineZ) );
877                 n_linesF--;
878              }
879           }
880        }
881        if (!VG_(delFromFM)(map_shmem, &fm_gaKey, (UWord*)&fm_sm, gaKey))
882          tl_assert (0);
883        stats__secmaps_in_map_shmem--;
884        tl_assert (gaKey == fm_gaKey);
885        tl_assert (sm == fm_sm);
886        stats__secmaps_scanGCed++;
887        push_SecMap_on_freelist (sm);
888        VG_(initIterAtFM) (map_shmem, gaKey + N_SECMAP_ARANGE);
889      }
890   }
891   VG_(doneIterFM)( map_shmem );
892
893   if (really) {
894      stats__secmaps_scanGC++;
895      /* Next GC when we approach the max allocated */
896      next_SecMap_GC_at = stats__secmaps_allocd - 1000;
897      /* Unless we GCed less than 10%. We then allow to alloc 10%
898         more before GCing. This avoids doing a lot of costly GC
899         for the worst case : the 'growing phase' of an application
900         that allocates a lot of memory.
901         Worst can can be reproduced e.g. by
902             perf/memrw -t 30000000 -b 1000 -r 1 -l 1
903         that allocates around 30Gb of memory. */
904      if (ok_GCed < stats__secmaps_allocd/10)
905         next_SecMap_GC_at = stats__secmaps_allocd + stats__secmaps_allocd/10;
906
907   }
908
909   if (VG_(clo_stats) && really) {
910      VG_(message)(Vg_DebugMsg,
911                  "libhb: SecMap GC: #%lu scanned %lu, GCed %lu,"
912                   " next GC at %lu\n",
913                   stats__secmaps_scanGC, examined, ok_GCed,
914                   next_SecMap_GC_at);
915   }
916
917   return ok_GCed;
918}
919
920static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
921{
922   SecMap* sm = shmem__find_SecMap ( ga );
923   if (LIKELY(sm)) {
924      if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
925      return sm;
926   } else {
927      /* create a new one */
928      Addr gaKey = shmem__round_to_SecMap_base(ga);
929      sm = shmem__alloc_or_recycle_SecMap();
930      tl_assert(sm);
931      VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
932      stats__secmaps_in_map_shmem++;
933      if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
934      return sm;
935   }
936}
937
938/* Returns the nr of linesF which are in use. Note: this is scanning
939   the secmap wordFM. So, this is to be used for statistics only. */
940__attribute__((noinline))
941static UWord shmem__SecMap_used_linesF(void)
942{
943   UWord secmapW = 0;
944   Addr  gaKey;
945   UWord inUse = 0;
946
947   VG_(initIterFM)( map_shmem );
948   while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
949      UWord   i;
950      SecMap* sm = (SecMap*)secmapW;
951      tl_assert(sm->magic == SecMap_MAGIC);
952
953      for (i = 0; i < N_SECMAP_ZLINES; i++) {
954         LineZ* lineZ = &sm->linesZ[i];
955         if (lineZ->dict[0] == SVal_INVALID)
956            inUse++;
957      }
958   }
959   VG_(doneIterFM)( map_shmem );
960
961   return inUse;
962}
963
964/* ------------ LineF and LineZ related ------------ */
965
966static void rcinc_LineF ( LineF* lineF ) {
967   UWord i;
968   for (i = 0; i < N_LINE_ARANGE; i++)
969      SVal__rcinc(lineF->w64s[i]);
970}
971
972static void rcdec_LineF ( LineF* lineF ) {
973   UWord i;
974   for (i = 0; i < N_LINE_ARANGE; i++)
975      SVal__rcdec(lineF->w64s[i]);
976}
977
978static void rcinc_LineZ ( LineZ* lineZ ) {
979   tl_assert(lineZ->dict[0] != SVal_INVALID);
980   SVal__rcinc(lineZ->dict[0]);
981   if (lineZ->dict[1] != SVal_INVALID) SVal__rcinc(lineZ->dict[1]);
982   if (lineZ->dict[2] != SVal_INVALID) SVal__rcinc(lineZ->dict[2]);
983   if (lineZ->dict[3] != SVal_INVALID) SVal__rcinc(lineZ->dict[3]);
984}
985
986static void rcdec_LineZ ( LineZ* lineZ ) {
987   tl_assert(lineZ->dict[0] != SVal_INVALID);
988   SVal__rcdec(lineZ->dict[0]);
989   if (lineZ->dict[1] != SVal_INVALID) SVal__rcdec(lineZ->dict[1]);
990   if (lineZ->dict[2] != SVal_INVALID) SVal__rcdec(lineZ->dict[2]);
991   if (lineZ->dict[3] != SVal_INVALID) SVal__rcdec(lineZ->dict[3]);
992}
993
994inline
995static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
996   Word bix, shft, mask, prep;
997   tl_assert(ix >= 0);
998   bix  = ix >> 2;
999   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1000   mask = 3 << shft;
1001   prep = b2 << shft;
1002   arr[bix] = (arr[bix] & ~mask) | prep;
1003}
1004
1005inline
1006static UWord read_twobit_array ( UChar* arr, UWord ix ) {
1007   Word bix, shft;
1008   tl_assert(ix >= 0);
1009   bix  = ix >> 2;
1010   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1011   return (arr[bix] >> shft) & 3;
1012}
1013
1014/* We cache one free lineF, to avoid pool allocator calls.
1015   Measurement on firefox has shown that this avoids more than 90%
1016   of the PA calls. */
1017static LineF *free_lineF = NULL;
1018
1019/* Allocates a lineF for LineZ. Sets lineZ in a state indicating
1020   lineF has to be used. */
1021static inline LineF *alloc_LineF_for_Z (LineZ *lineZ)
1022{
1023   LineF *lineF;
1024
1025   tl_assert(lineZ->dict[0] == SVal_INVALID);
1026
1027   if (LIKELY(free_lineF)) {
1028      lineF = free_lineF;
1029      free_lineF = NULL;
1030   } else {
1031      lineF = VG_(allocEltPA) ( LineF_pool_allocator );
1032   }
1033   lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1034   lineZ->dict[1] = Ptr2SVal (lineF);
1035
1036   return lineF;
1037}
1038
1039/* rcdec the LineF of lineZ, frees the lineF, and sets lineZ
1040   back to its initial state SVal_NOACCESS (i.e. ready to be
1041   read or written just after SecMap allocation). */
1042static inline void clear_LineF_of_Z (LineZ *lineZ)
1043{
1044   LineF *lineF = LineF_Ptr(lineZ);
1045
1046   rcdec_LineF(lineF);
1047   if (UNLIKELY(free_lineF)) {
1048      VG_(freeEltPA)( LineF_pool_allocator, lineF );
1049   } else {
1050      free_lineF = lineF;
1051   }
1052   lineZ->dict[0] = SVal_NOACCESS;
1053   lineZ->dict[1] = SVal_INVALID;
1054}
1055
1056/* Given address 'tag', find either the Z or F line containing relevant
1057   data, so it can be read into the cache.
1058*/
1059static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
1060                                  /*OUT*/LineF** fp, Addr tag ) {
1061   LineZ* lineZ;
1062   LineF* lineF;
1063   UWord   zix;
1064   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1065   UWord   smoff = shmem__get_SecMap_offset(tag);
1066   /* since smoff is derived from a valid tag, it should be
1067      cacheline-aligned. */
1068   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1069   zix = smoff >> N_LINE_BITS;
1070   tl_assert(zix < N_SECMAP_ZLINES);
1071   lineZ = &sm->linesZ[zix];
1072   lineF = NULL;
1073   if (lineZ->dict[0] == SVal_INVALID) {
1074      lineF = LineF_Ptr (lineZ);
1075      lineZ = NULL;
1076   }
1077   *zp = lineZ;
1078   *fp = lineF;
1079}
1080
1081/* Given address 'tag', return the relevant SecMap and the index of
1082   the LineZ within it, in the expectation that the line is to be
1083   overwritten.  Regardless of whether 'tag' is currently associated
1084   with a Z or F representation, to rcdec on the current
1085   representation, in recognition of the fact that the contents are
1086   just about to be overwritten. */
1087static __attribute__((noinline))
1088void find_Z_for_writing ( /*OUT*/SecMap** smp,
1089                          /*OUT*/Word* zixp,
1090                          Addr tag ) {
1091   LineZ* lineZ;
1092   UWord   zix;
1093   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1094   UWord   smoff = shmem__get_SecMap_offset(tag);
1095   /* since smoff is derived from a valid tag, it should be
1096      cacheline-aligned. */
1097   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1098   zix = smoff >> N_LINE_BITS;
1099   tl_assert(zix < N_SECMAP_ZLINES);
1100   lineZ = &sm->linesZ[zix];
1101   /* re RCs, we are rcdec_LineZ/clear_LineF_of_Z this LineZ so that new data
1102      can be parked in it.  Hence have to rcdec it accordingly. */
1103   /* If lineZ has an associated lineF, free it up. */
1104   if (lineZ->dict[0] == SVal_INVALID)
1105      clear_LineF_of_Z(lineZ);
1106   else
1107      rcdec_LineZ(lineZ);
1108   *smp  = sm;
1109   *zixp = zix;
1110}
1111
1112/* ------------ CacheLine and implicit-tree related ------------ */
1113
1114__attribute__((unused))
1115static void pp_CacheLine ( CacheLine* cl ) {
1116   Word i;
1117   if (!cl) {
1118      VG_(printf)("%s","pp_CacheLine(NULL)\n");
1119      return;
1120   }
1121   for (i = 0; i < N_LINE_TREES; i++)
1122      VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
1123   for (i = 0; i < N_LINE_ARANGE; i++)
1124      VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
1125}
1126
1127static UChar descr_to_validbits ( UShort descr )
1128{
1129   /* a.k.a Party Time for gcc's constant folder */
1130#  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
1131                b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
1132             ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
1133                          ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
1134                          ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
1135                          ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
1136                          ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
1137                          ( (b16_2) << 4)  | ( (b64)   << 3)  | \
1138                          ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
1139                          ( (b16_0) << 0) ) )
1140
1141#  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
1142             ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
1143                         ( (bit5) << 5) | ( (bit4) << 4) | \
1144                         ( (bit3) << 3) | ( (bit2) << 2) | \
1145                         ( (bit1) << 1) | ( (bit0) << 0) ) )
1146
1147   /* these should all get folded out at compile time */
1148   tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
1149   tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
1150   tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
1151   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
1152   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
1153   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
1154   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
1155   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
1156   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
1157
1158   switch (descr) {
1159   /*
1160              +--------------------------------- TREE_DESCR_8_7
1161              |             +------------------- TREE_DESCR_8_0
1162              |             |  +---------------- TREE_DESCR_16_3
1163              |             |  | +-------------- TREE_DESCR_32_1
1164              |             |  | | +------------ TREE_DESCR_16_2
1165              |             |  | | |  +--------- TREE_DESCR_64
1166              |             |  | | |  |  +------ TREE_DESCR_16_1
1167              |             |  | | |  |  | +---- TREE_DESCR_32_0
1168              |             |  | | |  |  | | +-- TREE_DESCR_16_0
1169              |             |  | | |  |  | | |
1170              |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
1171   case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
1172                                                 return BYTE(1,1,1,1,1,1,1,1);
1173   case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
1174                                                 return BYTE(1,1,0,1,1,1,1,1);
1175   case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
1176                                                 return BYTE(0,1,1,1,1,1,1,1);
1177   case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
1178                                                 return BYTE(0,1,0,1,1,1,1,1);
1179
1180   case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
1181                                                 return BYTE(1,1,1,1,1,1,0,1);
1182   case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
1183                                                 return BYTE(1,1,0,1,1,1,0,1);
1184   case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
1185                                                 return BYTE(0,1,1,1,1,1,0,1);
1186   case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
1187                                                 return BYTE(0,1,0,1,1,1,0,1);
1188
1189   case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
1190                                                 return BYTE(1,1,1,1,0,1,1,1);
1191   case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1192                                                 return BYTE(1,1,0,1,0,1,1,1);
1193   case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1194                                                 return BYTE(0,1,1,1,0,1,1,1);
1195   case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1196                                                 return BYTE(0,1,0,1,0,1,1,1);
1197
1198   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1199                                                 return BYTE(1,1,1,1,0,1,0,1);
1200   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1201                                                 return BYTE(1,1,0,1,0,1,0,1);
1202   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1203                                                 return BYTE(0,1,1,1,0,1,0,1);
1204   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1205                                                 return BYTE(0,1,0,1,0,1,0,1);
1206
1207   case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1208                                                 return BYTE(0,0,0,1,1,1,1,1);
1209   case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1210                                                 return BYTE(0,0,0,1,1,1,0,1);
1211   case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1212                                                 return BYTE(0,0,0,1,0,1,1,1);
1213   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1214                                                 return BYTE(0,0,0,1,0,1,0,1);
1215
1216   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1217                                                 return BYTE(1,1,1,1,0,0,0,1);
1218   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1219                                                 return BYTE(1,1,0,1,0,0,0,1);
1220   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1221                                                 return BYTE(0,1,1,1,0,0,0,1);
1222   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1223                                                 return BYTE(0,1,0,1,0,0,0,1);
1224
1225   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1226                                                 return BYTE(0,0,0,1,0,0,0,1);
1227
1228   case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1229                                                 return BYTE(0,0,0,0,0,0,0,1);
1230
1231   default: return BYTE(0,0,0,0,0,0,0,0);
1232                   /* INVALID - any valid descr produces at least one
1233                      valid bit in tree[0..7]*/
1234   }
1235   /* NOTREACHED*/
1236   tl_assert(0);
1237
1238#  undef DESCR
1239#  undef BYTE
1240}
1241
1242__attribute__((unused))
1243static Bool is_sane_Descr ( UShort descr ) {
1244   return descr_to_validbits(descr) != 0;
1245}
1246
1247static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1248   VG_(sprintf)(dst,
1249                "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1250                (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1251                (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1252                (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1253                (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1254                (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1255                (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1256                (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1257                (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1258                (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1259                (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1260                (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1261                (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1262                (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1263                (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1264                (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1265   );
1266}
1267static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1268   VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1269                     (Int)((byte & 128) ? 1 : 0),
1270                     (Int)((byte &  64) ? 1 : 0),
1271                     (Int)((byte &  32) ? 1 : 0),
1272                     (Int)((byte &  16) ? 1 : 0),
1273                     (Int)((byte &   8) ? 1 : 0),
1274                     (Int)((byte &   4) ? 1 : 0),
1275                     (Int)((byte &   2) ? 1 : 0),
1276                     (Int)((byte &   1) ? 1 : 0)
1277   );
1278}
1279
1280static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1281   Word  i;
1282   UChar validbits = descr_to_validbits(descr);
1283   HChar buf[128], buf2[128];    // large enough
1284   if (validbits == 0)
1285      goto bad;
1286   for (i = 0; i < 8; i++) {
1287      if (validbits & (1<<i)) {
1288         if (tree[i] == SVal_INVALID)
1289            goto bad;
1290      } else {
1291         if (tree[i] != SVal_INVALID)
1292            goto bad;
1293      }
1294   }
1295   return True;
1296  bad:
1297   sprintf_Descr( buf, descr );
1298   sprintf_Byte( buf2, validbits );
1299   VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1300   VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1301   VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1302   for (i = 0; i < 8; i++)
1303      VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1304   VG_(printf)("%s","}\n");
1305   return 0;
1306}
1307
1308static Bool is_sane_CacheLine ( CacheLine* cl )
1309{
1310   Word tno, cloff;
1311
1312   if (!cl) goto bad;
1313
1314   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1315      UShort descr = cl->descrs[tno];
1316      SVal*  tree  = &cl->svals[cloff];
1317      if (!is_sane_Descr_and_Tree(descr, tree))
1318         goto bad;
1319   }
1320   tl_assert(cloff == N_LINE_ARANGE);
1321   return True;
1322  bad:
1323   pp_CacheLine(cl);
1324   return False;
1325}
1326
1327static UShort normalise_tree ( /*MOD*/SVal* tree )
1328{
1329   UShort descr;
1330   /* pre: incoming tree[0..7] does not have any invalid shvals, in
1331      particular no zeroes. */
1332   if (CHECK_ZSM
1333       && UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1334                   || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1335                   || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1336                   || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1337      tl_assert(0);
1338
1339   descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1340           | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1341           | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1342   /* build 16-bit layer */
1343   if (tree[1] == tree[0]) {
1344      tree[1] = SVal_INVALID;
1345      descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1346      descr |= TREE_DESCR_16_0;
1347   }
1348   if (tree[3] == tree[2]) {
1349      tree[3] = SVal_INVALID;
1350      descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1351      descr |= TREE_DESCR_16_1;
1352   }
1353   if (tree[5] == tree[4]) {
1354      tree[5] = SVal_INVALID;
1355      descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1356      descr |= TREE_DESCR_16_2;
1357   }
1358   if (tree[7] == tree[6]) {
1359      tree[7] = SVal_INVALID;
1360      descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1361      descr |= TREE_DESCR_16_3;
1362   }
1363   /* build 32-bit layer */
1364   if (tree[2] == tree[0]
1365       && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1366      tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1367      descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1368      descr |= TREE_DESCR_32_0;
1369   }
1370   if (tree[6] == tree[4]
1371       && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1372      tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1373      descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1374      descr |= TREE_DESCR_32_1;
1375   }
1376   /* build 64-bit layer */
1377   if (tree[4] == tree[0]
1378       && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1379      tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1380      descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1381      descr |= TREE_DESCR_64;
1382   }
1383   return descr;
1384}
1385
1386/* This takes a cacheline where all the data is at the leaves
1387   (w8[..]) and builds a correctly normalised tree. */
1388static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1389{
1390   Word tno, cloff;
1391   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1392      SVal* tree = &cl->svals[cloff];
1393      cl->descrs[tno] = normalise_tree( tree );
1394   }
1395   tl_assert(cloff == N_LINE_ARANGE);
1396   if (CHECK_ZSM)
1397      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1398   stats__cline_normalises++;
1399}
1400
1401
1402typedef struct { UChar count; SVal sval; } CountedSVal;
1403
1404static
1405void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1406                               /*OUT*/Word* dstUsedP,
1407                               Word nDst, CacheLine* src )
1408{
1409   Word  tno, cloff, dstUsed;
1410
1411   tl_assert(nDst == N_LINE_ARANGE);
1412   dstUsed = 0;
1413
1414   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1415      UShort descr = src->descrs[tno];
1416      SVal*  tree  = &src->svals[cloff];
1417
1418      /* sequentialise the tree described by (descr,tree). */
1419#     define PUT(_n,_v)                                \
1420         do { dst[dstUsed  ].count = (_n);             \
1421              dst[dstUsed++].sval  = (_v);             \
1422         } while (0)
1423
1424      /* byte 0 */
1425      if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1426      if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1427      if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1428      if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1429      /* byte 1 */
1430      if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1431      /* byte 2 */
1432      if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1433      if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1434      /* byte 3 */
1435      if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1436      /* byte 4 */
1437      if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1438      if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1439      if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1440      /* byte 5 */
1441      if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1442      /* byte 6 */
1443      if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1444      if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1445      /* byte 7 */
1446      if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1447
1448#     undef PUT
1449      /* END sequentialise the tree described by (descr,tree). */
1450
1451   }
1452   tl_assert(cloff == N_LINE_ARANGE);
1453   tl_assert(dstUsed <= nDst);
1454
1455   *dstUsedP = dstUsed;
1456}
1457
1458/* Write the cacheline 'wix' to backing store.  Where it ends up
1459   is determined by its tag field. */
1460static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1461{
1462   Word        i, j, k, m;
1463   Addr        tag;
1464   SecMap*     sm;
1465   CacheLine*  cl;
1466   LineZ* lineZ;
1467   LineF* lineF;
1468   Word        zix, fix, csvalsUsed;
1469   CountedSVal csvals[N_LINE_ARANGE];
1470   SVal        sv;
1471
1472   if (0)
1473   VG_(printf)("scache wback line %d\n", (Int)wix);
1474
1475   tl_assert(wix >= 0 && wix < N_WAY_NENT);
1476
1477   tag =  cache_shmem.tags0[wix];
1478   cl  = &cache_shmem.lyns0[wix];
1479
1480   /* The cache line may have been invalidated; if so, ignore it. */
1481   if (!is_valid_scache_tag(tag))
1482      return;
1483
1484   /* Where are we going to put it? */
1485   sm         = NULL;
1486   lineZ      = NULL;
1487   lineF      = NULL;
1488   zix = fix = -1;
1489
1490   /* find the Z line to write in and rcdec it or the associated F
1491      line. */
1492   find_Z_for_writing( &sm, &zix, tag );
1493
1494   tl_assert(sm);
1495   tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1496   lineZ = &sm->linesZ[zix];
1497
1498   /* Generate the data to be stored */
1499   if (CHECK_ZSM)
1500      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1501
1502   csvalsUsed = -1;
1503   sequentialise_CacheLine( csvals, &csvalsUsed,
1504                            N_LINE_ARANGE, cl );
1505   tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1506   if (0) VG_(printf)("%ld ", csvalsUsed);
1507
1508   lineZ->dict[0] = lineZ->dict[1]
1509                  = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1510
1511   /* i indexes actual shadow values, k is cursor in csvals */
1512   i = 0;
1513   for (k = 0; k < csvalsUsed; k++) {
1514
1515      sv = csvals[k].sval;
1516      if (CHECK_ZSM)
1517         tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1518      /* do we already have it? */
1519      if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1520      if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1521      if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1522      if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1523      /* no.  look for a free slot. */
1524      if (CHECK_ZSM)
1525         tl_assert(sv != SVal_INVALID);
1526      if (lineZ->dict[0]
1527          == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1528      if (lineZ->dict[1]
1529          == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1530      if (lineZ->dict[2]
1531          == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1532      if (lineZ->dict[3]
1533          == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1534      break; /* we'll have to use the f rep */
1535     dict_ok:
1536      m = csvals[k].count;
1537      if (m == 8) {
1538         write_twobit_array( lineZ->ix2s, i+0, j );
1539         write_twobit_array( lineZ->ix2s, i+1, j );
1540         write_twobit_array( lineZ->ix2s, i+2, j );
1541         write_twobit_array( lineZ->ix2s, i+3, j );
1542         write_twobit_array( lineZ->ix2s, i+4, j );
1543         write_twobit_array( lineZ->ix2s, i+5, j );
1544         write_twobit_array( lineZ->ix2s, i+6, j );
1545         write_twobit_array( lineZ->ix2s, i+7, j );
1546         i += 8;
1547      }
1548      else if (m == 4) {
1549         write_twobit_array( lineZ->ix2s, i+0, j );
1550         write_twobit_array( lineZ->ix2s, i+1, j );
1551         write_twobit_array( lineZ->ix2s, i+2, j );
1552         write_twobit_array( lineZ->ix2s, i+3, j );
1553         i += 4;
1554      }
1555      else if (m == 1) {
1556         write_twobit_array( lineZ->ix2s, i+0, j );
1557         i += 1;
1558      }
1559      else if (m == 2) {
1560         write_twobit_array( lineZ->ix2s, i+0, j );
1561         write_twobit_array( lineZ->ix2s, i+1, j );
1562         i += 2;
1563      }
1564      else {
1565         tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1566      }
1567
1568   }
1569
1570   if (LIKELY(i == N_LINE_ARANGE)) {
1571      /* Construction of the compressed representation was
1572         successful. */
1573      rcinc_LineZ(lineZ);
1574      stats__cache_Z_wbacks++;
1575   } else {
1576      /* Cannot use the compressed(z) representation.  Use the full(f)
1577         rep instead. */
1578      tl_assert(i >= 0 && i < N_LINE_ARANGE);
1579      lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1580      lineF = alloc_LineF_for_Z (lineZ);
1581      i = 0;
1582      for (k = 0; k < csvalsUsed; k++) {
1583         if (CHECK_ZSM)
1584            tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1585         sv = csvals[k].sval;
1586         if (CHECK_ZSM)
1587            tl_assert(sv != SVal_INVALID);
1588         for (m = csvals[k].count; m > 0; m--) {
1589            lineF->w64s[i] = sv;
1590            i++;
1591         }
1592      }
1593      tl_assert(i == N_LINE_ARANGE);
1594      rcinc_LineF(lineF);
1595      stats__cache_F_wbacks++;
1596   }
1597}
1598
1599/* Fetch the cacheline 'wix' from the backing store.  The tag
1600   associated with 'wix' is assumed to have already been filled in;
1601   hence that is used to determine where in the backing store to read
1602   from. */
1603static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1604{
1605   Word       i;
1606   Addr       tag;
1607   CacheLine* cl;
1608   LineZ*     lineZ;
1609   LineF*     lineF;
1610
1611   if (0)
1612   VG_(printf)("scache fetch line %d\n", (Int)wix);
1613
1614   tl_assert(wix >= 0 && wix < N_WAY_NENT);
1615
1616   tag =  cache_shmem.tags0[wix];
1617   cl  = &cache_shmem.lyns0[wix];
1618
1619   /* reject nonsense requests */
1620   tl_assert(is_valid_scache_tag(tag));
1621
1622   lineZ = NULL;
1623   lineF = NULL;
1624   find_ZF_for_reading( &lineZ, &lineF, tag );
1625   tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1626
1627   /* expand the data into the bottom layer of the tree, then get
1628      cacheline_normalise to build the descriptor array. */
1629   if (lineF) {
1630      for (i = 0; i < N_LINE_ARANGE; i++) {
1631         cl->svals[i] = lineF->w64s[i];
1632      }
1633      stats__cache_F_fetches++;
1634   } else {
1635      for (i = 0; i < N_LINE_ARANGE; i++) {
1636         UWord ix = read_twobit_array( lineZ->ix2s, i );
1637         if (CHECK_ZSM) tl_assert(ix >= 0 && ix <= 3);
1638         cl->svals[i] = lineZ->dict[ix];
1639         if (CHECK_ZSM) tl_assert(cl->svals[i] != SVal_INVALID);
1640      }
1641      stats__cache_Z_fetches++;
1642   }
1643   normalise_CacheLine( cl );
1644}
1645
1646/* Invalid the cachelines corresponding to the given range, which
1647   must start and end on a cacheline boundary. */
1648static void shmem__invalidate_scache_range (Addr ga, SizeT szB)
1649{
1650   Word wix;
1651
1652   /* ga must be on a cacheline boundary. */
1653   tl_assert (is_valid_scache_tag (ga));
1654   /* szB must be a multiple of cacheline size. */
1655   tl_assert (0 == (szB & (N_LINE_ARANGE - 1)));
1656
1657
1658   Word ga_ix = (ga >> N_LINE_BITS) & (N_WAY_NENT - 1);
1659   Word nwix = szB / N_LINE_ARANGE;
1660
1661   if (nwix > N_WAY_NENT)
1662      nwix = N_WAY_NENT; // no need to check several times the same entry.
1663
1664   for (wix = 0; wix < nwix; wix++) {
1665      if (address_in_range(cache_shmem.tags0[ga_ix], ga, szB))
1666         cache_shmem.tags0[ga_ix] = 1/*INVALID*/;
1667      ga_ix++;
1668      if (UNLIKELY(ga_ix == N_WAY_NENT))
1669         ga_ix = 0;
1670   }
1671}
1672
1673
1674static void shmem__flush_and_invalidate_scache ( void ) {
1675   Word wix;
1676   Addr tag;
1677   if (0) VG_(printf)("%s","scache flush and invalidate\n");
1678   tl_assert(!is_valid_scache_tag(1));
1679   for (wix = 0; wix < N_WAY_NENT; wix++) {
1680      tag = cache_shmem.tags0[wix];
1681      if (tag == 1/*INVALID*/) {
1682         /* already invalid; nothing to do */
1683      } else {
1684         tl_assert(is_valid_scache_tag(tag));
1685         cacheline_wback( wix );
1686      }
1687      cache_shmem.tags0[wix] = 1/*INVALID*/;
1688   }
1689   stats__cache_flushes_invals++;
1690}
1691
1692
1693static inline Bool aligned16 ( Addr a ) {
1694   return 0 == (a & 1);
1695}
1696static inline Bool aligned32 ( Addr a ) {
1697   return 0 == (a & 3);
1698}
1699static inline Bool aligned64 ( Addr a ) {
1700   return 0 == (a & 7);
1701}
1702static inline UWord get_cacheline_offset ( Addr a ) {
1703   return (UWord)(a & (N_LINE_ARANGE - 1));
1704}
1705static inline Addr cacheline_ROUNDUP ( Addr a ) {
1706   return ROUNDUP(a, N_LINE_ARANGE);
1707}
1708static inline Addr cacheline_ROUNDDN ( Addr a ) {
1709   return ROUNDDN(a, N_LINE_ARANGE);
1710}
1711static inline UWord get_treeno ( Addr a ) {
1712   return get_cacheline_offset(a) >> 3;
1713}
1714static inline UWord get_tree_offset ( Addr a ) {
1715   return a & 7;
1716}
1717
1718static __attribute__((noinline))
1719       CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
1720static inline CacheLine* get_cacheline ( Addr a )
1721{
1722   /* tag is 'a' with the in-line offset masked out,
1723      eg a[31]..a[4] 0000 */
1724   Addr       tag = a & ~(N_LINE_ARANGE - 1);
1725   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1726   stats__cache_totrefs++;
1727   if (LIKELY(tag == cache_shmem.tags0[wix])) {
1728      return &cache_shmem.lyns0[wix];
1729   } else {
1730      return get_cacheline_MISS( a );
1731   }
1732}
1733
1734static __attribute__((noinline))
1735       CacheLine* get_cacheline_MISS ( Addr a )
1736{
1737   /* tag is 'a' with the in-line offset masked out,
1738      eg a[31]..a[4] 0000 */
1739
1740   CacheLine* cl;
1741   Addr*      tag_old_p;
1742   Addr       tag = a & ~(N_LINE_ARANGE - 1);
1743   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1744
1745   tl_assert(tag != cache_shmem.tags0[wix]);
1746
1747   /* Dump the old line into the backing store. */
1748   stats__cache_totmisses++;
1749
1750   cl        = &cache_shmem.lyns0[wix];
1751   tag_old_p = &cache_shmem.tags0[wix];
1752
1753   if (is_valid_scache_tag( *tag_old_p )) {
1754      /* EXPENSIVE and REDUNDANT: callee does it */
1755      if (CHECK_ZSM)
1756         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1757      cacheline_wback( wix );
1758   }
1759   /* and reload the new one */
1760   *tag_old_p = tag;
1761   cacheline_fetch( wix );
1762   if (CHECK_ZSM)
1763      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1764   return cl;
1765}
1766
1767static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1768   stats__cline_64to32pulldown++;
1769   switch (toff) {
1770      case 0: case 4:
1771         tl_assert(descr & TREE_DESCR_64);
1772         tree[4] = tree[0];
1773         descr &= ~TREE_DESCR_64;
1774         descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1775         break;
1776      default:
1777         tl_assert(0);
1778   }
1779   return descr;
1780}
1781
1782static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1783   stats__cline_32to16pulldown++;
1784   switch (toff) {
1785      case 0: case 2:
1786         if (!(descr & TREE_DESCR_32_0)) {
1787            descr = pulldown_to_32(tree, 0, descr);
1788         }
1789         tl_assert(descr & TREE_DESCR_32_0);
1790         tree[2] = tree[0];
1791         descr &= ~TREE_DESCR_32_0;
1792         descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1793         break;
1794      case 4: case 6:
1795         if (!(descr & TREE_DESCR_32_1)) {
1796            descr = pulldown_to_32(tree, 4, descr);
1797         }
1798         tl_assert(descr & TREE_DESCR_32_1);
1799         tree[6] = tree[4];
1800         descr &= ~TREE_DESCR_32_1;
1801         descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1802         break;
1803      default:
1804         tl_assert(0);
1805   }
1806   return descr;
1807}
1808
1809static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1810   stats__cline_16to8pulldown++;
1811   switch (toff) {
1812      case 0: case 1:
1813         if (!(descr & TREE_DESCR_16_0)) {
1814            descr = pulldown_to_16(tree, 0, descr);
1815         }
1816         tl_assert(descr & TREE_DESCR_16_0);
1817         tree[1] = tree[0];
1818         descr &= ~TREE_DESCR_16_0;
1819         descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1820         break;
1821      case 2: case 3:
1822         if (!(descr & TREE_DESCR_16_1)) {
1823            descr = pulldown_to_16(tree, 2, descr);
1824         }
1825         tl_assert(descr & TREE_DESCR_16_1);
1826         tree[3] = tree[2];
1827         descr &= ~TREE_DESCR_16_1;
1828         descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1829         break;
1830      case 4: case 5:
1831         if (!(descr & TREE_DESCR_16_2)) {
1832            descr = pulldown_to_16(tree, 4, descr);
1833         }
1834         tl_assert(descr & TREE_DESCR_16_2);
1835         tree[5] = tree[4];
1836         descr &= ~TREE_DESCR_16_2;
1837         descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1838         break;
1839      case 6: case 7:
1840         if (!(descr & TREE_DESCR_16_3)) {
1841            descr = pulldown_to_16(tree, 6, descr);
1842         }
1843         tl_assert(descr & TREE_DESCR_16_3);
1844         tree[7] = tree[6];
1845         descr &= ~TREE_DESCR_16_3;
1846         descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1847         break;
1848      default:
1849         tl_assert(0);
1850   }
1851   return descr;
1852}
1853
1854
1855static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1856   UShort mask;
1857   switch (toff) {
1858      case 0:
1859         mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1860         tl_assert( (descr & mask) == mask );
1861         descr &= ~mask;
1862         descr |= TREE_DESCR_16_0;
1863         break;
1864      case 2:
1865         mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1866         tl_assert( (descr & mask) == mask );
1867         descr &= ~mask;
1868         descr |= TREE_DESCR_16_1;
1869         break;
1870      case 4:
1871         mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1872         tl_assert( (descr & mask) == mask );
1873         descr &= ~mask;
1874         descr |= TREE_DESCR_16_2;
1875         break;
1876      case 6:
1877         mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1878         tl_assert( (descr & mask) == mask );
1879         descr &= ~mask;
1880         descr |= TREE_DESCR_16_3;
1881         break;
1882      default:
1883         tl_assert(0);
1884   }
1885   return descr;
1886}
1887
1888static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1889   UShort mask;
1890   switch (toff) {
1891      case 0:
1892         if (!(descr & TREE_DESCR_16_0))
1893            descr = pullup_descr_to_16(descr, 0);
1894         if (!(descr & TREE_DESCR_16_1))
1895            descr = pullup_descr_to_16(descr, 2);
1896         mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1897         tl_assert( (descr & mask) == mask );
1898         descr &= ~mask;
1899         descr |= TREE_DESCR_32_0;
1900         break;
1901      case 4:
1902         if (!(descr & TREE_DESCR_16_2))
1903            descr = pullup_descr_to_16(descr, 4);
1904         if (!(descr & TREE_DESCR_16_3))
1905            descr = pullup_descr_to_16(descr, 6);
1906         mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1907         tl_assert( (descr & mask) == mask );
1908         descr &= ~mask;
1909         descr |= TREE_DESCR_32_1;
1910         break;
1911      default:
1912         tl_assert(0);
1913   }
1914   return descr;
1915}
1916
1917static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1918   switch (toff) {
1919      case 0: case 4:
1920         return 0 != (descr & TREE_DESCR_64);
1921      default:
1922         tl_assert(0);
1923   }
1924}
1925
1926static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1927   switch (toff) {
1928      case 0:
1929         return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1930      case 2:
1931         return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1932      case 4:
1933         return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1934      case 6:
1935         return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1936      default:
1937         tl_assert(0);
1938   }
1939}
1940
1941/* ------------ Cache management ------------ */
1942
1943static void zsm_flush_cache ( void )
1944{
1945   shmem__flush_and_invalidate_scache();
1946}
1947
1948
1949static void zsm_init ( void )
1950{
1951   tl_assert( sizeof(UWord) == sizeof(Addr) );
1952
1953   tl_assert(map_shmem == NULL);
1954   map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1955                           HG_(free),
1956                           NULL/*unboxed UWord cmp*/);
1957   /* Invalidate all cache entries. */
1958   tl_assert(!is_valid_scache_tag(1));
1959   for (UWord wix = 0; wix < N_WAY_NENT; wix++) {
1960      cache_shmem.tags0[wix] = 1/*INVALID*/;
1961   }
1962
1963   LineF_pool_allocator = VG_(newPA) (
1964                             sizeof(LineF),
1965                             /* Nr elements/pool to fill a core arena block
1966                                taking some arena overhead into account. */
1967                             (4 * 1024 * 1024 - 200)/sizeof(LineF),
1968                             HG_(zalloc),
1969                             "libhb.LineF_storage.pool",
1970                             HG_(free)
1971                          );
1972
1973   /* a SecMap must contain an integral number of CacheLines */
1974   tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
1975   /* also ... a CacheLine holds an integral number of trees */
1976   tl_assert(0 == (N_LINE_ARANGE % 8));
1977}
1978
1979/////////////////////////////////////////////////////////////////
1980/////////////////////////////////////////////////////////////////
1981//                                                             //
1982// SECTION END compressed shadow memory                        //
1983//                                                             //
1984/////////////////////////////////////////////////////////////////
1985/////////////////////////////////////////////////////////////////
1986
1987
1988
1989/////////////////////////////////////////////////////////////////
1990/////////////////////////////////////////////////////////////////
1991//                                                             //
1992// SECTION BEGIN vts primitives                                //
1993//                                                             //
1994/////////////////////////////////////////////////////////////////
1995/////////////////////////////////////////////////////////////////
1996
1997
1998/* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
1999   being compact stand-ins for Thr*'s.  Use these functions to map
2000   between them. */
2001static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
2002static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
2003
2004__attribute__((noreturn))
2005static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
2006{
2007   if (due_to_nThrs) {
2008      const HChar* s =
2009         "\n"
2010         "Helgrind: cannot continue, run aborted: too many threads.\n"
2011         "Sorry.  Helgrind can only handle programs that create\n"
2012         "%'llu or fewer threads over their entire lifetime.\n"
2013         "\n";
2014      VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
2015   } else {
2016      const HChar* s =
2017         "\n"
2018         "Helgrind: cannot continue, run aborted: too many\n"
2019         "synchronisation events.  Sorry. Helgrind can only handle\n"
2020         "programs which perform %'llu or fewer\n"
2021         "inter-thread synchronisation events (locks, unlocks, etc).\n"
2022         "\n";
2023      VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
2024   }
2025   VG_(exit)(1);
2026   /*NOTREACHED*/
2027   tl_assert(0); /*wtf?!*/
2028}
2029
2030
2031/* The dead thread (ThrID, actually) tables.  A thread may only be
2032   listed here if we have been notified thereof by libhb_async_exit.
2033   New entries are added at the end.  The order isn't important, but
2034   the ThrID values must be unique.
2035   verydead_thread_table_not_pruned lists the identity of the threads
2036   that died since the previous round of pruning.
2037   Once pruning is done, these ThrID are added in verydead_thread_table.
2038   We don't actually need to keep the set of threads that have ever died --
2039   only the threads that have died since the previous round of
2040   pruning.  But it's useful for sanity check purposes to keep the
2041   entire set, so we do. */
2042static XArray* /* of ThrID */ verydead_thread_table_not_pruned = NULL;
2043static XArray* /* of ThrID */ verydead_thread_table = NULL;
2044
2045/* Arbitrary total ordering on ThrIDs. */
2046static Int cmp__ThrID ( const void* v1, const void* v2 ) {
2047   ThrID id1 = *(const ThrID*)v1;
2048   ThrID id2 = *(const ThrID*)v2;
2049   if (id1 < id2) return -1;
2050   if (id1 > id2) return 1;
2051   return 0;
2052}
2053
2054static void verydead_thread_tables_init ( void )
2055{
2056   tl_assert(!verydead_thread_table);
2057   tl_assert(!verydead_thread_table_not_pruned);
2058   verydead_thread_table
2059     = VG_(newXA)( HG_(zalloc),
2060                   "libhb.verydead_thread_table_init.1",
2061                   HG_(free), sizeof(ThrID) );
2062   VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
2063   verydead_thread_table_not_pruned
2064     = VG_(newXA)( HG_(zalloc),
2065                   "libhb.verydead_thread_table_init.2",
2066                   HG_(free), sizeof(ThrID) );
2067   VG_(setCmpFnXA)(verydead_thread_table_not_pruned, cmp__ThrID);
2068}
2069
2070static void verydead_thread_table_sort_and_check (XArray* thrids)
2071{
2072   UWord i;
2073
2074   VG_(sortXA)( thrids );
2075   /* Sanity check: check for unique .sts.thr values. */
2076   UWord nBT = VG_(sizeXA)( thrids );
2077   if (nBT > 0) {
2078      ThrID thrid1, thrid2;
2079      thrid2 = *(ThrID*)VG_(indexXA)( thrids, 0 );
2080      for (i = 1; i < nBT; i++) {
2081         thrid1 = thrid2;
2082         thrid2 = *(ThrID*)VG_(indexXA)( thrids, i );
2083         tl_assert(thrid1 < thrid2);
2084      }
2085   }
2086   /* Ok, so the dead thread table thrids has unique and in-order keys. */
2087}
2088
2089/* A VTS contains .ts, its vector clock, and also .id, a field to hold
2090   a backlink for the caller's convenience.  Since we have no idea
2091   what to set that to in the library, it always gets set to
2092   VtsID_INVALID. */
2093typedef
2094   struct {
2095      VtsID    id;
2096      UInt     usedTS;
2097      UInt     sizeTS;
2098      ScalarTS ts[0];
2099   }
2100   VTS;
2101
2102/* Allocate a VTS capable of storing 'sizeTS' entries. */
2103static VTS* VTS__new ( const HChar* who, UInt sizeTS );
2104
2105/* Make a clone of 'vts', sizing the new array to exactly match the
2106   number of ScalarTSs present. */
2107static VTS* VTS__clone ( const HChar* who, VTS* vts );
2108
2109/* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
2110   array is sized exactly to hold the number of required elements.
2111   'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
2112   must be in strictly increasing order. */
2113static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel );
2114
2115/* Delete this VTS in its entirety. */
2116static void VTS__delete ( VTS* vts );
2117
2118/* Create a new singleton VTS in 'out'.  Caller must have
2119   pre-allocated 'out' sufficiently big to hold the result in all
2120   possible cases. */
2121static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
2122
2123/* Create in 'out' a VTS which is the same as 'vts' except with
2124   vts[me]++, so to speak.  Caller must have pre-allocated 'out'
2125   sufficiently big to hold the result in all possible cases. */
2126static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
2127
2128/* Create in 'out' a VTS which is the join (max) of 'a' and
2129   'b'. Caller must have pre-allocated 'out' sufficiently big to hold
2130   the result in all possible cases. */
2131static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
2132
2133/* Compute the partial ordering relation of the two args.  Although we
2134   could be completely general and return an enumeration value (EQ,
2135   LT, GT, UN), in fact we only need LEQ, and so we may as well
2136   hardwire that fact.
2137
2138   Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
2139   invald ThrID).  In the latter case, the returned ThrID indicates
2140   the discovered point for which they are not.  There may be more
2141   than one such point, but we only care about seeing one of them, not
2142   all of them.  This rather strange convention is used because
2143   sometimes we want to know the actual index at which they first
2144   differ. */
2145static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
2146
2147/* Compute an arbitrary structural (total) ordering on the two args,
2148   based on their VCs, so they can be looked up in a table, tree, etc.
2149   Returns -1, 0 or 1. */
2150static Word VTS__cmp_structural ( VTS* a, VTS* b );
2151
2152/* Debugging only.  Display the given VTS. */
2153static void VTS__show ( const VTS* vts );
2154
2155/* Debugging only.  Return vts[index], so to speak. */
2156static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
2157
2158/* Notify the VTS machinery that a thread has been declared
2159   comprehensively dead: that is, it has done an async exit AND it has
2160   been joined with.  This should ensure that its local clocks (.viR
2161   and .viW) will never again change, and so all mentions of this
2162   thread from all VTSs in the system may be removed. */
2163static void VTS__declare_thread_very_dead ( Thr* idx );
2164
2165/*--------------- to do with Vector Timestamps ---------------*/
2166
2167static Bool is_sane_VTS ( VTS* vts )
2168{
2169   UWord     i, n;
2170   ScalarTS  *st1, *st2;
2171   if (!vts) return False;
2172   if (vts->usedTS > vts->sizeTS) return False;
2173   n = vts->usedTS;
2174   if (n == 1) {
2175      st1 = &vts->ts[0];
2176      if (st1->tym == 0)
2177         return False;
2178   }
2179   else
2180   if (n >= 2) {
2181      for (i = 0; i < n-1; i++) {
2182         st1 = &vts->ts[i];
2183         st2 = &vts->ts[i+1];
2184         if (st1->thrid >= st2->thrid)
2185            return False;
2186         if (st1->tym == 0 || st2->tym == 0)
2187            return False;
2188      }
2189   }
2190   return True;
2191}
2192
2193
2194/* Create a new, empty VTS.
2195*/
2196static VTS* VTS__new ( const HChar* who, UInt sizeTS )
2197{
2198   VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
2199   tl_assert(vts->usedTS == 0);
2200   vts->sizeTS = sizeTS;
2201   *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
2202   return vts;
2203}
2204
2205/* Clone this VTS.
2206*/
2207static VTS* VTS__clone ( const HChar* who, VTS* vts )
2208{
2209   tl_assert(vts);
2210   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2211   UInt nTS = vts->usedTS;
2212   VTS* clone = VTS__new(who, nTS);
2213   clone->id = vts->id;
2214   clone->sizeTS = nTS;
2215   clone->usedTS = nTS;
2216   UInt i;
2217   for (i = 0; i < nTS; i++) {
2218      clone->ts[i] = vts->ts[i];
2219   }
2220   tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2221   return clone;
2222}
2223
2224
2225/* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
2226   must be in strictly increasing order.  We could obviously do this
2227   much more efficiently (in linear time) if necessary.
2228*/
2229static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel )
2230{
2231   UInt i, j;
2232   tl_assert(vts);
2233   tl_assert(thridsToDel);
2234   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2235   UInt nTS = vts->usedTS;
2236   /* Figure out how many ScalarTSs will remain in the output. */
2237   UInt nReq = nTS;
2238   for (i = 0; i < nTS; i++) {
2239      ThrID thrid = vts->ts[i].thrid;
2240      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2241         nReq--;
2242   }
2243   tl_assert(nReq <= nTS);
2244   /* Copy the ones that will remain. */
2245   VTS* res = VTS__new(who, nReq);
2246   j = 0;
2247   for (i = 0; i < nTS; i++) {
2248      ThrID thrid = vts->ts[i].thrid;
2249      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2250         continue;
2251      res->ts[j++] = vts->ts[i];
2252   }
2253   tl_assert(j == nReq);
2254   tl_assert(j == res->sizeTS);
2255   res->usedTS = j;
2256   tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2257   return res;
2258}
2259
2260
2261/* Delete this VTS in its entirety.
2262*/
2263static void VTS__delete ( VTS* vts )
2264{
2265   tl_assert(vts);
2266   tl_assert(vts->usedTS <= vts->sizeTS);
2267   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2268   HG_(free)(vts);
2269}
2270
2271
2272/* Create a new singleton VTS.
2273*/
2274static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2275{
2276   tl_assert(thr);
2277   tl_assert(tym >= 1);
2278   tl_assert(out);
2279   tl_assert(out->usedTS == 0);
2280   tl_assert(out->sizeTS >= 1);
2281   UInt hi = out->usedTS++;
2282   out->ts[hi].thrid = Thr__to_ThrID(thr);
2283   out->ts[hi].tym   = tym;
2284}
2285
2286
2287/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2288   not modified.
2289*/
2290static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2291{
2292   UInt      i, n;
2293   ThrID     me_thrid;
2294   Bool      found = False;
2295
2296   stats__vts__tick++;
2297
2298   tl_assert(out);
2299   tl_assert(out->usedTS == 0);
2300   if (vts->usedTS >= ThrID_MAX_VALID)
2301      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2302   tl_assert(out->sizeTS >= 1 + vts->usedTS);
2303
2304   tl_assert(me);
2305   me_thrid = Thr__to_ThrID(me);
2306   tl_assert(is_sane_VTS(vts));
2307   n = vts->usedTS;
2308
2309   /* Copy all entries which precede 'me'. */
2310   for (i = 0; i < n; i++) {
2311      ScalarTS* here = &vts->ts[i];
2312      if (UNLIKELY(here->thrid >= me_thrid))
2313         break;
2314      UInt hi = out->usedTS++;
2315      out->ts[hi] = *here;
2316   }
2317
2318   /* 'i' now indicates the next entry to copy, if any.
2319       There are 3 possibilities:
2320       (a) there is no next entry (we used them all up already):
2321           add (me_thrid,1) to the output, and quit
2322       (b) there is a next entry, and its thrid > me_thrid:
2323           add (me_thrid,1) to the output, then copy the remaining entries
2324       (c) there is a next entry, and its thrid == me_thrid:
2325           copy it to the output but increment its timestamp value.
2326           Then copy the remaining entries.  (c) is the common case.
2327   */
2328   tl_assert(i >= 0 && i <= n);
2329   if (i == n) { /* case (a) */
2330      UInt hi = out->usedTS++;
2331      out->ts[hi].thrid = me_thrid;
2332      out->ts[hi].tym   = 1;
2333   } else {
2334      /* cases (b) and (c) */
2335      ScalarTS* here = &vts->ts[i];
2336      if (me_thrid == here->thrid) { /* case (c) */
2337         if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2338            /* We're hosed.  We have to stop. */
2339            scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2340         }
2341         UInt hi = out->usedTS++;
2342         out->ts[hi].thrid = here->thrid;
2343         out->ts[hi].tym   = here->tym + 1;
2344         i++;
2345         found = True;
2346      } else { /* case (b) */
2347         UInt hi = out->usedTS++;
2348         out->ts[hi].thrid = me_thrid;
2349         out->ts[hi].tym   = 1;
2350      }
2351      /* And copy any remaining entries. */
2352      for (/*keepgoing*/; i < n; i++) {
2353         ScalarTS* here2 = &vts->ts[i];
2354         UInt hi = out->usedTS++;
2355         out->ts[hi] = *here2;
2356      }
2357   }
2358
2359   tl_assert(is_sane_VTS(out));
2360   tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2361   tl_assert(out->usedTS <= out->sizeTS);
2362}
2363
2364
2365/* Return a new VTS constructed as the join (max) of the 2 args.
2366   Neither arg is modified.
2367*/
2368static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2369{
2370   UInt     ia, ib, useda, usedb;
2371   ULong    tyma, tymb, tymMax;
2372   ThrID    thrid;
2373   UInt     ncommon = 0;
2374
2375   stats__vts__join++;
2376
2377   tl_assert(a);
2378   tl_assert(b);
2379   useda = a->usedTS;
2380   usedb = b->usedTS;
2381
2382   tl_assert(out);
2383   tl_assert(out->usedTS == 0);
2384   /* overly conservative test, but doing better involves comparing
2385      the two VTSs, which we don't want to do at this point. */
2386   if (useda + usedb >= ThrID_MAX_VALID)
2387      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2388   tl_assert(out->sizeTS >= useda + usedb);
2389
2390   ia = ib = 0;
2391
2392   while (1) {
2393
2394      /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2395         from a and b in order, where thrid is the next ThrID
2396         occurring in either a or b, and tyma/b are the relevant
2397         scalar timestamps, taking into account implicit zeroes. */
2398      tl_assert(ia >= 0 && ia <= useda);
2399      tl_assert(ib >= 0 && ib <= usedb);
2400
2401      if        (ia == useda && ib == usedb) {
2402         /* both empty - done */
2403         break;
2404
2405      } else if (ia == useda && ib != usedb) {
2406         /* a empty, use up b */
2407         ScalarTS* tmpb = &b->ts[ib];
2408         thrid = tmpb->thrid;
2409         tyma  = 0;
2410         tymb  = tmpb->tym;
2411         ib++;
2412
2413      } else if (ia != useda && ib == usedb) {
2414         /* b empty, use up a */
2415         ScalarTS* tmpa = &a->ts[ia];
2416         thrid = tmpa->thrid;
2417         tyma  = tmpa->tym;
2418         tymb  = 0;
2419         ia++;
2420
2421      } else {
2422         /* both not empty; extract lowest-ThrID'd triple */
2423         ScalarTS* tmpa = &a->ts[ia];
2424         ScalarTS* tmpb = &b->ts[ib];
2425         if (tmpa->thrid < tmpb->thrid) {
2426            /* a has the lowest unconsidered ThrID */
2427            thrid = tmpa->thrid;
2428            tyma  = tmpa->tym;
2429            tymb  = 0;
2430            ia++;
2431         } else if (tmpa->thrid > tmpb->thrid) {
2432            /* b has the lowest unconsidered ThrID */
2433            thrid = tmpb->thrid;
2434            tyma  = 0;
2435            tymb  = tmpb->tym;
2436            ib++;
2437         } else {
2438            /* they both next mention the same ThrID */
2439            tl_assert(tmpa->thrid == tmpb->thrid);
2440            thrid = tmpa->thrid; /* == tmpb->thrid */
2441            tyma  = tmpa->tym;
2442            tymb  = tmpb->tym;
2443            ia++;
2444            ib++;
2445            ncommon++;
2446         }
2447      }
2448
2449      /* having laboriously determined (thr, tyma, tymb), do something
2450         useful with it. */
2451      tymMax = tyma > tymb ? tyma : tymb;
2452      if (tymMax > 0) {
2453         UInt hi = out->usedTS++;
2454         out->ts[hi].thrid = thrid;
2455         out->ts[hi].tym   = tymMax;
2456      }
2457
2458   }
2459
2460   tl_assert(is_sane_VTS(out));
2461   tl_assert(out->usedTS <= out->sizeTS);
2462   tl_assert(out->usedTS == useda + usedb - ncommon);
2463}
2464
2465
2466/* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2467   they are, or the first ThrID for which they are not (no valid ThrID
2468   has the value zero).  This rather strange convention is used
2469   because sometimes we want to know the actual index at which they
2470   first differ. */
2471static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2472{
2473   Word  ia, ib, useda, usedb;
2474   ULong tyma, tymb;
2475
2476   stats__vts__cmpLEQ++;
2477
2478   tl_assert(a);
2479   tl_assert(b);
2480   useda = a->usedTS;
2481   usedb = b->usedTS;
2482
2483   ia = ib = 0;
2484
2485   while (1) {
2486
2487      /* This logic is to enumerate doubles (tyma, tymb) drawn
2488         from a and b in order, and tyma/b are the relevant
2489         scalar timestamps, taking into account implicit zeroes. */
2490      ThrID thrid;
2491
2492      tl_assert(ia >= 0 && ia <= useda);
2493      tl_assert(ib >= 0 && ib <= usedb);
2494
2495      if        (ia == useda && ib == usedb) {
2496         /* both empty - done */
2497         break;
2498
2499      } else if (ia == useda && ib != usedb) {
2500         /* a empty, use up b */
2501         ScalarTS* tmpb = &b->ts[ib];
2502         tyma  = 0;
2503         tymb  = tmpb->tym;
2504         thrid = tmpb->thrid;
2505         ib++;
2506
2507      } else if (ia != useda && ib == usedb) {
2508         /* b empty, use up a */
2509         ScalarTS* tmpa = &a->ts[ia];
2510         tyma  = tmpa->tym;
2511         thrid = tmpa->thrid;
2512         tymb  = 0;
2513         ia++;
2514
2515      } else {
2516         /* both not empty; extract lowest-ThrID'd triple */
2517         ScalarTS* tmpa = &a->ts[ia];
2518         ScalarTS* tmpb = &b->ts[ib];
2519         if (tmpa->thrid < tmpb->thrid) {
2520            /* a has the lowest unconsidered ThrID */
2521            tyma  = tmpa->tym;
2522            thrid = tmpa->thrid;
2523            tymb  = 0;
2524            ia++;
2525         }
2526         else
2527         if (tmpa->thrid > tmpb->thrid) {
2528            /* b has the lowest unconsidered ThrID */
2529            tyma  = 0;
2530            tymb  = tmpb->tym;
2531            thrid = tmpb->thrid;
2532            ib++;
2533         } else {
2534            /* they both next mention the same ThrID */
2535            tl_assert(tmpa->thrid == tmpb->thrid);
2536            tyma  = tmpa->tym;
2537            thrid = tmpa->thrid;
2538            tymb  = tmpb->tym;
2539            ia++;
2540            ib++;
2541         }
2542      }
2543
2544      /* having laboriously determined (tyma, tymb), do something
2545         useful with it. */
2546      if (tyma > tymb) {
2547         /* not LEQ at this index.  Quit, since the answer is
2548            determined already. */
2549         tl_assert(thrid >= 1024);
2550         return thrid;
2551      }
2552   }
2553
2554   return 0; /* all points are LEQ => return an invalid ThrID */
2555}
2556
2557
2558/* Compute an arbitrary structural (total) ordering on the two args,
2559   based on their VCs, so they can be looked up in a table, tree, etc.
2560   Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2561   performance critical so there is some effort expended to make it sa
2562   fast as possible.
2563*/
2564Word VTS__cmp_structural ( VTS* a, VTS* b )
2565{
2566   /* We just need to generate an arbitrary total ordering based on
2567      a->ts and b->ts.  Preferably do it in a way which comes across likely
2568      differences relatively quickly. */
2569   Word     i;
2570   Word     useda = 0,    usedb = 0;
2571   ScalarTS *ctsa = NULL, *ctsb = NULL;
2572
2573   stats__vts__cmp_structural++;
2574
2575   tl_assert(a);
2576   tl_assert(b);
2577
2578   ctsa = &a->ts[0]; useda = a->usedTS;
2579   ctsb = &b->ts[0]; usedb = b->usedTS;
2580
2581   if (LIKELY(useda == usedb)) {
2582      ScalarTS *tmpa = NULL, *tmpb = NULL;
2583      stats__vts__cmp_structural_slow++;
2584      /* Same length vectors.  Find the first difference, if any, as
2585         fast as possible. */
2586      for (i = 0; i < useda; i++) {
2587         tmpa = &ctsa[i];
2588         tmpb = &ctsb[i];
2589         if (LIKELY(tmpa->tym == tmpb->tym
2590                    && tmpa->thrid == tmpb->thrid))
2591            continue;
2592         else
2593            break;
2594      }
2595      if (UNLIKELY(i == useda)) {
2596         /* They're identical. */
2597         return 0;
2598      } else {
2599         tl_assert(i >= 0 && i < useda);
2600         if (tmpa->tym < tmpb->tym) return -1;
2601         if (tmpa->tym > tmpb->tym) return 1;
2602         if (tmpa->thrid < tmpb->thrid) return -1;
2603         if (tmpa->thrid > tmpb->thrid) return 1;
2604         /* we just established them as non-identical, hence: */
2605      }
2606      /*NOTREACHED*/
2607      tl_assert(0);
2608   }
2609
2610   if (useda < usedb) return -1;
2611   if (useda > usedb) return 1;
2612   /*NOTREACHED*/
2613   tl_assert(0);
2614}
2615
2616
2617/* Debugging only.  Display the given VTS.
2618*/
2619static void VTS__show ( const VTS* vts )
2620{
2621   Word      i, n;
2622   tl_assert(vts);
2623
2624   VG_(printf)("[");
2625   n =  vts->usedTS;
2626   for (i = 0; i < n; i++) {
2627      const ScalarTS *st = &vts->ts[i];
2628      VG_(printf)(i < n-1 ? "%d:%llu " : "%d:%llu", st->thrid, (ULong)st->tym);
2629   }
2630   VG_(printf)("]");
2631}
2632
2633
2634/* Debugging only.  Return vts[index], so to speak.
2635*/
2636ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2637{
2638   UWord i, n;
2639   ThrID idx_thrid = Thr__to_ThrID(idx);
2640   stats__vts__indexat_slow++;
2641   tl_assert(vts);
2642   n = vts->usedTS;
2643   for (i = 0; i < n; i++) {
2644      ScalarTS* st = &vts->ts[i];
2645      if (st->thrid == idx_thrid)
2646         return st->tym;
2647   }
2648   return 0;
2649}
2650
2651
2652/* See comment on prototype above.
2653*/
2654static void VTS__declare_thread_very_dead ( Thr* thr )
2655{
2656   if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2657
2658   tl_assert(thr->llexit_done);
2659   tl_assert(thr->joinedwith_done);
2660
2661   ThrID nyu;
2662   nyu = Thr__to_ThrID(thr);
2663   VG_(addToXA)( verydead_thread_table_not_pruned, &nyu );
2664
2665   /* We can only get here if we're assured that we'll never again
2666      need to look at this thread's ::viR or ::viW.  Set them to
2667      VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2668      mostly so that we don't wind up pruning them (as that would be
2669      nonsensical: the only interesting ScalarTS entry for a dead
2670      thread is its own index, and the pruning will remove that.). */
2671   VtsID__rcdec(thr->viR);
2672   VtsID__rcdec(thr->viW);
2673   thr->viR = VtsID_INVALID;
2674   thr->viW = VtsID_INVALID;
2675}
2676
2677
2678/////////////////////////////////////////////////////////////////
2679/////////////////////////////////////////////////////////////////
2680//                                                             //
2681// SECTION END vts primitives                                  //
2682//                                                             //
2683/////////////////////////////////////////////////////////////////
2684/////////////////////////////////////////////////////////////////
2685
2686
2687
2688/////////////////////////////////////////////////////////////////
2689/////////////////////////////////////////////////////////////////
2690//                                                             //
2691// SECTION BEGIN main library                                  //
2692//                                                             //
2693/////////////////////////////////////////////////////////////////
2694/////////////////////////////////////////////////////////////////
2695
2696
2697/////////////////////////////////////////////////////////
2698//                                                     //
2699// VTS set                                             //
2700//                                                     //
2701/////////////////////////////////////////////////////////
2702
2703static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2704
2705static void vts_set_init ( void )
2706{
2707   tl_assert(!vts_set);
2708   vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2709                         HG_(free),
2710                         (Word(*)(UWord,UWord))VTS__cmp_structural );
2711}
2712
2713/* Given a VTS, look in vts_set to see if we already have a
2714   structurally identical one.  If yes, return the pair (True, pointer
2715   to the existing one).  If no, clone this one, add the clone to the
2716   set, and return (False, pointer to the clone). */
2717static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2718{
2719   UWord keyW, valW;
2720   stats__vts_set__focaa++;
2721   tl_assert(cand->id == VtsID_INVALID);
2722   /* lookup cand (by value) */
2723   if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2724      /* found it */
2725      tl_assert(valW == 0);
2726      /* if this fails, cand (by ref) was already present (!) */
2727      tl_assert(keyW != (UWord)cand);
2728      *res = (VTS*)keyW;
2729      return True;
2730   } else {
2731      /* not present.  Clone, add and return address of clone. */
2732      stats__vts_set__focaa_a++;
2733      VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2734      tl_assert(clone != cand);
2735      VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2736      *res = clone;
2737      return False;
2738   }
2739}
2740
2741
2742/////////////////////////////////////////////////////////
2743//                                                     //
2744// VTS table                                           //
2745//                                                     //
2746/////////////////////////////////////////////////////////
2747
2748static void VtsID__invalidate_caches ( void ); /* fwds */
2749
2750/* A type to hold VTS table entries.  Invariants:
2751   If .vts == NULL, then this entry is not in use, so:
2752   - .rc == 0
2753   - this entry is on the freelist (unfortunately, does not imply
2754     any constraints on value for u.freelink)
2755   If .vts != NULL, then this entry is in use:
2756   - .vts is findable in vts_set
2757   - .vts->id == this entry number
2758   - no specific value for .rc (even 0 is OK)
2759   - this entry is not on freelist, so u.freelink == VtsID_INVALID
2760*/
2761typedef
2762   struct {
2763      VTS*  vts;      /* vts, in vts_set */
2764      UWord rc;       /* reference count - enough for entire aspace */
2765      union {
2766         VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2767         VtsID remap;    /* used only during pruning, for used entries */
2768      } u;
2769      /* u.freelink only used when vts == NULL,
2770         u.remap only used when vts != NULL, during pruning. */
2771   }
2772   VtsTE;
2773
2774/* The VTS table. */
2775static XArray* /* of VtsTE */ vts_tab = NULL;
2776
2777/* An index into the VTS table, indicating the start of the list of
2778   free (available for use) entries.  If the list is empty, this is
2779   VtsID_INVALID. */
2780static VtsID vts_tab_freelist = VtsID_INVALID;
2781
2782/* Do a GC of vts_tab when the freelist becomes empty AND the size of
2783   vts_tab equals or exceeds this size.  After GC, the value here is
2784   set appropriately so as to check for the next GC point. */
2785static Word vts_next_GC_at = 1000;
2786
2787static void vts_tab_init ( void )
2788{
2789   vts_tab = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2790                         HG_(free), sizeof(VtsTE) );
2791   vts_tab_freelist = VtsID_INVALID;
2792}
2793
2794/* Add ii to the free list, checking that it looks out-of-use. */
2795static void add_to_free_list ( VtsID ii )
2796{
2797   VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2798   tl_assert(ie->vts == NULL);
2799   tl_assert(ie->rc == 0);
2800   tl_assert(ie->u.freelink == VtsID_INVALID);
2801   ie->u.freelink = vts_tab_freelist;
2802   vts_tab_freelist = ii;
2803}
2804
2805/* Get an entry from the free list.  This will return VtsID_INVALID if
2806   the free list is empty. */
2807static VtsID get_from_free_list ( void )
2808{
2809   VtsID  ii;
2810   VtsTE* ie;
2811   if (vts_tab_freelist == VtsID_INVALID)
2812      return VtsID_INVALID;
2813   ii = vts_tab_freelist;
2814   ie = VG_(indexXA)( vts_tab, ii );
2815   tl_assert(ie->vts == NULL);
2816   tl_assert(ie->rc == 0);
2817   vts_tab_freelist = ie->u.freelink;
2818   return ii;
2819}
2820
2821/* Produce a new VtsID that can be used, either by getting it from
2822   the freelist, or, if that is empty, by expanding vts_tab. */
2823static VtsID get_new_VtsID ( void )
2824{
2825   VtsID ii;
2826   VtsTE te;
2827   ii = get_from_free_list();
2828   if (ii != VtsID_INVALID)
2829      return ii;
2830   te.vts = NULL;
2831   te.rc = 0;
2832   te.u.freelink = VtsID_INVALID;
2833   ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2834   return ii;
2835}
2836
2837
2838/* Indirect callback from lib_zsm. */
2839static void VtsID__rcinc ( VtsID ii )
2840{
2841   VtsTE* ie;
2842   /* VG_(indexXA) does a range check for us */
2843   ie = VG_(indexXA)( vts_tab, ii );
2844   tl_assert(ie->vts); /* else it's not in use */
2845   tl_assert(ie->rc < ~0UL); /* else we can't continue */
2846   tl_assert(ie->vts->id == ii);
2847   ie->rc++;
2848}
2849
2850/* Indirect callback from lib_zsm. */
2851static void VtsID__rcdec ( VtsID ii )
2852{
2853   VtsTE* ie;
2854   /* VG_(indexXA) does a range check for us */
2855   ie = VG_(indexXA)( vts_tab, ii );
2856   tl_assert(ie->vts); /* else it's not in use */
2857   tl_assert(ie->rc > 0); /* else RC snafu */
2858   tl_assert(ie->vts->id == ii);
2859   ie->rc--;
2860}
2861
2862
2863/* Look up 'cand' in our collection of VTSs.  If present, return the
2864   VtsID for the pre-existing version.  If not present, clone it, add
2865   the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2866   it, and return that. */
2867static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2868{
2869   VTS* in_tab = NULL;
2870   tl_assert(cand->id == VtsID_INVALID);
2871   Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2872   tl_assert(in_tab);
2873   if (already_have) {
2874      /* We already have a copy of 'cand'.  Use that. */
2875      VtsTE* ie;
2876      tl_assert(in_tab->id != VtsID_INVALID);
2877      ie = VG_(indexXA)( vts_tab, in_tab->id );
2878      tl_assert(ie->vts == in_tab);
2879      return in_tab->id;
2880   } else {
2881      VtsID  ii = get_new_VtsID();
2882      VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2883      ie->vts = in_tab;
2884      ie->rc = 0;
2885      ie->u.freelink = VtsID_INVALID;
2886      in_tab->id = ii;
2887      return ii;
2888   }
2889}
2890
2891
2892static void show_vts_stats ( const HChar* caller )
2893{
2894   UWord nSet, nTab, nLive;
2895   ULong totrc;
2896   UWord n, i;
2897   nSet = VG_(sizeFM)( vts_set );
2898   nTab = VG_(sizeXA)( vts_tab );
2899   totrc = 0;
2900   nLive = 0;
2901   n = VG_(sizeXA)( vts_tab );
2902   for (i = 0; i < n; i++) {
2903      VtsTE* ie = VG_(indexXA)( vts_tab, i );
2904      if (ie->vts) {
2905         nLive++;
2906         totrc += (ULong)ie->rc;
2907      } else {
2908         tl_assert(ie->rc == 0);
2909      }
2910   }
2911   VG_(printf)("  show_vts_stats %s\n", caller);
2912   VG_(printf)("    vts_tab size %4lu\n", nTab);
2913   VG_(printf)("    vts_tab live %4lu\n", nLive);
2914   VG_(printf)("    vts_set size %4lu\n", nSet);
2915   VG_(printf)("        total rc %4llu\n", totrc);
2916}
2917
2918
2919/* --- Helpers for VtsID pruning --- */
2920
2921static
2922void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2923                   /*MOD*/XArray* /* of VtsTE */ new_tab,
2924                   VtsID* ii )
2925{
2926   VtsTE *old_te, *new_te;
2927   VtsID old_id, new_id;
2928   /* We're relying here on VG_(indexXA)'s range checking to assert on
2929      any stupid values, in particular *ii == VtsID_INVALID. */
2930   old_id = *ii;
2931   old_te = VG_(indexXA)( old_tab, old_id );
2932   old_te->rc--;
2933   new_id = old_te->u.remap;
2934   new_te = VG_(indexXA)( new_tab, new_id );
2935   new_te->rc++;
2936   *ii = new_id;
2937}
2938
2939static
2940void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2941                            /*MOD*/XArray* /* of VtsTE */ new_tab,
2942                            SVal* s )
2943{
2944   SVal old_sv, new_sv;
2945   old_sv = *s;
2946   if (SVal__isC(old_sv)) {
2947      VtsID rMin, wMin;
2948      rMin = SVal__unC_Rmin(old_sv);
2949      wMin = SVal__unC_Wmin(old_sv);
2950      remap_VtsID( old_tab, new_tab, &rMin );
2951      remap_VtsID( old_tab, new_tab, &wMin );
2952      new_sv = SVal__mkC( rMin, wMin );
2953      *s = new_sv;
2954  }
2955}
2956
2957
2958/* NOT TO BE CALLED FROM WITHIN libzsm. */
2959__attribute__((noinline))
2960static void vts_tab__do_GC ( Bool show_stats )
2961{
2962   UWord i, nTab, nLive, nFreed;
2963
2964   /* ---------- BEGIN VTS GC ---------- */
2965   /* check this is actually necessary. */
2966   tl_assert(vts_tab_freelist == VtsID_INVALID);
2967
2968   /* empty the caches for partial order checks and binary joins.  We
2969      could do better and prune out the entries to be deleted, but it
2970      ain't worth the hassle. */
2971   VtsID__invalidate_caches();
2972
2973   /* First, make the reference counts up to date. */
2974   zsm_flush_cache();
2975
2976   nTab = VG_(sizeXA)( vts_tab );
2977
2978   if (show_stats) {
2979      VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
2980      show_vts_stats("before GC");
2981   }
2982
2983   /* Now we can inspect the entire vts_tab.  Any entries with zero
2984      .rc fields are now no longer in use and can be put back on the
2985      free list, removed from vts_set, and deleted. */
2986   nFreed = 0;
2987   for (i = 0; i < nTab; i++) {
2988      Bool present;
2989      UWord oldK = 0, oldV = 12345;
2990      VtsTE* te = VG_(indexXA)( vts_tab, i );
2991      if (te->vts == NULL) {
2992         tl_assert(te->rc == 0);
2993         continue; /* already on the free list (presumably) */
2994      }
2995      if (te->rc > 0)
2996         continue; /* in use */
2997      /* Ok, we got one we can free. */
2998      tl_assert(te->vts->id == i);
2999      /* first, remove it from vts_set. */
3000      present = VG_(delFromFM)( vts_set,
3001                                &oldK, &oldV, (UWord)te->vts );
3002      tl_assert(present); /* else it isn't in vts_set ?! */
3003      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3004      tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
3005      /* now free the VTS itself */
3006      VTS__delete(te->vts);
3007      te->vts = NULL;
3008      /* and finally put this entry on the free list */
3009      tl_assert(te->u.freelink == VtsID_INVALID); /* can't already be on it */
3010      add_to_free_list( i );
3011      nFreed++;
3012   }
3013
3014   /* Now figure out when the next GC should be.  We'll allow the
3015      number of VTSs to double before GCing again.  Except of course
3016      that since we can't (or, at least, don't) shrink vts_tab, we
3017      can't set the threshold value smaller than it. */
3018   tl_assert(nFreed <= nTab);
3019   nLive = nTab - nFreed;
3020   tl_assert(nLive >= 0 && nLive <= nTab);
3021   vts_next_GC_at = 2 * nLive;
3022   if (vts_next_GC_at < nTab)
3023      vts_next_GC_at = nTab;
3024
3025   if (show_stats) {
3026      show_vts_stats("after GC");
3027      VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
3028   }
3029
3030   stats__vts_tab_GC++;
3031   if (VG_(clo_stats)) {
3032      tl_assert(nTab > 0);
3033      VG_(message)(Vg_DebugMsg,
3034                   "libhb: VTS GC: #%lu  old size %lu  live %lu  (%2llu%%)\n",
3035                   stats__vts_tab_GC,
3036                   nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
3037   }
3038   /* ---------- END VTS GC ---------- */
3039
3040   /* Decide whether to do VTS pruning.  We have one of three
3041      settings. */
3042   static UInt pruning_auto_ctr = 0; /* do not make non-static */
3043
3044   Bool do_pruning = False;
3045   switch (HG_(clo_vts_pruning)) {
3046      case 0: /* never */
3047         break;
3048      case 1: /* auto */
3049         do_pruning = (++pruning_auto_ctr % 5) == 0;
3050         break;
3051      case 2: /* always */
3052         do_pruning = True;
3053         break;
3054      default:
3055         tl_assert(0);
3056   }
3057
3058   /* The rest of this routine only handles pruning, so we can
3059      quit at this point if it is not to be done. */
3060   if (!do_pruning)
3061      return;
3062   /* No need to do pruning if no thread died since the last pruning as
3063      no VtsTE can be pruned. */
3064   if (VG_(sizeXA)( verydead_thread_table_not_pruned) == 0)
3065      return;
3066
3067   /* ---------- BEGIN VTS PRUNING ---------- */
3068   /* Sort and check the very dead threads that died since the last pruning.
3069      Sorting is used for the check and so that we can quickly look
3070      up the dead-thread entries as we work through the VTSs. */
3071   verydead_thread_table_sort_and_check (verydead_thread_table_not_pruned);
3072
3073   /* We will run through the old table, and create a new table and
3074      set, at the same time setting the u.remap entries in the old
3075      table to point to the new entries.  Then, visit every VtsID in
3076      the system, and replace all of them with new ones, using the
3077      u.remap entries in the old table.  Finally, we can delete the old
3078      table and set. */
3079
3080   XArray* /* of VtsTE */ new_tab
3081      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
3082                    HG_(free), sizeof(VtsTE) );
3083
3084   /* WordFM VTS* void */
3085   WordFM* new_set
3086      = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
3087                    HG_(free),
3088                    (Word(*)(UWord,UWord))VTS__cmp_structural );
3089
3090   /* Visit each old VTS.  For each one:
3091
3092      * make a pruned version
3093
3094      * search new_set for the pruned version, yielding either
3095        Nothing (not present) or the new VtsID for it.
3096
3097      * if not present, allocate a new VtsID for it, insert (pruned
3098        VTS, new VtsID) in the tree, and set
3099        remap_table[old VtsID] = new VtsID.
3100
3101      * if present, set remap_table[old VtsID] = new VtsID, where
3102        new VtsID was determined by the tree lookup.  Then free up
3103        the clone.
3104   */
3105
3106   UWord nBeforePruning = 0, nAfterPruning = 0;
3107   UWord nSTSsBefore = 0, nSTSsAfter = 0;
3108   VtsID new_VtsID_ctr = 0;
3109
3110   for (i = 0; i < nTab; i++) {
3111
3112      /* For each old VTS .. */
3113      VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
3114      VTS*   old_vts = old_te->vts;
3115
3116      /* Skip it if not in use */
3117      if (old_te->rc == 0) {
3118         tl_assert(old_vts == NULL);
3119         continue;
3120      }
3121      tl_assert(old_te->u.remap == VtsID_INVALID);
3122      tl_assert(old_vts != NULL);
3123      tl_assert(old_vts->id == i);
3124      tl_assert(old_vts->ts != NULL);
3125
3126      /* It is in use. Make a pruned version. */
3127      nBeforePruning++;
3128      nSTSsBefore += old_vts->usedTS;
3129      VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
3130                                   old_vts, verydead_thread_table_not_pruned);
3131      tl_assert(new_vts->sizeTS == new_vts->usedTS);
3132      tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
3133                == 0x0ddC0ffeeBadF00dULL);
3134
3135      /* Get rid of the old VTS and the tree entry.  It's a bit more
3136         complex to incrementally delete the VTSs now than to nuke
3137         them all after we're done, but the upside is that we don't
3138         wind up temporarily storing potentially two complete copies
3139         of each VTS and hence spiking memory use. */
3140      UWord oldK = 0, oldV = 12345;
3141      Bool  present = VG_(delFromFM)( vts_set,
3142                                      &oldK, &oldV, (UWord)old_vts );
3143      tl_assert(present); /* else it isn't in vts_set ?! */
3144      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3145      tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
3146      /* now free the VTS itself */
3147      VTS__delete(old_vts);
3148      old_te->vts = NULL;
3149      old_vts = NULL;
3150
3151      /* NO MENTIONS of old_vts allowed beyond this point. */
3152
3153      /* Ok, we have the pruned copy in new_vts.  See if a
3154         structurally identical version is already present in new_set.
3155         If so, delete the one we just made and move on; if not, add
3156         it. */
3157      VTS*  identical_version = NULL;
3158      UWord valW = 12345;
3159      if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
3160                        (UWord)new_vts)) {
3161         // already have it
3162         tl_assert(valW == 0);
3163         tl_assert(identical_version != NULL);
3164         tl_assert(identical_version != new_vts);
3165         VTS__delete(new_vts);
3166         new_vts = identical_version;
3167         tl_assert(new_vts->id != VtsID_INVALID);
3168      } else {
3169         tl_assert(valW == 12345);
3170         tl_assert(identical_version == NULL);
3171         new_vts->id = new_VtsID_ctr++;
3172         Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
3173         tl_assert(!b);
3174         VtsTE new_te;
3175         new_te.vts      = new_vts;
3176         new_te.rc       = 0;
3177         new_te.u.freelink = VtsID_INVALID;
3178         Word j = VG_(addToXA)( new_tab, &new_te );
3179         tl_assert(j <= i);
3180         tl_assert(j == new_VtsID_ctr - 1);
3181         // stats
3182         nAfterPruning++;
3183         nSTSsAfter += new_vts->usedTS;
3184      }
3185      old_te->u.remap = new_vts->id;
3186
3187   } /* for (i = 0; i < nTab; i++) */
3188
3189   /* Move very dead thread from verydead_thread_table_not_pruned to
3190      verydead_thread_table. Sort and check verydead_thread_table
3191      to verify a thread was reported very dead only once. */
3192   {
3193      UWord nBT = VG_(sizeXA)( verydead_thread_table_not_pruned);
3194
3195      for (i = 0; i < nBT; i++) {
3196         ThrID thrid =
3197            *(ThrID*)VG_(indexXA)( verydead_thread_table_not_pruned, i );
3198         VG_(addToXA)( verydead_thread_table, &thrid );
3199      }
3200      verydead_thread_table_sort_and_check (verydead_thread_table);
3201      VG_(dropHeadXA) (verydead_thread_table_not_pruned, nBT);
3202   }
3203
3204   /* At this point, we have:
3205      * the old VTS table, with its u.remap entries set,
3206        and with all .vts == NULL.
3207      * the old VTS tree should be empty, since it and the old VTSs
3208        it contained have been incrementally deleted was we worked
3209        through the old table.
3210      * the new VTS table, with all .rc == 0, all u.freelink and u.remap
3211        == VtsID_INVALID.
3212      * the new VTS tree.
3213   */
3214   tl_assert( VG_(sizeFM)(vts_set) == 0 );
3215
3216   /* Now actually apply the mapping. */
3217   /* Visit all the VtsIDs in the entire system.  Where do we expect
3218      to find them?
3219      (a) in shadow memory -- the LineZs and LineFs
3220      (b) in our collection of struct _Thrs.
3221      (c) in our collection of struct _SOs.
3222      Nowhere else, AFAICS.  Not in the zsm cache, because that just
3223      got invalidated.
3224
3225      Using the u.remap fields in vts_tab, map each old VtsID to a new
3226      VtsID.  For each old VtsID, dec its rc; and for each new one,
3227      inc it.  This sets up the new refcounts, and it also gives a
3228      cheap sanity check of the old ones: all old refcounts should be
3229      zero after this operation.
3230   */
3231
3232   /* Do the mappings for (a) above: iterate over the Primary shadow
3233      mem map (WordFM Addr SecMap*). */
3234   UWord secmapW = 0;
3235   VG_(initIterFM)( map_shmem );
3236   while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3237      UWord   j;
3238      SecMap* sm = (SecMap*)secmapW;
3239      tl_assert(sm->magic == SecMap_MAGIC);
3240      /* Deal with the LineZs */
3241      for (i = 0; i < N_SECMAP_ZLINES; i++) {
3242         LineZ* lineZ = &sm->linesZ[i];
3243         if (lineZ->dict[0] != SVal_INVALID) {
3244            for (j = 0; j < 4; j++)
3245               remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3246         } else {
3247            LineF* lineF = SVal2Ptr (lineZ->dict[1]);
3248            for (j = 0; j < N_LINE_ARANGE; j++)
3249               remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3250         }
3251      }
3252   }
3253   VG_(doneIterFM)( map_shmem );
3254
3255   /* Do the mappings for (b) above: visit our collection of struct
3256      _Thrs. */
3257   Thread* hgthread = get_admin_threads();
3258   tl_assert(hgthread);
3259   while (hgthread) {
3260      Thr* hbthr = hgthread->hbthr;
3261      tl_assert(hbthr);
3262      /* Threads that are listed in the prunable set have their viR
3263         and viW set to VtsID_INVALID, so we can't mess with them. */
3264      if (hbthr->llexit_done && hbthr->joinedwith_done) {
3265         tl_assert(hbthr->viR == VtsID_INVALID);
3266         tl_assert(hbthr->viW == VtsID_INVALID);
3267         hgthread = hgthread->admin;
3268         continue;
3269      }
3270      remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3271      remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3272      hgthread = hgthread->admin;
3273   }
3274
3275   /* Do the mappings for (c) above: visit the struct _SOs. */
3276   SO* so = admin_SO;
3277   while (so) {
3278      if (so->viR != VtsID_INVALID)
3279         remap_VtsID( vts_tab, new_tab, &so->viR );
3280      if (so->viW != VtsID_INVALID)
3281         remap_VtsID( vts_tab, new_tab, &so->viW );
3282      so = so->admin_next;
3283   }
3284
3285   /* So, we're nearly done (with this incredibly complex operation).
3286      Check the refcounts for the old VtsIDs all fell to zero, as
3287      expected.  Any failure is serious. */
3288   for (i = 0; i < nTab; i++) {
3289      VtsTE* te = VG_(indexXA)( vts_tab, i );
3290      tl_assert(te->vts == NULL);
3291      /* This is the assert proper.  Note we're also asserting
3292         zeroness for old entries which are unmapped.  That's OK. */
3293      tl_assert(te->rc == 0);
3294   }
3295
3296   /* Install the new table and set. */
3297   VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3298   vts_set = new_set;
3299   VG_(deleteXA)( vts_tab );
3300   vts_tab = new_tab;
3301
3302   /* The freelist of vts_tab entries is empty now, because we've
3303      compacted all of the live entries at the low end of the
3304      table. */
3305   vts_tab_freelist = VtsID_INVALID;
3306
3307   /* Sanity check vts_set and vts_tab. */
3308
3309   /* Because all the live entries got slid down to the bottom of vts_tab: */
3310   tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3311
3312   /* Assert that the vts_tab and vts_set entries point at each other
3313      in the required way */
3314   UWord wordK = 0, wordV = 0;
3315   VG_(initIterFM)( vts_set );
3316   while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3317      tl_assert(wordK != 0);
3318      tl_assert(wordV == 0);
3319      VTS* vts = (VTS*)wordK;
3320      tl_assert(vts->id != VtsID_INVALID);
3321      VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3322      tl_assert(te->vts == vts);
3323   }
3324   VG_(doneIterFM)( vts_set );
3325
3326   /* Also iterate over the table, and check each entry is
3327      plausible. */
3328   nTab = VG_(sizeXA)( vts_tab );
3329   for (i = 0; i < nTab; i++) {
3330      VtsTE* te = VG_(indexXA)( vts_tab, i );
3331      tl_assert(te->vts);
3332      tl_assert(te->vts->id == i);
3333      tl_assert(te->rc > 0); /* 'cos we just GC'd */
3334      tl_assert(te->u.freelink == VtsID_INVALID); /* in use */
3335      /* value of te->u.remap  not relevant */
3336   }
3337
3338   /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3339   stats__vts_pruning++;
3340   if (VG_(clo_stats)) {
3341      tl_assert(nTab > 0);
3342      VG_(message)(
3343         Vg_DebugMsg,
3344         "libhb: VTS PR: #%lu  before %lu (avg sz %lu)  "
3345            "after %lu (avg sz %lu)\n",
3346         stats__vts_pruning,
3347         nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3348         nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3349      );
3350   }
3351   /* ---------- END VTS PRUNING ---------- */
3352}
3353
3354
3355/////////////////////////////////////////////////////////
3356//                                                     //
3357// Vts IDs                                             //
3358//                                                     //
3359/////////////////////////////////////////////////////////
3360
3361//////////////////////////
3362/* A temporary, max-sized VTS which is used as a temporary (the first
3363   argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3364static VTS* temp_max_sized_VTS = NULL;
3365
3366//////////////////////////
3367static ULong stats__cmpLEQ_queries = 0;
3368static ULong stats__cmpLEQ_misses  = 0;
3369static ULong stats__join2_queries  = 0;
3370static ULong stats__join2_misses   = 0;
3371
3372static inline UInt ROL32 ( UInt w, Int n ) {
3373   w = (w << n) | (w >> (32-n));
3374   return w;
3375}
3376static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3377   UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3378   return hash % nTab;
3379}
3380
3381#define N_CMPLEQ_CACHE 1023
3382static
3383   struct { VtsID vi1; VtsID vi2; Bool leq; }
3384   cmpLEQ_cache[N_CMPLEQ_CACHE];
3385
3386#define N_JOIN2_CACHE 1023
3387static
3388   struct { VtsID vi1; VtsID vi2; VtsID res; }
3389   join2_cache[N_JOIN2_CACHE];
3390
3391static void VtsID__invalidate_caches ( void ) {
3392   Int i;
3393   for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3394      cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3395      cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3396      cmpLEQ_cache[i].leq = False;
3397   }
3398   for (i = 0; i < N_JOIN2_CACHE; i++) {
3399     join2_cache[i].vi1 = VtsID_INVALID;
3400     join2_cache[i].vi2 = VtsID_INVALID;
3401     join2_cache[i].res = VtsID_INVALID;
3402   }
3403}
3404//////////////////////////
3405
3406//static Bool VtsID__is_valid ( VtsID vi ) {
3407//   VtsTE* ve;
3408//   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3409//      return False;
3410//   ve = VG_(indexXA)( vts_tab, vi );
3411//   if (!ve->vts)
3412//      return False;
3413//   tl_assert(ve->vts->id == vi);
3414//   return True;
3415//}
3416
3417static VTS* VtsID__to_VTS ( VtsID vi ) {
3418   VtsTE* te = VG_(indexXA)( vts_tab, vi );
3419   tl_assert(te->vts);
3420   return te->vts;
3421}
3422
3423static void VtsID__pp ( VtsID vi ) {
3424   VTS* vts = VtsID__to_VTS(vi);
3425   VTS__show( vts );
3426}
3427
3428/* compute partial ordering relation of vi1 and vi2. */
3429__attribute__((noinline))
3430static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3431   UInt hash;
3432   Bool leq;
3433   VTS  *v1, *v2;
3434   //if (vi1 == vi2) return True;
3435   tl_assert(vi1 != vi2);
3436   ////++
3437   stats__cmpLEQ_queries++;
3438   hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3439   if (cmpLEQ_cache[hash].vi1 == vi1
3440       && cmpLEQ_cache[hash].vi2 == vi2)
3441      return cmpLEQ_cache[hash].leq;
3442   stats__cmpLEQ_misses++;
3443   ////--
3444   v1  = VtsID__to_VTS(vi1);
3445   v2  = VtsID__to_VTS(vi2);
3446   leq = VTS__cmpLEQ( v1, v2 ) == 0;
3447   ////++
3448   cmpLEQ_cache[hash].vi1 = vi1;
3449   cmpLEQ_cache[hash].vi2 = vi2;
3450   cmpLEQ_cache[hash].leq = leq;
3451   ////--
3452   return leq;
3453}
3454static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3455   return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3456}
3457
3458/* compute binary join */
3459__attribute__((noinline))
3460static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3461   UInt  hash;
3462   VtsID res;
3463   VTS   *vts1, *vts2;
3464   //if (vi1 == vi2) return vi1;
3465   tl_assert(vi1 != vi2);
3466   ////++
3467   stats__join2_queries++;
3468   hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3469   if (join2_cache[hash].vi1 == vi1
3470       && join2_cache[hash].vi2 == vi2)
3471      return join2_cache[hash].res;
3472   stats__join2_misses++;
3473   ////--
3474   vts1 = VtsID__to_VTS(vi1);
3475   vts2 = VtsID__to_VTS(vi2);
3476   temp_max_sized_VTS->usedTS = 0;
3477   VTS__join(temp_max_sized_VTS, vts1,vts2);
3478   res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3479   ////++
3480   join2_cache[hash].vi1 = vi1;
3481   join2_cache[hash].vi2 = vi2;
3482   join2_cache[hash].res = res;
3483   ////--
3484   return res;
3485}
3486static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3487   return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3488}
3489
3490/* create a singleton VTS, namely [thr:1] */
3491static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3492   temp_max_sized_VTS->usedTS = 0;
3493   VTS__singleton(temp_max_sized_VTS, thr,tym);
3494   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3495}
3496
3497/* tick operation, creates value 1 if specified index is absent */
3498static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3499   VTS* vts = VtsID__to_VTS(vi);
3500   temp_max_sized_VTS->usedTS = 0;
3501   VTS__tick(temp_max_sized_VTS, idx,vts);
3502   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3503}
3504
3505/* index into a VTS (only for assertions) */
3506static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3507   VTS* vts = VtsID__to_VTS(vi);
3508   return VTS__indexAt_SLOW( vts, idx );
3509}
3510
3511/* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3512   any, really) element in vi1 which is pointwise greater-than the
3513   corresponding element in vi2.  If no such element exists, return
3514   NULL.  This needs to be fairly quick since it is called every time
3515   a race is detected. */
3516static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3517{
3518   VTS  *vts1, *vts2;
3519   Thr*  diffthr;
3520   ThrID diffthrid;
3521   tl_assert(vi1 != vi2);
3522   vts1 = VtsID__to_VTS(vi1);
3523   vts2 = VtsID__to_VTS(vi2);
3524   tl_assert(vts1 != vts2);
3525   diffthrid = VTS__cmpLEQ(vts1, vts2);
3526   diffthr = Thr__from_ThrID(diffthrid);
3527   tl_assert(diffthr); /* else they are LEQ ! */
3528   return diffthr;
3529}
3530
3531
3532/////////////////////////////////////////////////////////
3533//                                                     //
3534// Filters                                             //
3535//                                                     //
3536/////////////////////////////////////////////////////////
3537
3538/* Forget everything we know -- clear the filter and let everything
3539   through.  This needs to be as fast as possible, since it is called
3540   every time the running thread changes, and every time a thread's
3541   vector clocks change, which can be quite frequent.  The obvious
3542   fast way to do this is simply to stuff in tags which we know are
3543   not going to match anything, since they're not aligned to the start
3544   of a line. */
3545static void Filter__clear ( Filter* fi, const HChar* who )
3546{
3547   UWord i;
3548   if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3549   for (i = 0; i < FI_NUM_LINES; i += 8) {
3550      fi->tags[i+0] = 1; /* impossible value -- cannot match */
3551      fi->tags[i+1] = 1;
3552      fi->tags[i+2] = 1;
3553      fi->tags[i+3] = 1;
3554      fi->tags[i+4] = 1;
3555      fi->tags[i+5] = 1;
3556      fi->tags[i+6] = 1;
3557      fi->tags[i+7] = 1;
3558   }
3559   tl_assert(i == FI_NUM_LINES);
3560}
3561
3562/* Clearing an arbitrary range in the filter.  Unfortunately
3563   we have to do this due to core-supplied new/die-mem events. */
3564
3565static void Filter__clear_1byte ( Filter* fi, Addr a )
3566{
3567   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3568   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3569   FiLine* line   = &fi->lines[lineno];
3570   UWord   loff   = (a - atag) / 8;
3571   UShort  mask   = 0x3 << (2 * (a & 7));
3572   /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3573   if (LIKELY( fi->tags[lineno] == atag )) {
3574      /* hit.  clear the bits. */
3575      UShort  u16  = line->u16s[loff];
3576      line->u16s[loff] = u16 & ~mask; /* clear them */
3577   } else {
3578      /* miss.  The filter doesn't hold this address, so ignore. */
3579   }
3580}
3581
3582static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3583{
3584   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3585   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3586   FiLine* line   = &fi->lines[lineno];
3587   UWord   loff   = (a - atag) / 8;
3588   if (LIKELY( fi->tags[lineno] == atag )) {
3589      line->u16s[loff] = 0;
3590   } else {
3591    /* miss.  The filter doesn't hold this address, so ignore. */
3592   }
3593}
3594
3595/* Only used to verify the fast Filter__clear_range */
3596__attribute__((unused))
3597static void Filter__clear_range_SLOW ( Filter* fi, Addr a, UWord len )
3598{
3599   tl_assert (CHECK_ZSM);
3600
3601   /* slowly do part preceding 8-alignment */
3602   while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3603      Filter__clear_1byte( fi, a );
3604      a++;
3605      len--;
3606   }
3607   /* vector loop */
3608   while (len >= 8) {
3609      Filter__clear_8bytes_aligned( fi, a );
3610      a += 8;
3611      len -= 8;
3612   }
3613   /* slowly do tail */
3614   while (UNLIKELY(len > 0)) {
3615      Filter__clear_1byte( fi, a );
3616      a++;
3617      len--;
3618   }
3619}
3620
3621static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3622{
3623#  if CHECK_ZSM > 0
3624   /* We check the below more complex algorithm with the simple one.
3625      This check is very expensive : we do first the slow way on a
3626      copy of the data, then do it the fast way. On RETURN, we check
3627      the two values are equal. */
3628   Filter fi_check = *fi;
3629   Filter__clear_range_SLOW(&fi_check, a, len);
3630#  define RETURN goto check_and_return
3631#  else
3632#  define RETURN return
3633#  endif
3634
3635   Addr    begtag = FI_GET_TAG(a);       /* tag of range begin */
3636
3637   Addr    end = a + len - 1;
3638   Addr    endtag = FI_GET_TAG(end); /* tag of range end. */
3639
3640   UWord rlen = len; /* remaining length to clear */
3641
3642   Addr    c = a; /* Current position we are clearing. */
3643   UWord   clineno = FI_GET_LINENO(c); /* Current lineno we are clearing */
3644   FiLine* cline; /* Current line we are clearing */
3645   UWord   cloff; /* Current offset in line we are clearing, when clearing
3646                     partial lines. */
3647
3648   UShort u16;
3649
3650   STATIC_ASSERT (FI_LINE_SZB == 32);
3651   // Below assumes filter lines are 32 bytes
3652
3653   if (LIKELY(fi->tags[clineno] == begtag)) {
3654      /* LIKELY for the heavy caller VG_(unknown_SP_update). */
3655      /* First filter line matches begtag.
3656         If c is not at the filter line begin, the below will clear
3657         the filter line bytes starting from c. */
3658      cline = &fi->lines[clineno];
3659      cloff = (c - begtag) / 8;
3660
3661      /* First the byte(s) needed to reach 8-alignment */
3662      if (UNLIKELY(!VG_IS_8_ALIGNED(c))) {
3663         /* hiB is the nr of bytes (higher addresses) from c to reach
3664            8-aligment. */
3665         UWord hiB = 8 - (c & 7);
3666         /* Compute 2-bit/byte mask representing hiB bytes [c..c+hiB[
3667            mask is  C000 , F000, FC00, FF00, FFC0, FFF0 or FFFC for the byte
3668            range    7..7   6..7  5..7  4..7  3..7  2..7    1..7 */
3669         UShort mask = 0xFFFF << (16 - 2*hiB);
3670
3671         u16  = cline->u16s[cloff];
3672         if (LIKELY(rlen >= hiB)) {
3673            cline->u16s[cloff] = u16 & ~mask; /* clear all hiB from c */
3674            rlen -= hiB;
3675            c += hiB;
3676            cloff += 1;
3677         } else {
3678            /* Only have the bits for rlen bytes bytes. */
3679            mask = mask & ~(0xFFFF << (16 - 2*(hiB-rlen)));
3680            cline->u16s[cloff] = u16 & ~mask; /* clear rlen bytes from c. */
3681            RETURN;  // We have cleared all what we can.
3682         }
3683      }
3684      /* c is now 8 aligned. Clear by 8 aligned bytes,
3685         till c is filter-line aligned */
3686      while (!VG_IS_32_ALIGNED(c) && rlen >= 8) {
3687         cline->u16s[cloff] = 0;
3688         c += 8;
3689         rlen -= 8;
3690         cloff += 1;
3691      }
3692   } else {
3693      c = begtag + FI_LINE_SZB;
3694      if (c > end)
3695         RETURN;   // We have cleared all what we can.
3696      rlen -= c - a;
3697   }
3698   // We have changed c, so re-establish clineno.
3699   clineno = FI_GET_LINENO(c);
3700
3701   if (rlen >= FI_LINE_SZB) {
3702      /* Here, c is filter line-aligned. Clear all full lines that
3703         overlap with the range starting at c, made of a full lines */
3704      UWord nfull = rlen / FI_LINE_SZB;
3705      UWord full_len = nfull * FI_LINE_SZB;
3706      rlen -= full_len;
3707      if (nfull > FI_NUM_LINES)
3708         nfull = FI_NUM_LINES; // no need to check several times the same entry.
3709
3710      for (UWord n = 0; n < nfull; n++) {
3711         if (UNLIKELY(address_in_range(fi->tags[clineno], c, full_len))) {
3712            cline = &fi->lines[clineno];
3713            cline->u16s[0] = 0;
3714            cline->u16s[1] = 0;
3715            cline->u16s[2] = 0;
3716            cline->u16s[3] = 0;
3717            STATIC_ASSERT (4 == sizeof(cline->u16s)/sizeof(cline->u16s[0]));
3718         }
3719         clineno++;
3720         if (UNLIKELY(clineno == FI_NUM_LINES))
3721            clineno = 0;
3722      }
3723
3724      c += full_len;
3725      clineno = FI_GET_LINENO(c);
3726   }
3727
3728   if (CHECK_ZSM) {
3729      tl_assert(VG_IS_8_ALIGNED(c));
3730      tl_assert(clineno == FI_GET_LINENO(c));
3731   }
3732
3733   /* Do the last filter line, if it was not cleared as a full filter line */
3734   if (UNLIKELY(rlen > 0) && fi->tags[clineno] == endtag) {
3735      cline = &fi->lines[clineno];
3736      cloff = (c - endtag) / 8;
3737      if (CHECK_ZSM) tl_assert(FI_GET_TAG(c) == endtag);
3738
3739      /* c is 8 aligned. Clear by 8 aligned bytes, till we have less than
3740         8 bytes. */
3741      while (rlen >= 8) {
3742         cline->u16s[cloff] = 0;
3743         c += 8;
3744         rlen -= 8;
3745         cloff += 1;
3746      }
3747      /* Then the remaining byte(s) */
3748      if (rlen > 0) {
3749         /* nr of bytes from c to reach end. */
3750         UWord loB = rlen;
3751         /* Compute mask representing loB bytes [c..c+loB[ :
3752            mask is 0003, 000F, 003F, 00FF, 03FF, 0FFF or 3FFF */
3753         UShort mask = 0xFFFF >> (16 - 2*loB);
3754
3755         u16  = cline->u16s[cloff];
3756         cline->u16s[cloff] = u16 & ~mask; /* clear all loB from c */
3757      }
3758   }
3759
3760#  if CHECK_ZSM > 0
3761   check_and_return:
3762   tl_assert (VG_(memcmp)(&fi_check, fi, sizeof(fi_check)) == 0);
3763#  endif
3764#  undef RETURN
3765}
3766
3767/* ------ Read handlers for the filter. ------ */
3768
3769static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3770{
3771   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3772      return False;
3773   {
3774     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3775     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3776     FiLine* line   = &fi->lines[lineno];
3777     UWord   loff   = (a - atag) / 8;
3778     UShort  mask   = 0xAAAA;
3779     if (LIKELY( fi->tags[lineno] == atag )) {
3780        /* hit.  check line and update. */
3781        UShort u16  = line->u16s[loff];
3782        Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3783        line->u16s[loff] = u16 | mask; /* set them */
3784        return ok;
3785     } else {
3786        /* miss.  nuke existing line and re-use it. */
3787        UWord i;
3788        fi->tags[lineno] = atag;
3789        for (i = 0; i < FI_LINE_SZB / 8; i++)
3790           line->u16s[i] = 0;
3791        line->u16s[loff] = mask;
3792        return False;
3793     }
3794   }
3795}
3796
3797static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3798{
3799   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3800      return False;
3801   {
3802     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3803     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3804     FiLine* line   = &fi->lines[lineno];
3805     UWord   loff   = (a - atag) / 8;
3806     UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3807     if (LIKELY( fi->tags[lineno] == atag )) {
3808        /* hit.  check line and update. */
3809        UShort  u16  = line->u16s[loff];
3810        Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3811        line->u16s[loff] = u16 | mask; /* set them */
3812        return ok;
3813     } else {
3814        /* miss.  nuke existing line and re-use it. */
3815        UWord   i;
3816        fi->tags[lineno] = atag;
3817        for (i = 0; i < FI_LINE_SZB / 8; i++)
3818           line->u16s[i] = 0;
3819        line->u16s[loff] = mask;
3820        return False;
3821     }
3822   }
3823}
3824
3825static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3826{
3827   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3828      return False;
3829   {
3830     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3831     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3832     FiLine* line   = &fi->lines[lineno];
3833     UWord   loff   = (a - atag) / 8;
3834     UShort  mask   = 0xA << (2 * (a & 6));
3835     /* mask is A000, 0A00, 00A0 or 000A */
3836     if (LIKELY( fi->tags[lineno] == atag )) {
3837        /* hit.  check line and update. */
3838        UShort  u16  = line->u16s[loff];
3839        Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3840        line->u16s[loff] = u16 | mask; /* set them */
3841        return ok;
3842     } else {
3843        /* miss.  nuke existing line and re-use it. */
3844        UWord   i;
3845        fi->tags[lineno] = atag;
3846        for (i = 0; i < FI_LINE_SZB / 8; i++)
3847           line->u16s[i] = 0;
3848        line->u16s[loff] = mask;
3849        return False;
3850     }
3851   }
3852}
3853
3854static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3855{
3856   {
3857     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3858     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3859     FiLine* line   = &fi->lines[lineno];
3860     UWord   loff   = (a - atag) / 8;
3861     UShort  mask   = 0x2 << (2 * (a & 7));
3862     /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3863     if (LIKELY( fi->tags[lineno] == atag )) {
3864        /* hit.  check line and update. */
3865        UShort  u16  = line->u16s[loff];
3866        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3867        line->u16s[loff] = u16 | mask; /* set them */
3868        return ok;
3869     } else {
3870        /* miss.  nuke existing line and re-use it. */
3871        UWord   i;
3872        fi->tags[lineno] = atag;
3873        for (i = 0; i < FI_LINE_SZB / 8; i++)
3874           line->u16s[i] = 0;
3875        line->u16s[loff] = mask;
3876        return False;
3877     }
3878   }
3879}
3880
3881
3882/* ------ Write handlers for the filter. ------ */
3883
3884static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3885{
3886   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3887      return False;
3888   {
3889     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3890     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3891     FiLine* line   = &fi->lines[lineno];
3892     UWord   loff   = (a - atag) / 8;
3893     UShort  mask   = 0xFFFF;
3894     if (LIKELY( fi->tags[lineno] == atag )) {
3895        /* hit.  check line and update. */
3896        UShort u16  = line->u16s[loff];
3897        Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3898        line->u16s[loff] = u16 | mask; /* set them */
3899        return ok;
3900     } else {
3901        /* miss.  nuke existing line and re-use it. */
3902        UWord i;
3903        fi->tags[lineno] = atag;
3904        for (i = 0; i < FI_LINE_SZB / 8; i++)
3905           line->u16s[i] = 0;
3906        line->u16s[loff] = mask;
3907        return False;
3908     }
3909   }
3910}
3911
3912static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3913{
3914   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3915      return False;
3916   {
3917     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3918     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3919     FiLine* line   = &fi->lines[lineno];
3920     UWord   loff   = (a - atag) / 8;
3921     UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3922     if (LIKELY( fi->tags[lineno] == atag )) {
3923        /* hit.  check line and update. */
3924        UShort  u16  = line->u16s[loff];
3925        Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3926        line->u16s[loff] = u16 | mask; /* set them */
3927        return ok;
3928     } else {
3929        /* miss.  nuke existing line and re-use it. */
3930        UWord   i;
3931        fi->tags[lineno] = atag;
3932        for (i = 0; i < FI_LINE_SZB / 8; i++)
3933           line->u16s[i] = 0;
3934        line->u16s[loff] = mask;
3935        return False;
3936     }
3937   }
3938}
3939
3940static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3941{
3942   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3943      return False;
3944   {
3945     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3946     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3947     FiLine* line   = &fi->lines[lineno];
3948     UWord   loff   = (a - atag) / 8;
3949     UShort  mask   = 0xF << (2 * (a & 6));
3950     /* mask is F000, 0F00, 00F0 or 000F */
3951     if (LIKELY( fi->tags[lineno] == atag )) {
3952        /* hit.  check line and update. */
3953        UShort  u16  = line->u16s[loff];
3954        Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3955        line->u16s[loff] = u16 | mask; /* set them */
3956        return ok;
3957     } else {
3958        /* miss.  nuke existing line and re-use it. */
3959        UWord   i;
3960        fi->tags[lineno] = atag;
3961        for (i = 0; i < FI_LINE_SZB / 8; i++)
3962           line->u16s[i] = 0;
3963        line->u16s[loff] = mask;
3964        return False;
3965     }
3966   }
3967}
3968
3969static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
3970{
3971   {
3972     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3973     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3974     FiLine* line   = &fi->lines[lineno];
3975     UWord   loff   = (a - atag) / 8;
3976     UShort  mask   = 0x3 << (2 * (a & 7));
3977     /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3978     if (LIKELY( fi->tags[lineno] == atag )) {
3979        /* hit.  check line and update. */
3980        UShort  u16  = line->u16s[loff];
3981        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3982        line->u16s[loff] = u16 | mask; /* set them */
3983        return ok;
3984     } else {
3985        /* miss.  nuke existing line and re-use it. */
3986        UWord   i;
3987        fi->tags[lineno] = atag;
3988        for (i = 0; i < FI_LINE_SZB / 8; i++)
3989           line->u16s[i] = 0;
3990        line->u16s[loff] = mask;
3991        return False;
3992     }
3993   }
3994}
3995
3996
3997/////////////////////////////////////////////////////////
3998//                                                     //
3999// Threads                                             //
4000//                                                     //
4001/////////////////////////////////////////////////////////
4002
4003/* Maps ThrID values to their Thr*s (which contain ThrID values that
4004   should point back to the relevant slot in the array.  Lowest
4005   numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
4006static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
4007
4008/* And a counter to dole out ThrID values.  For rationale/background,
4009   see comments on definition of ScalarTS (far) above. */
4010static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
4011
4012static ThrID Thr__to_ThrID ( Thr* thr ) {
4013   return thr->thrid;
4014}
4015static Thr* Thr__from_ThrID ( UInt thrid ) {
4016   Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
4017   tl_assert(thr->thrid == thrid);
4018   return thr;
4019}
4020
4021static Thr* Thr__new ( void )
4022{
4023   Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
4024   thr->viR = VtsID_INVALID;
4025   thr->viW = VtsID_INVALID;
4026   thr->llexit_done = False;
4027   thr->joinedwith_done = False;
4028   thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
4029   if (HG_(clo_history_level) == 1)
4030      thr->local_Kws_n_stacks
4031         = VG_(newXA)( HG_(zalloc),
4032                       "libhb.Thr__new.3 (local_Kws_and_stacks)",
4033                       HG_(free), sizeof(ULong_n_EC) );
4034
4035   /* Add this Thr* <-> ThrID binding to the mapping, and
4036      cross-check */
4037   if (!thrid_to_thr_map) {
4038      thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
4039                                     HG_(free), sizeof(Thr*) );
4040   }
4041
4042   if (thrid_counter >= ThrID_MAX_VALID) {
4043      /* We're hosed.  We have to stop. */
4044      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
4045   }
4046
4047   thr->thrid = thrid_counter++;
4048   Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
4049   tl_assert(ix + 1024 == thr->thrid);
4050
4051   return thr;
4052}
4053
4054static void note_local_Kw_n_stack_for ( Thr* thr )
4055{
4056   Word       nPresent;
4057   ULong_n_EC pair;
4058   tl_assert(thr);
4059
4060   // We only collect this info at history level 1 (approx)
4061   if (HG_(clo_history_level) != 1)
4062      return;
4063
4064   /* This is the scalar Kw for thr. */
4065   pair.ull = VtsID__indexAt( thr->viW, thr );
4066   pair.ec  = main_get_EC( thr );
4067   tl_assert(pair.ec);
4068   tl_assert(thr->local_Kws_n_stacks);
4069
4070   /* check that we're not adding duplicates */
4071   nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4072
4073   /* Throw away old stacks, if necessary.  We can't accumulate stuff
4074      indefinitely. */
4075   if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
4076      VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
4077      nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4078      if (0)
4079         VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
4080                     thr, pair.ull, pair.ec );
4081   }
4082
4083   if (nPresent > 0) {
4084      ULong_n_EC* prevPair
4085         = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
4086      tl_assert( prevPair->ull <= pair.ull );
4087   }
4088
4089   if (nPresent == 0)
4090      pair.ec = NULL;
4091
4092   VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
4093
4094   if (0)
4095      VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
4096                  thr, pair.ull, pair.ec );
4097   if (0)
4098      VG_(pp_ExeContext)(pair.ec);
4099}
4100
4101static Int cmp__ULong_n_EC__by_ULong ( const ULong_n_EC* pair1,
4102                                       const ULong_n_EC* pair2 )
4103{
4104   if (pair1->ull < pair2->ull) return -1;
4105   if (pair1->ull > pair2->ull) return 1;
4106   return 0;
4107}
4108
4109
4110/////////////////////////////////////////////////////////
4111//                                                     //
4112// Shadow Values                                       //
4113//                                                     //
4114/////////////////////////////////////////////////////////
4115
4116// type SVal, SVal_INVALID and SVal_NOACCESS are defined by
4117// hb_zsm.h.  We have to do everything else here.
4118
4119/* SVal is 64 bit unsigned int.
4120
4121      <---------30--------->    <---------30--------->
4122   00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
4123   10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
4124   11 0--------------------0 00 0--------------------0   A: SVal_INVALID
4125
4126*/
4127#define SVAL_TAGMASK (3ULL << 62)
4128
4129static inline Bool SVal__isC ( SVal s ) {
4130   return (0ULL << 62) == (s & SVAL_TAGMASK);
4131}
4132static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
4133   //tl_assert(VtsID__is_valid(rmini));
4134   //tl_assert(VtsID__is_valid(wmini));
4135   return (((ULong)rmini) << 32) | ((ULong)wmini);
4136}
4137static inline VtsID SVal__unC_Rmin ( SVal s ) {
4138   tl_assert(SVal__isC(s));
4139   return (VtsID)(s >> 32);
4140}
4141static inline VtsID SVal__unC_Wmin ( SVal s ) {
4142   tl_assert(SVal__isC(s));
4143   return (VtsID)(s & 0xFFFFFFFFULL);
4144}
4145
4146static inline Bool SVal__isA ( SVal s ) {
4147   return (2ULL << 62) == (s & SVAL_TAGMASK);
4148}
4149__attribute__((unused))
4150static inline SVal SVal__mkA ( void ) {
4151   return 2ULL << 62;
4152}
4153
4154/* Direct callback from lib_zsm. */
4155static inline void SVal__rcinc ( SVal s ) {
4156   if (SVal__isC(s)) {
4157      VtsID__rcinc( SVal__unC_Rmin(s) );
4158      VtsID__rcinc( SVal__unC_Wmin(s) );
4159   }
4160}
4161
4162/* Direct callback from lib_zsm. */
4163static inline void SVal__rcdec ( SVal s ) {
4164   if (SVal__isC(s)) {
4165      VtsID__rcdec( SVal__unC_Rmin(s) );
4166      VtsID__rcdec( SVal__unC_Wmin(s) );
4167   }
4168}
4169
4170static inline void *SVal2Ptr (SVal s)
4171{
4172   return (void*)(UWord)s;
4173}
4174
4175static inline SVal Ptr2SVal (void* ptr)
4176{
4177   return (SVal)(UWord)ptr;
4178}
4179
4180
4181
4182/////////////////////////////////////////////////////////
4183//                                                     //
4184// Change-event map2                                   //
4185//                                                     //
4186/////////////////////////////////////////////////////////
4187
4188/* This is in two parts:
4189
4190   1. A hash table of RCECs.  This is a set of reference-counted stack
4191      traces.  When the reference count of a stack trace becomes zero,
4192      it is removed from the set and freed up.  The intent is to have
4193      a set of stack traces which can be referred to from (2), but to
4194      only represent each one once.  The set is indexed/searched by
4195      ordering on the stack trace vectors.
4196
4197   2. A Hash table of OldRefs.  These store information about each old
4198      ref that we need to record.  Hash table key is the address of the
4199      location for which the information is recorded.  For LRU
4200      purposes, each OldRef in the hash table is also on a doubly
4201      linked list maintaining the order in which the OldRef were most
4202      recently accessed.
4203      Each OldRef also maintains the stamp at which it was last accessed.
4204      With these stamps, we can quickly check which of 2 OldRef is the
4205      'newest', without having to scan the full list of LRU OldRef.
4206
4207      The important part of an OldRef is, however, its acc component.
4208      This binds a TSW triple (thread, size, R/W) to an RCEC.
4209
4210      We allocate a maximum of VG_(clo_conflict_cache_size) OldRef.
4211      Then we do exact LRU discarding.  For each discarded OldRef we must
4212      of course decrement the reference count on the RCEC it
4213      refers to, in order that entries from (1) eventually get
4214      discarded too.
4215*/
4216
4217static UWord stats__evm__lookup_found = 0;
4218static UWord stats__evm__lookup_notfound = 0;
4219
4220static UWord stats__ctxt_eq_tsw_eq_rcec = 0;
4221static UWord stats__ctxt_eq_tsw_neq_rcec = 0;
4222static UWord stats__ctxt_neq_tsw_neq_rcec = 0;
4223static UWord stats__ctxt_rcdec_calls = 0;
4224static UWord stats__ctxt_rcec_gc_discards = 0;
4225
4226static UWord stats__ctxt_tab_curr = 0;
4227static UWord stats__ctxt_tab_max  = 0;
4228
4229static UWord stats__ctxt_tab_qs   = 0;
4230static UWord stats__ctxt_tab_cmps = 0;
4231
4232
4233///////////////////////////////////////////////////////
4234//// Part (1): A hash table of RCECs
4235///
4236
4237#define N_FRAMES 8
4238
4239// (UInt) `echo "Reference Counted Execution Context" | md5sum`
4240#define RCEC_MAGIC 0xab88abb2UL
4241
4242//#define N_RCEC_TAB 98317 /* prime */
4243#define N_RCEC_TAB 196613 /* prime */
4244
4245typedef
4246   struct _RCEC {
4247      UWord magic;  /* sanity check only */
4248      struct _RCEC* next;
4249      UWord rc;
4250      UWord rcX; /* used for crosschecking */
4251      UWord frames_hash;          /* hash of all the frames */
4252      UWord frames[N_FRAMES];
4253   }
4254   RCEC;
4255
4256//////////// BEGIN RCEC pool allocator
4257static PoolAlloc* rcec_pool_allocator;
4258static RCEC* alloc_RCEC ( void ) {
4259   return VG_(allocEltPA) ( rcec_pool_allocator );
4260}
4261
4262static void free_RCEC ( RCEC* rcec ) {
4263   tl_assert(rcec->magic == RCEC_MAGIC);
4264   VG_(freeEltPA)( rcec_pool_allocator, rcec );
4265}
4266//////////// END RCEC pool allocator
4267
4268static RCEC** contextTab = NULL; /* hash table of RCEC*s */
4269
4270/* Count of allocated RCEC having ref count > 0 */
4271static UWord RCEC_referenced = 0;
4272
4273/* Gives an arbitrary total order on RCEC .frames fields */
4274static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
4275   Word i;
4276   tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
4277   tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
4278   if (ec1->frames_hash < ec2->frames_hash) return -1;
4279   if (ec1->frames_hash > ec2->frames_hash) return  1;
4280   for (i = 0; i < N_FRAMES; i++) {
4281      if (ec1->frames[i] < ec2->frames[i]) return -1;
4282      if (ec1->frames[i] > ec2->frames[i]) return  1;
4283   }
4284   return 0;
4285}
4286
4287
4288/* Dec the ref of this RCEC. */
4289static void ctxt__rcdec ( RCEC* ec )
4290{
4291   stats__ctxt_rcdec_calls++;
4292   tl_assert(ec && ec->magic == RCEC_MAGIC);
4293   tl_assert(ec->rc > 0);
4294   ec->rc--;
4295   if (ec->rc == 0)
4296      RCEC_referenced--;
4297}
4298
4299static void ctxt__rcinc ( RCEC* ec )
4300{
4301   tl_assert(ec && ec->magic == RCEC_MAGIC);
4302   if (ec->rc == 0)
4303      RCEC_referenced++;
4304   ec->rc++;
4305}
4306
4307
4308/* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
4309   move it one step closer to the front of the list, so as to make
4310   subsequent searches for it cheaper. */
4311static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
4312{
4313   RCEC *ec0, *ec1, *ec2;
4314   if (ec == *headp)
4315      tl_assert(0); /* already at head of list */
4316   tl_assert(ec != NULL);
4317   ec0 = *headp;
4318   ec1 = NULL;
4319   ec2 = NULL;
4320   while (True) {
4321      if (ec0 == NULL || ec0 == ec) break;
4322      ec2 = ec1;
4323      ec1 = ec0;
4324      ec0 = ec0->next;
4325   }
4326   tl_assert(ec0 == ec);
4327   if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
4328      RCEC* tmp;
4329      /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
4330         predecessor.  Swap ec0 and ec1, that is, move ec0 one step
4331         closer to the start of the list. */
4332      tl_assert(ec2->next == ec1);
4333      tl_assert(ec1->next == ec0);
4334      tmp = ec0->next;
4335      ec2->next = ec0;
4336      ec0->next = ec1;
4337      ec1->next = tmp;
4338   }
4339   else
4340   if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
4341      /* it's second in the list. */
4342      tl_assert(*headp == ec1);
4343      tl_assert(ec1->next == ec0);
4344      ec1->next = ec0->next;
4345      ec0->next = ec1;
4346      *headp = ec0;
4347   }
4348}
4349
4350
4351/* Find the given RCEC in the tree, and return a pointer to it.  Or,
4352   if not present, add the given one to the tree (by making a copy of
4353   it, so the caller can immediately deallocate the original) and
4354   return a pointer to the copy.  The caller can safely have 'example'
4355   on its stack, since we will always return a pointer to a copy of
4356   it, not to the original.  Note that the inserted node will have .rc
4357   of zero and so the caller must immediately increment it. */
4358__attribute__((noinline))
4359static RCEC* ctxt__find_or_add ( RCEC* example )
4360{
4361   UWord hent;
4362   RCEC* copy;
4363   tl_assert(example && example->magic == RCEC_MAGIC);
4364   tl_assert(example->rc == 0);
4365
4366   /* Search the hash table to see if we already have it. */
4367   stats__ctxt_tab_qs++;
4368   hent = example->frames_hash % N_RCEC_TAB;
4369   copy = contextTab[hent];
4370   while (1) {
4371      if (!copy) break;
4372      tl_assert(copy->magic == RCEC_MAGIC);
4373      stats__ctxt_tab_cmps++;
4374      if (0 == RCEC__cmp_by_frames(copy, example)) break;
4375      copy = copy->next;
4376   }
4377
4378   if (copy) {
4379      tl_assert(copy != example);
4380      /* optimisation: if it's not at the head of its list, move 1
4381         step fwds, to make future searches cheaper */
4382      if (copy != contextTab[hent]) {
4383         move_RCEC_one_step_forward( &contextTab[hent], copy );
4384      }
4385   } else {
4386      copy = alloc_RCEC();
4387      tl_assert(copy != example);
4388      *copy = *example;
4389      copy->next = contextTab[hent];
4390      contextTab[hent] = copy;
4391      stats__ctxt_tab_curr++;
4392      if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4393         stats__ctxt_tab_max = stats__ctxt_tab_curr;
4394   }
4395   return copy;
4396}
4397
4398static inline UWord ROLW ( UWord w, Int n )
4399{
4400   Int bpw = 8 * sizeof(UWord);
4401   w = (w << n) | (w >> (bpw-n));
4402   return w;
4403}
4404
4405__attribute__((noinline))
4406static RCEC* get_RCEC ( Thr* thr )
4407{
4408   UWord hash, i;
4409   RCEC  example;
4410   example.magic = RCEC_MAGIC;
4411   example.rc = 0;
4412   example.rcX = 0;
4413   example.next = NULL;
4414   main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
4415   hash = 0;
4416   for (i = 0; i < N_FRAMES; i++) {
4417      hash ^= example.frames[i];
4418      hash = ROLW(hash, 19);
4419   }
4420   example.frames_hash = hash;
4421   return ctxt__find_or_add( &example );
4422}
4423
4424///////////////////////////////////////////////////////
4425//// Part (2):
4426///  A hashtable guest-addr -> OldRef, that refers to (1)
4427///  Note: we use the guest address as key. This means that the entries
4428///  for multiple threads accessing the same address will land in the same
4429///  bucket. It might be nice to have a better distribution of the
4430///  OldRef in the hashtable by using ask key the guestaddress ^ tsw.
4431///  The problem is that when a race is reported on a ga, we need to retrieve
4432///  efficiently the accesses to ga by other threads, only using the ga.
4433///  Measurements on firefox have shown that the chain length is reasonable.
4434
4435/* Records an access: a thread, a context (size & writeness) and the
4436   number of held locks. The size (1,2,4,8) is stored as is in szB.
4437   Note that szB uses more bits than needed to store a size up to 8.
4438   This allows to use a TSW as a fully initialised UInt e.g. in
4439   cmp_oldref_tsw. If needed, a more compact representation of szB
4440   can be done (e.g. use only 4 bits, or use only 2 bits and encode the
4441   size (1,2,4,8) as 00 = 1, 01 = 2, 10 = 4, 11 = 8. */
4442typedef
4443   struct {
4444      UInt      thrid  : SCALARTS_N_THRBITS;
4445      UInt      szB    : 32 - SCALARTS_N_THRBITS - 1;
4446      UInt      isW    : 1;
4447   } TSW; // Thread+Size+Writeness
4448typedef
4449   struct {
4450      TSW       tsw;
4451      WordSetID locksHeldW;
4452      RCEC*     rcec;
4453   }
4454   Thr_n_RCEC;
4455
4456typedef
4457   struct OldRef {
4458      struct OldRef *ht_next; // to link hash table nodes together.
4459      UWord  ga; // hash_table key, == address for which we record an access.
4460      struct OldRef *prev; // to refs older than this one
4461      struct OldRef *next; // to refs newer that this one
4462      UWord stamp; // allows to order (by time of access) 2 OldRef
4463      Thr_n_RCEC acc;
4464   }
4465   OldRef;
4466
4467/* Returns the or->tsw as an UInt */
4468static inline UInt oldref_tsw (const OldRef* or)
4469{
4470   return *(const UInt*)(&or->acc.tsw);
4471}
4472
4473/* Compare the tsw component for 2 OldRef.
4474   Used for OldRef hashtable (which already verifies equality of the
4475   'key' part. */
4476static Word cmp_oldref_tsw (const void* node1, const void* node2 )
4477{
4478   const UInt tsw1 = oldref_tsw(node1);
4479   const UInt tsw2 = oldref_tsw(node2);
4480
4481   if (tsw1 < tsw2) return -1;
4482   if (tsw1 > tsw2) return  1;
4483   return 0;
4484}
4485
4486
4487//////////// BEGIN OldRef pool allocator
4488static PoolAlloc* oldref_pool_allocator;
4489// Note: We only allocate elements in this pool allocator, we never free them.
4490// We stop allocating elements at VG_(clo_conflict_cache_size).
4491//////////// END OldRef pool allocator
4492
4493static OldRef mru;
4494static OldRef lru;
4495// A double linked list, chaining all OldREf in a mru/lru order.
4496// mru/lru are sentinel nodes.
4497// Whenever an oldref is re-used, its position is changed as the most recently
4498// used (i.e. pointed to by mru.prev).
4499// When a new oldref is needed, it is allocated from the pool
4500//  if we have not yet reached --conflict-cache-size.
4501// Otherwise, if all oldref have already been allocated,
4502// the least recently used (i.e. pointed to by lru.next) is re-used.
4503// When an OldRef is used, it is moved as the most recently used entry
4504// (i.e. pointed to by mru.prev).
4505
4506// Removes r from the double linked list
4507// Note: we do not need to test for special cases such as
4508// NULL next or prev pointers, because we have sentinel nodes
4509// at both sides of the list. So, a node is always forward and
4510// backward linked.
4511static inline void OldRef_unchain(OldRef *r)
4512{
4513   r->next->prev = r->prev;
4514   r->prev->next = r->next;
4515}
4516
4517// Insert new as the newest OldRef
4518// Similarly to OldRef_unchain, no need to test for NULL
4519// pointers, as e.g. mru.prev is always guaranteed to point
4520// to a non NULL node (lru when the list is empty).
4521static inline void OldRef_newest(OldRef *new)
4522{
4523   new->next = &mru;
4524   new->prev = mru.prev;
4525   mru.prev = new;
4526   new->prev->next = new;
4527}
4528
4529
4530static VgHashTable* oldrefHT    = NULL; /* Hash table* OldRef* */
4531static UWord     oldrefHTN    = 0;    /* # elems in oldrefHT */
4532/* Note: the nr of ref in the oldrefHT will always be equal to
4533   the nr of elements that were allocated from the OldRef pool allocator
4534   as we never free an OldRef : we just re-use them. */
4535
4536
4537/* allocates a new OldRef or re-use the lru one if all allowed OldRef
4538   have already been allocated. */
4539static OldRef* alloc_or_reuse_OldRef ( void )
4540{
4541   if (oldrefHTN < HG_(clo_conflict_cache_size)) {
4542      oldrefHTN++;
4543      return VG_(allocEltPA) ( oldref_pool_allocator );
4544   } else {
4545      OldRef *oldref_ht;
4546      OldRef *oldref = lru.next;
4547
4548      OldRef_unchain(oldref);
4549      oldref_ht = VG_(HT_gen_remove) (oldrefHT, oldref, cmp_oldref_tsw);
4550      tl_assert (oldref == oldref_ht);
4551      ctxt__rcdec( oldref->acc.rcec );
4552      return oldref;
4553   }
4554}
4555
4556
4557inline static UInt min_UInt ( UInt a, UInt b ) {
4558   return a < b ? a : b;
4559}
4560
4561/* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4562   first interval is lower, 1 if the first interval is higher, and 0
4563   if there is any overlap.  Redundant paranoia with casting is there
4564   following what looked distinctly like a bug in gcc-4.1.2, in which
4565   some of the comparisons were done signedly instead of
4566   unsignedly. */
4567/* Copied from exp-ptrcheck/sg_main.c */
4568static inline Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4569                                            Addr a2, SizeT n2 ) {
4570   UWord a1w = (UWord)a1;
4571   UWord n1w = (UWord)n1;
4572   UWord a2w = (UWord)a2;
4573   UWord n2w = (UWord)n2;
4574   tl_assert(n1w > 0 && n2w > 0);
4575   if (a1w + n1w <= a2w) return -1L;
4576   if (a2w + n2w <= a1w) return 1L;
4577   return 0;
4578}
4579
4580static UWord event_map_stamp = 0; // Used to stamp each OldRef when touched.
4581
4582static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4583{
4584   OldRef  example;
4585   OldRef* ref;
4586   RCEC*   rcec;
4587
4588   tl_assert(thr);
4589   ThrID thrid = thr->thrid;
4590   tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4591
4592   WordSetID locksHeldW = thr->hgthread->locksetW;
4593
4594   rcec = get_RCEC( thr );
4595
4596   tl_assert (szB == 4 || szB == 8 ||szB == 1 || szB == 2);
4597   // Check for most frequent cases first
4598   // Note: we could support a szB up to 1 << (32 - SCALARTS_N_THRBITS - 1)
4599
4600   /* Look in the oldrefHT to see if we already have a record for this
4601      address/thr/sz/isW. */
4602   example.ga = a;
4603   example.acc.tsw = (TSW) {.thrid = thrid,
4604                            .szB = szB,
4605                            .isW = (UInt)(isW & 1)};
4606   ref = VG_(HT_gen_lookup) (oldrefHT, &example, cmp_oldref_tsw);
4607
4608   if (ref) {
4609      /* We already have a record for this address and this (thrid, R/W,
4610         size) triple. */
4611      tl_assert (ref->ga == a);
4612
4613      /* thread 'thr' has an entry.  Update its RCEC, if it differs. */
4614      if (rcec == ref->acc.rcec)
4615         stats__ctxt_eq_tsw_eq_rcec++;
4616      else {
4617         stats__ctxt_eq_tsw_neq_rcec++;
4618         ctxt__rcdec( ref->acc.rcec );
4619         ctxt__rcinc(rcec);
4620         ref->acc.rcec       = rcec;
4621      }
4622      tl_assert(ref->acc.tsw.thrid == thrid);
4623      /* Update the stamp, RCEC and the W-held lockset. */
4624      ref->stamp = event_map_stamp;
4625      ref->acc.locksHeldW = locksHeldW;
4626
4627      OldRef_unchain(ref);
4628      OldRef_newest(ref);
4629
4630   } else {
4631      /* We don't have a record for this address+triple.  Create a new one. */
4632      stats__ctxt_neq_tsw_neq_rcec++;
4633      ref = alloc_or_reuse_OldRef();
4634      ref->ga = a;
4635      ref->acc.tsw = (TSW) {.thrid  = thrid,
4636                            .szB    = szB,
4637                            .isW    = (UInt)(isW & 1)};
4638      ref->stamp = event_map_stamp;
4639      ref->acc.locksHeldW = locksHeldW;
4640      ref->acc.rcec       = rcec;
4641      ctxt__rcinc(rcec);
4642
4643      VG_(HT_add_node) ( oldrefHT, ref );
4644      OldRef_newest (ref);
4645   }
4646   event_map_stamp++;
4647}
4648
4649
4650/* Extract info from the conflicting-access machinery.
4651   Returns the most recent conflicting access with thr/[a, a+szB[/isW. */
4652Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4653                              /*OUT*/Thr**        resThr,
4654                              /*OUT*/SizeT*       resSzB,
4655                              /*OUT*/Bool*        resIsW,
4656                              /*OUT*/WordSetID*   locksHeldW,
4657                              Thr* thr, Addr a, SizeT szB, Bool isW )
4658{
4659   Word    i, j;
4660   OldRef *ref = NULL;
4661   SizeT  ref_szB = 0;
4662
4663   OldRef *cand_ref;
4664   SizeT  cand_ref_szB;
4665   Addr   cand_a;
4666
4667   Addr toCheck[15];
4668   Int  nToCheck = 0;
4669
4670   tl_assert(thr);
4671   tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4672
4673   ThrID thrid = thr->thrid;
4674
4675   toCheck[nToCheck++] = a;
4676   for (i = -7; i < (Word)szB; i++) {
4677      if (i != 0)
4678         toCheck[nToCheck++] = a + i;
4679   }
4680   tl_assert(nToCheck <= 15);
4681
4682   /* Now see if we can find a suitable matching event for
4683      any of the addresses in toCheck[0 .. nToCheck-1]. */
4684   for (j = 0; j < nToCheck; j++) {
4685
4686      cand_a = toCheck[j];
4687      //      VG_(printf)("test %ld %p\n", j, cand_a);
4688
4689      /* Find the first HT element for this address.
4690         We might have several of these. They will be linked via ht_next.
4691         We however need to check various elements as the list contains
4692         all elements that map to the same bucket. */
4693      for (cand_ref = VG_(HT_lookup)( oldrefHT, cand_a );
4694           cand_ref; cand_ref = cand_ref->ht_next) {
4695         if (cand_ref->ga != cand_a)
4696            /* OldRef for another address in this HT bucket. Ignore. */
4697            continue;
4698
4699         if (cand_ref->acc.tsw.thrid == thrid)
4700            /* This is an access by the same thread, but we're only
4701               interested in accesses from other threads.  Ignore. */
4702            continue;
4703
4704         if ((!cand_ref->acc.tsw.isW) && (!isW))
4705            /* We don't want to report a read racing against another
4706               read; that's stupid.  So in this case move on. */
4707            continue;
4708
4709         cand_ref_szB        = cand_ref->acc.tsw.szB;
4710         if (cmp_nonempty_intervals(a, szB, cand_a, cand_ref_szB) != 0)
4711            /* No overlap with the access we're asking about.  Ignore. */
4712            continue;
4713
4714         /* We have a match. Keep this match if it is newer than
4715            the previous match. Note that stamp are Unsigned Words, and
4716            for long running applications, event_map_stamp might have cycled.
4717            So, 'roll' each stamp using event_map_stamp to have the
4718            stamps in the good order, in case event_map_stamp recycled. */
4719         if (!ref
4720             || (ref->stamp - event_map_stamp)
4721                   < (cand_ref->stamp - event_map_stamp)) {
4722            ref = cand_ref;
4723            ref_szB = cand_ref_szB;
4724         }
4725      }
4726
4727      if (ref) {
4728         /* return with success */
4729         Int n, maxNFrames;
4730         RCEC*     ref_rcec = ref->acc.rcec;
4731         tl_assert(ref->acc.tsw.thrid);
4732         tl_assert(ref_rcec);
4733         tl_assert(ref_rcec->magic == RCEC_MAGIC);
4734         tl_assert(ref_szB >= 1);
4735         /* Count how many non-zero frames we have. */
4736         maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
4737         for (n = 0; n < maxNFrames; n++) {
4738            if (0 == ref_rcec->frames[n]) break;
4739         }
4740         *resEC      = VG_(make_ExeContext_from_StackTrace)(ref_rcec->frames,
4741                                                            n);
4742         *resThr     = Thr__from_ThrID(ref->acc.tsw.thrid);
4743         *resSzB     = ref_szB;
4744         *resIsW     = ref->acc.tsw.isW;
4745         *locksHeldW = ref->acc.locksHeldW;
4746         stats__evm__lookup_found++;
4747         return True;
4748      }
4749
4750      /* consider next address in toCheck[] */
4751   } /* for (j = 0; j < nToCheck; j++) */
4752
4753   /* really didn't find anything. */
4754   stats__evm__lookup_notfound++;
4755   return False;
4756}
4757
4758
4759void libhb_event_map_access_history ( Addr a, SizeT szB, Access_t fn )
4760{
4761   OldRef *ref = lru.next;
4762   SizeT ref_szB;
4763   Int n;
4764
4765   while (ref != &mru) {
4766      ref_szB = ref->acc.tsw.szB;
4767      if (cmp_nonempty_intervals(a, szB, ref->ga, ref_szB) == 0) {
4768         RCEC* ref_rcec = ref->acc.rcec;
4769         for (n = 0; n < N_FRAMES; n++) {
4770            if (0 == ref_rcec->frames[n]) {
4771               break;
4772            }
4773         }
4774         (*fn)(ref_rcec->frames, n,
4775               Thr__from_ThrID(ref->acc.tsw.thrid),
4776               ref->ga,
4777               ref_szB,
4778               ref->acc.tsw.isW,
4779               ref->acc.locksHeldW);
4780      }
4781      tl_assert (ref->next == &mru
4782                 || ((ref->stamp - event_map_stamp)
4783                        < ref->next->stamp - event_map_stamp));
4784      ref = ref->next;
4785   }
4786}
4787
4788static void event_map_init ( void )
4789{
4790   Word i;
4791
4792   /* Context (RCEC) pool allocator */
4793   rcec_pool_allocator = VG_(newPA) (
4794                             sizeof(RCEC),
4795                             1000 /* RCECs per pool */,
4796                             HG_(zalloc),
4797                             "libhb.event_map_init.1 (RCEC pools)",
4798                             HG_(free)
4799                          );
4800
4801   /* Context table */
4802   tl_assert(!contextTab);
4803   contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
4804                             N_RCEC_TAB * sizeof(RCEC*) );
4805   for (i = 0; i < N_RCEC_TAB; i++)
4806      contextTab[i] = NULL;
4807
4808   /* Oldref pool allocator */
4809   oldref_pool_allocator = VG_(newPA)(
4810                               sizeof(OldRef),
4811                               1000 /* OldRefs per pool */,
4812                               HG_(zalloc),
4813                               "libhb.event_map_init.3 (OldRef pools)",
4814                               HG_(free)
4815                            );
4816
4817   /* Oldref hashtable */
4818   tl_assert(!oldrefHT);
4819   oldrefHT = VG_(HT_construct) ("libhb.event_map_init.4 (oldref hashtable)");
4820
4821   oldrefHTN = 0;
4822   mru.prev = &lru;
4823   mru.next = NULL;
4824   lru.prev = NULL;
4825   lru.next = &mru;
4826   mru.acc = (Thr_n_RCEC) {.tsw = {.thrid = 0,
4827                                   .szB = 0,
4828                                   .isW = 0},
4829                           .locksHeldW = 0,
4830                           .rcec = NULL};
4831   lru.acc = mru.acc;
4832}
4833
4834static void event_map__check_reference_counts ( void )
4835{
4836   RCEC*   rcec;
4837   OldRef* oldref;
4838   Word    i;
4839   UWord   nEnts = 0;
4840
4841   /* Set the 'check' reference counts to zero.  Also, optionally
4842      check that the real reference counts are non-zero.  We allow
4843      these to fall to zero before a GC, but the GC must get rid of
4844      all those that are zero, hence none should be zero after a
4845      GC. */
4846   for (i = 0; i < N_RCEC_TAB; i++) {
4847      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4848         nEnts++;
4849         tl_assert(rcec);
4850         tl_assert(rcec->magic == RCEC_MAGIC);
4851         rcec->rcX = 0;
4852      }
4853   }
4854
4855   /* check that the stats are sane */
4856   tl_assert(nEnts == stats__ctxt_tab_curr);
4857   tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
4858
4859   /* visit all the referencing points, inc check ref counts */
4860   VG_(HT_ResetIter)( oldrefHT );
4861   oldref = VG_(HT_Next)( oldrefHT );
4862   while (oldref) {
4863      tl_assert (oldref->acc.tsw.thrid);
4864      tl_assert (oldref->acc.rcec);
4865      tl_assert (oldref->acc.rcec->magic == RCEC_MAGIC);
4866      oldref->acc.rcec->rcX++;
4867      oldref = VG_(HT_Next)( oldrefHT );
4868   }
4869
4870   /* compare check ref counts with actual */
4871   for (i = 0; i < N_RCEC_TAB; i++) {
4872      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4873         tl_assert(rcec->rc == rcec->rcX);
4874      }
4875   }
4876}
4877
4878__attribute__((noinline))
4879static void do_RCEC_GC ( void )
4880{
4881   UInt i;
4882
4883   if (VG_(clo_stats)) {
4884      static UInt ctr = 1;
4885      VG_(message)(Vg_DebugMsg,
4886                  "libhb: RCEC GC: #%u  %lu slots,"
4887                   " %lu cur ents(ref'd %lu),"
4888                   " %lu max ents\n",
4889                   ctr++,
4890                   (UWord)N_RCEC_TAB,
4891                   stats__ctxt_tab_curr, RCEC_referenced,
4892                   stats__ctxt_tab_max );
4893   }
4894   tl_assert (stats__ctxt_tab_curr > RCEC_referenced);
4895
4896   /* Throw away all RCECs with zero reference counts */
4897   for (i = 0; i < N_RCEC_TAB; i++) {
4898      RCEC** pp = &contextTab[i];
4899      RCEC*  p  = *pp;
4900      while (p) {
4901         if (p->rc == 0) {
4902            *pp = p->next;
4903            free_RCEC(p);
4904            p = *pp;
4905            tl_assert(stats__ctxt_tab_curr > 0);
4906            stats__ctxt_rcec_gc_discards++;
4907            stats__ctxt_tab_curr--;
4908         } else {
4909            pp = &p->next;
4910            p = p->next;
4911         }
4912      }
4913   }
4914
4915   tl_assert (stats__ctxt_tab_curr == RCEC_referenced);
4916}
4917
4918/////////////////////////////////////////////////////////
4919//                                                     //
4920// Core MSM                                            //
4921//                                                     //
4922/////////////////////////////////////////////////////////
4923
4924/* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
4925   Nov 08, and again after [...],
4926   June 09. */
4927
4928static ULong stats__msmcread         = 0;
4929static ULong stats__msmcread_change  = 0;
4930static ULong stats__msmcwrite        = 0;
4931static ULong stats__msmcwrite_change = 0;
4932
4933/* Some notes on the H1 history mechanism:
4934
4935   Transition rules are:
4936
4937   read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
4938   write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
4939
4940   After any access by a thread T to a location L, L's constraint pair
4941   (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
4942
4943   After a race by thread T conflicting with some previous access by
4944   some other thread U, for a location with constraint (before
4945   processing the later access) (Cr,Cw), then Cw[U] is the segment in
4946   which the previously access lies.
4947
4948   Hence in record_race_info, we pass in Cfailed and Kfailed, which
4949   are compared so as to find out which thread(s) this access
4950   conflicts with.  Once that is established, we also require the
4951   pre-update Cw for the location, so we can index into it for those
4952   threads, to get the scalar clock values for the point at which the
4953   former accesses were made.  (In fact we only bother to do any of
4954   this for an arbitrarily chosen one of the conflicting threads, as
4955   that's simpler, it avoids flooding the user with vast amounts of
4956   mostly useless information, and because the program is wrong if it
4957   contains any races at all -- so we don't really need to show all
4958   conflicting access pairs initially, so long as we only show none if
4959   none exist).
4960
4961   ---
4962
4963   That requires the auxiliary proof that
4964
4965      (Cr `join` Kw)[T] == Kw[T]
4966
4967   Why should that be true?  Because for any thread T, Kw[T] >= the
4968   scalar clock value for T known by any other thread.  In other
4969   words, because T's value for its own scalar clock is at least as up
4970   to date as the value for it known by any other thread (that is true
4971   for both the R- and W- scalar clocks).  Hence no other thread will
4972   be able to feed in a value for that element (indirectly via a
4973   constraint) which will exceed Kw[T], and hence the join cannot
4974   cause that particular element to advance.
4975*/
4976
4977__attribute__((noinline))
4978static void record_race_info ( Thr* acc_thr,
4979                               Addr acc_addr, SizeT szB, Bool isWrite,
4980                               VtsID Cfailed,
4981                               VtsID Kfailed,
4982                               VtsID Cw )
4983{
4984   /* Call here to report a race.  We just hand it onwards to
4985      HG_(record_error_Race).  If that in turn discovers that the
4986      error is going to be collected, then, at history_level 2, that
4987      queries the conflicting-event map.  The alternative would be to
4988      query it right here.  But that causes a lot of pointless queries
4989      for errors which will shortly be discarded as duplicates, and
4990      can become a performance overhead; so we defer the query until
4991      we know the error is not a duplicate. */
4992
4993   /* Stacks for the bounds of the (or one of the) conflicting
4994      segment(s).  These are only set at history_level 1. */
4995   ExeContext* hist1_seg_start = NULL;
4996   ExeContext* hist1_seg_end   = NULL;
4997   Thread*     hist1_conf_thr  = NULL;
4998
4999   tl_assert(acc_thr);
5000   tl_assert(acc_thr->hgthread);
5001   tl_assert(acc_thr->hgthread->hbthr == acc_thr);
5002   tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
5003
5004   if (HG_(clo_history_level) == 1) {
5005      Bool found;
5006      Word firstIx, lastIx;
5007      ULong_n_EC key;
5008
5009      /* At history_level 1, we must round up the relevant stack-pair
5010         for the conflicting segment right now.  This is because
5011         deferring it is complex; we can't (easily) put Kfailed and
5012         Cfailed into the XError and wait for later without
5013         getting tied up in difficulties with VtsID reference
5014         counting.  So just do it now. */
5015      Thr*  confThr;
5016      ULong confTym = 0;
5017      /* Which thread are we in conflict with?  There may be more than
5018         one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
5019         (in fact it's the one with the lowest Thr* value). */
5020      confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
5021      /* This must exist!  since if it was NULL then there's no
5022         conflict (semantics of return value of
5023         VtsID__findFirst_notLEQ), and msmc{read,write}, which has
5024         called us, just checked exactly this -- that there was in
5025         fact a race. */
5026      tl_assert(confThr);
5027
5028      /* Get the scalar clock value that the conflicting thread
5029         introduced into the constraint.  A careful examination of the
5030         base machine rules shows that this must be the same as the
5031         conflicting thread's scalar clock when it created this
5032         constraint.  Hence we know the scalar clock of the
5033         conflicting thread when the conflicting access was made. */
5034      confTym = VtsID__indexAt( Cfailed, confThr );
5035
5036      /* Using this scalar clock, index into the conflicting thread's
5037         collection of stack traces made each time its vector clock
5038         (hence its scalar clock) changed.  This gives the stack
5039         traces at the start and end of the conflicting segment (well,
5040         as per comment just above, of one of the conflicting
5041         segments, if there are more than one). */
5042      key.ull = confTym;
5043      key.ec  = NULL;
5044      /* tl_assert(confThr); -- asserted just above */
5045      tl_assert(confThr->local_Kws_n_stacks);
5046      firstIx = lastIx = 0;
5047      found = VG_(lookupXA_UNSAFE)(
5048                 confThr->local_Kws_n_stacks,
5049                 &key, &firstIx, &lastIx,
5050                 (XACmpFn_t)cmp__ULong_n_EC__by_ULong
5051              );
5052      if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
5053                         "confTym %llu found %d (%ld,%ld)\n",
5054                         Cfailed, Kfailed, Cw,
5055                         confThr, confTym, found, firstIx, lastIx);
5056      /* We can't indefinitely collect stack traces at VTS
5057         transitions, since we'd eventually run out of memory.  Hence
5058         note_local_Kw_n_stack_for will eventually throw away old
5059         ones, which in turn means we might fail to find index value
5060         confTym in the array. */
5061      if (found) {
5062         ULong_n_EC *pair_start, *pair_end;
5063         pair_start
5064            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
5065         hist1_seg_start = pair_start->ec;
5066         if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
5067            pair_end
5068               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
5069                                            lastIx+1 );
5070            /* from properties of VG_(lookupXA) and the comparison fn used: */
5071            tl_assert(pair_start->ull < pair_end->ull);
5072            hist1_seg_end = pair_end->ec;
5073            /* Could do a bit better here.  It may be that pair_end
5074               doesn't have a stack, but the following entries in the
5075               array have the same scalar Kw and to have a stack.  So
5076               we should search a bit further along the array than
5077               lastIx+1 if hist1_seg_end is NULL. */
5078         } else {
5079            if (!confThr->llexit_done)
5080               hist1_seg_end = main_get_EC( confThr );
5081         }
5082         // seg_start could be NULL iff this is the first stack in the thread
5083         //if (seg_start) VG_(pp_ExeContext)(seg_start);
5084         //if (seg_end)   VG_(pp_ExeContext)(seg_end);
5085         hist1_conf_thr = confThr->hgthread;
5086      }
5087   }
5088
5089   HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
5090                           szB, isWrite,
5091                           hist1_conf_thr, hist1_seg_start, hist1_seg_end );
5092}
5093
5094static Bool is_sane_SVal_C ( SVal sv ) {
5095   Bool leq;
5096   if (!SVal__isC(sv)) return True;
5097   leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
5098   return leq;
5099}
5100
5101
5102/* Compute new state following a read */
5103static inline SVal msmcread ( SVal svOld,
5104                              /* The following are only needed for
5105                                 creating error reports. */
5106                              Thr* acc_thr,
5107                              Addr acc_addr, SizeT szB )
5108{
5109   SVal svNew = SVal_INVALID;
5110   stats__msmcread++;
5111
5112   /* Redundant sanity check on the constraints */
5113   if (CHECK_MSM) {
5114      tl_assert(is_sane_SVal_C(svOld));
5115   }
5116
5117   if (LIKELY(SVal__isC(svOld))) {
5118      VtsID tviR  = acc_thr->viR;
5119      VtsID tviW  = acc_thr->viW;
5120      VtsID rmini = SVal__unC_Rmin(svOld);
5121      VtsID wmini = SVal__unC_Wmin(svOld);
5122      Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
5123      if (LIKELY(leq)) {
5124         /* no race */
5125         /* Note: RWLOCK subtlety: use tviW, not tviR */
5126         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5127         goto out;
5128      } else {
5129         /* assert on sanity of constraints. */
5130         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5131         tl_assert(leqxx);
5132         // same as in non-race case
5133         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5134         record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
5135                           rmini, /* Cfailed */
5136                           tviR,  /* Kfailed */
5137                           wmini  /* Cw */ );
5138         goto out;
5139      }
5140   }
5141   if (SVal__isA(svOld)) {
5142      /* reading no-access memory (sigh); leave unchanged */
5143      /* check for no pollution */
5144      tl_assert(svOld == SVal_NOACCESS);
5145      svNew = SVal_NOACCESS;
5146      goto out;
5147   }
5148   if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
5149   tl_assert(0);
5150
5151  out:
5152   if (CHECK_MSM) {
5153      tl_assert(is_sane_SVal_C(svNew));
5154   }
5155   if (UNLIKELY(svNew != svOld)) {
5156      tl_assert(svNew != SVal_INVALID);
5157      if (HG_(clo_history_level) >= 2
5158          && SVal__isC(svOld) && SVal__isC(svNew)) {
5159         event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5160         stats__msmcread_change++;
5161      }
5162   }
5163   return svNew;
5164}
5165
5166
5167/* Compute new state following a write */
5168static inline SVal msmcwrite ( SVal svOld,
5169                              /* The following are only needed for
5170                                 creating error reports. */
5171                              Thr* acc_thr,
5172                              Addr acc_addr, SizeT szB )
5173{
5174   SVal svNew = SVal_INVALID;
5175   stats__msmcwrite++;
5176
5177   /* Redundant sanity check on the constraints */
5178   if (CHECK_MSM) {
5179      tl_assert(is_sane_SVal_C(svOld));
5180   }
5181
5182   if (LIKELY(SVal__isC(svOld))) {
5183      VtsID tviW  = acc_thr->viW;
5184      VtsID wmini = SVal__unC_Wmin(svOld);
5185      Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5186      if (LIKELY(leq)) {
5187         /* no race */
5188         svNew = SVal__mkC( tviW, tviW );
5189         goto out;
5190      } else {
5191         VtsID rmini = SVal__unC_Rmin(svOld);
5192         /* assert on sanity of constraints. */
5193         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5194         tl_assert(leqxx);
5195         // same as in non-race case
5196         // proof: in the non-race case, we have
5197         //    rmini <= wmini (invar on constraints)
5198         //    tviW <= tviR (invar on thread clocks)
5199         //    wmini <= tviW (from run-time check)
5200         // hence from transitivity of <= we have
5201         //    rmini <= wmini <= tviW
5202         // and so join(rmini,tviW) == tviW
5203         // and    join(wmini,tviW) == tviW
5204         // qed.
5205         svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5206                            VtsID__join2(wmini, tviW) );
5207         record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5208                           wmini, /* Cfailed */
5209                           tviW,  /* Kfailed */
5210                           wmini  /* Cw */ );
5211         goto out;
5212      }
5213   }
5214   if (SVal__isA(svOld)) {
5215      /* writing no-access memory (sigh); leave unchanged */
5216      /* check for no pollution */
5217      tl_assert(svOld == SVal_NOACCESS);
5218      svNew = SVal_NOACCESS;
5219      goto out;
5220   }
5221   if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5222   tl_assert(0);
5223
5224  out:
5225   if (CHECK_MSM) {
5226      tl_assert(is_sane_SVal_C(svNew));
5227   }
5228   if (UNLIKELY(svNew != svOld)) {
5229      tl_assert(svNew != SVal_INVALID);
5230      if (HG_(clo_history_level) >= 2
5231          && SVal__isC(svOld) && SVal__isC(svNew)) {
5232         event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5233         stats__msmcwrite_change++;
5234      }
5235   }
5236   return svNew;
5237}
5238
5239
5240/////////////////////////////////////////////////////////
5241//                                                     //
5242// Apply core MSM to specific memory locations         //
5243//                                                     //
5244/////////////////////////////////////////////////////////
5245
5246/*------------- ZSM accesses: 8 bit sapply ------------- */
5247
5248static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5249   CacheLine* cl;
5250   UWord      cloff, tno, toff;
5251   SVal       svOld, svNew;
5252   UShort     descr;
5253   stats__cline_cread08s++;
5254   cl    = get_cacheline(a);
5255   cloff = get_cacheline_offset(a);
5256   tno   = get_treeno(a);
5257   toff  = get_tree_offset(a); /* == 0 .. 7 */
5258   descr = cl->descrs[tno];
5259   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5260      SVal* tree = &cl->svals[tno << 3];
5261      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5262      if (CHECK_ZSM)
5263         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5264   }
5265   svOld = cl->svals[cloff];
5266   svNew = msmcread( svOld, thr,a,1 );
5267   if (CHECK_ZSM)
5268      tl_assert(svNew != SVal_INVALID);
5269   cl->svals[cloff] = svNew;
5270}
5271
5272static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5273   CacheLine* cl;
5274   UWord      cloff, tno, toff;
5275   SVal       svOld, svNew;
5276   UShort     descr;
5277   stats__cline_cwrite08s++;
5278   cl    = get_cacheline(a);
5279   cloff = get_cacheline_offset(a);
5280   tno   = get_treeno(a);
5281   toff  = get_tree_offset(a); /* == 0 .. 7 */
5282   descr = cl->descrs[tno];
5283   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5284      SVal* tree = &cl->svals[tno << 3];
5285      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5286      if (CHECK_ZSM)
5287         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5288   }
5289   svOld = cl->svals[cloff];
5290   svNew = msmcwrite( svOld, thr,a,1 );
5291   if (CHECK_ZSM)
5292      tl_assert(svNew != SVal_INVALID);
5293   cl->svals[cloff] = svNew;
5294}
5295
5296/*------------- ZSM accesses: 16 bit sapply ------------- */
5297
5298static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5299   CacheLine* cl;
5300   UWord      cloff, tno, toff;
5301   SVal       svOld, svNew;
5302   UShort     descr;
5303   stats__cline_cread16s++;
5304   if (UNLIKELY(!aligned16(a))) goto slowcase;
5305   cl    = get_cacheline(a);
5306   cloff = get_cacheline_offset(a);
5307   tno   = get_treeno(a);
5308   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5309   descr = cl->descrs[tno];
5310   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5311      if (valid_value_is_below_me_16(descr, toff)) {
5312         goto slowcase;
5313      } else {
5314         SVal* tree = &cl->svals[tno << 3];
5315         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5316      }
5317      if (CHECK_ZSM)
5318         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5319   }
5320   svOld = cl->svals[cloff];
5321   svNew = msmcread( svOld, thr,a,2 );
5322   if (CHECK_ZSM)
5323      tl_assert(svNew != SVal_INVALID);
5324   cl->svals[cloff] = svNew;
5325   return;
5326  slowcase: /* misaligned, or must go further down the tree */
5327   stats__cline_16to8splits++;
5328   zsm_sapply08__msmcread( thr, a + 0 );
5329   zsm_sapply08__msmcread( thr, a + 1 );
5330}
5331
5332static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5333   CacheLine* cl;
5334   UWord      cloff, tno, toff;
5335   SVal       svOld, svNew;
5336   UShort     descr;
5337   stats__cline_cwrite16s++;
5338   if (UNLIKELY(!aligned16(a))) goto slowcase;
5339   cl    = get_cacheline(a);
5340   cloff = get_cacheline_offset(a);
5341   tno   = get_treeno(a);
5342   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5343   descr = cl->descrs[tno];
5344   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5345      if (valid_value_is_below_me_16(descr, toff)) {
5346         goto slowcase;
5347      } else {
5348         SVal* tree = &cl->svals[tno << 3];
5349         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5350      }
5351      if (CHECK_ZSM)
5352         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5353   }
5354   svOld = cl->svals[cloff];
5355   svNew = msmcwrite( svOld, thr,a,2 );
5356   if (CHECK_ZSM)
5357      tl_assert(svNew != SVal_INVALID);
5358   cl->svals[cloff] = svNew;
5359   return;
5360  slowcase: /* misaligned, or must go further down the tree */
5361   stats__cline_16to8splits++;
5362   zsm_sapply08__msmcwrite( thr, a + 0 );
5363   zsm_sapply08__msmcwrite( thr, a + 1 );
5364}
5365
5366/*------------- ZSM accesses: 32 bit sapply ------------- */
5367
5368static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5369   CacheLine* cl;
5370   UWord      cloff, tno, toff;
5371   SVal       svOld, svNew;
5372   UShort     descr;
5373   stats__cline_cread32s++;
5374   if (UNLIKELY(!aligned32(a))) goto slowcase;
5375   cl    = get_cacheline(a);
5376   cloff = get_cacheline_offset(a);
5377   tno   = get_treeno(a);
5378   toff  = get_tree_offset(a); /* == 0 or 4 */
5379   descr = cl->descrs[tno];
5380   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5381      if (valid_value_is_above_me_32(descr, toff)) {
5382         SVal* tree = &cl->svals[tno << 3];
5383         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5384      } else {
5385         goto slowcase;
5386      }
5387      if (CHECK_ZSM)
5388         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5389   }
5390   svOld = cl->svals[cloff];
5391   svNew = msmcread( svOld, thr,a,4 );
5392   if (CHECK_ZSM)
5393      tl_assert(svNew != SVal_INVALID);
5394   cl->svals[cloff] = svNew;
5395   return;
5396  slowcase: /* misaligned, or must go further down the tree */
5397   stats__cline_32to16splits++;
5398   zsm_sapply16__msmcread( thr, a + 0 );
5399   zsm_sapply16__msmcread( thr, a + 2 );
5400}
5401
5402static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5403   CacheLine* cl;
5404   UWord      cloff, tno, toff;
5405   SVal       svOld, svNew;
5406   UShort     descr;
5407   stats__cline_cwrite32s++;
5408   if (UNLIKELY(!aligned32(a))) goto slowcase;
5409   cl    = get_cacheline(a);
5410   cloff = get_cacheline_offset(a);
5411   tno   = get_treeno(a);
5412   toff  = get_tree_offset(a); /* == 0 or 4 */
5413   descr = cl->descrs[tno];
5414   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5415      if (valid_value_is_above_me_32(descr, toff)) {
5416         SVal* tree = &cl->svals[tno << 3];
5417         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5418      } else {
5419         goto slowcase;
5420      }
5421      if (CHECK_ZSM)
5422         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5423   }
5424   svOld = cl->svals[cloff];
5425   svNew = msmcwrite( svOld, thr,a,4 );
5426   if (CHECK_ZSM)
5427      tl_assert(svNew != SVal_INVALID);
5428   cl->svals[cloff] = svNew;
5429   return;
5430  slowcase: /* misaligned, or must go further down the tree */
5431   stats__cline_32to16splits++;
5432   zsm_sapply16__msmcwrite( thr, a + 0 );
5433   zsm_sapply16__msmcwrite( thr, a + 2 );
5434}
5435
5436/*------------- ZSM accesses: 64 bit sapply ------------- */
5437
5438static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5439   CacheLine* cl;
5440   UWord      cloff, tno;
5441   //UWord      toff;
5442   SVal       svOld, svNew;
5443   UShort     descr;
5444   stats__cline_cread64s++;
5445   if (UNLIKELY(!aligned64(a))) goto slowcase;
5446   cl    = get_cacheline(a);
5447   cloff = get_cacheline_offset(a);
5448   tno   = get_treeno(a);
5449   //toff  = get_tree_offset(a); /* == 0, unused */
5450   descr = cl->descrs[tno];
5451   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5452      goto slowcase;
5453   }
5454   svOld = cl->svals[cloff];
5455   svNew = msmcread( svOld, thr,a,8 );
5456   if (CHECK_ZSM)
5457      tl_assert(svNew != SVal_INVALID);
5458   cl->svals[cloff] = svNew;
5459   return;
5460  slowcase: /* misaligned, or must go further down the tree */
5461   stats__cline_64to32splits++;
5462   zsm_sapply32__msmcread( thr, a + 0 );
5463   zsm_sapply32__msmcread( thr, a + 4 );
5464}
5465
5466static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5467   CacheLine* cl;
5468   UWord      cloff, tno;
5469   //UWord      toff;
5470   SVal       svOld, svNew;
5471   UShort     descr;
5472   stats__cline_cwrite64s++;
5473   if (UNLIKELY(!aligned64(a))) goto slowcase;
5474   cl    = get_cacheline(a);
5475   cloff = get_cacheline_offset(a);
5476   tno   = get_treeno(a);
5477   //toff  = get_tree_offset(a); /* == 0, unused */
5478   descr = cl->descrs[tno];
5479   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5480      goto slowcase;
5481   }
5482   svOld = cl->svals[cloff];
5483   svNew = msmcwrite( svOld, thr,a,8 );
5484   if (CHECK_ZSM)
5485      tl_assert(svNew != SVal_INVALID);
5486   cl->svals[cloff] = svNew;
5487   return;
5488  slowcase: /* misaligned, or must go further down the tree */
5489   stats__cline_64to32splits++;
5490   zsm_sapply32__msmcwrite( thr, a + 0 );
5491   zsm_sapply32__msmcwrite( thr, a + 4 );
5492}
5493
5494/*--------------- ZSM accesses: 8 bit swrite --------------- */
5495
5496static
5497void zsm_swrite08 ( Addr a, SVal svNew ) {
5498   CacheLine* cl;
5499   UWord      cloff, tno, toff;
5500   UShort     descr;
5501   stats__cline_swrite08s++;
5502   cl    = get_cacheline(a);
5503   cloff = get_cacheline_offset(a);
5504   tno   = get_treeno(a);
5505   toff  = get_tree_offset(a); /* == 0 .. 7 */
5506   descr = cl->descrs[tno];
5507   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5508      SVal* tree = &cl->svals[tno << 3];
5509      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5510      if (CHECK_ZSM)
5511         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5512   }
5513   tl_assert(svNew != SVal_INVALID);
5514   cl->svals[cloff] = svNew;
5515}
5516
5517/*--------------- ZSM accesses: 16 bit swrite --------------- */
5518
5519static
5520void zsm_swrite16 ( Addr a, SVal svNew ) {
5521   CacheLine* cl;
5522   UWord      cloff, tno, toff;
5523   UShort     descr;
5524   stats__cline_swrite16s++;
5525   if (UNLIKELY(!aligned16(a))) goto slowcase;
5526   cl    = get_cacheline(a);
5527   cloff = get_cacheline_offset(a);
5528   tno   = get_treeno(a);
5529   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5530   descr = cl->descrs[tno];
5531   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5532      if (valid_value_is_below_me_16(descr, toff)) {
5533         /* Writing at this level.  Need to fix up 'descr'. */
5534         cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5535         /* At this point, the tree does not match cl->descr[tno] any
5536            more.  The assignments below will fix it up. */
5537      } else {
5538         /* We can't indiscriminately write on the w16 node as in the
5539            w64 case, as that might make the node inconsistent with
5540            its parent.  So first, pull down to this level. */
5541         SVal* tree = &cl->svals[tno << 3];
5542         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5543      if (CHECK_ZSM)
5544         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5545      }
5546   }
5547   tl_assert(svNew != SVal_INVALID);
5548   cl->svals[cloff + 0] = svNew;
5549   cl->svals[cloff + 1] = SVal_INVALID;
5550   return;
5551  slowcase: /* misaligned */
5552   stats__cline_16to8splits++;
5553   zsm_swrite08( a + 0, svNew );
5554   zsm_swrite08( a + 1, svNew );
5555}
5556
5557/*--------------- ZSM accesses: 32 bit swrite --------------- */
5558
5559static
5560void zsm_swrite32 ( Addr a, SVal svNew ) {
5561   CacheLine* cl;
5562   UWord      cloff, tno, toff;
5563   UShort     descr;
5564   stats__cline_swrite32s++;
5565   if (UNLIKELY(!aligned32(a))) goto slowcase;
5566   cl    = get_cacheline(a);
5567   cloff = get_cacheline_offset(a);
5568   tno   = get_treeno(a);
5569   toff  = get_tree_offset(a); /* == 0 or 4 */
5570   descr = cl->descrs[tno];
5571   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5572      if (valid_value_is_above_me_32(descr, toff)) {
5573         /* We can't indiscriminately write on the w32 node as in the
5574            w64 case, as that might make the node inconsistent with
5575            its parent.  So first, pull down to this level. */
5576         SVal* tree = &cl->svals[tno << 3];
5577         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5578         if (CHECK_ZSM)
5579            tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5580      } else {
5581         /* Writing at this level.  Need to fix up 'descr'. */
5582         cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5583         /* At this point, the tree does not match cl->descr[tno] any
5584            more.  The assignments below will fix it up. */
5585      }
5586   }
5587   tl_assert(svNew != SVal_INVALID);
5588   cl->svals[cloff + 0] = svNew;
5589   cl->svals[cloff + 1] = SVal_INVALID;
5590   cl->svals[cloff + 2] = SVal_INVALID;
5591   cl->svals[cloff + 3] = SVal_INVALID;
5592   return;
5593  slowcase: /* misaligned */
5594   stats__cline_32to16splits++;
5595   zsm_swrite16( a + 0, svNew );
5596   zsm_swrite16( a + 2, svNew );
5597}
5598
5599/*--------------- ZSM accesses: 64 bit swrite --------------- */
5600
5601static
5602void zsm_swrite64 ( Addr a, SVal svNew ) {
5603   CacheLine* cl;
5604   UWord      cloff, tno;
5605   //UWord    toff;
5606   stats__cline_swrite64s++;
5607   if (UNLIKELY(!aligned64(a))) goto slowcase;
5608   cl    = get_cacheline(a);
5609   cloff = get_cacheline_offset(a);
5610   tno   = get_treeno(a);
5611   //toff  = get_tree_offset(a); /* == 0, unused */
5612   cl->descrs[tno] = TREE_DESCR_64;
5613   tl_assert(svNew != SVal_INVALID);
5614   cl->svals[cloff + 0] = svNew;
5615   cl->svals[cloff + 1] = SVal_INVALID;
5616   cl->svals[cloff + 2] = SVal_INVALID;
5617   cl->svals[cloff + 3] = SVal_INVALID;
5618   cl->svals[cloff + 4] = SVal_INVALID;
5619   cl->svals[cloff + 5] = SVal_INVALID;
5620   cl->svals[cloff + 6] = SVal_INVALID;
5621   cl->svals[cloff + 7] = SVal_INVALID;
5622   return;
5623  slowcase: /* misaligned */
5624   stats__cline_64to32splits++;
5625   zsm_swrite32( a + 0, svNew );
5626   zsm_swrite32( a + 4, svNew );
5627}
5628
5629/*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5630
5631static
5632SVal zsm_sread08 ( Addr a ) {
5633   CacheLine* cl;
5634   UWord      cloff, tno, toff;
5635   UShort     descr;
5636   stats__cline_sread08s++;
5637   cl    = get_cacheline(a);
5638   cloff = get_cacheline_offset(a);
5639   tno   = get_treeno(a);
5640   toff  = get_tree_offset(a); /* == 0 .. 7 */
5641   descr = cl->descrs[tno];
5642   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5643      SVal* tree = &cl->svals[tno << 3];
5644      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5645   }
5646   return cl->svals[cloff];
5647}
5648
5649static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5650   SVal       sv;
5651   stats__cline_scopy08s++;
5652   sv = zsm_sread08( src );
5653   zsm_swrite08( dst, sv );
5654}
5655
5656
5657/* Block-copy states (needed for implementing realloc()).  Note this
5658   doesn't change the filtering arrangements.  The caller of
5659   zsm_scopy_range needs to attend to that. */
5660
5661static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
5662{
5663   SizeT i;
5664   if (len == 0)
5665      return;
5666
5667   /* assert for non-overlappingness */
5668   tl_assert(src+len <= dst || dst+len <= src);
5669
5670   /* To be simple, just copy byte by byte.  But so as not to wreck
5671      performance for later accesses to dst[0 .. len-1], normalise
5672      destination lines as we finish with them, and also normalise the
5673      line containing the first and last address. */
5674   for (i = 0; i < len; i++) {
5675      Bool normalise
5676         = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
5677           || i == 0       /* first in range */
5678           || i == len-1;  /* last in range */
5679      zsm_scopy08( src+i, dst+i, normalise );
5680   }
5681}
5682
5683
5684/* For setting address ranges to a given value.  Has considerable
5685   sophistication so as to avoid generating large numbers of pointless
5686   cache loads/writebacks for large ranges. */
5687
5688/* Do small ranges in-cache, in the obvious way. */
5689static
5690void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
5691{
5692   /* fast track a couple of common cases */
5693   if (len == 4 && aligned32(a)) {
5694      zsm_swrite32( a, svNew );
5695      return;
5696   }
5697   if (len == 8 && aligned64(a)) {
5698      zsm_swrite64( a, svNew );
5699      return;
5700   }
5701
5702   /* be completely general (but as efficient as possible) */
5703   if (len == 0) return;
5704
5705   if (!aligned16(a) && len >= 1) {
5706      zsm_swrite08( a, svNew );
5707      a += 1;
5708      len -= 1;
5709      tl_assert(aligned16(a));
5710   }
5711   if (len == 0) return;
5712
5713   if (!aligned32(a) && len >= 2) {
5714      zsm_swrite16( a, svNew );
5715      a += 2;
5716      len -= 2;
5717      tl_assert(aligned32(a));
5718   }
5719   if (len == 0) return;
5720
5721   if (!aligned64(a) && len >= 4) {
5722      zsm_swrite32( a, svNew );
5723      a += 4;
5724      len -= 4;
5725      tl_assert(aligned64(a));
5726   }
5727   if (len == 0) return;
5728
5729   if (len >= 8) {
5730      tl_assert(aligned64(a));
5731      while (len >= 8) {
5732         zsm_swrite64( a, svNew );
5733         a += 8;
5734         len -= 8;
5735      }
5736      tl_assert(aligned64(a));
5737   }
5738   if (len == 0) return;
5739
5740   if (len >= 4)
5741      tl_assert(aligned32(a));
5742   if (len >= 4) {
5743      zsm_swrite32( a, svNew );
5744      a += 4;
5745      len -= 4;
5746   }
5747   if (len == 0) return;
5748
5749   if (len >= 2)
5750      tl_assert(aligned16(a));
5751   if (len >= 2) {
5752      zsm_swrite16( a, svNew );
5753      a += 2;
5754      len -= 2;
5755   }
5756   if (len == 0) return;
5757
5758   if (len >= 1) {
5759      zsm_swrite08( a, svNew );
5760      //a += 1;
5761      len -= 1;
5762   }
5763   tl_assert(len == 0);
5764}
5765
5766
5767/* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
5768   for larger ranges, try to operate directly on the out-of-cache
5769   representation, rather than dragging lines into the cache,
5770   overwriting them, and forcing them out.  This turns out to be an
5771   important performance optimisation.
5772
5773   Note that this doesn't change the filtering arrangements.  The
5774   caller of zsm_sset_range needs to attend to that. */
5775
5776static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
5777{
5778   tl_assert(svNew != SVal_INVALID);
5779   stats__cache_make_New_arange += (ULong)len;
5780
5781   if (0 && len > 500)
5782      VG_(printf)("make New      ( %#lx, %lu )\n", a, len );
5783
5784   if (0) {
5785      static UWord n_New_in_cache = 0;
5786      static UWord n_New_not_in_cache = 0;
5787      /* tag is 'a' with the in-line offset masked out,
5788         eg a[31]..a[4] 0000 */
5789      Addr       tag = a & ~(N_LINE_ARANGE - 1);
5790      UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
5791      if (LIKELY(tag == cache_shmem.tags0[wix])) {
5792         n_New_in_cache++;
5793      } else {
5794         n_New_not_in_cache++;
5795      }
5796      if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
5797         VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
5798                     n_New_in_cache, n_New_not_in_cache );
5799   }
5800
5801   if (LIKELY(len < 2 * N_LINE_ARANGE)) {
5802      zsm_sset_range_SMALL( a, len, svNew );
5803   } else {
5804      Addr  before_start  = a;
5805      Addr  aligned_start = cacheline_ROUNDUP(a);
5806      Addr  after_start   = cacheline_ROUNDDN(a + len);
5807      UWord before_len    = aligned_start - before_start;
5808      UWord aligned_len   = after_start - aligned_start;
5809      UWord after_len     = a + len - after_start;
5810      tl_assert(before_start <= aligned_start);
5811      tl_assert(aligned_start <= after_start);
5812      tl_assert(before_len < N_LINE_ARANGE);
5813      tl_assert(after_len < N_LINE_ARANGE);
5814      tl_assert(get_cacheline_offset(aligned_start) == 0);
5815      if (get_cacheline_offset(a) == 0) {
5816         tl_assert(before_len == 0);
5817         tl_assert(a == aligned_start);
5818      }
5819      if (get_cacheline_offset(a+len) == 0) {
5820         tl_assert(after_len == 0);
5821         tl_assert(after_start == a+len);
5822      }
5823      if (before_len > 0) {
5824         zsm_sset_range_SMALL( before_start, before_len, svNew );
5825      }
5826      if (after_len > 0) {
5827         zsm_sset_range_SMALL( after_start, after_len, svNew );
5828      }
5829      stats__cache_make_New_inZrep += (ULong)aligned_len;
5830
5831      while (1) {
5832         Addr tag;
5833         UWord wix;
5834         if (aligned_start >= after_start)
5835            break;
5836         tl_assert(get_cacheline_offset(aligned_start) == 0);
5837         tag = aligned_start & ~(N_LINE_ARANGE - 1);
5838         wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
5839         if (tag == cache_shmem.tags0[wix]) {
5840            UWord i;
5841            for (i = 0; i < N_LINE_ARANGE / 8; i++)
5842               zsm_swrite64( aligned_start + i * 8, svNew );
5843         } else {
5844            UWord i;
5845            Word zix;
5846            SecMap* sm;
5847            LineZ* lineZ;
5848            /* This line is not in the cache.  Do not force it in; instead
5849               modify it in-place. */
5850            /* find the Z line to write in and rcdec it or the
5851               associated F line. */
5852            find_Z_for_writing( &sm, &zix, tag );
5853            tl_assert(sm);
5854            tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
5855            lineZ = &sm->linesZ[zix];
5856            lineZ->dict[0] = svNew;
5857            lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
5858            for (i = 0; i < N_LINE_ARANGE/4; i++)
5859               lineZ->ix2s[i] = 0; /* all refer to dict[0] */
5860            rcinc_LineZ(lineZ);
5861         }
5862         aligned_start += N_LINE_ARANGE;
5863         aligned_len -= N_LINE_ARANGE;
5864      }
5865      tl_assert(aligned_start == after_start);
5866      tl_assert(aligned_len == 0);
5867   }
5868}
5869
5870
5871/////////////////////////////////////////////////////////
5872//                                                     //
5873// Front-filtering accesses                            //
5874//                                                     //
5875/////////////////////////////////////////////////////////
5876
5877static UWord stats__f_ac = 0;
5878static UWord stats__f_sk = 0;
5879
5880#if 0
5881#  define STATS__F_SHOW \
5882     do { \
5883        if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
5884           VG_(printf)("filters: ac %lu sk %lu\n",   \
5885           stats__f_ac, stats__f_sk); \
5886     } while (0)
5887#else
5888#  define STATS__F_SHOW /* */
5889#endif
5890
5891void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
5892   stats__f_ac++;
5893   STATS__F_SHOW;
5894   if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
5895      stats__f_sk++;
5896      return;
5897   }
5898   zsm_sapply08__msmcwrite(thr, a);
5899}
5900
5901void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
5902   stats__f_ac++;
5903   STATS__F_SHOW;
5904   if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
5905      stats__f_sk++;
5906      return;
5907   }
5908   zsm_sapply16__msmcwrite(thr, a);
5909}
5910
5911void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
5912   stats__f_ac++;
5913   STATS__F_SHOW;
5914   if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
5915      stats__f_sk++;
5916      return;
5917   }
5918   zsm_sapply32__msmcwrite(thr, a);
5919}
5920
5921void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
5922   stats__f_ac++;
5923   STATS__F_SHOW;
5924   if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
5925      stats__f_sk++;
5926      return;
5927   }
5928   zsm_sapply64__msmcwrite(thr, a);
5929}
5930
5931void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
5932{
5933   /* fast track a couple of common cases */
5934   if (len == 4 && aligned32(a)) {
5935      zsm_sapply32_f__msmcwrite( thr, a );
5936      return;
5937   }
5938   if (len == 8 && aligned64(a)) {
5939      zsm_sapply64_f__msmcwrite( thr, a );
5940      return;
5941   }
5942
5943   /* be completely general (but as efficient as possible) */
5944   if (len == 0) return;
5945
5946   if (!aligned16(a) && len >= 1) {
5947      zsm_sapply08_f__msmcwrite( thr, a );
5948      a += 1;
5949      len -= 1;
5950      tl_assert(aligned16(a));
5951   }
5952   if (len == 0) return;
5953
5954   if (!aligned32(a) && len >= 2) {
5955      zsm_sapply16_f__msmcwrite( thr, a );
5956      a += 2;
5957      len -= 2;
5958      tl_assert(aligned32(a));
5959   }
5960   if (len == 0) return;
5961
5962   if (!aligned64(a) && len >= 4) {
5963      zsm_sapply32_f__msmcwrite( thr, a );
5964      a += 4;
5965      len -= 4;
5966      tl_assert(aligned64(a));
5967   }
5968   if (len == 0) return;
5969
5970   if (len >= 8) {
5971      tl_assert(aligned64(a));
5972      while (len >= 8) {
5973         zsm_sapply64_f__msmcwrite( thr, a );
5974         a += 8;
5975         len -= 8;
5976      }
5977      tl_assert(aligned64(a));
5978   }
5979   if (len == 0) return;
5980
5981   if (len >= 4)
5982      tl_assert(aligned32(a));
5983   if (len >= 4) {
5984      zsm_sapply32_f__msmcwrite( thr, a );
5985      a += 4;
5986      len -= 4;
5987   }
5988   if (len == 0) return;
5989
5990   if (len >= 2)
5991      tl_assert(aligned16(a));
5992   if (len >= 2) {
5993      zsm_sapply16_f__msmcwrite( thr, a );
5994      a += 2;
5995      len -= 2;
5996   }
5997   if (len == 0) return;
5998
5999   if (len >= 1) {
6000      zsm_sapply08_f__msmcwrite( thr, a );
6001      //a += 1;
6002      len -= 1;
6003   }
6004   tl_assert(len == 0);
6005}
6006
6007void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
6008   stats__f_ac++;
6009   STATS__F_SHOW;
6010   if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
6011      stats__f_sk++;
6012      return;
6013   }
6014   zsm_sapply08__msmcread(thr, a);
6015}
6016
6017void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
6018   stats__f_ac++;
6019   STATS__F_SHOW;
6020   if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
6021      stats__f_sk++;
6022      return;
6023   }
6024   zsm_sapply16__msmcread(thr, a);
6025}
6026
6027void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
6028   stats__f_ac++;
6029   STATS__F_SHOW;
6030   if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
6031      stats__f_sk++;
6032      return;
6033   }
6034   zsm_sapply32__msmcread(thr, a);
6035}
6036
6037void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
6038   stats__f_ac++;
6039   STATS__F_SHOW;
6040   if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
6041      stats__f_sk++;
6042      return;
6043   }
6044   zsm_sapply64__msmcread(thr, a);
6045}
6046
6047void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
6048{
6049   /* fast track a couple of common cases */
6050   if (len == 4 && aligned32(a)) {
6051      zsm_sapply32_f__msmcread( thr, a );
6052      return;
6053   }
6054   if (len == 8 && aligned64(a)) {
6055      zsm_sapply64_f__msmcread( thr, a );
6056      return;
6057   }
6058
6059   /* be completely general (but as efficient as possible) */
6060   if (len == 0) return;
6061
6062   if (!aligned16(a) && len >= 1) {
6063      zsm_sapply08_f__msmcread( thr, a );
6064      a += 1;
6065      len -= 1;
6066      tl_assert(aligned16(a));
6067   }
6068   if (len == 0) return;
6069
6070   if (!aligned32(a) && len >= 2) {
6071      zsm_sapply16_f__msmcread( thr, a );
6072      a += 2;
6073      len -= 2;
6074      tl_assert(aligned32(a));
6075   }
6076   if (len == 0) return;
6077
6078   if (!aligned64(a) && len >= 4) {
6079      zsm_sapply32_f__msmcread( thr, a );
6080      a += 4;
6081      len -= 4;
6082      tl_assert(aligned64(a));
6083   }
6084   if (len == 0) return;
6085
6086   if (len >= 8) {
6087      tl_assert(aligned64(a));
6088      while (len >= 8) {
6089         zsm_sapply64_f__msmcread( thr, a );
6090         a += 8;
6091         len -= 8;
6092      }
6093      tl_assert(aligned64(a));
6094   }
6095   if (len == 0) return;
6096
6097   if (len >= 4)
6098      tl_assert(aligned32(a));
6099   if (len >= 4) {
6100      zsm_sapply32_f__msmcread( thr, a );
6101      a += 4;
6102      len -= 4;
6103   }
6104   if (len == 0) return;
6105
6106   if (len >= 2)
6107      tl_assert(aligned16(a));
6108   if (len >= 2) {
6109      zsm_sapply16_f__msmcread( thr, a );
6110      a += 2;
6111      len -= 2;
6112   }
6113   if (len == 0) return;
6114
6115   if (len >= 1) {
6116      zsm_sapply08_f__msmcread( thr, a );
6117      //a += 1;
6118      len -= 1;
6119   }
6120   tl_assert(len == 0);
6121}
6122
6123void libhb_Thr_resumes ( Thr* thr )
6124{
6125   if (0) VG_(printf)("resume %p\n", thr);
6126   tl_assert(thr);
6127   tl_assert(!thr->llexit_done);
6128   Filter__clear(thr->filter, "libhb_Thr_resumes");
6129   /* A kludge, but .. if this thread doesn't have any marker stacks
6130      at all, get one right now.  This is easier than figuring out
6131      exactly when at thread startup we can and can't take a stack
6132      snapshot. */
6133   if (HG_(clo_history_level) == 1) {
6134      tl_assert(thr->local_Kws_n_stacks);
6135      if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
6136         note_local_Kw_n_stack_for(thr);
6137   }
6138}
6139
6140
6141/////////////////////////////////////////////////////////
6142//                                                     //
6143// Synchronisation objects                             //
6144//                                                     //
6145/////////////////////////////////////////////////////////
6146
6147/* A double linked list of all the SO's. */
6148SO* admin_SO = NULL;
6149
6150static SO* SO__Alloc ( void )
6151{
6152   SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
6153   so->viR   = VtsID_INVALID;
6154   so->viW   = VtsID_INVALID;
6155   so->magic = SO_MAGIC;
6156   /* Add to double linked list */
6157   if (admin_SO) {
6158      tl_assert(admin_SO->admin_prev == NULL);
6159      admin_SO->admin_prev = so;
6160      so->admin_next = admin_SO;
6161   } else {
6162      so->admin_next = NULL;
6163   }
6164   so->admin_prev = NULL;
6165   admin_SO = so;
6166   /* */
6167   return so;
6168}
6169
6170static void SO__Dealloc ( SO* so )
6171{
6172   tl_assert(so);
6173   tl_assert(so->magic == SO_MAGIC);
6174   if (so->viR == VtsID_INVALID) {
6175      tl_assert(so->viW == VtsID_INVALID);
6176   } else {
6177      tl_assert(so->viW != VtsID_INVALID);
6178      VtsID__rcdec(so->viR);
6179      VtsID__rcdec(so->viW);
6180   }
6181   so->magic = 0;
6182   /* Del from double linked list */
6183   if (so->admin_prev)
6184      so->admin_prev->admin_next = so->admin_next;
6185   if (so->admin_next)
6186      so->admin_next->admin_prev = so->admin_prev;
6187   if (so == admin_SO)
6188      admin_SO = so->admin_next;
6189   /* */
6190   HG_(free)( so );
6191}
6192
6193
6194/////////////////////////////////////////////////////////
6195//                                                     //
6196// Top Level API                                       //
6197//                                                     //
6198/////////////////////////////////////////////////////////
6199
6200static void show_thread_state ( const HChar* str, Thr* t )
6201{
6202   if (1) return;
6203   if (t->viR == t->viW) {
6204      VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6205      VtsID__pp( t->viR );
6206      VG_(printf)("%s","\n");
6207   } else {
6208      VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6209      VtsID__pp( t->viR );
6210      VG_(printf)(" viW %u==", t->viW);
6211      VtsID__pp( t->viW );
6212      VG_(printf)("%s","\n");
6213   }
6214}
6215
6216
6217Thr* libhb_init (
6218        void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6219        ExeContext* (*get_EC)( Thr* )
6220     )
6221{
6222   Thr*  thr;
6223   VtsID vi;
6224
6225   // We will have to have to store a large number of these,
6226   // so make sure they're the size we expect them to be.
6227   STATIC_ASSERT(sizeof(ScalarTS) == 8);
6228
6229   /* because first 1024 unusable */
6230   STATIC_ASSERT(SCALARTS_N_THRBITS >= 11);
6231   /* so as to fit in a UInt w/ 5 bits to spare (see defn of
6232      Thr_n_RCEC and TSW). */
6233   STATIC_ASSERT(SCALARTS_N_THRBITS <= 27);
6234
6235   /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6236      (32-bit).  It's not correctness-critical, but there are a lot of
6237      them, so it's important from a space viewpoint.  Unfortunately
6238      we simply can't pack it into 2 words on a 32-bit target. */
6239   STATIC_ASSERT(   (sizeof(UWord) == 8 && sizeof(Thr_n_RCEC) == 16)
6240                 || (sizeof(UWord) == 4 && sizeof(Thr_n_RCEC) == 12));
6241   STATIC_ASSERT(sizeof(TSW) == sizeof(UInt));
6242
6243   /* Word sets really are 32 bits.  Even on a 64 bit target. */
6244   STATIC_ASSERT(sizeof(WordSetID) == 4);
6245   STATIC_ASSERT(sizeof(WordSet) == sizeof(WordSetID));
6246
6247   tl_assert(get_stacktrace);
6248   tl_assert(get_EC);
6249   main_get_stacktrace   = get_stacktrace;
6250   main_get_EC           = get_EC;
6251
6252   // No need to initialise hg_wordfm.
6253   // No need to initialise hg_wordset.
6254
6255   /* Allocated once and never deallocated.  Used as a temporary in
6256      VTS singleton, tick and join operations. */
6257   temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6258   temp_max_sized_VTS->id = VtsID_INVALID;
6259   verydead_thread_tables_init();
6260   vts_set_init();
6261   vts_tab_init();
6262   event_map_init();
6263   VtsID__invalidate_caches();
6264
6265   // initialise shadow memory
6266   zsm_init( );
6267
6268   thr = Thr__new();
6269   vi  = VtsID__mk_Singleton( thr, 1 );
6270   thr->viR = vi;
6271   thr->viW = vi;
6272   VtsID__rcinc(thr->viR);
6273   VtsID__rcinc(thr->viW);
6274
6275   show_thread_state("  root", thr);
6276   return thr;
6277}
6278
6279
6280Thr* libhb_create ( Thr* parent )
6281{
6282   /* The child's VTSs are copies of the parent's VTSs, but ticked at
6283      the child's index.  Since the child's index is guaranteed
6284      unique, it has never been seen before, so the implicit value
6285      before the tick is zero and after that is one. */
6286   Thr* child = Thr__new();
6287
6288   child->viR = VtsID__tick( parent->viR, child );
6289   child->viW = VtsID__tick( parent->viW, child );
6290   Filter__clear(child->filter, "libhb_create(child)");
6291   VtsID__rcinc(child->viR);
6292   VtsID__rcinc(child->viW);
6293   /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6294      early for that - it may not have a valid TId yet.  So, let
6295      libhb_Thr_resumes pick it up the first time the thread runs. */
6296
6297   tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6298   tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6299
6300   /* and the parent has to move along too */
6301   VtsID__rcdec(parent->viR);
6302   VtsID__rcdec(parent->viW);
6303   parent->viR = VtsID__tick( parent->viR, parent );
6304   parent->viW = VtsID__tick( parent->viW, parent );
6305   Filter__clear(parent->filter, "libhb_create(parent)");
6306   VtsID__rcinc(parent->viR);
6307   VtsID__rcinc(parent->viW);
6308   note_local_Kw_n_stack_for( parent );
6309
6310   show_thread_state(" child", child);
6311   show_thread_state("parent", parent);
6312
6313   return child;
6314}
6315
6316/* Shut down the library, and print stats (in fact that's _all_
6317   this is for. */
6318void libhb_shutdown ( Bool show_stats )
6319{
6320   if (show_stats) {
6321      VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6322      VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6323                  stats__secmaps_allocd,
6324                  stats__secmap_ga_space_covered);
6325      VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6326                  stats__secmap_linesZ_allocd,
6327                  stats__secmap_linesZ_bytes);
6328      VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)"
6329                  " (%'10lu used)\n",
6330                  VG_(sizePA) (LineF_pool_allocator),
6331                  VG_(sizePA) (LineF_pool_allocator) * sizeof(LineF),
6332                  shmem__SecMap_used_linesF());
6333      VG_(printf)(" secmaps: %'10lu in map (can be scanGCed %'5lu)"
6334                  " #%lu scanGC \n",
6335                  stats__secmaps_in_map_shmem,
6336                  shmem__SecMap_do_GC(False /* really do GC */),
6337                  stats__secmaps_scanGC);
6338      tl_assert (VG_(sizeFM) (map_shmem) == stats__secmaps_in_map_shmem);
6339      VG_(printf)(" secmaps: %'10lu in freelist,"
6340                  " total (scanGCed %'lu, ssetGCed %'lu)\n",
6341                  SecMap_freelist_length(),
6342                  stats__secmaps_scanGCed,
6343                  stats__secmaps_ssetGCed);
6344      VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6345                  stats__secmaps_search, stats__secmaps_search_slow);
6346
6347      VG_(printf)("%s","\n");
6348      VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6349                  stats__cache_totrefs, stats__cache_totmisses );
6350      VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6351                  stats__cache_Z_fetches, stats__cache_F_fetches );
6352      VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6353                  stats__cache_Z_wbacks, stats__cache_F_wbacks );
6354      VG_(printf)("   cache: %'14lu flushes_invals\n",
6355                  stats__cache_flushes_invals );
6356      VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6357                  stats__cache_make_New_arange,
6358                  stats__cache_make_New_inZrep);
6359
6360      VG_(printf)("%s","\n");
6361      VG_(printf)("   cline: %'10lu normalises\n",
6362                  stats__cline_normalises );
6363      VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6364                  stats__cline_cread64s,
6365                  stats__cline_cread32s,
6366                  stats__cline_cread16s,
6367                  stats__cline_cread08s );
6368      VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6369                  stats__cline_cwrite64s,
6370                  stats__cline_cwrite32s,
6371                  stats__cline_cwrite16s,
6372                  stats__cline_cwrite08s );
6373      VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6374                  stats__cline_swrite64s,
6375                  stats__cline_swrite32s,
6376                  stats__cline_swrite16s,
6377                  stats__cline_swrite08s );
6378      VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6379                  stats__cline_sread08s, stats__cline_scopy08s );
6380      VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu"
6381                  "    2to1 %'12lu\n",
6382                  stats__cline_64to32splits, stats__cline_32to16splits,
6383                  stats__cline_16to8splits );
6384      VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu"
6385                  "    2to1 %'12lu\n",
6386                  stats__cline_64to32pulldown, stats__cline_32to16pulldown,
6387                  stats__cline_16to8pulldown );
6388      if (0)
6389      VG_(printf)("   cline: sizeof(CacheLineZ) %ld,"
6390                  " covers %ld bytes of arange\n",
6391                  (Word)sizeof(LineZ),
6392                  (Word)N_LINE_ARANGE);
6393
6394      VG_(printf)("%s","\n");
6395
6396      VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6397                  stats__msmcread, stats__msmcread_change);
6398      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6399                  stats__msmcwrite, stats__msmcwrite_change);
6400      VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6401                  stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6402      VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6403                  stats__join2_queries, stats__join2_misses);
6404
6405      VG_(printf)("%s","\n");
6406      VG_(printf)("   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6407                  stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6408      VG_(printf)("   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6409                  stats__vts__cmp_structural, stats__vts__cmp_structural_slow);
6410      VG_(printf)("   libhb: VTSset: find__or__clone_and_add %'lu"
6411                  " (%'lu allocd)\n",
6412                   stats__vts_set__focaa, stats__vts_set__focaa_a );
6413      VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6414                   stats__vts__indexat_slow );
6415
6416      VG_(printf)("%s","\n");
6417      VG_(printf)(
6418         "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6419         VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6420      );
6421      VG_(printf)("   libhb: #%lu vts_tab GC    #%lu vts pruning\n",
6422                  stats__vts_tab_GC, stats__vts_pruning);
6423      VG_(printf)( "   libhb: %lu entries in vts_set\n",
6424                   VG_(sizeFM)( vts_set ) );
6425
6426      VG_(printf)("%s","\n");
6427      {
6428         UInt live = 0;
6429         UInt llexit_done = 0;
6430         UInt joinedwith_done = 0;
6431         UInt llexit_and_joinedwith_done = 0;
6432
6433         Thread* hgthread = get_admin_threads();
6434         tl_assert(hgthread);
6435         while (hgthread) {
6436            Thr* hbthr = hgthread->hbthr;
6437            tl_assert(hbthr);
6438            if (hbthr->llexit_done && hbthr->joinedwith_done)
6439               llexit_and_joinedwith_done++;
6440            else if (hbthr->llexit_done)
6441               llexit_done++;
6442            else if (hbthr->joinedwith_done)
6443               joinedwith_done++;
6444            else
6445               live++;
6446            hgthread = hgthread->admin;
6447         }
6448         VG_(printf)("   libhb: threads live: %u exit_and_joinedwith %u"
6449                     " exit %u joinedwith %u\n",
6450                     live, llexit_and_joinedwith_done,
6451                     llexit_done, joinedwith_done);
6452         VG_(printf)("   libhb: %d verydead_threads, "
6453                     "%d verydead_threads_not_pruned\n",
6454                     (int) VG_(sizeXA)( verydead_thread_table),
6455                     (int) VG_(sizeXA)( verydead_thread_table_not_pruned));
6456         tl_assert (VG_(sizeXA)( verydead_thread_table)
6457                    + VG_(sizeXA)( verydead_thread_table_not_pruned)
6458                    == llexit_and_joinedwith_done);
6459      }
6460
6461      VG_(printf)("%s","\n");
6462      VG_(printf)( "   libhb: oldrefHTN %lu (%'d bytes)\n",
6463                   oldrefHTN, (int)(oldrefHTN * sizeof(OldRef)));
6464      tl_assert (oldrefHTN == VG_(HT_count_nodes) (oldrefHT));
6465      VG_(printf)( "   libhb: oldref lookup found=%lu notfound=%lu\n",
6466                   stats__evm__lookup_found, stats__evm__lookup_notfound);
6467      if (VG_(clo_verbosity) > 1)
6468         VG_(HT_print_stats) (oldrefHT, cmp_oldref_tsw);
6469      VG_(printf)( "   libhb: oldref bind tsw/rcec "
6470                   "==/==:%'lu ==/!=:%'lu !=/!=:%'lu\n",
6471                   stats__ctxt_eq_tsw_eq_rcec, stats__ctxt_eq_tsw_neq_rcec,
6472                   stats__ctxt_neq_tsw_neq_rcec);
6473      VG_(printf)( "   libhb: ctxt__rcdec calls %'lu. rcec gc discards %'lu\n",
6474                   stats__ctxt_rcdec_calls, stats__ctxt_rcec_gc_discards);
6475      VG_(printf)( "   libhb: contextTab: %lu slots,"
6476                   " %lu cur ents(ref'd %lu),"
6477                   " %lu max ents\n",
6478                   (UWord)N_RCEC_TAB,
6479                   stats__ctxt_tab_curr, RCEC_referenced,
6480                   stats__ctxt_tab_max );
6481      {
6482#        define  MAXCHAIN 10
6483         UInt chains[MAXCHAIN+1]; // [MAXCHAIN] gets all chains >= MAXCHAIN
6484         UInt non0chain = 0;
6485         UInt n;
6486         UInt i;
6487         RCEC *p;
6488
6489         for (i = 0; i <= MAXCHAIN; i++) chains[i] = 0;
6490         for (i = 0; i < N_RCEC_TAB; i++) {
6491            n = 0;
6492            for (p = contextTab[i]; p; p = p->next)
6493               n++;
6494            if (n < MAXCHAIN)
6495               chains[n]++;
6496            else
6497               chains[MAXCHAIN]++;
6498            if (n > 0)
6499               non0chain++;
6500         }
6501         VG_(printf)( "   libhb: contextTab chain of [length]=nchain."
6502                      " Avg chain len %3.1f\n"
6503                      "        ",
6504                      (Double)stats__ctxt_tab_curr
6505                      / (Double)(non0chain ? non0chain : 1));
6506         for (i = 0; i <= MAXCHAIN; i++) {
6507            if (chains[i] != 0)
6508                VG_(printf)( "[%u%s]=%u ",
6509                             i, i == MAXCHAIN ? "+" : "",
6510                             chains[i]);
6511         }
6512         VG_(printf)( "\n");
6513#        undef MAXCHAIN
6514      }
6515      VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6516                   stats__ctxt_tab_qs,
6517                   stats__ctxt_tab_cmps );
6518#if 0
6519      VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
6520      VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
6521      VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
6522      VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
6523      VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
6524      VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
6525      VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
6526      VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
6527      VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
6528      VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
6529      VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
6530      VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
6531      VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
6532      VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
6533
6534      VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
6535      VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
6536      VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
6537      VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
6538#endif
6539
6540      VG_(printf)("%s","<<< END libhb stats >>>\n");
6541      VG_(printf)("%s","\n");
6542
6543   }
6544}
6545
6546/* Receive notification that a thread has low level exited.  The
6547   significance here is that we do not expect to see any more memory
6548   references from it. */
6549void libhb_async_exit ( Thr* thr )
6550{
6551   tl_assert(thr);
6552   tl_assert(!thr->llexit_done);
6553   thr->llexit_done = True;
6554
6555   /* free up Filter and local_Kws_n_stacks (well, actually not the
6556      latter ..) */
6557   tl_assert(thr->filter);
6558   HG_(free)(thr->filter);
6559   thr->filter = NULL;
6560
6561   /* Tell the VTS mechanism this thread has exited, so it can
6562      participate in VTS pruning.  Note this can only happen if the
6563      thread has both ll_exited and has been joined with. */
6564   if (thr->joinedwith_done)
6565      VTS__declare_thread_very_dead(thr);
6566
6567   /* Another space-accuracy tradeoff.  Do we want to be able to show
6568      H1 history for conflicts in threads which have since exited?  If
6569      yes, then we better not free up thr->local_Kws_n_stacks.  The
6570      downside is a potential per-thread leak of up to
6571      N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6572      XArray average overcommit factor is (1.5 I'd guess). */
6573   // hence:
6574   // VG_(deleteXA)(thr->local_Kws_n_stacks);
6575   // thr->local_Kws_n_stacks = NULL;
6576}
6577
6578/* Receive notification that a thread has been joined with.  The
6579   significance here is that we do not expect to see any further
6580   references to its vector clocks (Thr::viR and Thr::viW). */
6581void libhb_joinedwith_done ( Thr* thr )
6582{
6583   tl_assert(thr);
6584   /* Caller must ensure that this is only ever called once per Thr. */
6585   tl_assert(!thr->joinedwith_done);
6586   thr->joinedwith_done = True;
6587   if (thr->llexit_done)
6588      VTS__declare_thread_very_dead(thr);
6589}
6590
6591
6592/* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6593   a Seg that points at a VTS is its one-and-only owner, and ditto for
6594   a SO that points at a VTS. */
6595
6596SO* libhb_so_alloc ( void )
6597{
6598   return SO__Alloc();
6599}
6600
6601void libhb_so_dealloc ( SO* so )
6602{
6603   tl_assert(so);
6604   tl_assert(so->magic == SO_MAGIC);
6605   SO__Dealloc(so);
6606}
6607
6608/* See comments in libhb.h for details on the meaning of
6609   strong vs weak sends and strong vs weak receives. */
6610void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6611{
6612   /* Copy the VTSs from 'thr' into the sync object, and then move
6613      the thread along one step. */
6614
6615   tl_assert(so);
6616   tl_assert(so->magic == SO_MAGIC);
6617
6618   /* stay sane .. a thread's read-clock must always lead or be the
6619      same as its write-clock */
6620   { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6621     tl_assert(leq);
6622   }
6623
6624   /* since we're overwriting the VtsIDs in the SO, we need to drop
6625      any references made by the previous contents thereof */
6626   if (so->viR == VtsID_INVALID) {
6627      tl_assert(so->viW == VtsID_INVALID);
6628      so->viR = thr->viR;
6629      so->viW = thr->viW;
6630      VtsID__rcinc(so->viR);
6631      VtsID__rcinc(so->viW);
6632   } else {
6633      /* In a strong send, we dump any previous VC in the SO and
6634         install the sending thread's VC instead.  For a weak send we
6635         must join2 with what's already there. */
6636      tl_assert(so->viW != VtsID_INVALID);
6637      VtsID__rcdec(so->viR);
6638      VtsID__rcdec(so->viW);
6639      so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6640      so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6641      VtsID__rcinc(so->viR);
6642      VtsID__rcinc(so->viW);
6643   }
6644
6645   /* move both parent clocks along */
6646   VtsID__rcdec(thr->viR);
6647   VtsID__rcdec(thr->viW);
6648   thr->viR = VtsID__tick( thr->viR, thr );
6649   thr->viW = VtsID__tick( thr->viW, thr );
6650   if (!thr->llexit_done) {
6651      Filter__clear(thr->filter, "libhb_so_send");
6652      note_local_Kw_n_stack_for(thr);
6653   }
6654   VtsID__rcinc(thr->viR);
6655   VtsID__rcinc(thr->viW);
6656
6657   if (strong_send)
6658      show_thread_state("s-send", thr);
6659   else
6660      show_thread_state("w-send", thr);
6661}
6662
6663void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
6664{
6665   tl_assert(so);
6666   tl_assert(so->magic == SO_MAGIC);
6667
6668   if (so->viR != VtsID_INVALID) {
6669      tl_assert(so->viW != VtsID_INVALID);
6670
6671      /* Weak receive (basically, an R-acquisition of a R-W lock).
6672         This advances the read-clock of the receiver, but not the
6673         write-clock. */
6674      VtsID__rcdec(thr->viR);
6675      thr->viR = VtsID__join2( thr->viR, so->viR );
6676      VtsID__rcinc(thr->viR);
6677
6678      /* At one point (r10589) it seemed safest to tick the clocks for
6679         the receiving thread after the join.  But on reflection, I
6680         wonder if that might cause it to 'overtake' constraints,
6681         which could lead to missing races.  So, back out that part of
6682         r10589. */
6683      //VtsID__rcdec(thr->viR);
6684      //thr->viR = VtsID__tick( thr->viR, thr );
6685      //VtsID__rcinc(thr->viR);
6686
6687      /* For a strong receive, we also advance the receiver's write
6688         clock, which means the receive as a whole is essentially
6689         equivalent to a W-acquisition of a R-W lock. */
6690      if (strong_recv) {
6691         VtsID__rcdec(thr->viW);
6692         thr->viW = VtsID__join2( thr->viW, so->viW );
6693         VtsID__rcinc(thr->viW);
6694
6695         /* See comment just above, re r10589. */
6696         //VtsID__rcdec(thr->viW);
6697         //thr->viW = VtsID__tick( thr->viW, thr );
6698         //VtsID__rcinc(thr->viW);
6699      }
6700
6701      if (thr->filter)
6702         Filter__clear(thr->filter, "libhb_so_recv");
6703      note_local_Kw_n_stack_for(thr);
6704
6705      if (strong_recv)
6706         show_thread_state("s-recv", thr);
6707      else
6708         show_thread_state("w-recv", thr);
6709
6710   } else {
6711      tl_assert(so->viW == VtsID_INVALID);
6712      /* Deal with degenerate case: 'so' has no vts, so there has been
6713         no message posted to it.  Just ignore this case. */
6714      show_thread_state("d-recv", thr);
6715   }
6716}
6717
6718Bool libhb_so_everSent ( SO* so )
6719{
6720   if (so->viR == VtsID_INVALID) {
6721      tl_assert(so->viW == VtsID_INVALID);
6722      return False;
6723   } else {
6724      tl_assert(so->viW != VtsID_INVALID);
6725      return True;
6726   }
6727}
6728
6729#define XXX1 0 // 0x67a106c
6730#define XXX2 0
6731
6732static inline Bool TRACEME(Addr a, SizeT szB) {
6733   if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
6734   if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
6735   return False;
6736}
6737static void trace ( Thr* thr, Addr a, SizeT szB, const HChar* s )
6738{
6739  SVal sv = zsm_sread08(a);
6740  VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
6741  show_thread_state("", thr);
6742  VG_(printf)("%s","\n");
6743}
6744
6745void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
6746{
6747   SVal sv = SVal__mkC(thr->viW, thr->viW);
6748   tl_assert(is_sane_SVal_C(sv));
6749   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
6750   zsm_sset_range( a, szB, sv );
6751   Filter__clear_range( thr->filter, a, szB );
6752   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
6753}
6754
6755void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
6756{
6757   /* do nothing */
6758}
6759
6760
6761/* Set the lines zix_start till zix_end to NOACCESS. */
6762static void zsm_secmap_line_range_noaccess (SecMap *sm,
6763                                            UInt zix_start, UInt zix_end)
6764{
6765   for (UInt lz = zix_start; lz <= zix_end; lz++) {
6766      LineZ* lineZ;
6767      lineZ = &sm->linesZ[lz];
6768      if (lineZ->dict[0] != SVal_INVALID) {
6769         rcdec_LineZ(lineZ);
6770         lineZ->dict[0] = SVal_NOACCESS;
6771         lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
6772      } else {
6773         clear_LineF_of_Z(lineZ);
6774      }
6775      for (UInt i = 0; i < N_LINE_ARANGE/4; i++)
6776         lineZ->ix2s[i] = 0; /* all refer to dict[0] */
6777   }
6778}
6779
6780/* Set the given range to SVal_NOACCESS in-place in the secmap.
6781   a must be cacheline aligned. len must be a multiple of a cacheline
6782   and must be < N_SECMAP_ARANGE. */
6783static void zsm_sset_range_noaccess_in_secmap(Addr a, SizeT len)
6784{
6785   tl_assert (is_valid_scache_tag (a));
6786   tl_assert (0 == (len & (N_LINE_ARANGE - 1)));
6787   tl_assert (len < N_SECMAP_ARANGE);
6788
6789   SecMap *sm1 = shmem__find_SecMap (a);
6790   SecMap *sm2 = shmem__find_SecMap (a + len - 1);
6791   UWord zix_start = shmem__get_SecMap_offset(a          ) >> N_LINE_BITS;
6792   UWord zix_end   = shmem__get_SecMap_offset(a + len - 1) >> N_LINE_BITS;
6793
6794   if (sm1) {
6795      if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm1));
6796      zsm_secmap_line_range_noaccess (sm1, zix_start,
6797                                      sm1 == sm2 ? zix_end : N_SECMAP_ZLINES-1);
6798   }
6799   if (sm2 && sm1 != sm2) {
6800      if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm2));
6801      zsm_secmap_line_range_noaccess (sm2, 0, zix_end);
6802   }
6803}
6804
6805/* Set the given address range to SVal_NOACCESS.
6806   The SecMaps fully set to SVal_NOACCESS will be pushed in SecMap_freelist. */
6807static void zsm_sset_range_noaccess (Addr addr, SizeT len)
6808{
6809   /*
6810       BPC = Before, Partial Cacheline, = addr
6811             (i.e. starting inside a cacheline/inside a SecMap)
6812       BFC = Before, Full Cacheline(s), but not full SecMap
6813             (i.e. starting inside a SecMap)
6814       FSM = Full SecMap(s)
6815             (i.e. starting a SecMap)
6816       AFC = After, Full Cacheline(s), but not full SecMap
6817             (i.e. first address after the full SecMap(s))
6818       APC = After, Partial Cacheline, i.e. first address after the
6819             full CacheLines).
6820       ARE = After Range End = addr+len = first address not part of the range.
6821
6822       If addr     starts a Cacheline, then BPC == BFC.
6823       If addr     starts a SecMap,    then BPC == BFC == FSM.
6824       If addr+len starts a SecMap,    then APC == ARE == AFC
6825       If addr+len starts a Cacheline, then APC == ARE
6826   */
6827   Addr ARE = addr + len;
6828   Addr BPC = addr;
6829   Addr BFC = ROUNDUP(BPC, N_LINE_ARANGE);
6830   Addr FSM = ROUNDUP(BPC, N_SECMAP_ARANGE);
6831   Addr AFC = ROUNDDN(ARE, N_SECMAP_ARANGE);
6832   Addr APC = ROUNDDN(ARE, N_LINE_ARANGE);
6833   SizeT Plen = len; // Plen will be split between the following:
6834   SizeT BPClen;
6835   SizeT BFClen;
6836   SizeT FSMlen;
6837   SizeT AFClen;
6838   SizeT APClen;
6839
6840   /* Consumes from Plen the nr of bytes between from and to.
6841      from and to must be aligned on a multiple of round.
6842      The length consumed will be a multiple of round, with
6843      a maximum of Plen. */
6844#  define PlenCONSUME(from, to, round, consumed) \
6845   do {                                          \
6846   if (from < to) {                              \
6847      if (to - from < Plen)                      \
6848         consumed = to - from;                   \
6849      else                                       \
6850         consumed = ROUNDDN(Plen, round);        \
6851   } else {                                      \
6852      consumed = 0;                              \
6853   }                                             \
6854   Plen -= consumed; } while (0)
6855
6856   PlenCONSUME(BPC, BFC, 1,               BPClen);
6857   PlenCONSUME(BFC, FSM, N_LINE_ARANGE,   BFClen);
6858   PlenCONSUME(FSM, AFC, N_SECMAP_ARANGE, FSMlen);
6859   PlenCONSUME(AFC, APC, N_LINE_ARANGE,   AFClen);
6860   PlenCONSUME(APC, ARE, 1,               APClen);
6861
6862   if (0)
6863      VG_(printf) ("addr %p[%lu] ARE %p"
6864                   " BPC %p[%lu] BFC %p[%lu] FSM %p[%lu]"
6865                   " AFC %p[%lu] APC %p[%lu]\n",
6866                   (void*)addr, len, (void*)ARE,
6867                   (void*)BPC, BPClen, (void*)BFC, BFClen, (void*)FSM, FSMlen,
6868                   (void*)AFC, AFClen, (void*)APC, APClen);
6869
6870   tl_assert (Plen == 0);
6871
6872   /* Set to NOACCESS pieces before and after not covered by entire SecMaps. */
6873
6874   /* First we set the partial cachelines. This is done through the cache. */
6875   if (BPClen > 0)
6876      zsm_sset_range_SMALL (BPC, BPClen, SVal_NOACCESS);
6877   if (APClen > 0)
6878      zsm_sset_range_SMALL (APC, APClen, SVal_NOACCESS);
6879
6880   /* After this, we will not use the cache anymore. We will directly work
6881      in-place on the z shadow memory in SecMap(s).
6882      So, we invalidate the cachelines for the whole range we are setting
6883      to NOACCESS below. */
6884   shmem__invalidate_scache_range (BFC, APC - BFC);
6885
6886   if (BFClen > 0)
6887      zsm_sset_range_noaccess_in_secmap (BFC, BFClen);
6888   if (AFClen > 0)
6889      zsm_sset_range_noaccess_in_secmap (AFC, AFClen);
6890
6891   if (FSMlen > 0) {
6892      /* Set to NOACCESS all the SecMaps, pushing the SecMaps to the
6893         free list. */
6894      Addr  sm_start = FSM;
6895      while (sm_start < AFC) {
6896         SecMap *sm = shmem__find_SecMap (sm_start);
6897         if (sm) {
6898            Addr gaKey;
6899            SecMap *fm_sm;
6900
6901            if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
6902            for (UInt lz = 0; lz < N_SECMAP_ZLINES; lz++) {
6903               LineZ *lineZ = &sm->linesZ[lz];
6904               if (LIKELY(lineZ->dict[0] != SVal_INVALID))
6905                  rcdec_LineZ(lineZ);
6906               else
6907                  clear_LineF_of_Z(lineZ);
6908            }
6909            if (!VG_(delFromFM)(map_shmem, &gaKey, (UWord*)&fm_sm, sm_start))
6910               tl_assert (0);
6911            stats__secmaps_in_map_shmem--;
6912            tl_assert (gaKey == sm_start);
6913            tl_assert (sm == fm_sm);
6914            stats__secmaps_ssetGCed++;
6915            push_SecMap_on_freelist (sm);
6916         }
6917         sm_start += N_SECMAP_ARANGE;
6918      }
6919      tl_assert (sm_start == AFC);
6920
6921      /* The above loop might have kept copies of freed SecMap in the smCache.
6922         => clear them. */
6923      if (address_in_range(smCache[0].gaKey, FSM, FSMlen)) {
6924         smCache[0].gaKey = 1;
6925         smCache[0].sm = NULL;
6926      }
6927      if (address_in_range(smCache[1].gaKey, FSM, FSMlen)) {
6928         smCache[1].gaKey = 1;
6929         smCache[1].sm = NULL;
6930      }
6931      if (address_in_range(smCache[2].gaKey, FSM, FSMlen)) {
6932         smCache[2].gaKey = 1;
6933         smCache[2].sm = NULL;
6934      }
6935      STATIC_ASSERT (3 == sizeof(smCache)/sizeof(SMCacheEnt));
6936   }
6937}
6938
6939void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
6940{
6941   /* This really does put the requested range in NoAccess.  It's
6942      expensive though. */
6943   SVal sv = SVal_NOACCESS;
6944   tl_assert(is_sane_SVal_C(sv));
6945   if (LIKELY(szB < 2 * N_LINE_ARANGE))
6946      zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
6947   else
6948      zsm_sset_range_noaccess (a, szB);
6949   Filter__clear_range( thr->filter, a, szB );
6950}
6951
6952/* Works byte at a time. Can be optimised if needed. */
6953UWord libhb_srange_get_abits (Addr a, UChar *abits, SizeT len)
6954{
6955   UWord anr = 0; // nr of bytes addressable.
6956
6957   /* Get the accessibility of each byte. Pay attention to not
6958      create SecMap or LineZ when checking if a byte is addressable.
6959
6960      Note: this is used for client request. Performance deemed not critical.
6961      So for simplicity, we work byte per byte.
6962      Performance could be improved  by working with full cachelines
6963      or with full SecMap, when reaching a cacheline or secmap boundary. */
6964   for (SizeT i = 0; i < len; i++) {
6965      SVal       sv = SVal_INVALID;
6966      Addr       b = a + i;
6967      Addr       tag = b & ~(N_LINE_ARANGE - 1);
6968      UWord      wix = (b >> N_LINE_BITS) & (N_WAY_NENT - 1);
6969      UWord      cloff = get_cacheline_offset(b);
6970
6971      /* Note: we do not use get_cacheline(b) to avoid creating cachelines
6972         and/or SecMap for non addressable bytes. */
6973      if (tag == cache_shmem.tags0[wix]) {
6974         CacheLine copy = cache_shmem.lyns0[wix];
6975         /* We work on a copy of the cacheline, as we do not want to
6976            record the client request as a real read.
6977            The below is somewhat similar to zsm_sapply08__msmcread but
6978            avoids side effects on the cache. */
6979         UWord toff = get_tree_offset(b); /* == 0 .. 7 */
6980         UWord tno  = get_treeno(b);
6981         UShort descr = copy.descrs[tno];
6982         if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
6983            SVal* tree = &copy.svals[tno << 3];
6984            copy.descrs[tno] = pulldown_to_8(tree, toff, descr);
6985         }
6986         sv = copy.svals[cloff];
6987      } else {
6988         /* Byte not found in the cacheline. Search for a SecMap. */
6989         SecMap *sm = shmem__find_SecMap(b);
6990         LineZ *lineZ;
6991         if (sm == NULL)
6992            sv = SVal_NOACCESS;
6993         else {
6994            UWord zix = shmem__get_SecMap_offset(b) >> N_LINE_BITS;
6995            lineZ = &sm->linesZ[zix];
6996            if (lineZ->dict[0] == SVal_INVALID) {
6997               LineF *lineF = SVal2Ptr(lineZ->dict[1]);
6998               sv = lineF->w64s[cloff];
6999            } else {
7000               UWord ix = read_twobit_array( lineZ->ix2s, cloff );
7001               sv = lineZ->dict[ix];
7002            }
7003         }
7004      }
7005
7006      tl_assert (sv != SVal_INVALID);
7007      if (sv == SVal_NOACCESS) {
7008         if (abits)
7009            abits[i] = 0x00;
7010      } else {
7011         if (abits)
7012            abits[i] = 0xff;
7013         anr++;
7014      }
7015   }
7016
7017   return anr;
7018}
7019
7020
7021void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
7022{
7023   SVal sv = SVal_NOACCESS;
7024   tl_assert(is_sane_SVal_C(sv));
7025   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
7026   if (LIKELY(szB < 2 * N_LINE_ARANGE))
7027      zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7028   else
7029      zsm_sset_range_noaccess (a, szB);
7030   Filter__clear_range( thr->filter, a, szB );
7031   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
7032}
7033
7034Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
7035   tl_assert(thr);
7036   return thr->hgthread;
7037}
7038
7039void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
7040   tl_assert(thr);
7041   thr->hgthread = hgthread;
7042}
7043
7044void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
7045{
7046   zsm_scopy_range(src, dst, len);
7047   Filter__clear_range( thr->filter, dst, len );
7048}
7049
7050void libhb_maybe_GC ( void )
7051{
7052   /* GC the unreferenced (zero rc) RCECs when
7053         (1) reaching a significant nr of RCECs (to avoid scanning a contextTab
7054             with mostly NULL ptr)
7055     and (2) approaching the max nr of RCEC (as we have in any case
7056             at least that amount of RCEC in the pool allocator)
7057             Note: the margin allows to avoid a small but constant increase
7058             of the max nr of RCEC due to the fact that libhb_maybe_GC is
7059             not called when the current nr of RCEC exactly reaches the max.
7060     and (3) the nr of referenced RCECs is less than 75% than total nr RCECs.
7061     Avoid growing too much the nr of RCEC keeps the memory use low,
7062     and avoids to have too many elements in the (fixed) contextTab hashtable.
7063   */
7064   if (UNLIKELY(stats__ctxt_tab_curr > N_RCEC_TAB/2
7065                && stats__ctxt_tab_curr + 1000 >= stats__ctxt_tab_max
7066                && (stats__ctxt_tab_curr * 3)/4 > RCEC_referenced))
7067      do_RCEC_GC();
7068
7069   /* If there are still no entries available (all the table entries are full),
7070      and we hit the threshold point, then do a GC */
7071   Bool vts_tab_GC = vts_tab_freelist == VtsID_INVALID
7072      && VG_(sizeXA)( vts_tab ) >= vts_next_GC_at;
7073   if (UNLIKELY (vts_tab_GC))
7074      vts_tab__do_GC( False/*don't show stats*/ );
7075
7076   /* scan GC the SecMaps when
7077          (1) no SecMap in the freelist
7078      and (2) the current nr of live secmaps exceeds the threshold. */
7079   if (UNLIKELY(SecMap_freelist == NULL
7080                && stats__secmaps_in_map_shmem >= next_SecMap_GC_at)) {
7081      // If we did a vts tab GC, then no need to flush the cache again.
7082      if (!vts_tab_GC)
7083         zsm_flush_cache();
7084      shmem__SecMap_do_GC(True);
7085   }
7086
7087   /* Check the reference counts (expensive) */
7088   if (CHECK_CEM)
7089      event_map__check_reference_counts();
7090}
7091
7092
7093/////////////////////////////////////////////////////////////////
7094/////////////////////////////////////////////////////////////////
7095//                                                             //
7096// SECTION END main library                                    //
7097//                                                             //
7098/////////////////////////////////////////////////////////////////
7099/////////////////////////////////////////////////////////////////
7100
7101/*--------------------------------------------------------------------*/
7102/*--- end                                             libhb_main.c ---*/
7103/*--------------------------------------------------------------------*/
7104