1
2/*--------------------------------------------------------------------*/
3/*--- LibHB: a library for implementing and checking               ---*/
4/*--- the happens-before relationship in concurrent programs.      ---*/
5/*---                                                 libhb_main.c ---*/
6/*--------------------------------------------------------------------*/
7
8/*
9   This file is part of LibHB, a library for implementing and checking
10   the happens-before relationship in concurrent programs.
11
12   Copyright (C) 2008-2013 OpenWorks Ltd
13      info@open-works.co.uk
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "pub_tool_basics.h"
34#include "pub_tool_poolalloc.h"
35#include "pub_tool_libcassert.h"
36#include "pub_tool_libcbase.h"
37#include "pub_tool_libcprint.h"
38#include "pub_tool_mallocfree.h"
39#include "pub_tool_wordfm.h"
40#include "pub_tool_sparsewa.h"
41#include "pub_tool_xarray.h"
42#include "pub_tool_oset.h"
43#include "pub_tool_threadstate.h"
44#include "pub_tool_aspacemgr.h"
45#include "pub_tool_execontext.h"
46#include "pub_tool_errormgr.h"
47#include "pub_tool_options.h"        // VG_(clo_stats)
48#include "hg_basics.h"
49#include "hg_wordset.h"
50#include "hg_lock_n_thread.h"
51#include "hg_errors.h"
52
53#include "libhb.h"
54
55
56/////////////////////////////////////////////////////////////////
57/////////////////////////////////////////////////////////////////
58//                                                             //
59// Debugging #defines                                          //
60//                                                             //
61/////////////////////////////////////////////////////////////////
62/////////////////////////////////////////////////////////////////
63
64/* Check the sanity of shadow values in the core memory state
65   machine.  Change #if 0 to #if 1 to enable this. */
66#if 0
67#  define CHECK_MSM 1
68#else
69#  define CHECK_MSM 0
70#endif
71
72
73/* Check sanity (reference counts, etc) in the conflicting access
74   machinery.  Change #if 0 to #if 1 to enable this. */
75#if 0
76#  define CHECK_CEM 1
77#else
78#  define CHECK_CEM 0
79#endif
80
81
82/* Check sanity in the compressed shadow memory machinery,
83   particularly in its caching innards.  Unfortunately there's no
84   almost-zero-cost way to make them selectable at run time.  Hence
85   set the #if 0 to #if 1 and rebuild if you want them. */
86#if 0
87#  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
88#  define inline __attribute__((noinline))
89   /* probably want to ditch -fomit-frame-pointer too */
90#else
91#  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
92#endif
93
94
95/////////////////////////////////////////////////////////////////
96/////////////////////////////////////////////////////////////////
97//                                                             //
98// data decls: VtsID                                           //
99//                                                             //
100/////////////////////////////////////////////////////////////////
101/////////////////////////////////////////////////////////////////
102
103/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
104   bits, since they have to be packed into the lowest 30 bits of an
105   SVal. */
106typedef  UInt  VtsID;
107#define VtsID_INVALID 0xFFFFFFFF
108
109
110
111/////////////////////////////////////////////////////////////////
112/////////////////////////////////////////////////////////////////
113//                                                             //
114// data decls: SVal                                            //
115//                                                             //
116/////////////////////////////////////////////////////////////////
117/////////////////////////////////////////////////////////////////
118
119typedef  ULong  SVal;
120
121/* This value has special significance to the implementation, and callers
122   may not store it in the shadow memory. */
123#define SVal_INVALID (3ULL << 62)
124
125/* This is the default value for shadow memory.  Initially the shadow
126   memory contains no accessible areas and so all reads produce this
127   value.  TODO: make this caller-defineable. */
128#define SVal_NOACCESS (2ULL << 62)
129
130
131
132/////////////////////////////////////////////////////////////////
133/////////////////////////////////////////////////////////////////
134//                                                             //
135// data decls: ScalarTS                                        //
136//                                                             //
137/////////////////////////////////////////////////////////////////
138/////////////////////////////////////////////////////////////////
139
140/* Scalar Timestamp.  We have to store a lot of these, so there is
141   some effort to make them as small as possible.  Logically they are
142   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
143   We pack it into 64 bits by representing the Thr* using a ThrID, a
144   small integer (18 bits), and a 46 bit integer for the timestamp
145   number.  The 46/18 split is arbitary, but has the effect that
146   Helgrind can only handle programs that create 2^18 or fewer threads
147   over their entire lifetime, and have no more than 2^46 timestamp
148   ticks (synchronisation operations on the same thread).
149
150   This doesn't seem like much of a limitation.  2^46 ticks is
151   7.06e+13, and if each tick (optimistically) takes the machine 1000
152   cycles to process, then the minimum time to process that many ticks
153   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
154   but VTS ticks, which isn't realistic.
155
156   NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
157   32 since a ThrID is a UInt.  29 comes from the fact that
158   'Thr_n_RCEC', which records information about old accesses, packs
159   not only a ThrID but also 2+1 other bits (access size and
160   writeness) in a UInt, hence limiting size to 32-(2+1) == 29.
161
162   NB2: thrid values are issued upwards from 1024, and values less
163   than that aren't valid.  This isn't per se necessary (any order
164   will do, so long as they are unique), but it does help ensure they
165   are less likely to get confused with the various other kinds of
166   small-integer thread ids drifting around (eg, TId).  See also NB5.
167
168   NB3: this probably also relies on the fact that Thr's are never
169   deallocated -- they exist forever.  Hence the 1-1 mapping from
170   Thr's to thrid values (set up in Thr__new) persists forever.
171
172   NB4: temp_max_sized_VTS is allocated at startup and never freed.
173   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
174   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
175   making the memory use for this go sky-high.  With
176   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
177   like an OK tradeoff.  If more than 256k threads need to be
178   supported, we could change SCALARTS_N_THRBITS to 20, which would
179   facilitate supporting 1 million threads at the cost of 8MB storage
180   for temp_max_sized_VTS.
181
182   NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
183   ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
184   must never be a valid ThrID.  Given NB2 that's OK.
185*/
186#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */
187
188#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
189typedef
190   struct {
191      ThrID thrid : SCALARTS_N_THRBITS;
192      ULong tym   : SCALARTS_N_TYMBITS;
193   }
194   ScalarTS;
195
196#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
197
198
199
200/////////////////////////////////////////////////////////////////
201/////////////////////////////////////////////////////////////////
202//                                                             //
203// data decls: Filter                                          //
204//                                                             //
205/////////////////////////////////////////////////////////////////
206/////////////////////////////////////////////////////////////////
207
208// baseline: 5, 9
209#define FI_LINE_SZB_LOG2  5
210#define FI_NUM_LINES_LOG2 10
211
212#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
213#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
214
215#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
216#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
217
218#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
219                             & (Addr)(FI_NUM_LINES-1) )
220
221
222/* In the lines, each 8 bytes are treated individually, and are mapped
223   to a UShort.  Regardless of endianness of the underlying machine,
224   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
225   the highest address.
226
227   Of each bit pair, the higher numbered bit is set if a R has been
228   seen, so the actual layout is:
229
230   15 14             ...  01 00
231
232   R  W  for addr+7  ...  R  W  for addr+0
233
234   So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
235*/
236
237/* tags are separated from lines.  tags are Addrs and are
238   the base address of the line. */
239typedef
240   struct {
241      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
242   }
243   FiLine;
244
245typedef
246   struct {
247      Addr   tags[FI_NUM_LINES];
248      FiLine lines[FI_NUM_LINES];
249   }
250   Filter;
251
252
253
254/////////////////////////////////////////////////////////////////
255/////////////////////////////////////////////////////////////////
256//                                                             //
257// data decls: Thr, ULong_n_EC                                 //
258//                                                             //
259/////////////////////////////////////////////////////////////////
260/////////////////////////////////////////////////////////////////
261
262// Records stacks for H1 history mechanism (DRD-style)
263typedef
264   struct { ULong ull; ExeContext* ec; }
265   ULong_n_EC;
266
267
268/* How many of the above records to collect for each thread?  Older
269   ones are dumped when we run out of space.  62.5k requires 1MB per
270   thread, since each ULong_n_EC record is 16 bytes long.  When more
271   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
272   deleted to make space.  Hence in the worst case we will be able to
273   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
274   Kw transitions (segments in this thread).  For the current setting
275   that gives a guaranteed stack for at least the last 31.25k
276   segments. */
277#define N_KWs_N_STACKs_PER_THREAD 62500
278
279
280struct _Thr {
281   /* Current VTSs for this thread.  They change as we go along.  viR
282      is the VTS to be used for reads, viW for writes.  Usually they
283      are the same, but can differ when we deal with reader-writer
284      locks.  It is always the case that
285         VtsID__cmpLEQ(viW,viR) == True
286      that is, viW must be the same, or lagging behind, viR. */
287   VtsID viR;
288   VtsID viW;
289
290   /* Is initially False, and is set to True after the thread really
291      has done a low-level exit.  When True, we expect to never see
292      any more memory references done by this thread. */
293   Bool llexit_done;
294
295   /* Is initially False, and is set to True after the thread has been
296      joined with (reaped by some other thread).  After this point, we
297      do not expect to see any uses of .viR or .viW, so it is safe to
298      set them to VtsID_INVALID. */
299   Bool joinedwith_done;
300
301   /* A small integer giving a unique identity to this Thr.  See
302      comments on the definition of ScalarTS for details. */
303   ThrID thrid : SCALARTS_N_THRBITS;
304
305   /* A filter that removes references for which we believe that
306      msmcread/msmcwrite will not change the state, nor report a
307      race. */
308   Filter* filter;
309
310   /* A pointer back to the top level Thread structure.  There is a
311      1-1 mapping between Thread and Thr structures -- each Thr points
312      at its corresponding Thread, and vice versa.  Really, Thr and
313      Thread should be merged into a single structure. */
314   Thread* hgthread;
315
316   /* The ULongs (scalar Kws) in this accumulate in strictly
317      increasing order, without duplicates.  This is important because
318      we need to be able to find a given scalar Kw in this array
319      later, by binary search. */
320   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
321};
322
323
324
325/////////////////////////////////////////////////////////////////
326/////////////////////////////////////////////////////////////////
327//                                                             //
328// data decls: SO                                              //
329//                                                             //
330/////////////////////////////////////////////////////////////////
331/////////////////////////////////////////////////////////////////
332
333// (UInt) `echo "Synchronisation object" | md5sum`
334#define SO_MAGIC 0x56b3c5b0U
335
336struct _SO {
337   struct _SO* admin_prev;
338   struct _SO* admin_next;
339   VtsID viR; /* r-clock of sender */
340   VtsID viW; /* w-clock of sender */
341   UInt  magic;
342};
343
344
345
346/////////////////////////////////////////////////////////////////
347/////////////////////////////////////////////////////////////////
348//                                                             //
349// Forward declarations                                        //
350//                                                             //
351/////////////////////////////////////////////////////////////////
352/////////////////////////////////////////////////////////////////
353
354/* fwds for
355   Globals needed by other parts of the library.  These are set
356   once at startup and then never changed. */
357static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
358static ExeContext* (*main_get_EC)( Thr* ) = NULL;
359
360/* misc fn and data fwdses */
361static void VtsID__rcinc ( VtsID ii );
362static void VtsID__rcdec ( VtsID ii );
363
364static inline Bool SVal__isC ( SVal s );
365static inline VtsID SVal__unC_Rmin ( SVal s );
366static inline VtsID SVal__unC_Wmin ( SVal s );
367static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
368
369/* A double linked list of all the SO's. */
370SO* admin_SO;
371
372
373
374/////////////////////////////////////////////////////////////////
375/////////////////////////////////////////////////////////////////
376//                                                             //
377// SECTION BEGIN compressed shadow memory                      //
378//                                                             //
379/////////////////////////////////////////////////////////////////
380/////////////////////////////////////////////////////////////////
381
382#ifndef __HB_ZSM_H
383#define __HB_ZSM_H
384
385/* Initialise the library.  Once initialised, it will (or may) call
386   rcinc and rcdec in response to all the calls below, in order to
387   allow the user to do reference counting on the SVals stored herein.
388   It is important to understand, however, that due to internal
389   caching, the reference counts are in general inaccurate, and can be
390   both above or below the true reference count for an item.  In
391   particular, the library may indicate that the reference count for
392   an item is zero, when in fact it is not.
393
394   To make the reference counting exact and therefore non-pointless,
395   call zsm_flush_cache.  Immediately after it returns, the reference
396   counts for all items, as deduced by the caller by observing calls
397   to rcinc and rcdec, will be correct, and so any items with a zero
398   reference count may be freed (or at least considered to be
399   unreferenced by this library).
400*/
401static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
402
403static void zsm_sset_range  ( Addr, SizeT, SVal );
404static void zsm_scopy_range ( Addr, Addr, SizeT );
405static void zsm_flush_cache ( void );
406
407#endif /* ! __HB_ZSM_H */
408
409
410/* Round a up to the next multiple of N.  N must be a power of 2 */
411#define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
412/* Round a down to the next multiple of N.  N must be a power of 2 */
413#define ROUNDDN(a, N)   ((a) & ~(N-1))
414
415
416
417/* ------ User-supplied RC functions ------ */
418static void(*rcinc)(SVal) = NULL;
419static void(*rcdec)(SVal) = NULL;
420
421
422/* ------ CacheLine ------ */
423
424#define N_LINE_BITS      6 /* must be >= 3 */
425#define N_LINE_ARANGE    (1 << N_LINE_BITS)
426#define N_LINE_TREES     (N_LINE_ARANGE >> 3)
427
428typedef
429   struct {
430      UShort descrs[N_LINE_TREES];
431      SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
432   }
433   CacheLine;
434
435#define TREE_DESCR_16_0 (1<<0)
436#define TREE_DESCR_32_0 (1<<1)
437#define TREE_DESCR_16_1 (1<<2)
438#define TREE_DESCR_64   (1<<3)
439#define TREE_DESCR_16_2 (1<<4)
440#define TREE_DESCR_32_1 (1<<5)
441#define TREE_DESCR_16_3 (1<<6)
442#define TREE_DESCR_8_0  (1<<7)
443#define TREE_DESCR_8_1  (1<<8)
444#define TREE_DESCR_8_2  (1<<9)
445#define TREE_DESCR_8_3  (1<<10)
446#define TREE_DESCR_8_4  (1<<11)
447#define TREE_DESCR_8_5  (1<<12)
448#define TREE_DESCR_8_6  (1<<13)
449#define TREE_DESCR_8_7  (1<<14)
450#define TREE_DESCR_DTY  (1<<15)
451
452typedef
453   struct {
454      SVal  dict[4]; /* can represent up to 4 diff values in the line */
455      UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
456                                      dict indexes */
457      /* if dict[0] == SVal_INVALID then dict[1] is the index of the
458         LineF to use, and dict[2..] are also SVal_INVALID. */
459   }
460   LineZ; /* compressed rep for a cache line */
461
462typedef
463   struct {
464      Bool inUse;
465      SVal w64s[N_LINE_ARANGE];
466   }
467   LineF; /* full rep for a cache line */
468
469/* Shadow memory.
470   Primary map is a WordFM Addr SecMap*.
471   SecMaps cover some page-size-ish section of address space and hold
472     a compressed representation.
473   CacheLine-sized chunks of SecMaps are copied into a Cache, being
474   decompressed when moved into the cache and recompressed on the
475   way out.  Because of this, the cache must operate as a writeback
476   cache, not a writethrough one.
477
478   Each SecMap must hold a power-of-2 number of CacheLines.  Hence
479   N_SECMAP_BITS must >= N_LINE_BITS.
480*/
481#define N_SECMAP_BITS   13
482#define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
483
484// # CacheLines held by a SecMap
485#define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
486
487/* The data in the SecMap is held in the array of LineZs.  Each LineZ
488   either carries the required data directly, in a compressed
489   representation, or it holds (in .dict[0]) an index to the LineF in
490   .linesF that holds the full representation.
491
492   Currently-unused LineF's have their .inUse bit set to zero.
493   Since each in-use LineF is referred to be exactly one LineZ,
494   the number of .linesZ[] that refer to .linesF should equal
495   the number of .linesF[] that have .inUse == True.
496
497   RC obligations: the RCs presented to the user include exactly
498   the values in:
499   * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
500   * F reps that are in use (.inUse == True)
501
502   Hence the following actions at the following transitions are required:
503
504   F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
505   F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
506   Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
507   Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
508*/
509typedef
510   struct {
511      UInt   magic;
512      LineZ  linesZ[N_SECMAP_ZLINES];
513      LineF* linesF;
514      UInt   linesF_size;
515   }
516   SecMap;
517
518#define SecMap_MAGIC   0x571e58cbU
519
520static inline Bool is_sane_SecMap ( SecMap* sm ) {
521   return sm != NULL && sm->magic == SecMap_MAGIC;
522}
523
524/* ------ Cache ------ */
525
526#define N_WAY_BITS 16
527#define N_WAY_NENT (1 << N_WAY_BITS)
528
529/* Each tag is the address of the associated CacheLine, rounded down
530   to a CacheLine address boundary.  A CacheLine size must be a power
531   of 2 and must be 8 or more.  Hence an easy way to initialise the
532   cache so it is empty is to set all the tag values to any value % 8
533   != 0, eg 1.  This means all queries in the cache initially miss.
534   It does however require us to detect and not writeback, any line
535   with a bogus tag. */
536typedef
537   struct {
538      CacheLine lyns0[N_WAY_NENT];
539      Addr      tags0[N_WAY_NENT];
540   }
541   Cache;
542
543static inline Bool is_valid_scache_tag ( Addr tag ) {
544   /* a valid tag should be naturally aligned to the start of
545      a CacheLine. */
546   return 0 == (tag & (N_LINE_ARANGE - 1));
547}
548
549
550/* --------- Primary data structures --------- */
551
552/* Shadow memory primary map */
553static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
554static Cache   cache_shmem;
555
556
557static UWord stats__secmaps_search       = 0; // # SM finds
558static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
559static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
560static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
561static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
562static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
563static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
564static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
565static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
566static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
567static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
568static UWord stats__cache_F_fetches      = 0; // # F lines fetched
569static UWord stats__cache_F_wbacks       = 0; // # F lines written back
570static UWord stats__cache_invals         = 0; // # cache invals
571static UWord stats__cache_flushes        = 0; // # cache flushes
572static UWord stats__cache_totrefs        = 0; // # total accesses
573static UWord stats__cache_totmisses      = 0; // # misses
574static ULong stats__cache_make_New_arange = 0; // total arange made New
575static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
576static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
577static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
578static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
579static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
580static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
581static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
582static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
583static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
584static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
585static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
586static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
587static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
588static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
589static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
590static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
591static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
592static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
593static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
594static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
595static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
596static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
597static UWord stats__vts__tick            = 0; // # calls to VTS__tick
598static UWord stats__vts__join            = 0; // # calls to VTS__join
599static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
600static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
601
602// # calls to VTS__cmp_structural w/ slow case
603static UWord stats__vts__cmp_structural_slow = 0;
604
605// # calls to VTS__indexAt_SLOW
606static UWord stats__vts__indexat_slow = 0;
607
608// # calls to vts_set__find__or__clone_and_add
609static UWord stats__vts_set__focaa    = 0;
610
611// # calls to vts_set__find__or__clone_and_add that lead to an
612// allocation
613static UWord stats__vts_set__focaa_a  = 0;
614
615
616static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
617   return a & ~(N_SECMAP_ARANGE - 1);
618}
619static inline UWord shmem__get_SecMap_offset ( Addr a ) {
620   return a & (N_SECMAP_ARANGE - 1);
621}
622
623
624/*----------------------------------------------------------------*/
625/*--- map_shmem :: WordFM Addr SecMap                          ---*/
626/*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
627/*----------------------------------------------------------------*/
628
629/*--------------- SecMap allocation --------------- */
630
631static HChar* shmem__bigchunk_next = NULL;
632static HChar* shmem__bigchunk_end1 = NULL;
633
634static void* shmem__bigchunk_alloc ( SizeT n )
635{
636   const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
637   tl_assert(n > 0);
638   n = VG_ROUNDUP(n, 16);
639   tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
640   tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
641             <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
642   if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
643      if (0)
644      VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
645                  (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
646      shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
647      if (shmem__bigchunk_next == NULL)
648         VG_(out_of_memory_NORETURN)(
649            "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
650      shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
651   }
652   tl_assert(shmem__bigchunk_next);
653   tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
654   tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
655   shmem__bigchunk_next += n;
656   return shmem__bigchunk_next - n;
657}
658
659static SecMap* shmem__alloc_SecMap ( void )
660{
661   Word    i, j;
662   SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
663   if (0) VG_(printf)("alloc_SecMap %p\n",sm);
664   tl_assert(sm);
665   sm->magic = SecMap_MAGIC;
666   for (i = 0; i < N_SECMAP_ZLINES; i++) {
667      sm->linesZ[i].dict[0] = SVal_NOACCESS;
668      sm->linesZ[i].dict[1] = SVal_INVALID;
669      sm->linesZ[i].dict[2] = SVal_INVALID;
670      sm->linesZ[i].dict[3] = SVal_INVALID;
671      for (j = 0; j < N_LINE_ARANGE/4; j++)
672         sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
673   }
674   sm->linesF      = NULL;
675   sm->linesF_size = 0;
676   stats__secmaps_allocd++;
677   stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
678   stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
679   stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
680   return sm;
681}
682
683typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
684static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
685
686static SecMap* shmem__find_SecMap ( Addr ga )
687{
688   SecMap* sm    = NULL;
689   Addr    gaKey = shmem__round_to_SecMap_base(ga);
690   // Cache
691   stats__secmaps_search++;
692   if (LIKELY(gaKey == smCache[0].gaKey))
693      return smCache[0].sm;
694   if (LIKELY(gaKey == smCache[1].gaKey)) {
695      SMCacheEnt tmp = smCache[0];
696      smCache[0] = smCache[1];
697      smCache[1] = tmp;
698      return smCache[0].sm;
699   }
700   if (gaKey == smCache[2].gaKey) {
701      SMCacheEnt tmp = smCache[1];
702      smCache[1] = smCache[2];
703      smCache[2] = tmp;
704      return smCache[1].sm;
705   }
706   // end Cache
707   stats__secmaps_search_slow++;
708   if (VG_(lookupFM)( map_shmem,
709                      NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
710      tl_assert(sm != NULL);
711      smCache[2] = smCache[1];
712      smCache[1] = smCache[0];
713      smCache[0].gaKey = gaKey;
714      smCache[0].sm    = sm;
715   } else {
716      tl_assert(sm == NULL);
717   }
718   return sm;
719}
720
721static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
722{
723   SecMap* sm = shmem__find_SecMap ( ga );
724   if (LIKELY(sm)) {
725      return sm;
726   } else {
727      /* create a new one */
728      Addr gaKey = shmem__round_to_SecMap_base(ga);
729      sm = shmem__alloc_SecMap();
730      tl_assert(sm);
731      VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
732      return sm;
733   }
734}
735
736
737/* ------------ LineF and LineZ related ------------ */
738
739static void rcinc_LineF ( LineF* lineF ) {
740   UWord i;
741   tl_assert(lineF->inUse);
742   for (i = 0; i < N_LINE_ARANGE; i++)
743      rcinc(lineF->w64s[i]);
744}
745
746static void rcdec_LineF ( LineF* lineF ) {
747   UWord i;
748   tl_assert(lineF->inUse);
749   for (i = 0; i < N_LINE_ARANGE; i++)
750      rcdec(lineF->w64s[i]);
751}
752
753static void rcinc_LineZ ( LineZ* lineZ ) {
754   tl_assert(lineZ->dict[0] != SVal_INVALID);
755   rcinc(lineZ->dict[0]);
756   if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
757   if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
758   if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
759}
760
761static void rcdec_LineZ ( LineZ* lineZ ) {
762   tl_assert(lineZ->dict[0] != SVal_INVALID);
763   rcdec(lineZ->dict[0]);
764   if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
765   if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
766   if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
767}
768
769inline
770static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
771   Word bix, shft, mask, prep;
772   tl_assert(ix >= 0);
773   bix  = ix >> 2;
774   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
775   mask = 3 << shft;
776   prep = b2 << shft;
777   arr[bix] = (arr[bix] & ~mask) | prep;
778}
779
780inline
781static UWord read_twobit_array ( UChar* arr, UWord ix ) {
782   Word bix, shft;
783   tl_assert(ix >= 0);
784   bix  = ix >> 2;
785   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
786   return (arr[bix] >> shft) & 3;
787}
788
789/* Given address 'tag', find either the Z or F line containing relevant
790   data, so it can be read into the cache.
791*/
792static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
793                                  /*OUT*/LineF** fp, Addr tag ) {
794   LineZ* lineZ;
795   LineF* lineF;
796   UWord   zix;
797   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
798   UWord   smoff = shmem__get_SecMap_offset(tag);
799   /* since smoff is derived from a valid tag, it should be
800      cacheline-aligned. */
801   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
802   zix = smoff >> N_LINE_BITS;
803   tl_assert(zix < N_SECMAP_ZLINES);
804   lineZ = &sm->linesZ[zix];
805   lineF = NULL;
806   if (lineZ->dict[0] == SVal_INVALID) {
807      UInt fix = (UInt)lineZ->dict[1];
808      tl_assert(sm->linesF);
809      tl_assert(sm->linesF_size > 0);
810      tl_assert(fix >= 0 && fix < sm->linesF_size);
811      lineF = &sm->linesF[fix];
812      tl_assert(lineF->inUse);
813      lineZ = NULL;
814   }
815   *zp = lineZ;
816   *fp = lineF;
817}
818
819/* Given address 'tag', return the relevant SecMap and the index of
820   the LineZ within it, in the expectation that the line is to be
821   overwritten.  Regardless of whether 'tag' is currently associated
822   with a Z or F representation, to rcdec on the current
823   representation, in recognition of the fact that the contents are
824   just about to be overwritten. */
825static __attribute__((noinline))
826void find_Z_for_writing ( /*OUT*/SecMap** smp,
827                          /*OUT*/Word* zixp,
828                          Addr tag ) {
829   LineZ* lineZ;
830   LineF* lineF;
831   UWord   zix;
832   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
833   UWord   smoff = shmem__get_SecMap_offset(tag);
834   /* since smoff is derived from a valid tag, it should be
835      cacheline-aligned. */
836   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
837   zix = smoff >> N_LINE_BITS;
838   tl_assert(zix < N_SECMAP_ZLINES);
839   lineZ = &sm->linesZ[zix];
840   lineF = NULL;
841   /* re RCs, we are freeing up this LineZ/LineF so that new data can
842      be parked in it.  Hence have to rcdec it accordingly. */
843   /* If lineZ has an associated lineF, free it up. */
844   if (lineZ->dict[0] == SVal_INVALID) {
845      UInt fix = (UInt)lineZ->dict[1];
846      tl_assert(sm->linesF);
847      tl_assert(sm->linesF_size > 0);
848      tl_assert(fix >= 0 && fix < sm->linesF_size);
849      lineF = &sm->linesF[fix];
850      tl_assert(lineF->inUse);
851      rcdec_LineF(lineF);
852      lineF->inUse = False;
853   } else {
854      rcdec_LineZ(lineZ);
855   }
856   *smp  = sm;
857   *zixp = zix;
858}
859
860static __attribute__((noinline))
861void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
862   UInt        i, new_size;
863   LineF* nyu;
864
865   if (sm->linesF) {
866      tl_assert(sm->linesF_size > 0);
867   } else {
868      tl_assert(sm->linesF_size == 0);
869   }
870
871   if (sm->linesF) {
872      for (i = 0; i < sm->linesF_size; i++) {
873         if (!sm->linesF[i].inUse) {
874            *fixp = (Word)i;
875            return;
876         }
877      }
878   }
879
880   /* No free F line found.  Expand existing array and try again. */
881   new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
882   nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
883                           new_size * sizeof(LineF) );
884   tl_assert(nyu);
885
886   stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
887   stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
888                                  * sizeof(LineF);
889
890   if (0)
891   VG_(printf)("SM %p: expand F array from %d to %d\n",
892               sm, (Int)sm->linesF_size, new_size);
893
894   for (i = 0; i < new_size; i++)
895      nyu[i].inUse = False;
896
897   if (sm->linesF) {
898      for (i = 0; i < sm->linesF_size; i++) {
899         tl_assert(sm->linesF[i].inUse);
900         nyu[i] = sm->linesF[i];
901      }
902      VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
903      HG_(free)(sm->linesF);
904   }
905
906   sm->linesF      = nyu;
907   sm->linesF_size = new_size;
908
909   for (i = 0; i < sm->linesF_size; i++) {
910      if (!sm->linesF[i].inUse) {
911         *fixp = (Word)i;
912         return;
913      }
914    }
915
916    /*NOTREACHED*/
917    tl_assert(0);
918}
919
920
921/* ------------ CacheLine and implicit-tree related ------------ */
922
923__attribute__((unused))
924static void pp_CacheLine ( CacheLine* cl ) {
925   Word i;
926   if (!cl) {
927      VG_(printf)("%s","pp_CacheLine(NULL)\n");
928      return;
929   }
930   for (i = 0; i < N_LINE_TREES; i++)
931      VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
932   for (i = 0; i < N_LINE_ARANGE; i++)
933      VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
934}
935
936static UChar descr_to_validbits ( UShort descr )
937{
938   /* a.k.a Party Time for gcc's constant folder */
939#  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
940                b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
941             ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
942                          ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
943                          ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
944                          ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
945                          ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
946                          ( (b16_2) << 4)  | ( (b64)   << 3)  | \
947                          ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
948                          ( (b16_0) << 0) ) )
949
950#  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
951             ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
952                         ( (bit5) << 5) | ( (bit4) << 4) | \
953                         ( (bit3) << 3) | ( (bit2) << 2) | \
954                         ( (bit1) << 1) | ( (bit0) << 0) ) )
955
956   /* these should all get folded out at compile time */
957   tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
958   tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
959   tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
960   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
961   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
962   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
963   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
964   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
965   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
966
967   switch (descr) {
968   /*
969              +--------------------------------- TREE_DESCR_8_7
970              |             +------------------- TREE_DESCR_8_0
971              |             |  +---------------- TREE_DESCR_16_3
972              |             |  | +-------------- TREE_DESCR_32_1
973              |             |  | | +------------ TREE_DESCR_16_2
974              |             |  | | |  +--------- TREE_DESCR_64
975              |             |  | | |  |  +------ TREE_DESCR_16_1
976              |             |  | | |  |  | +---- TREE_DESCR_32_0
977              |             |  | | |  |  | | +-- TREE_DESCR_16_0
978              |             |  | | |  |  | | |
979              |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
980   case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
981                                                 return BYTE(1,1,1,1,1,1,1,1);
982   case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
983                                                 return BYTE(1,1,0,1,1,1,1,1);
984   case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
985                                                 return BYTE(0,1,1,1,1,1,1,1);
986   case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
987                                                 return BYTE(0,1,0,1,1,1,1,1);
988
989   case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
990                                                 return BYTE(1,1,1,1,1,1,0,1);
991   case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
992                                                 return BYTE(1,1,0,1,1,1,0,1);
993   case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
994                                                 return BYTE(0,1,1,1,1,1,0,1);
995   case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
996                                                 return BYTE(0,1,0,1,1,1,0,1);
997
998   case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
999                                                 return BYTE(1,1,1,1,0,1,1,1);
1000   case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1001                                                 return BYTE(1,1,0,1,0,1,1,1);
1002   case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1003                                                 return BYTE(0,1,1,1,0,1,1,1);
1004   case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1005                                                 return BYTE(0,1,0,1,0,1,1,1);
1006
1007   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1008                                                 return BYTE(1,1,1,1,0,1,0,1);
1009   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1010                                                 return BYTE(1,1,0,1,0,1,0,1);
1011   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1012                                                 return BYTE(0,1,1,1,0,1,0,1);
1013   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1014                                                 return BYTE(0,1,0,1,0,1,0,1);
1015
1016   case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1017                                                 return BYTE(0,0,0,1,1,1,1,1);
1018   case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1019                                                 return BYTE(0,0,0,1,1,1,0,1);
1020   case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1021                                                 return BYTE(0,0,0,1,0,1,1,1);
1022   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1023                                                 return BYTE(0,0,0,1,0,1,0,1);
1024
1025   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1026                                                 return BYTE(1,1,1,1,0,0,0,1);
1027   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1028                                                 return BYTE(1,1,0,1,0,0,0,1);
1029   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1030                                                 return BYTE(0,1,1,1,0,0,0,1);
1031   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1032                                                 return BYTE(0,1,0,1,0,0,0,1);
1033
1034   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1035                                                 return BYTE(0,0,0,1,0,0,0,1);
1036
1037   case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1038                                                 return BYTE(0,0,0,0,0,0,0,1);
1039
1040   default: return BYTE(0,0,0,0,0,0,0,0);
1041                   /* INVALID - any valid descr produces at least one
1042                      valid bit in tree[0..7]*/
1043   }
1044   /* NOTREACHED*/
1045   tl_assert(0);
1046
1047#  undef DESCR
1048#  undef BYTE
1049}
1050
1051__attribute__((unused))
1052static Bool is_sane_Descr ( UShort descr ) {
1053   return descr_to_validbits(descr) != 0;
1054}
1055
1056static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1057   VG_(sprintf)(dst,
1058                "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1059                (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1060                (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1061                (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1062                (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1063                (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1064                (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1065                (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1066                (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1067                (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1068                (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1069                (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1070                (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1071                (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1072                (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1073                (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1074   );
1075}
1076static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1077   VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1078                     (Int)((byte & 128) ? 1 : 0),
1079                     (Int)((byte &  64) ? 1 : 0),
1080                     (Int)((byte &  32) ? 1 : 0),
1081                     (Int)((byte &  16) ? 1 : 0),
1082                     (Int)((byte &   8) ? 1 : 0),
1083                     (Int)((byte &   4) ? 1 : 0),
1084                     (Int)((byte &   2) ? 1 : 0),
1085                     (Int)((byte &   1) ? 1 : 0)
1086   );
1087}
1088
1089static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1090   Word  i;
1091   UChar validbits = descr_to_validbits(descr);
1092   HChar buf[128], buf2[128];
1093   if (validbits == 0)
1094      goto bad;
1095   for (i = 0; i < 8; i++) {
1096      if (validbits & (1<<i)) {
1097         if (tree[i] == SVal_INVALID)
1098            goto bad;
1099      } else {
1100         if (tree[i] != SVal_INVALID)
1101            goto bad;
1102      }
1103   }
1104   return True;
1105  bad:
1106   sprintf_Descr( buf, descr );
1107   sprintf_Byte( buf2, validbits );
1108   VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1109   VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1110   VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1111   for (i = 0; i < 8; i++)
1112      VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1113   VG_(printf)("%s","}\n");
1114   return 0;
1115}
1116
1117static Bool is_sane_CacheLine ( CacheLine* cl )
1118{
1119   Word tno, cloff;
1120
1121   if (!cl) goto bad;
1122
1123   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1124      UShort descr = cl->descrs[tno];
1125      SVal*  tree  = &cl->svals[cloff];
1126      if (!is_sane_Descr_and_Tree(descr, tree))
1127         goto bad;
1128   }
1129   tl_assert(cloff == N_LINE_ARANGE);
1130   return True;
1131  bad:
1132   pp_CacheLine(cl);
1133   return False;
1134}
1135
1136static UShort normalise_tree ( /*MOD*/SVal* tree )
1137{
1138   UShort descr;
1139   /* pre: incoming tree[0..7] does not have any invalid shvals, in
1140      particular no zeroes. */
1141   if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1142                || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1143                || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1144                || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1145      tl_assert(0);
1146
1147   descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1148           | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1149           | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1150   /* build 16-bit layer */
1151   if (tree[1] == tree[0]) {
1152      tree[1] = SVal_INVALID;
1153      descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1154      descr |= TREE_DESCR_16_0;
1155   }
1156   if (tree[3] == tree[2]) {
1157      tree[3] = SVal_INVALID;
1158      descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1159      descr |= TREE_DESCR_16_1;
1160   }
1161   if (tree[5] == tree[4]) {
1162      tree[5] = SVal_INVALID;
1163      descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1164      descr |= TREE_DESCR_16_2;
1165   }
1166   if (tree[7] == tree[6]) {
1167      tree[7] = SVal_INVALID;
1168      descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1169      descr |= TREE_DESCR_16_3;
1170   }
1171   /* build 32-bit layer */
1172   if (tree[2] == tree[0]
1173       && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1174      tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1175      descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1176      descr |= TREE_DESCR_32_0;
1177   }
1178   if (tree[6] == tree[4]
1179       && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1180      tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1181      descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1182      descr |= TREE_DESCR_32_1;
1183   }
1184   /* build 64-bit layer */
1185   if (tree[4] == tree[0]
1186       && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1187      tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1188      descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1189      descr |= TREE_DESCR_64;
1190   }
1191   return descr;
1192}
1193
1194/* This takes a cacheline where all the data is at the leaves
1195   (w8[..]) and builds a correctly normalised tree. */
1196static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1197{
1198   Word tno, cloff;
1199   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1200      SVal* tree = &cl->svals[cloff];
1201      cl->descrs[tno] = normalise_tree( tree );
1202   }
1203   tl_assert(cloff == N_LINE_ARANGE);
1204   if (CHECK_ZSM)
1205      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1206   stats__cline_normalises++;
1207}
1208
1209
1210typedef struct { UChar count; SVal sval; } CountedSVal;
1211
1212static
1213void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1214                               /*OUT*/Word* dstUsedP,
1215                               Word nDst, CacheLine* src )
1216{
1217   Word  tno, cloff, dstUsed;
1218
1219   tl_assert(nDst == N_LINE_ARANGE);
1220   dstUsed = 0;
1221
1222   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1223      UShort descr = src->descrs[tno];
1224      SVal*  tree  = &src->svals[cloff];
1225
1226      /* sequentialise the tree described by (descr,tree). */
1227#     define PUT(_n,_v)                                \
1228         do { dst[dstUsed  ].count = (_n);             \
1229              dst[dstUsed++].sval  = (_v);             \
1230         } while (0)
1231
1232      /* byte 0 */
1233      if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1234      if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1235      if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1236      if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1237      /* byte 1 */
1238      if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1239      /* byte 2 */
1240      if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1241      if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1242      /* byte 3 */
1243      if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1244      /* byte 4 */
1245      if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1246      if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1247      if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1248      /* byte 5 */
1249      if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1250      /* byte 6 */
1251      if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1252      if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1253      /* byte 7 */
1254      if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1255
1256#     undef PUT
1257      /* END sequentialise the tree described by (descr,tree). */
1258
1259   }
1260   tl_assert(cloff == N_LINE_ARANGE);
1261   tl_assert(dstUsed <= nDst);
1262
1263   *dstUsedP = dstUsed;
1264}
1265
1266/* Write the cacheline 'wix' to backing store.  Where it ends up
1267   is determined by its tag field. */
1268static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1269{
1270   Word        i, j, k, m;
1271   Addr        tag;
1272   SecMap*     sm;
1273   CacheLine*  cl;
1274   LineZ* lineZ;
1275   LineF* lineF;
1276   Word        zix, fix, csvalsUsed;
1277   CountedSVal csvals[N_LINE_ARANGE];
1278   SVal        sv;
1279
1280   if (0)
1281   VG_(printf)("scache wback line %d\n", (Int)wix);
1282
1283   tl_assert(wix >= 0 && wix < N_WAY_NENT);
1284
1285   tag =  cache_shmem.tags0[wix];
1286   cl  = &cache_shmem.lyns0[wix];
1287
1288   /* The cache line may have been invalidated; if so, ignore it. */
1289   if (!is_valid_scache_tag(tag))
1290      return;
1291
1292   /* Where are we going to put it? */
1293   sm         = NULL;
1294   lineZ      = NULL;
1295   lineF      = NULL;
1296   zix = fix = -1;
1297
1298   /* find the Z line to write in and rcdec it or the associated F
1299      line. */
1300   find_Z_for_writing( &sm, &zix, tag );
1301
1302   tl_assert(sm);
1303   tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1304   lineZ = &sm->linesZ[zix];
1305
1306   /* Generate the data to be stored */
1307   if (CHECK_ZSM)
1308      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1309
1310   csvalsUsed = -1;
1311   sequentialise_CacheLine( csvals, &csvalsUsed,
1312                            N_LINE_ARANGE, cl );
1313   tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1314   if (0) VG_(printf)("%lu ", csvalsUsed);
1315
1316   lineZ->dict[0] = lineZ->dict[1]
1317                  = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1318
1319   /* i indexes actual shadow values, k is cursor in csvals */
1320   i = 0;
1321   for (k = 0; k < csvalsUsed; k++) {
1322
1323      sv = csvals[k].sval;
1324      if (CHECK_ZSM)
1325         tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1326      /* do we already have it? */
1327      if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1328      if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1329      if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1330      if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1331      /* no.  look for a free slot. */
1332      if (CHECK_ZSM)
1333         tl_assert(sv != SVal_INVALID);
1334      if (lineZ->dict[0]
1335          == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1336      if (lineZ->dict[1]
1337          == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1338      if (lineZ->dict[2]
1339          == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1340      if (lineZ->dict[3]
1341          == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1342      break; /* we'll have to use the f rep */
1343     dict_ok:
1344      m = csvals[k].count;
1345      if (m == 8) {
1346         write_twobit_array( lineZ->ix2s, i+0, j );
1347         write_twobit_array( lineZ->ix2s, i+1, j );
1348         write_twobit_array( lineZ->ix2s, i+2, j );
1349         write_twobit_array( lineZ->ix2s, i+3, j );
1350         write_twobit_array( lineZ->ix2s, i+4, j );
1351         write_twobit_array( lineZ->ix2s, i+5, j );
1352         write_twobit_array( lineZ->ix2s, i+6, j );
1353         write_twobit_array( lineZ->ix2s, i+7, j );
1354         i += 8;
1355      }
1356      else if (m == 4) {
1357         write_twobit_array( lineZ->ix2s, i+0, j );
1358         write_twobit_array( lineZ->ix2s, i+1, j );
1359         write_twobit_array( lineZ->ix2s, i+2, j );
1360         write_twobit_array( lineZ->ix2s, i+3, j );
1361         i += 4;
1362      }
1363      else if (m == 1) {
1364         write_twobit_array( lineZ->ix2s, i+0, j );
1365         i += 1;
1366      }
1367      else if (m == 2) {
1368         write_twobit_array( lineZ->ix2s, i+0, j );
1369         write_twobit_array( lineZ->ix2s, i+1, j );
1370         i += 2;
1371      }
1372      else {
1373         tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1374      }
1375
1376   }
1377
1378   if (LIKELY(i == N_LINE_ARANGE)) {
1379      /* Construction of the compressed representation was
1380         successful. */
1381      rcinc_LineZ(lineZ);
1382      stats__cache_Z_wbacks++;
1383   } else {
1384      /* Cannot use the compressed(z) representation.  Use the full(f)
1385         rep instead. */
1386      tl_assert(i >= 0 && i < N_LINE_ARANGE);
1387      alloc_F_for_writing( sm, &fix );
1388      tl_assert(sm->linesF);
1389      tl_assert(sm->linesF_size > 0);
1390      tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
1391      lineF = &sm->linesF[fix];
1392      tl_assert(!lineF->inUse);
1393      lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1394      lineZ->dict[1] = (SVal)fix;
1395      lineF->inUse = True;
1396      i = 0;
1397      for (k = 0; k < csvalsUsed; k++) {
1398         if (CHECK_ZSM)
1399            tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1400         sv = csvals[k].sval;
1401         if (CHECK_ZSM)
1402            tl_assert(sv != SVal_INVALID);
1403         for (m = csvals[k].count; m > 0; m--) {
1404            lineF->w64s[i] = sv;
1405            i++;
1406         }
1407      }
1408      tl_assert(i == N_LINE_ARANGE);
1409      rcinc_LineF(lineF);
1410      stats__cache_F_wbacks++;
1411   }
1412}
1413
1414/* Fetch the cacheline 'wix' from the backing store.  The tag
1415   associated with 'wix' is assumed to have already been filled in;
1416   hence that is used to determine where in the backing store to read
1417   from. */
1418static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1419{
1420   Word       i;
1421   Addr       tag;
1422   CacheLine* cl;
1423   LineZ*     lineZ;
1424   LineF*     lineF;
1425
1426   if (0)
1427   VG_(printf)("scache fetch line %d\n", (Int)wix);
1428
1429   tl_assert(wix >= 0 && wix < N_WAY_NENT);
1430
1431   tag =  cache_shmem.tags0[wix];
1432   cl  = &cache_shmem.lyns0[wix];
1433
1434   /* reject nonsense requests */
1435   tl_assert(is_valid_scache_tag(tag));
1436
1437   lineZ = NULL;
1438   lineF = NULL;
1439   find_ZF_for_reading( &lineZ, &lineF, tag );
1440   tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1441
1442   /* expand the data into the bottom layer of the tree, then get
1443      cacheline_normalise to build the descriptor array. */
1444   if (lineF) {
1445      tl_assert(lineF->inUse);
1446      for (i = 0; i < N_LINE_ARANGE; i++) {
1447         cl->svals[i] = lineF->w64s[i];
1448      }
1449      stats__cache_F_fetches++;
1450   } else {
1451      for (i = 0; i < N_LINE_ARANGE; i++) {
1452         SVal sv;
1453         UWord ix = read_twobit_array( lineZ->ix2s, i );
1454         /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
1455         sv = lineZ->dict[ix];
1456         tl_assert(sv != SVal_INVALID);
1457         cl->svals[i] = sv;
1458      }
1459      stats__cache_Z_fetches++;
1460   }
1461   normalise_CacheLine( cl );
1462}
1463
1464static void shmem__invalidate_scache ( void ) {
1465   Word wix;
1466   if (0) VG_(printf)("%s","scache inval\n");
1467   tl_assert(!is_valid_scache_tag(1));
1468   for (wix = 0; wix < N_WAY_NENT; wix++) {
1469      cache_shmem.tags0[wix] = 1/*INVALID*/;
1470   }
1471   stats__cache_invals++;
1472}
1473
1474static void shmem__flush_and_invalidate_scache ( void ) {
1475   Word wix;
1476   Addr tag;
1477   if (0) VG_(printf)("%s","scache flush and invalidate\n");
1478   tl_assert(!is_valid_scache_tag(1));
1479   for (wix = 0; wix < N_WAY_NENT; wix++) {
1480      tag = cache_shmem.tags0[wix];
1481      if (tag == 1/*INVALID*/) {
1482         /* already invalid; nothing to do */
1483      } else {
1484         tl_assert(is_valid_scache_tag(tag));
1485         cacheline_wback( wix );
1486      }
1487      cache_shmem.tags0[wix] = 1/*INVALID*/;
1488   }
1489   stats__cache_flushes++;
1490   stats__cache_invals++;
1491}
1492
1493
1494static inline Bool aligned16 ( Addr a ) {
1495   return 0 == (a & 1);
1496}
1497static inline Bool aligned32 ( Addr a ) {
1498   return 0 == (a & 3);
1499}
1500static inline Bool aligned64 ( Addr a ) {
1501   return 0 == (a & 7);
1502}
1503static inline UWord get_cacheline_offset ( Addr a ) {
1504   return (UWord)(a & (N_LINE_ARANGE - 1));
1505}
1506static inline Addr cacheline_ROUNDUP ( Addr a ) {
1507   return ROUNDUP(a, N_LINE_ARANGE);
1508}
1509static inline Addr cacheline_ROUNDDN ( Addr a ) {
1510   return ROUNDDN(a, N_LINE_ARANGE);
1511}
1512static inline UWord get_treeno ( Addr a ) {
1513   return get_cacheline_offset(a) >> 3;
1514}
1515static inline UWord get_tree_offset ( Addr a ) {
1516   return a & 7;
1517}
1518
1519static __attribute__((noinline))
1520       CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
1521static inline CacheLine* get_cacheline ( Addr a )
1522{
1523   /* tag is 'a' with the in-line offset masked out,
1524      eg a[31]..a[4] 0000 */
1525   Addr       tag = a & ~(N_LINE_ARANGE - 1);
1526   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1527   stats__cache_totrefs++;
1528   if (LIKELY(tag == cache_shmem.tags0[wix])) {
1529      return &cache_shmem.lyns0[wix];
1530   } else {
1531      return get_cacheline_MISS( a );
1532   }
1533}
1534
1535static __attribute__((noinline))
1536       CacheLine* get_cacheline_MISS ( Addr a )
1537{
1538   /* tag is 'a' with the in-line offset masked out,
1539      eg a[31]..a[4] 0000 */
1540
1541   CacheLine* cl;
1542   Addr*      tag_old_p;
1543   Addr       tag = a & ~(N_LINE_ARANGE - 1);
1544   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1545
1546   tl_assert(tag != cache_shmem.tags0[wix]);
1547
1548   /* Dump the old line into the backing store. */
1549   stats__cache_totmisses++;
1550
1551   cl        = &cache_shmem.lyns0[wix];
1552   tag_old_p = &cache_shmem.tags0[wix];
1553
1554   if (is_valid_scache_tag( *tag_old_p )) {
1555      /* EXPENSIVE and REDUNDANT: callee does it */
1556      if (CHECK_ZSM)
1557         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1558      cacheline_wback( wix );
1559   }
1560   /* and reload the new one */
1561   *tag_old_p = tag;
1562   cacheline_fetch( wix );
1563   if (CHECK_ZSM)
1564      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1565   return cl;
1566}
1567
1568static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1569   stats__cline_64to32pulldown++;
1570   switch (toff) {
1571      case 0: case 4:
1572         tl_assert(descr & TREE_DESCR_64);
1573         tree[4] = tree[0];
1574         descr &= ~TREE_DESCR_64;
1575         descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1576         break;
1577      default:
1578         tl_assert(0);
1579   }
1580   return descr;
1581}
1582
1583static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1584   stats__cline_32to16pulldown++;
1585   switch (toff) {
1586      case 0: case 2:
1587         if (!(descr & TREE_DESCR_32_0)) {
1588            descr = pulldown_to_32(tree, 0, descr);
1589         }
1590         tl_assert(descr & TREE_DESCR_32_0);
1591         tree[2] = tree[0];
1592         descr &= ~TREE_DESCR_32_0;
1593         descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1594         break;
1595      case 4: case 6:
1596         if (!(descr & TREE_DESCR_32_1)) {
1597            descr = pulldown_to_32(tree, 4, descr);
1598         }
1599         tl_assert(descr & TREE_DESCR_32_1);
1600         tree[6] = tree[4];
1601         descr &= ~TREE_DESCR_32_1;
1602         descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1603         break;
1604      default:
1605         tl_assert(0);
1606   }
1607   return descr;
1608}
1609
1610static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1611   stats__cline_16to8pulldown++;
1612   switch (toff) {
1613      case 0: case 1:
1614         if (!(descr & TREE_DESCR_16_0)) {
1615            descr = pulldown_to_16(tree, 0, descr);
1616         }
1617         tl_assert(descr & TREE_DESCR_16_0);
1618         tree[1] = tree[0];
1619         descr &= ~TREE_DESCR_16_0;
1620         descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1621         break;
1622      case 2: case 3:
1623         if (!(descr & TREE_DESCR_16_1)) {
1624            descr = pulldown_to_16(tree, 2, descr);
1625         }
1626         tl_assert(descr & TREE_DESCR_16_1);
1627         tree[3] = tree[2];
1628         descr &= ~TREE_DESCR_16_1;
1629         descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1630         break;
1631      case 4: case 5:
1632         if (!(descr & TREE_DESCR_16_2)) {
1633            descr = pulldown_to_16(tree, 4, descr);
1634         }
1635         tl_assert(descr & TREE_DESCR_16_2);
1636         tree[5] = tree[4];
1637         descr &= ~TREE_DESCR_16_2;
1638         descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1639         break;
1640      case 6: case 7:
1641         if (!(descr & TREE_DESCR_16_3)) {
1642            descr = pulldown_to_16(tree, 6, descr);
1643         }
1644         tl_assert(descr & TREE_DESCR_16_3);
1645         tree[7] = tree[6];
1646         descr &= ~TREE_DESCR_16_3;
1647         descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1648         break;
1649      default:
1650         tl_assert(0);
1651   }
1652   return descr;
1653}
1654
1655
1656static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1657   UShort mask;
1658   switch (toff) {
1659      case 0:
1660         mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1661         tl_assert( (descr & mask) == mask );
1662         descr &= ~mask;
1663         descr |= TREE_DESCR_16_0;
1664         break;
1665      case 2:
1666         mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1667         tl_assert( (descr & mask) == mask );
1668         descr &= ~mask;
1669         descr |= TREE_DESCR_16_1;
1670         break;
1671      case 4:
1672         mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1673         tl_assert( (descr & mask) == mask );
1674         descr &= ~mask;
1675         descr |= TREE_DESCR_16_2;
1676         break;
1677      case 6:
1678         mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1679         tl_assert( (descr & mask) == mask );
1680         descr &= ~mask;
1681         descr |= TREE_DESCR_16_3;
1682         break;
1683      default:
1684         tl_assert(0);
1685   }
1686   return descr;
1687}
1688
1689static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1690   UShort mask;
1691   switch (toff) {
1692      case 0:
1693         if (!(descr & TREE_DESCR_16_0))
1694            descr = pullup_descr_to_16(descr, 0);
1695         if (!(descr & TREE_DESCR_16_1))
1696            descr = pullup_descr_to_16(descr, 2);
1697         mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1698         tl_assert( (descr & mask) == mask );
1699         descr &= ~mask;
1700         descr |= TREE_DESCR_32_0;
1701         break;
1702      case 4:
1703         if (!(descr & TREE_DESCR_16_2))
1704            descr = pullup_descr_to_16(descr, 4);
1705         if (!(descr & TREE_DESCR_16_3))
1706            descr = pullup_descr_to_16(descr, 6);
1707         mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1708         tl_assert( (descr & mask) == mask );
1709         descr &= ~mask;
1710         descr |= TREE_DESCR_32_1;
1711         break;
1712      default:
1713         tl_assert(0);
1714   }
1715   return descr;
1716}
1717
1718static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1719   switch (toff) {
1720      case 0: case 4:
1721         return 0 != (descr & TREE_DESCR_64);
1722      default:
1723         tl_assert(0);
1724   }
1725}
1726
1727static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1728   switch (toff) {
1729      case 0:
1730         return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1731      case 2:
1732         return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1733      case 4:
1734         return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1735      case 6:
1736         return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1737      default:
1738         tl_assert(0);
1739   }
1740}
1741
1742/* ------------ Cache management ------------ */
1743
1744static void zsm_flush_cache ( void )
1745{
1746   shmem__flush_and_invalidate_scache();
1747}
1748
1749
1750static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
1751{
1752   tl_assert( sizeof(UWord) == sizeof(Addr) );
1753
1754   rcinc = p_rcinc;
1755   rcdec = p_rcdec;
1756
1757   tl_assert(map_shmem == NULL);
1758   map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1759                           HG_(free),
1760                           NULL/*unboxed UWord cmp*/);
1761   tl_assert(map_shmem != NULL);
1762   shmem__invalidate_scache();
1763
1764   /* a SecMap must contain an integral number of CacheLines */
1765   tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
1766   /* also ... a CacheLine holds an integral number of trees */
1767   tl_assert(0 == (N_LINE_ARANGE % 8));
1768}
1769
1770/////////////////////////////////////////////////////////////////
1771/////////////////////////////////////////////////////////////////
1772//                                                             //
1773// SECTION END compressed shadow memory                        //
1774//                                                             //
1775/////////////////////////////////////////////////////////////////
1776/////////////////////////////////////////////////////////////////
1777
1778
1779
1780/////////////////////////////////////////////////////////////////
1781/////////////////////////////////////////////////////////////////
1782//                                                             //
1783// SECTION BEGIN vts primitives                                //
1784//                                                             //
1785/////////////////////////////////////////////////////////////////
1786/////////////////////////////////////////////////////////////////
1787
1788
1789/* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
1790   being compact stand-ins for Thr*'s.  Use these functions to map
1791   between them. */
1792static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
1793static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
1794
1795__attribute__((noreturn))
1796static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
1797{
1798   if (due_to_nThrs) {
1799      const HChar* s =
1800         "\n"
1801         "Helgrind: cannot continue, run aborted: too many threads.\n"
1802         "Sorry.  Helgrind can only handle programs that create\n"
1803         "%'llu or fewer threads over their entire lifetime.\n"
1804         "\n";
1805      VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
1806   } else {
1807      const HChar* s =
1808         "\n"
1809         "Helgrind: cannot continue, run aborted: too many\n"
1810         "synchronisation events.  Sorry. Helgrind can only handle\n"
1811         "programs which perform %'llu or fewer\n"
1812         "inter-thread synchronisation events (locks, unlocks, etc).\n"
1813         "\n";
1814      VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
1815   }
1816   VG_(exit)(1);
1817   /*NOTREACHED*/
1818   tl_assert(0); /*wtf?!*/
1819}
1820
1821
1822/* The dead thread (ThrID, actually) table.  A thread may only be
1823   listed here if we have been notified thereof by libhb_async_exit.
1824   New entries are added at the end.  The order isn't important, but
1825   the ThrID values must be unique.  This table lists the identity of
1826   all threads that have ever died -- none are ever removed.  We keep
1827   this table so as to be able to prune entries from VTSs.  We don't
1828   actually need to keep the set of threads that have ever died --
1829   only the threads that have died since the previous round of
1830   pruning.  But it's useful for sanity check purposes to keep the
1831   entire set, so we do. */
1832static XArray* /* of ThrID */ verydead_thread_table = NULL;
1833
1834/* Arbitrary total ordering on ThrIDs. */
1835static Int cmp__ThrID ( const void* v1, const void* v2 ) {
1836   ThrID id1 = *(const ThrID*)v1;
1837   ThrID id2 = *(const ThrID*)v2;
1838   if (id1 < id2) return -1;
1839   if (id1 > id2) return 1;
1840   return 0;
1841}
1842
1843static void verydead_thread_table_init ( void )
1844{
1845   tl_assert(!verydead_thread_table);
1846   verydead_thread_table
1847     = VG_(newXA)( HG_(zalloc),
1848                   "libhb.verydead_thread_table_init.1",
1849                   HG_(free), sizeof(ThrID) );
1850   tl_assert(verydead_thread_table);
1851   VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
1852}
1853
1854
1855/* A VTS contains .ts, its vector clock, and also .id, a field to hold
1856   a backlink for the caller's convenience.  Since we have no idea
1857   what to set that to in the library, it always gets set to
1858   VtsID_INVALID. */
1859typedef
1860   struct {
1861      VtsID    id;
1862      UInt     usedTS;
1863      UInt     sizeTS;
1864      ScalarTS ts[0];
1865   }
1866   VTS;
1867
1868/* Allocate a VTS capable of storing 'sizeTS' entries. */
1869static VTS* VTS__new ( const HChar* who, UInt sizeTS );
1870
1871/* Make a clone of 'vts', sizing the new array to exactly match the
1872   number of ScalarTSs present. */
1873static VTS* VTS__clone ( const HChar* who, VTS* vts );
1874
1875/* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
1876   array is sized exactly to hold the number of required elements.
1877   'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
1878   must be in strictly increasing order. */
1879static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel );
1880
1881/* Delete this VTS in its entirety. */
1882static void VTS__delete ( VTS* vts );
1883
1884/* Create a new singleton VTS in 'out'.  Caller must have
1885   pre-allocated 'out' sufficiently big to hold the result in all
1886   possible cases. */
1887static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
1888
1889/* Create in 'out' a VTS which is the same as 'vts' except with
1890   vts[me]++, so to speak.  Caller must have pre-allocated 'out'
1891   sufficiently big to hold the result in all possible cases. */
1892static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
1893
1894/* Create in 'out' a VTS which is the join (max) of 'a' and
1895   'b'. Caller must have pre-allocated 'out' sufficiently big to hold
1896   the result in all possible cases. */
1897static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
1898
1899/* Compute the partial ordering relation of the two args.  Although we
1900   could be completely general and return an enumeration value (EQ,
1901   LT, GT, UN), in fact we only need LEQ, and so we may as well
1902   hardwire that fact.
1903
1904   Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
1905   invald ThrID).  In the latter case, the returned ThrID indicates
1906   the discovered point for which they are not.  There may be more
1907   than one such point, but we only care about seeing one of them, not
1908   all of them.  This rather strange convention is used because
1909   sometimes we want to know the actual index at which they first
1910   differ. */
1911static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
1912
1913/* Compute an arbitrary structural (total) ordering on the two args,
1914   based on their VCs, so they can be looked up in a table, tree, etc.
1915   Returns -1, 0 or 1. */
1916static Word VTS__cmp_structural ( VTS* a, VTS* b );
1917
1918/* Debugging only.  Display the given VTS in the buffer. */
1919static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
1920
1921/* Debugging only.  Return vts[index], so to speak. */
1922static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
1923
1924/* Notify the VTS machinery that a thread has been declared
1925   comprehensively dead: that is, it has done an async exit AND it has
1926   been joined with.  This should ensure that its local clocks (.viR
1927   and .viW) will never again change, and so all mentions of this
1928   thread from all VTSs in the system may be removed. */
1929static void VTS__declare_thread_very_dead ( Thr* idx );
1930
1931/*--------------- to do with Vector Timestamps ---------------*/
1932
1933static Bool is_sane_VTS ( VTS* vts )
1934{
1935   UWord     i, n;
1936   ScalarTS  *st1, *st2;
1937   if (!vts) return False;
1938   if (vts->usedTS > vts->sizeTS) return False;
1939   n = vts->usedTS;
1940   if (n == 1) {
1941      st1 = &vts->ts[0];
1942      if (st1->tym == 0)
1943         return False;
1944   }
1945   else
1946   if (n >= 2) {
1947      for (i = 0; i < n-1; i++) {
1948         st1 = &vts->ts[i];
1949         st2 = &vts->ts[i+1];
1950         if (st1->thrid >= st2->thrid)
1951            return False;
1952         if (st1->tym == 0 || st2->tym == 0)
1953            return False;
1954      }
1955   }
1956   return True;
1957}
1958
1959
1960/* Create a new, empty VTS.
1961*/
1962static VTS* VTS__new ( const HChar* who, UInt sizeTS )
1963{
1964   VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
1965   tl_assert(vts->usedTS == 0);
1966   vts->sizeTS = sizeTS;
1967   *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
1968   return vts;
1969}
1970
1971/* Clone this VTS.
1972*/
1973static VTS* VTS__clone ( const HChar* who, VTS* vts )
1974{
1975   tl_assert(vts);
1976   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1977   UInt nTS = vts->usedTS;
1978   VTS* clone = VTS__new(who, nTS);
1979   clone->id = vts->id;
1980   clone->sizeTS = nTS;
1981   clone->usedTS = nTS;
1982   UInt i;
1983   for (i = 0; i < nTS; i++) {
1984      clone->ts[i] = vts->ts[i];
1985   }
1986   tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1987   return clone;
1988}
1989
1990
1991/* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
1992   must be in strictly increasing order.  We could obviously do this
1993   much more efficiently (in linear time) if necessary.
1994*/
1995static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel )
1996{
1997   UInt i, j;
1998   tl_assert(vts);
1999   tl_assert(thridsToDel);
2000   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2001   UInt nTS = vts->usedTS;
2002   /* Figure out how many ScalarTSs will remain in the output. */
2003   UInt nReq = nTS;
2004   for (i = 0; i < nTS; i++) {
2005      ThrID thrid = vts->ts[i].thrid;
2006      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2007         nReq--;
2008   }
2009   tl_assert(nReq <= nTS);
2010   /* Copy the ones that will remain. */
2011   VTS* res = VTS__new(who, nReq);
2012   j = 0;
2013   for (i = 0; i < nTS; i++) {
2014      ThrID thrid = vts->ts[i].thrid;
2015      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2016         continue;
2017      res->ts[j++] = vts->ts[i];
2018   }
2019   tl_assert(j == nReq);
2020   tl_assert(j == res->sizeTS);
2021   res->usedTS = j;
2022   tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2023   return res;
2024}
2025
2026
2027/* Delete this VTS in its entirety.
2028*/
2029static void VTS__delete ( VTS* vts )
2030{
2031   tl_assert(vts);
2032   tl_assert(vts->usedTS <= vts->sizeTS);
2033   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2034   HG_(free)(vts);
2035}
2036
2037
2038/* Create a new singleton VTS.
2039*/
2040static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2041{
2042   tl_assert(thr);
2043   tl_assert(tym >= 1);
2044   tl_assert(out);
2045   tl_assert(out->usedTS == 0);
2046   tl_assert(out->sizeTS >= 1);
2047   UInt hi = out->usedTS++;
2048   out->ts[hi].thrid = Thr__to_ThrID(thr);
2049   out->ts[hi].tym   = tym;
2050}
2051
2052
2053/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2054   not modified.
2055*/
2056static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2057{
2058   UInt      i, n;
2059   ThrID     me_thrid;
2060   Bool      found = False;
2061
2062   stats__vts__tick++;
2063
2064   tl_assert(out);
2065   tl_assert(out->usedTS == 0);
2066   if (vts->usedTS >= ThrID_MAX_VALID)
2067      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2068   tl_assert(out->sizeTS >= 1 + vts->usedTS);
2069
2070   tl_assert(me);
2071   me_thrid = Thr__to_ThrID(me);
2072   tl_assert(is_sane_VTS(vts));
2073   n = vts->usedTS;
2074
2075   /* Copy all entries which precede 'me'. */
2076   for (i = 0; i < n; i++) {
2077      ScalarTS* here = &vts->ts[i];
2078      if (UNLIKELY(here->thrid >= me_thrid))
2079         break;
2080      UInt hi = out->usedTS++;
2081      out->ts[hi] = *here;
2082   }
2083
2084   /* 'i' now indicates the next entry to copy, if any.
2085       There are 3 possibilities:
2086       (a) there is no next entry (we used them all up already):
2087           add (me_thrid,1) to the output, and quit
2088       (b) there is a next entry, and its thrid > me_thrid:
2089           add (me_thrid,1) to the output, then copy the remaining entries
2090       (c) there is a next entry, and its thrid == me_thrid:
2091           copy it to the output but increment its timestamp value.
2092           Then copy the remaining entries.  (c) is the common case.
2093   */
2094   tl_assert(i >= 0 && i <= n);
2095   if (i == n) { /* case (a) */
2096      UInt hi = out->usedTS++;
2097      out->ts[hi].thrid = me_thrid;
2098      out->ts[hi].tym   = 1;
2099   } else {
2100      /* cases (b) and (c) */
2101      ScalarTS* here = &vts->ts[i];
2102      if (me_thrid == here->thrid) { /* case (c) */
2103         if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2104            /* We're hosed.  We have to stop. */
2105            scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2106         }
2107         UInt hi = out->usedTS++;
2108         out->ts[hi].thrid = here->thrid;
2109         out->ts[hi].tym   = here->tym + 1;
2110         i++;
2111         found = True;
2112      } else { /* case (b) */
2113         UInt hi = out->usedTS++;
2114         out->ts[hi].thrid = me_thrid;
2115         out->ts[hi].tym   = 1;
2116      }
2117      /* And copy any remaining entries. */
2118      for (/*keepgoing*/; i < n; i++) {
2119         ScalarTS* here2 = &vts->ts[i];
2120         UInt hi = out->usedTS++;
2121         out->ts[hi] = *here2;
2122      }
2123   }
2124
2125   tl_assert(is_sane_VTS(out));
2126   tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2127   tl_assert(out->usedTS <= out->sizeTS);
2128}
2129
2130
2131/* Return a new VTS constructed as the join (max) of the 2 args.
2132   Neither arg is modified.
2133*/
2134static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2135{
2136   UInt     ia, ib, useda, usedb;
2137   ULong    tyma, tymb, tymMax;
2138   ThrID    thrid;
2139   UInt     ncommon = 0;
2140
2141   stats__vts__join++;
2142
2143   tl_assert(a);
2144   tl_assert(b);
2145   useda = a->usedTS;
2146   usedb = b->usedTS;
2147
2148   tl_assert(out);
2149   tl_assert(out->usedTS == 0);
2150   /* overly conservative test, but doing better involves comparing
2151      the two VTSs, which we don't want to do at this point. */
2152   if (useda + usedb >= ThrID_MAX_VALID)
2153      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2154   tl_assert(out->sizeTS >= useda + usedb);
2155
2156   ia = ib = 0;
2157
2158   while (1) {
2159
2160      /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2161         from a and b in order, where thrid is the next ThrID
2162         occurring in either a or b, and tyma/b are the relevant
2163         scalar timestamps, taking into account implicit zeroes. */
2164      tl_assert(ia >= 0 && ia <= useda);
2165      tl_assert(ib >= 0 && ib <= usedb);
2166
2167      if        (ia == useda && ib == usedb) {
2168         /* both empty - done */
2169         break;
2170
2171      } else if (ia == useda && ib != usedb) {
2172         /* a empty, use up b */
2173         ScalarTS* tmpb = &b->ts[ib];
2174         thrid = tmpb->thrid;
2175         tyma  = 0;
2176         tymb  = tmpb->tym;
2177         ib++;
2178
2179      } else if (ia != useda && ib == usedb) {
2180         /* b empty, use up a */
2181         ScalarTS* tmpa = &a->ts[ia];
2182         thrid = tmpa->thrid;
2183         tyma  = tmpa->tym;
2184         tymb  = 0;
2185         ia++;
2186
2187      } else {
2188         /* both not empty; extract lowest-ThrID'd triple */
2189         ScalarTS* tmpa = &a->ts[ia];
2190         ScalarTS* tmpb = &b->ts[ib];
2191         if (tmpa->thrid < tmpb->thrid) {
2192            /* a has the lowest unconsidered ThrID */
2193            thrid = tmpa->thrid;
2194            tyma  = tmpa->tym;
2195            tymb  = 0;
2196            ia++;
2197         } else if (tmpa->thrid > tmpb->thrid) {
2198            /* b has the lowest unconsidered ThrID */
2199            thrid = tmpb->thrid;
2200            tyma  = 0;
2201            tymb  = tmpb->tym;
2202            ib++;
2203         } else {
2204            /* they both next mention the same ThrID */
2205            tl_assert(tmpa->thrid == tmpb->thrid);
2206            thrid = tmpa->thrid; /* == tmpb->thrid */
2207            tyma  = tmpa->tym;
2208            tymb  = tmpb->tym;
2209            ia++;
2210            ib++;
2211            ncommon++;
2212         }
2213      }
2214
2215      /* having laboriously determined (thr, tyma, tymb), do something
2216         useful with it. */
2217      tymMax = tyma > tymb ? tyma : tymb;
2218      if (tymMax > 0) {
2219         UInt hi = out->usedTS++;
2220         out->ts[hi].thrid = thrid;
2221         out->ts[hi].tym   = tymMax;
2222      }
2223
2224   }
2225
2226   tl_assert(is_sane_VTS(out));
2227   tl_assert(out->usedTS <= out->sizeTS);
2228   tl_assert(out->usedTS == useda + usedb - ncommon);
2229}
2230
2231
2232/* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2233   they are, or the first ThrID for which they are not (no valid ThrID
2234   has the value zero).  This rather strange convention is used
2235   because sometimes we want to know the actual index at which they
2236   first differ. */
2237static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2238{
2239   Word  ia, ib, useda, usedb;
2240   ULong tyma, tymb;
2241
2242   stats__vts__cmpLEQ++;
2243
2244   tl_assert(a);
2245   tl_assert(b);
2246   useda = a->usedTS;
2247   usedb = b->usedTS;
2248
2249   ia = ib = 0;
2250
2251   while (1) {
2252
2253      /* This logic is to enumerate doubles (tyma, tymb) drawn
2254         from a and b in order, and tyma/b are the relevant
2255         scalar timestamps, taking into account implicit zeroes. */
2256      ThrID thrid;
2257
2258      tl_assert(ia >= 0 && ia <= useda);
2259      tl_assert(ib >= 0 && ib <= usedb);
2260
2261      if        (ia == useda && ib == usedb) {
2262         /* both empty - done */
2263         break;
2264
2265      } else if (ia == useda && ib != usedb) {
2266         /* a empty, use up b */
2267         ScalarTS* tmpb = &b->ts[ib];
2268         tyma  = 0;
2269         tymb  = tmpb->tym;
2270         thrid = tmpb->thrid;
2271         ib++;
2272
2273      } else if (ia != useda && ib == usedb) {
2274         /* b empty, use up a */
2275         ScalarTS* tmpa = &a->ts[ia];
2276         tyma  = tmpa->tym;
2277         thrid = tmpa->thrid;
2278         tymb  = 0;
2279         ia++;
2280
2281      } else {
2282         /* both not empty; extract lowest-ThrID'd triple */
2283         ScalarTS* tmpa = &a->ts[ia];
2284         ScalarTS* tmpb = &b->ts[ib];
2285         if (tmpa->thrid < tmpb->thrid) {
2286            /* a has the lowest unconsidered ThrID */
2287            tyma  = tmpa->tym;
2288            thrid = tmpa->thrid;
2289            tymb  = 0;
2290            ia++;
2291         }
2292         else
2293         if (tmpa->thrid > tmpb->thrid) {
2294            /* b has the lowest unconsidered ThrID */
2295            tyma  = 0;
2296            tymb  = tmpb->tym;
2297            thrid = tmpb->thrid;
2298            ib++;
2299         } else {
2300            /* they both next mention the same ThrID */
2301            tl_assert(tmpa->thrid == tmpb->thrid);
2302            tyma  = tmpa->tym;
2303            thrid = tmpa->thrid;
2304            tymb  = tmpb->tym;
2305            ia++;
2306            ib++;
2307         }
2308      }
2309
2310      /* having laboriously determined (tyma, tymb), do something
2311         useful with it. */
2312      if (tyma > tymb) {
2313         /* not LEQ at this index.  Quit, since the answer is
2314            determined already. */
2315         tl_assert(thrid >= 1024);
2316         return thrid;
2317      }
2318   }
2319
2320   return 0; /* all points are LEQ => return an invalid ThrID */
2321}
2322
2323
2324/* Compute an arbitrary structural (total) ordering on the two args,
2325   based on their VCs, so they can be looked up in a table, tree, etc.
2326   Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2327   performance critical so there is some effort expended to make it sa
2328   fast as possible.
2329*/
2330Word VTS__cmp_structural ( VTS* a, VTS* b )
2331{
2332   /* We just need to generate an arbitrary total ordering based on
2333      a->ts and b->ts.  Preferably do it in a way which comes across likely
2334      differences relatively quickly. */
2335   Word     i;
2336   Word     useda = 0,    usedb = 0;
2337   ScalarTS *ctsa = NULL, *ctsb = NULL;
2338
2339   stats__vts__cmp_structural++;
2340
2341   tl_assert(a);
2342   tl_assert(b);
2343
2344   ctsa = &a->ts[0]; useda = a->usedTS;
2345   ctsb = &b->ts[0]; usedb = b->usedTS;
2346
2347   if (LIKELY(useda == usedb)) {
2348      ScalarTS *tmpa = NULL, *tmpb = NULL;
2349      stats__vts__cmp_structural_slow++;
2350      /* Same length vectors.  Find the first difference, if any, as
2351         fast as possible. */
2352      for (i = 0; i < useda; i++) {
2353         tmpa = &ctsa[i];
2354         tmpb = &ctsb[i];
2355         if (LIKELY(tmpa->tym == tmpb->tym
2356                    && tmpa->thrid == tmpb->thrid))
2357            continue;
2358         else
2359            break;
2360      }
2361      if (UNLIKELY(i == useda)) {
2362         /* They're identical. */
2363         return 0;
2364      } else {
2365         tl_assert(i >= 0 && i < useda);
2366         if (tmpa->tym < tmpb->tym) return -1;
2367         if (tmpa->tym > tmpb->tym) return 1;
2368         if (tmpa->thrid < tmpb->thrid) return -1;
2369         if (tmpa->thrid > tmpb->thrid) return 1;
2370         /* we just established them as non-identical, hence: */
2371      }
2372      /*NOTREACHED*/
2373      tl_assert(0);
2374   }
2375
2376   if (useda < usedb) return -1;
2377   if (useda > usedb) return 1;
2378   /*NOTREACHED*/
2379   tl_assert(0);
2380}
2381
2382
2383/* Debugging only.  Display the given VTS in the buffer.
2384*/
2385void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
2386{
2387   ScalarTS* st;
2388   HChar     unit[64];
2389   Word      i, n;
2390   Int       avail = nBuf;
2391   tl_assert(vts && vts->ts);
2392   tl_assert(nBuf > 16);
2393   buf[0] = '[';
2394   buf[1] = 0;
2395   n =  vts->usedTS;
2396   for (i = 0; i < n; i++) {
2397      tl_assert(avail >= 40);
2398      st = &vts->ts[i];
2399      VG_(memset)(unit, 0, sizeof(unit));
2400      VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
2401                         st->thrid, (ULong)st->tym);
2402      if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
2403         VG_(strcat)(buf, " ...]");
2404         buf[nBuf-1] = 0;
2405         return;
2406      }
2407      VG_(strcat)(buf, unit);
2408      avail -= VG_(strlen)(unit);
2409   }
2410   VG_(strcat)(buf, "]");
2411   buf[nBuf-1] = 0;
2412}
2413
2414
2415/* Debugging only.  Return vts[index], so to speak.
2416*/
2417ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2418{
2419   UWord i, n;
2420   ThrID idx_thrid = Thr__to_ThrID(idx);
2421   stats__vts__indexat_slow++;
2422   tl_assert(vts && vts->ts);
2423   n = vts->usedTS;
2424   for (i = 0; i < n; i++) {
2425      ScalarTS* st = &vts->ts[i];
2426      if (st->thrid == idx_thrid)
2427         return st->tym;
2428   }
2429   return 0;
2430}
2431
2432
2433/* See comment on prototype above.
2434*/
2435static void VTS__declare_thread_very_dead ( Thr* thr )
2436{
2437   if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2438
2439   tl_assert(thr->llexit_done);
2440   tl_assert(thr->joinedwith_done);
2441
2442   ThrID nyu;
2443   nyu = Thr__to_ThrID(thr);
2444   VG_(addToXA)( verydead_thread_table, &nyu );
2445
2446   /* We can only get here if we're assured that we'll never again
2447      need to look at this thread's ::viR or ::viW.  Set them to
2448      VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2449      mostly so that we don't wind up pruning them (as that would be
2450      nonsensical: the only interesting ScalarTS entry for a dead
2451      thread is its own index, and the pruning will remove that.). */
2452   VtsID__rcdec(thr->viR);
2453   VtsID__rcdec(thr->viW);
2454   thr->viR = VtsID_INVALID;
2455   thr->viW = VtsID_INVALID;
2456}
2457
2458
2459/////////////////////////////////////////////////////////////////
2460/////////////////////////////////////////////////////////////////
2461//                                                             //
2462// SECTION END vts primitives                                  //
2463//                                                             //
2464/////////////////////////////////////////////////////////////////
2465/////////////////////////////////////////////////////////////////
2466
2467
2468
2469/////////////////////////////////////////////////////////////////
2470/////////////////////////////////////////////////////////////////
2471//                                                             //
2472// SECTION BEGIN main library                                  //
2473//                                                             //
2474/////////////////////////////////////////////////////////////////
2475/////////////////////////////////////////////////////////////////
2476
2477
2478/////////////////////////////////////////////////////////
2479//                                                     //
2480// VTS set                                             //
2481//                                                     //
2482/////////////////////////////////////////////////////////
2483
2484static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2485
2486static void vts_set_init ( void )
2487{
2488   tl_assert(!vts_set);
2489   vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2490                         HG_(free),
2491                         (Word(*)(UWord,UWord))VTS__cmp_structural );
2492   tl_assert(vts_set);
2493}
2494
2495/* Given a VTS, look in vts_set to see if we already have a
2496   structurally identical one.  If yes, return the pair (True, pointer
2497   to the existing one).  If no, clone this one, add the clone to the
2498   set, and return (False, pointer to the clone). */
2499static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2500{
2501   UWord keyW, valW;
2502   stats__vts_set__focaa++;
2503   tl_assert(cand->id == VtsID_INVALID);
2504   /* lookup cand (by value) */
2505   if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2506      /* found it */
2507      tl_assert(valW == 0);
2508      /* if this fails, cand (by ref) was already present (!) */
2509      tl_assert(keyW != (UWord)cand);
2510      *res = (VTS*)keyW;
2511      return True;
2512   } else {
2513      /* not present.  Clone, add and return address of clone. */
2514      stats__vts_set__focaa_a++;
2515      VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2516      tl_assert(clone != cand);
2517      VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2518      *res = clone;
2519      return False;
2520   }
2521}
2522
2523
2524/////////////////////////////////////////////////////////
2525//                                                     //
2526// VTS table                                           //
2527//                                                     //
2528/////////////////////////////////////////////////////////
2529
2530static void VtsID__invalidate_caches ( void ); /* fwds */
2531
2532/* A type to hold VTS table entries.  Invariants:
2533   If .vts == NULL, then this entry is not in use, so:
2534   - .rc == 0
2535   - this entry is on the freelist (unfortunately, does not imply
2536     any constraints on value for .freelink)
2537   If .vts != NULL, then this entry is in use:
2538   - .vts is findable in vts_set
2539   - .vts->id == this entry number
2540   - no specific value for .rc (even 0 is OK)
2541   - this entry is not on freelist, so .freelink == VtsID_INVALID
2542*/
2543typedef
2544   struct {
2545      VTS*  vts;      /* vts, in vts_set */
2546      UWord rc;       /* reference count - enough for entire aspace */
2547      VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2548      VtsID remap;    /* used only during pruning */
2549   }
2550   VtsTE;
2551
2552/* The VTS table. */
2553static XArray* /* of VtsTE */ vts_tab = NULL;
2554
2555/* An index into the VTS table, indicating the start of the list of
2556   free (available for use) entries.  If the list is empty, this is
2557   VtsID_INVALID. */
2558static VtsID vts_tab_freelist = VtsID_INVALID;
2559
2560/* Do a GC of vts_tab when the freelist becomes empty AND the size of
2561   vts_tab equals or exceeds this size.  After GC, the value here is
2562   set appropriately so as to check for the next GC point. */
2563static Word vts_next_GC_at = 1000;
2564
2565static void vts_tab_init ( void )
2566{
2567   vts_tab
2568      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2569                    HG_(free), sizeof(VtsTE) );
2570   vts_tab_freelist
2571      = VtsID_INVALID;
2572   tl_assert(vts_tab);
2573}
2574
2575/* Add ii to the free list, checking that it looks out-of-use. */
2576static void add_to_free_list ( VtsID ii )
2577{
2578   VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2579   tl_assert(ie->vts == NULL);
2580   tl_assert(ie->rc == 0);
2581   tl_assert(ie->freelink == VtsID_INVALID);
2582   ie->freelink = vts_tab_freelist;
2583   vts_tab_freelist = ii;
2584}
2585
2586/* Get an entry from the free list.  This will return VtsID_INVALID if
2587   the free list is empty. */
2588static VtsID get_from_free_list ( void )
2589{
2590   VtsID  ii;
2591   VtsTE* ie;
2592   if (vts_tab_freelist == VtsID_INVALID)
2593      return VtsID_INVALID;
2594   ii = vts_tab_freelist;
2595   ie = VG_(indexXA)( vts_tab, ii );
2596   tl_assert(ie->vts == NULL);
2597   tl_assert(ie->rc == 0);
2598   vts_tab_freelist = ie->freelink;
2599   return ii;
2600}
2601
2602/* Produce a new VtsID that can be used, either by getting it from
2603   the freelist, or, if that is empty, by expanding vts_tab. */
2604static VtsID get_new_VtsID ( void )
2605{
2606   VtsID ii;
2607   VtsTE te;
2608   ii = get_from_free_list();
2609   if (ii != VtsID_INVALID)
2610      return ii;
2611   te.vts = NULL;
2612   te.rc = 0;
2613   te.freelink = VtsID_INVALID;
2614   te.remap    = VtsID_INVALID;
2615   ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2616   return ii;
2617}
2618
2619
2620/* Indirect callback from lib_zsm. */
2621static void VtsID__rcinc ( VtsID ii )
2622{
2623   VtsTE* ie;
2624   /* VG_(indexXA) does a range check for us */
2625   ie = VG_(indexXA)( vts_tab, ii );
2626   tl_assert(ie->vts); /* else it's not in use */
2627   tl_assert(ie->rc < ~0UL); /* else we can't continue */
2628   tl_assert(ie->vts->id == ii);
2629   ie->rc++;
2630}
2631
2632/* Indirect callback from lib_zsm. */
2633static void VtsID__rcdec ( VtsID ii )
2634{
2635   VtsTE* ie;
2636   /* VG_(indexXA) does a range check for us */
2637   ie = VG_(indexXA)( vts_tab, ii );
2638   tl_assert(ie->vts); /* else it's not in use */
2639   tl_assert(ie->rc > 0); /* else RC snafu */
2640   tl_assert(ie->vts->id == ii);
2641   ie->rc--;
2642}
2643
2644
2645/* Look up 'cand' in our collection of VTSs.  If present, return the
2646   VtsID for the pre-existing version.  If not present, clone it, add
2647   the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2648   it, and return that. */
2649static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2650{
2651   VTS* in_tab = NULL;
2652   tl_assert(cand->id == VtsID_INVALID);
2653   Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2654   tl_assert(in_tab);
2655   if (already_have) {
2656      /* We already have a copy of 'cand'.  Use that. */
2657      VtsTE* ie;
2658      tl_assert(in_tab->id != VtsID_INVALID);
2659      ie = VG_(indexXA)( vts_tab, in_tab->id );
2660      tl_assert(ie->vts == in_tab);
2661      return in_tab->id;
2662   } else {
2663      VtsID  ii = get_new_VtsID();
2664      VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2665      ie->vts = in_tab;
2666      ie->rc = 0;
2667      ie->freelink = VtsID_INVALID;
2668      in_tab->id = ii;
2669      return ii;
2670   }
2671}
2672
2673
2674static void show_vts_stats ( const HChar* caller )
2675{
2676   UWord nSet, nTab, nLive;
2677   ULong totrc;
2678   UWord n, i;
2679   nSet = VG_(sizeFM)( vts_set );
2680   nTab = VG_(sizeXA)( vts_tab );
2681   totrc = 0;
2682   nLive = 0;
2683   n = VG_(sizeXA)( vts_tab );
2684   for (i = 0; i < n; i++) {
2685      VtsTE* ie = VG_(indexXA)( vts_tab, i );
2686      if (ie->vts) {
2687         nLive++;
2688         totrc += (ULong)ie->rc;
2689      } else {
2690         tl_assert(ie->rc == 0);
2691      }
2692   }
2693   VG_(printf)("  show_vts_stats %s\n", caller);
2694   VG_(printf)("    vts_tab size %4lu\n", nTab);
2695   VG_(printf)("    vts_tab live %4lu\n", nLive);
2696   VG_(printf)("    vts_set size %4lu\n", nSet);
2697   VG_(printf)("        total rc %4llu\n", totrc);
2698}
2699
2700
2701/* --- Helpers for VtsID pruning --- */
2702
2703static
2704void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2705                   /*MOD*/XArray* /* of VtsTE */ new_tab,
2706                   VtsID* ii )
2707{
2708   VtsTE *old_te, *new_te;
2709   VtsID old_id, new_id;
2710   /* We're relying here on VG_(indexXA)'s range checking to assert on
2711      any stupid values, in particular *ii == VtsID_INVALID. */
2712   old_id = *ii;
2713   old_te = VG_(indexXA)( old_tab, old_id );
2714   old_te->rc--;
2715   new_id = old_te->remap;
2716   new_te = VG_(indexXA)( new_tab, new_id );
2717   new_te->rc++;
2718   *ii = new_id;
2719}
2720
2721static
2722void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2723                            /*MOD*/XArray* /* of VtsTE */ new_tab,
2724                            SVal* s )
2725{
2726   SVal old_sv, new_sv;
2727   old_sv = *s;
2728   if (SVal__isC(old_sv)) {
2729      VtsID rMin, wMin;
2730      rMin = SVal__unC_Rmin(old_sv);
2731      wMin = SVal__unC_Wmin(old_sv);
2732      remap_VtsID( old_tab, new_tab, &rMin );
2733      remap_VtsID( old_tab, new_tab, &wMin );
2734      new_sv = SVal__mkC( rMin, wMin );
2735      *s = new_sv;
2736  }
2737}
2738
2739
2740/* NOT TO BE CALLED FROM WITHIN libzsm. */
2741__attribute__((noinline))
2742static void vts_tab__do_GC ( Bool show_stats )
2743{
2744   UWord i, nTab, nLive, nFreed;
2745
2746   /* ---------- BEGIN VTS GC ---------- */
2747   /* check this is actually necessary. */
2748   tl_assert(vts_tab_freelist == VtsID_INVALID);
2749
2750   /* empty the caches for partial order checks and binary joins.  We
2751      could do better and prune out the entries to be deleted, but it
2752      ain't worth the hassle. */
2753   VtsID__invalidate_caches();
2754
2755   /* First, make the reference counts up to date. */
2756   zsm_flush_cache();
2757
2758   nTab = VG_(sizeXA)( vts_tab );
2759
2760   if (show_stats) {
2761      VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
2762      show_vts_stats("before GC");
2763   }
2764
2765   /* Now we can inspect the entire vts_tab.  Any entries with zero
2766      .rc fields are now no longer in use and can be put back on the
2767      free list, removed from vts_set, and deleted. */
2768   nFreed = 0;
2769   for (i = 0; i < nTab; i++) {
2770      Bool present;
2771      UWord oldK = 0, oldV = 12345;
2772      VtsTE* te = VG_(indexXA)( vts_tab, i );
2773      if (te->vts == NULL) {
2774         tl_assert(te->rc == 0);
2775         continue; /* already on the free list (presumably) */
2776      }
2777      if (te->rc > 0)
2778         continue; /* in use */
2779      /* Ok, we got one we can free. */
2780      tl_assert(te->vts->id == i);
2781      /* first, remove it from vts_set. */
2782      present = VG_(delFromFM)( vts_set,
2783                                &oldK, &oldV, (UWord)te->vts );
2784      tl_assert(present); /* else it isn't in vts_set ?! */
2785      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
2786      tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
2787      /* now free the VTS itself */
2788      VTS__delete(te->vts);
2789      te->vts = NULL;
2790      /* and finally put this entry on the free list */
2791      tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
2792      add_to_free_list( i );
2793      nFreed++;
2794   }
2795
2796   /* Now figure out when the next GC should be.  We'll allow the
2797      number of VTSs to double before GCing again.  Except of course
2798      that since we can't (or, at least, don't) shrink vts_tab, we
2799      can't set the threshhold value smaller than it. */
2800   tl_assert(nFreed <= nTab);
2801   nLive = nTab - nFreed;
2802   tl_assert(nLive >= 0 && nLive <= nTab);
2803   vts_next_GC_at = 2 * nLive;
2804   if (vts_next_GC_at < nTab)
2805      vts_next_GC_at = nTab;
2806
2807   if (show_stats) {
2808      show_vts_stats("after GC");
2809      VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
2810   }
2811
2812   if (VG_(clo_stats)) {
2813      static UInt ctr = 1;
2814      tl_assert(nTab > 0);
2815      VG_(message)(Vg_DebugMsg,
2816                  "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
2817                  ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
2818   }
2819   /* ---------- END VTS GC ---------- */
2820
2821   /* Decide whether to do VTS pruning.  We have one of three
2822      settings. */
2823   static UInt pruning_auto_ctr = 0; /* do not make non-static */
2824
2825   Bool do_pruning = False;
2826   switch (HG_(clo_vts_pruning)) {
2827      case 0: /* never */
2828         break;
2829      case 1: /* auto */
2830         do_pruning = (++pruning_auto_ctr % 5) == 0;
2831         break;
2832      case 2: /* always */
2833         do_pruning = True;
2834         break;
2835      default:
2836         tl_assert(0);
2837   }
2838
2839   /* The rest of this routine only handles pruning, so we can
2840      quit at this point if it is not to be done. */
2841   if (!do_pruning)
2842      return;
2843
2844   /* ---------- BEGIN VTS PRUNING ---------- */
2845   /* We begin by sorting the backing table on its .thr values, so as
2846      to (1) check they are unique [else something has gone wrong,
2847      since it means we must have seen some Thr* exiting more than
2848      once, which can't happen], and (2) so that we can quickly look
2849      up the dead-thread entries as we work through the VTSs. */
2850   VG_(sortXA)( verydead_thread_table );
2851   /* Sanity check: check for unique .sts.thr values. */
2852   UWord nBT = VG_(sizeXA)( verydead_thread_table );
2853   if (nBT > 0) {
2854      ThrID thrid1, thrid2;
2855      thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
2856      for (i = 1; i < nBT; i++) {
2857         thrid1 = thrid2;
2858         thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
2859         tl_assert(thrid1 < thrid2);
2860      }
2861   }
2862   /* Ok, so the dead thread table has unique and in-order keys. */
2863
2864   /* We will run through the old table, and create a new table and
2865      set, at the same time setting the .remap entries in the old
2866      table to point to the new entries.  Then, visit every VtsID in
2867      the system, and replace all of them with new ones, using the
2868      .remap entries in the old table.  Finally, we can delete the old
2869      table and set. */
2870
2871   XArray* /* of VtsTE */ new_tab
2872      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
2873                    HG_(free), sizeof(VtsTE) );
2874
2875   /* WordFM VTS* void */
2876   WordFM* new_set
2877      = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
2878                    HG_(free),
2879                    (Word(*)(UWord,UWord))VTS__cmp_structural );
2880
2881   /* Visit each old VTS.  For each one:
2882
2883      * make a pruned version
2884
2885      * search new_set for the pruned version, yielding either
2886        Nothing (not present) or the new VtsID for it.
2887
2888      * if not present, allocate a new VtsID for it, insert (pruned
2889        VTS, new VtsID) in the tree, and set
2890        remap_table[old VtsID] = new VtsID.
2891
2892      * if present, set remap_table[old VtsID] = new VtsID, where
2893        new VtsID was determined by the tree lookup.  Then free up
2894        the clone.
2895   */
2896
2897   UWord nBeforePruning = 0, nAfterPruning = 0;
2898   UWord nSTSsBefore = 0, nSTSsAfter = 0;
2899   VtsID new_VtsID_ctr = 0;
2900
2901   for (i = 0; i < nTab; i++) {
2902
2903      /* For each old VTS .. */
2904      VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
2905      VTS*   old_vts = old_te->vts;
2906      tl_assert(old_te->remap == VtsID_INVALID);
2907
2908      /* Skip it if not in use */
2909      if (old_te->rc == 0) {
2910         tl_assert(old_vts == NULL);
2911         continue;
2912      }
2913      tl_assert(old_vts != NULL);
2914      tl_assert(old_vts->id == i);
2915      tl_assert(old_vts->ts != NULL);
2916
2917      /* It is in use. Make a pruned version. */
2918      nBeforePruning++;
2919      nSTSsBefore += old_vts->usedTS;
2920      VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
2921                                   old_vts, verydead_thread_table);
2922      tl_assert(new_vts->sizeTS == new_vts->usedTS);
2923      tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
2924                == 0x0ddC0ffeeBadF00dULL);
2925
2926      /* Get rid of the old VTS and the tree entry.  It's a bit more
2927         complex to incrementally delete the VTSs now than to nuke
2928         them all after we're done, but the upside is that we don't
2929         wind up temporarily storing potentially two complete copies
2930         of each VTS and hence spiking memory use. */
2931      UWord oldK = 0, oldV = 12345;
2932      Bool  present = VG_(delFromFM)( vts_set,
2933                                      &oldK, &oldV, (UWord)old_vts );
2934      tl_assert(present); /* else it isn't in vts_set ?! */
2935      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
2936      tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
2937      /* now free the VTS itself */
2938      VTS__delete(old_vts);
2939      old_te->vts = NULL;
2940      old_vts = NULL;
2941
2942      /* NO MENTIONS of old_vts allowed beyond this point. */
2943
2944      /* Ok, we have the pruned copy in new_vts.  See if a
2945         structurally identical version is already present in new_set.
2946         If so, delete the one we just made and move on; if not, add
2947         it. */
2948      VTS*  identical_version = NULL;
2949      UWord valW = 12345;
2950      if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
2951                        (UWord)new_vts)) {
2952         // already have it
2953         tl_assert(valW == 0);
2954         tl_assert(identical_version != NULL);
2955         tl_assert(identical_version != new_vts);
2956         VTS__delete(new_vts);
2957         new_vts = identical_version;
2958         tl_assert(new_vts->id != VtsID_INVALID);
2959      } else {
2960         tl_assert(valW == 12345);
2961         tl_assert(identical_version == NULL);
2962         new_vts->id = new_VtsID_ctr++;
2963         Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
2964         tl_assert(!b);
2965         VtsTE new_te;
2966         new_te.vts      = new_vts;
2967         new_te.rc       = 0;
2968         new_te.freelink = VtsID_INVALID;
2969         new_te.remap    = VtsID_INVALID;
2970         Word j = VG_(addToXA)( new_tab, &new_te );
2971         tl_assert(j <= i);
2972         tl_assert(j == new_VtsID_ctr - 1);
2973         // stats
2974         nAfterPruning++;
2975         nSTSsAfter += new_vts->usedTS;
2976      }
2977      old_te->remap = new_vts->id;
2978
2979   } /* for (i = 0; i < nTab; i++) */
2980
2981   /* At this point, we have:
2982      * the old VTS table, with its .remap entries set,
2983        and with all .vts == NULL.
2984      * the old VTS tree should be empty, since it and the old VTSs
2985        it contained have been incrementally deleted was we worked
2986        through the old table.
2987      * the new VTS table, with all .rc == 0, all .freelink and .remap
2988        == VtsID_INVALID.
2989      * the new VTS tree.
2990   */
2991   tl_assert( VG_(sizeFM)(vts_set) == 0 );
2992
2993   /* Now actually apply the mapping. */
2994   /* Visit all the VtsIDs in the entire system.  Where do we expect
2995      to find them?
2996      (a) in shadow memory -- the LineZs and LineFs
2997      (b) in our collection of struct _Thrs.
2998      (c) in our collection of struct _SOs.
2999      Nowhere else, AFAICS.  Not in the zsm cache, because that just
3000      got invalidated.
3001
3002      Using the .remap fields in vts_tab, map each old VtsID to a new
3003      VtsID.  For each old VtsID, dec its rc; and for each new one,
3004      inc it.  This sets up the new refcounts, and it also gives a
3005      cheap sanity check of the old ones: all old refcounts should be
3006      zero after this operation.
3007   */
3008
3009   /* Do the mappings for (a) above: iterate over the Primary shadow
3010      mem map (WordFM Addr SecMap*). */
3011   UWord secmapW = 0;
3012   VG_(initIterFM)( map_shmem );
3013   while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3014      UWord   j;
3015      SecMap* sm = (SecMap*)secmapW;
3016      tl_assert(sm->magic == SecMap_MAGIC);
3017      /* Deal with the LineZs */
3018      for (i = 0; i < N_SECMAP_ZLINES; i++) {
3019         LineZ* lineZ = &sm->linesZ[i];
3020         if (lineZ->dict[0] == SVal_INVALID)
3021            continue; /* not in use -- data is in F rep instead */
3022         for (j = 0; j < 4; j++)
3023            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3024      }
3025      /* Deal with the LineFs */
3026      for (i = 0; i < sm->linesF_size; i++) {
3027         LineF* lineF = &sm->linesF[i];
3028         if (!lineF->inUse)
3029            continue;
3030         for (j = 0; j < N_LINE_ARANGE; j++)
3031            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3032      }
3033   }
3034   VG_(doneIterFM)( map_shmem );
3035
3036   /* Do the mappings for (b) above: visit our collection of struct
3037      _Thrs. */
3038   Thread* hgthread = get_admin_threads();
3039   tl_assert(hgthread);
3040   while (hgthread) {
3041      Thr* hbthr = hgthread->hbthr;
3042      tl_assert(hbthr);
3043      /* Threads that are listed in the prunable set have their viR
3044         and viW set to VtsID_INVALID, so we can't mess with them. */
3045      if (hbthr->llexit_done && hbthr->joinedwith_done) {
3046         tl_assert(hbthr->viR == VtsID_INVALID);
3047         tl_assert(hbthr->viW == VtsID_INVALID);
3048         hgthread = hgthread->admin;
3049         continue;
3050      }
3051      remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3052      remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3053      hgthread = hgthread->admin;
3054   }
3055
3056   /* Do the mappings for (c) above: visit the struct _SOs. */
3057   SO* so = admin_SO;
3058   while (so) {
3059      if (so->viR != VtsID_INVALID)
3060         remap_VtsID( vts_tab, new_tab, &so->viR );
3061      if (so->viW != VtsID_INVALID)
3062         remap_VtsID( vts_tab, new_tab, &so->viW );
3063      so = so->admin_next;
3064   }
3065
3066   /* So, we're nearly done (with this incredibly complex operation).
3067      Check the refcounts for the old VtsIDs all fell to zero, as
3068      expected.  Any failure is serious. */
3069   for (i = 0; i < nTab; i++) {
3070      VtsTE* te = VG_(indexXA)( vts_tab, i );
3071      tl_assert(te->vts == NULL);
3072      /* This is the assert proper.  Note we're also asserting
3073         zeroness for old entries which are unmapped (hence have
3074         .remap == VtsID_INVALID).  That's OK. */
3075      tl_assert(te->rc == 0);
3076   }
3077
3078   /* Install the new table and set. */
3079   VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3080   vts_set = new_set;
3081   VG_(deleteXA)( vts_tab );
3082   vts_tab = new_tab;
3083
3084   /* The freelist of vts_tab entries is empty now, because we've
3085      compacted all of the live entries at the low end of the
3086      table. */
3087   vts_tab_freelist = VtsID_INVALID;
3088
3089   /* Sanity check vts_set and vts_tab. */
3090
3091   /* Because all the live entries got slid down to the bottom of vts_tab: */
3092   tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3093
3094   /* Assert that the vts_tab and vts_set entries point at each other
3095      in the required way */
3096   UWord wordK = 0, wordV = 0;
3097   VG_(initIterFM)( vts_set );
3098   while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3099      tl_assert(wordK != 0);
3100      tl_assert(wordV == 0);
3101      VTS* vts = (VTS*)wordK;
3102      tl_assert(vts->id != VtsID_INVALID);
3103      VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3104      tl_assert(te->vts == vts);
3105   }
3106   VG_(doneIterFM)( vts_set );
3107
3108   /* Also iterate over the table, and check each entry is
3109      plausible. */
3110   nTab = VG_(sizeXA)( vts_tab );
3111   for (i = 0; i < nTab; i++) {
3112      VtsTE* te = VG_(indexXA)( vts_tab, i );
3113      tl_assert(te->vts);
3114      tl_assert(te->vts->id == i);
3115      tl_assert(te->rc > 0); /* 'cos we just GC'd */
3116      tl_assert(te->freelink == VtsID_INVALID); /* in use */
3117      tl_assert(te->remap == VtsID_INVALID); /* not relevant */
3118   }
3119
3120   /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3121   if (VG_(clo_stats)) {
3122      static UInt ctr = 1;
3123      tl_assert(nTab > 0);
3124      VG_(message)(
3125         Vg_DebugMsg,
3126         "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
3127            "after %lu (avg sz %lu)\n",
3128         ctr++,
3129         nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3130         nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3131      );
3132   }
3133   if (0)
3134   VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
3135               "after pruning %lu (avg sz %lu)\n",
3136               nBeforePruning, nSTSsBefore / nBeforePruning,
3137               nAfterPruning, nSTSsAfter / nAfterPruning);
3138   /* ---------- END VTS PRUNING ---------- */
3139}
3140
3141
3142/////////////////////////////////////////////////////////
3143//                                                     //
3144// Vts IDs                                             //
3145//                                                     //
3146/////////////////////////////////////////////////////////
3147
3148//////////////////////////
3149/* A temporary, max-sized VTS which is used as a temporary (the first
3150   argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3151static VTS* temp_max_sized_VTS = NULL;
3152
3153//////////////////////////
3154static ULong stats__cmpLEQ_queries = 0;
3155static ULong stats__cmpLEQ_misses  = 0;
3156static ULong stats__join2_queries  = 0;
3157static ULong stats__join2_misses   = 0;
3158
3159static inline UInt ROL32 ( UInt w, Int n ) {
3160   w = (w << n) | (w >> (32-n));
3161   return w;
3162}
3163static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3164   UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3165   return hash % nTab;
3166}
3167
3168#define N_CMPLEQ_CACHE 1023
3169static
3170   struct { VtsID vi1; VtsID vi2; Bool leq; }
3171   cmpLEQ_cache[N_CMPLEQ_CACHE];
3172
3173#define N_JOIN2_CACHE 1023
3174static
3175   struct { VtsID vi1; VtsID vi2; VtsID res; }
3176   join2_cache[N_JOIN2_CACHE];
3177
3178static void VtsID__invalidate_caches ( void ) {
3179   Int i;
3180   for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3181      cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3182      cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3183      cmpLEQ_cache[i].leq = False;
3184   }
3185   for (i = 0; i < N_JOIN2_CACHE; i++) {
3186     join2_cache[i].vi1 = VtsID_INVALID;
3187     join2_cache[i].vi2 = VtsID_INVALID;
3188     join2_cache[i].res = VtsID_INVALID;
3189   }
3190}
3191//////////////////////////
3192
3193//static Bool VtsID__is_valid ( VtsID vi ) {
3194//   VtsTE* ve;
3195//   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3196//      return False;
3197//   ve = VG_(indexXA)( vts_tab, vi );
3198//   if (!ve->vts)
3199//      return False;
3200//   tl_assert(ve->vts->id == vi);
3201//   return True;
3202//}
3203
3204static VTS* VtsID__to_VTS ( VtsID vi ) {
3205   VtsTE* te = VG_(indexXA)( vts_tab, vi );
3206   tl_assert(te->vts);
3207   return te->vts;
3208}
3209
3210static void VtsID__pp ( VtsID vi ) {
3211   HChar buf[100];
3212   VTS* vts = VtsID__to_VTS(vi);
3213   VTS__show( buf, sizeof(buf)-1, vts );
3214   buf[sizeof(buf)-1] = 0;
3215   VG_(printf)("%s", buf);
3216}
3217
3218/* compute partial ordering relation of vi1 and vi2. */
3219__attribute__((noinline))
3220static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3221   UInt hash;
3222   Bool leq;
3223   VTS  *v1, *v2;
3224   //if (vi1 == vi2) return True;
3225   tl_assert(vi1 != vi2);
3226   ////++
3227   stats__cmpLEQ_queries++;
3228   hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3229   if (cmpLEQ_cache[hash].vi1 == vi1
3230       && cmpLEQ_cache[hash].vi2 == vi2)
3231      return cmpLEQ_cache[hash].leq;
3232   stats__cmpLEQ_misses++;
3233   ////--
3234   v1  = VtsID__to_VTS(vi1);
3235   v2  = VtsID__to_VTS(vi2);
3236   leq = VTS__cmpLEQ( v1, v2 ) == 0;
3237   ////++
3238   cmpLEQ_cache[hash].vi1 = vi1;
3239   cmpLEQ_cache[hash].vi2 = vi2;
3240   cmpLEQ_cache[hash].leq = leq;
3241   ////--
3242   return leq;
3243}
3244static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3245   return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3246}
3247
3248/* compute binary join */
3249__attribute__((noinline))
3250static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3251   UInt  hash;
3252   VtsID res;
3253   VTS   *vts1, *vts2;
3254   //if (vi1 == vi2) return vi1;
3255   tl_assert(vi1 != vi2);
3256   ////++
3257   stats__join2_queries++;
3258   hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3259   if (join2_cache[hash].vi1 == vi1
3260       && join2_cache[hash].vi2 == vi2)
3261      return join2_cache[hash].res;
3262   stats__join2_misses++;
3263   ////--
3264   vts1 = VtsID__to_VTS(vi1);
3265   vts2 = VtsID__to_VTS(vi2);
3266   temp_max_sized_VTS->usedTS = 0;
3267   VTS__join(temp_max_sized_VTS, vts1,vts2);
3268   res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3269   ////++
3270   join2_cache[hash].vi1 = vi1;
3271   join2_cache[hash].vi2 = vi2;
3272   join2_cache[hash].res = res;
3273   ////--
3274   return res;
3275}
3276static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3277   return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3278}
3279
3280/* create a singleton VTS, namely [thr:1] */
3281static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3282   temp_max_sized_VTS->usedTS = 0;
3283   VTS__singleton(temp_max_sized_VTS, thr,tym);
3284   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3285}
3286
3287/* tick operation, creates value 1 if specified index is absent */
3288static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3289   VTS* vts = VtsID__to_VTS(vi);
3290   temp_max_sized_VTS->usedTS = 0;
3291   VTS__tick(temp_max_sized_VTS, idx,vts);
3292   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3293}
3294
3295/* index into a VTS (only for assertions) */
3296static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3297   VTS* vts = VtsID__to_VTS(vi);
3298   return VTS__indexAt_SLOW( vts, idx );
3299}
3300
3301/* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3302   any, really) element in vi1 which is pointwise greater-than the
3303   corresponding element in vi2.  If no such element exists, return
3304   NULL.  This needs to be fairly quick since it is called every time
3305   a race is detected. */
3306static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3307{
3308   VTS  *vts1, *vts2;
3309   Thr*  diffthr;
3310   ThrID diffthrid;
3311   tl_assert(vi1 != vi2);
3312   vts1 = VtsID__to_VTS(vi1);
3313   vts2 = VtsID__to_VTS(vi2);
3314   tl_assert(vts1 != vts2);
3315   diffthrid = VTS__cmpLEQ(vts1, vts2);
3316   diffthr = Thr__from_ThrID(diffthrid);
3317   tl_assert(diffthr); /* else they are LEQ ! */
3318   return diffthr;
3319}
3320
3321
3322/////////////////////////////////////////////////////////
3323//                                                     //
3324// Filters                                             //
3325//                                                     //
3326/////////////////////////////////////////////////////////
3327
3328/* Forget everything we know -- clear the filter and let everything
3329   through.  This needs to be as fast as possible, since it is called
3330   every time the running thread changes, and every time a thread's
3331   vector clocks change, which can be quite frequent.  The obvious
3332   fast way to do this is simply to stuff in tags which we know are
3333   not going to match anything, since they're not aligned to the start
3334   of a line. */
3335static void Filter__clear ( Filter* fi, const HChar* who )
3336{
3337   UWord i;
3338   if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3339   for (i = 0; i < FI_NUM_LINES; i += 8) {
3340      fi->tags[i+0] = 1; /* impossible value -- cannot match */
3341      fi->tags[i+1] = 1;
3342      fi->tags[i+2] = 1;
3343      fi->tags[i+3] = 1;
3344      fi->tags[i+4] = 1;
3345      fi->tags[i+5] = 1;
3346      fi->tags[i+6] = 1;
3347      fi->tags[i+7] = 1;
3348   }
3349   tl_assert(i == FI_NUM_LINES);
3350}
3351
3352/* Clearing an arbitrary range in the filter.  Unfortunately
3353   we have to do this due to core-supplied new/die-mem events. */
3354
3355static void Filter__clear_1byte ( Filter* fi, Addr a )
3356{
3357   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3358   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3359   FiLine* line   = &fi->lines[lineno];
3360   UWord   loff   = (a - atag) / 8;
3361   UShort  mask   = 0x3 << (2 * (a & 7));
3362   /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3363   if (LIKELY( fi->tags[lineno] == atag )) {
3364      /* hit.  clear the bits. */
3365      UShort  u16  = line->u16s[loff];
3366      line->u16s[loff] = u16 & ~mask; /* clear them */
3367   } else {
3368      /* miss.  The filter doesn't hold this address, so ignore. */
3369   }
3370}
3371
3372static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3373{
3374   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3375   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3376   FiLine* line   = &fi->lines[lineno];
3377   UWord   loff   = (a - atag) / 8;
3378   if (LIKELY( fi->tags[lineno] == atag )) {
3379      line->u16s[loff] = 0;
3380   } else {
3381    /* miss.  The filter doesn't hold this address, so ignore. */
3382   }
3383}
3384
3385static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3386{
3387  //VG_(printf)("%lu ", len);
3388   /* slowly do part preceding 8-alignment */
3389   while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3390      Filter__clear_1byte( fi, a );
3391      a++;
3392      len--;
3393   }
3394   /* vector loop */
3395   while (len >= 8) {
3396      Filter__clear_8bytes_aligned( fi, a );
3397      a += 8;
3398      len -= 8;
3399   }
3400   /* slowly do tail */
3401   while (UNLIKELY(len > 0)) {
3402      Filter__clear_1byte( fi, a );
3403      a++;
3404      len--;
3405   }
3406}
3407
3408
3409/* ------ Read handlers for the filter. ------ */
3410
3411static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3412{
3413   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3414      return False;
3415   {
3416     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3417     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3418     FiLine* line   = &fi->lines[lineno];
3419     UWord   loff   = (a - atag) / 8;
3420     UShort  mask   = 0xAAAA;
3421     if (LIKELY( fi->tags[lineno] == atag )) {
3422        /* hit.  check line and update. */
3423        UShort u16  = line->u16s[loff];
3424        Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3425        line->u16s[loff] = u16 | mask; /* set them */
3426        return ok;
3427     } else {
3428        /* miss.  nuke existing line and re-use it. */
3429        UWord i;
3430        fi->tags[lineno] = atag;
3431        for (i = 0; i < FI_LINE_SZB / 8; i++)
3432           line->u16s[i] = 0;
3433        line->u16s[loff] = mask;
3434        return False;
3435     }
3436   }
3437}
3438
3439static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3440{
3441   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3442      return False;
3443   {
3444     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3445     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3446     FiLine* line   = &fi->lines[lineno];
3447     UWord   loff   = (a - atag) / 8;
3448     UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3449     if (LIKELY( fi->tags[lineno] == atag )) {
3450        /* hit.  check line and update. */
3451        UShort  u16  = line->u16s[loff];
3452        Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3453        line->u16s[loff] = u16 | mask; /* set them */
3454        return ok;
3455     } else {
3456        /* miss.  nuke existing line and re-use it. */
3457        UWord   i;
3458        fi->tags[lineno] = atag;
3459        for (i = 0; i < FI_LINE_SZB / 8; i++)
3460           line->u16s[i] = 0;
3461        line->u16s[loff] = mask;
3462        return False;
3463     }
3464   }
3465}
3466
3467static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3468{
3469   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3470      return False;
3471   {
3472     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3473     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3474     FiLine* line   = &fi->lines[lineno];
3475     UWord   loff   = (a - atag) / 8;
3476     UShort  mask   = 0xA << (2 * (a & 6));
3477     /* mask is A000, 0A00, 00A0 or 000A */
3478     if (LIKELY( fi->tags[lineno] == atag )) {
3479        /* hit.  check line and update. */
3480        UShort  u16  = line->u16s[loff];
3481        Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3482        line->u16s[loff] = u16 | mask; /* set them */
3483        return ok;
3484     } else {
3485        /* miss.  nuke existing line and re-use it. */
3486        UWord   i;
3487        fi->tags[lineno] = atag;
3488        for (i = 0; i < FI_LINE_SZB / 8; i++)
3489           line->u16s[i] = 0;
3490        line->u16s[loff] = mask;
3491        return False;
3492     }
3493   }
3494}
3495
3496static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3497{
3498   {
3499     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3500     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3501     FiLine* line   = &fi->lines[lineno];
3502     UWord   loff   = (a - atag) / 8;
3503     UShort  mask   = 0x2 << (2 * (a & 7));
3504     /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3505     if (LIKELY( fi->tags[lineno] == atag )) {
3506        /* hit.  check line and update. */
3507        UShort  u16  = line->u16s[loff];
3508        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3509        line->u16s[loff] = u16 | mask; /* set them */
3510        return ok;
3511     } else {
3512        /* miss.  nuke existing line and re-use it. */
3513        UWord   i;
3514        fi->tags[lineno] = atag;
3515        for (i = 0; i < FI_LINE_SZB / 8; i++)
3516           line->u16s[i] = 0;
3517        line->u16s[loff] = mask;
3518        return False;
3519     }
3520   }
3521}
3522
3523
3524/* ------ Write handlers for the filter. ------ */
3525
3526static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3527{
3528   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3529      return False;
3530   {
3531     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3532     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3533     FiLine* line   = &fi->lines[lineno];
3534     UWord   loff   = (a - atag) / 8;
3535     UShort  mask   = 0xFFFF;
3536     if (LIKELY( fi->tags[lineno] == atag )) {
3537        /* hit.  check line and update. */
3538        UShort u16  = line->u16s[loff];
3539        Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3540        line->u16s[loff] = u16 | mask; /* set them */
3541        return ok;
3542     } else {
3543        /* miss.  nuke existing line and re-use it. */
3544        UWord i;
3545        fi->tags[lineno] = atag;
3546        for (i = 0; i < FI_LINE_SZB / 8; i++)
3547           line->u16s[i] = 0;
3548        line->u16s[loff] = mask;
3549        return False;
3550     }
3551   }
3552}
3553
3554static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3555{
3556   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3557      return False;
3558   {
3559     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3560     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3561     FiLine* line   = &fi->lines[lineno];
3562     UWord   loff   = (a - atag) / 8;
3563     UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3564     if (LIKELY( fi->tags[lineno] == atag )) {
3565        /* hit.  check line and update. */
3566        UShort  u16  = line->u16s[loff];
3567        Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3568        line->u16s[loff] = u16 | mask; /* set them */
3569        return ok;
3570     } else {
3571        /* miss.  nuke existing line and re-use it. */
3572        UWord   i;
3573        fi->tags[lineno] = atag;
3574        for (i = 0; i < FI_LINE_SZB / 8; i++)
3575           line->u16s[i] = 0;
3576        line->u16s[loff] = mask;
3577        return False;
3578     }
3579   }
3580}
3581
3582static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3583{
3584   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3585      return False;
3586   {
3587     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3588     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3589     FiLine* line   = &fi->lines[lineno];
3590     UWord   loff   = (a - atag) / 8;
3591     UShort  mask   = 0xF << (2 * (a & 6));
3592     /* mask is F000, 0F00, 00F0 or 000F */
3593     if (LIKELY( fi->tags[lineno] == atag )) {
3594        /* hit.  check line and update. */
3595        UShort  u16  = line->u16s[loff];
3596        Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3597        line->u16s[loff] = u16 | mask; /* set them */
3598        return ok;
3599     } else {
3600        /* miss.  nuke existing line and re-use it. */
3601        UWord   i;
3602        fi->tags[lineno] = atag;
3603        for (i = 0; i < FI_LINE_SZB / 8; i++)
3604           line->u16s[i] = 0;
3605        line->u16s[loff] = mask;
3606        return False;
3607     }
3608   }
3609}
3610
3611static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
3612{
3613   {
3614     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3615     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3616     FiLine* line   = &fi->lines[lineno];
3617     UWord   loff   = (a - atag) / 8;
3618     UShort  mask   = 0x3 << (2 * (a & 7));
3619     /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3620     if (LIKELY( fi->tags[lineno] == atag )) {
3621        /* hit.  check line and update. */
3622        UShort  u16  = line->u16s[loff];
3623        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3624        line->u16s[loff] = u16 | mask; /* set them */
3625        return ok;
3626     } else {
3627        /* miss.  nuke existing line and re-use it. */
3628        UWord   i;
3629        fi->tags[lineno] = atag;
3630        for (i = 0; i < FI_LINE_SZB / 8; i++)
3631           line->u16s[i] = 0;
3632        line->u16s[loff] = mask;
3633        return False;
3634     }
3635   }
3636}
3637
3638
3639/////////////////////////////////////////////////////////
3640//                                                     //
3641// Threads                                             //
3642//                                                     //
3643/////////////////////////////////////////////////////////
3644
3645/* Maps ThrID values to their Thr*s (which contain ThrID values that
3646   should point back to the relevant slot in the array.  Lowest
3647   numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
3648static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
3649
3650/* And a counter to dole out ThrID values.  For rationale/background,
3651   see comments on definition of ScalarTS (far) above. */
3652static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
3653
3654static ThrID Thr__to_ThrID ( Thr* thr ) {
3655   return thr->thrid;
3656}
3657static Thr* Thr__from_ThrID ( UInt thrid ) {
3658   Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
3659   tl_assert(thr->thrid == thrid);
3660   return thr;
3661}
3662
3663static Thr* Thr__new ( void )
3664{
3665   Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
3666   thr->viR = VtsID_INVALID;
3667   thr->viW = VtsID_INVALID;
3668   thr->llexit_done = False;
3669   thr->joinedwith_done = False;
3670   thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
3671   if (HG_(clo_history_level) == 1)
3672      thr->local_Kws_n_stacks
3673         = VG_(newXA)( HG_(zalloc),
3674                       "libhb.Thr__new.3 (local_Kws_and_stacks)",
3675                       HG_(free), sizeof(ULong_n_EC) );
3676
3677   /* Add this Thr* <-> ThrID binding to the mapping, and
3678      cross-check */
3679   if (!thrid_to_thr_map) {
3680      thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
3681                                     HG_(free), sizeof(Thr*) );
3682      tl_assert(thrid_to_thr_map);
3683   }
3684
3685   if (thrid_counter >= ThrID_MAX_VALID) {
3686      /* We're hosed.  We have to stop. */
3687      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
3688   }
3689
3690   thr->thrid = thrid_counter++;
3691   Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
3692   tl_assert(ix + 1024 == thr->thrid);
3693
3694   return thr;
3695}
3696
3697static void note_local_Kw_n_stack_for ( Thr* thr )
3698{
3699   Word       nPresent;
3700   ULong_n_EC pair;
3701   tl_assert(thr);
3702
3703   // We only collect this info at history level 1 (approx)
3704   if (HG_(clo_history_level) != 1)
3705      return;
3706
3707   /* This is the scalar Kw for thr. */
3708   pair.ull = VtsID__indexAt( thr->viW, thr );
3709   pair.ec  = main_get_EC( thr );
3710   tl_assert(pair.ec);
3711   tl_assert(thr->local_Kws_n_stacks);
3712
3713   /* check that we're not adding duplicates */
3714   nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
3715
3716   /* Throw away old stacks, if necessary.  We can't accumulate stuff
3717      indefinitely. */
3718   if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
3719      VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
3720      nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
3721      if (0)
3722         VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
3723                     thr, pair.ull, pair.ec );
3724   }
3725
3726   if (nPresent > 0) {
3727      ULong_n_EC* prevPair
3728         = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
3729      tl_assert( prevPair->ull <= pair.ull );
3730   }
3731
3732   if (nPresent == 0)
3733      pair.ec = NULL;
3734
3735   VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
3736
3737   if (0)
3738      VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
3739                  thr, pair.ull, pair.ec );
3740   if (0)
3741      VG_(pp_ExeContext)(pair.ec);
3742}
3743
3744static Int cmp__ULong_n_EC__by_ULong ( const ULong_n_EC* pair1,
3745                                       const ULong_n_EC* pair2 )
3746{
3747   if (pair1->ull < pair2->ull) return -1;
3748   if (pair1->ull > pair2->ull) return 1;
3749   return 0;
3750}
3751
3752
3753/////////////////////////////////////////////////////////
3754//                                                     //
3755// Shadow Values                                       //
3756//                                                     //
3757/////////////////////////////////////////////////////////
3758
3759// type SVal, SVal_INVALID and SVal_NOACCESS are defined by
3760// hb_zsm.h.  We have to do everything else here.
3761
3762/* SVal is 64 bit unsigned int.
3763
3764      <---------30--------->    <---------30--------->
3765   00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
3766   10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
3767   11 0--------------------0 00 0--------------------0   A: SVal_INVALID
3768
3769*/
3770#define SVAL_TAGMASK (3ULL << 62)
3771
3772static inline Bool SVal__isC ( SVal s ) {
3773   return (0ULL << 62) == (s & SVAL_TAGMASK);
3774}
3775static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
3776   //tl_assert(VtsID__is_valid(rmini));
3777   //tl_assert(VtsID__is_valid(wmini));
3778   return (((ULong)rmini) << 32) | ((ULong)wmini);
3779}
3780static inline VtsID SVal__unC_Rmin ( SVal s ) {
3781   tl_assert(SVal__isC(s));
3782   return (VtsID)(s >> 32);
3783}
3784static inline VtsID SVal__unC_Wmin ( SVal s ) {
3785   tl_assert(SVal__isC(s));
3786   return (VtsID)(s & 0xFFFFFFFFULL);
3787}
3788
3789static inline Bool SVal__isA ( SVal s ) {
3790   return (2ULL << 62) == (s & SVAL_TAGMASK);
3791}
3792static inline SVal SVal__mkA ( void ) {
3793   return 2ULL << 62;
3794}
3795
3796/* Direct callback from lib_zsm. */
3797static void SVal__rcinc ( SVal s ) {
3798   if (SVal__isC(s)) {
3799      VtsID__rcinc( SVal__unC_Rmin(s) );
3800      VtsID__rcinc( SVal__unC_Wmin(s) );
3801   }
3802}
3803
3804/* Direct callback from lib_zsm. */
3805static void SVal__rcdec ( SVal s ) {
3806   if (SVal__isC(s)) {
3807      VtsID__rcdec( SVal__unC_Rmin(s) );
3808      VtsID__rcdec( SVal__unC_Wmin(s) );
3809   }
3810}
3811
3812
3813/////////////////////////////////////////////////////////
3814//                                                     //
3815// Change-event map2                                   //
3816//                                                     //
3817/////////////////////////////////////////////////////////
3818
3819#define EVENT_MAP_GC_DISCARD_FRACTION  0.5
3820
3821/* This is in two parts:
3822
3823   1. A hash table of RCECs.  This is a set of reference-counted stack
3824      traces.  When the reference count of a stack trace becomes zero,
3825      it is removed from the set and freed up.  The intent is to have
3826      a set of stack traces which can be referred to from (2), but to
3827      only represent each one once.  The set is indexed/searched by
3828      ordering on the stack trace vectors.
3829
3830   2. A SparseWA of OldRefs.  These store information about each old
3831      ref that we need to record.  It is indexed by address of the
3832      location for which the information is recorded.  For LRU
3833      purposes, each OldRef also contains a generation number,
3834      indicating when it was most recently accessed.
3835
3836      The important part of an OldRef is, however, its accs[] array.
3837      This is an array of N_OLDREF_ACCS which binds (thread, R/W,
3838      size) triples to RCECs.  This allows us to collect the last
3839      access-traceback by up to N_OLDREF_ACCS different triples for
3840      this location.  The accs[] array is a MTF-array.  If a binding
3841      falls off the end, that's too bad -- we will lose info about
3842      that triple's access to this location.
3843
3844      When the SparseWA becomes too big, we can throw away the OldRefs
3845      whose generation numbers are below some threshold; hence doing
3846      approximate LRU discarding.  For each discarded OldRef we must
3847      of course decrement the reference count on the all RCECs it
3848      refers to, in order that entries from (1) eventually get
3849      discarded too.
3850
3851   A major improvement in reliability of this mechanism would be to
3852   have a dynamically sized OldRef.accs[] array, so no entries ever
3853   fall off the end.  In investigations (Dec 08) it appears that a
3854   major cause for the non-availability of conflicting-access traces
3855   in race reports is caused by the fixed size of this array.  I
3856   suspect for most OldRefs, only a few entries are used, but for a
3857   minority of cases there is an overflow, leading to info lossage.
3858   Investigations also suggest this is very workload and scheduling
3859   sensitive.  Therefore a dynamic sizing would be better.
3860
3861   However, dynamic sizing would defeat the use of a PoolAllocator
3862   for OldRef structures.  And that's important for performance.  So
3863   it's not straightforward to do.
3864*/
3865
3866
3867static UWord stats__ctxt_rcdec1 = 0;
3868static UWord stats__ctxt_rcdec2 = 0;
3869static UWord stats__ctxt_rcdec3 = 0;
3870static UWord stats__ctxt_rcdec_calls = 0;
3871static UWord stats__ctxt_rcdec_discards = 0;
3872static UWord stats__ctxt_rcdec1_eq = 0;
3873
3874static UWord stats__ctxt_tab_curr = 0;
3875static UWord stats__ctxt_tab_max  = 0;
3876
3877static UWord stats__ctxt_tab_qs   = 0;
3878static UWord stats__ctxt_tab_cmps = 0;
3879
3880
3881///////////////////////////////////////////////////////
3882//// Part (1): A hash table of RCECs
3883///
3884
3885#define N_FRAMES 8
3886
3887// (UInt) `echo "Reference Counted Execution Context" | md5sum`
3888#define RCEC_MAGIC 0xab88abb2UL
3889
3890//#define N_RCEC_TAB 98317 /* prime */
3891#define N_RCEC_TAB 196613 /* prime */
3892
3893typedef
3894   struct _RCEC {
3895      UWord magic;  /* sanity check only */
3896      struct _RCEC* next;
3897      UWord rc;
3898      UWord rcX; /* used for crosschecking */
3899      UWord frames_hash;          /* hash of all the frames */
3900      UWord frames[N_FRAMES];
3901   }
3902   RCEC;
3903
3904static RCEC** contextTab = NULL; /* hash table of RCEC*s */
3905
3906
3907/* Gives an arbitrary total order on RCEC .frames fields */
3908static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
3909   Word i;
3910   tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
3911   tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
3912   if (ec1->frames_hash < ec2->frames_hash) return -1;
3913   if (ec1->frames_hash > ec2->frames_hash) return  1;
3914   for (i = 0; i < N_FRAMES; i++) {
3915      if (ec1->frames[i] < ec2->frames[i]) return -1;
3916      if (ec1->frames[i] > ec2->frames[i]) return  1;
3917   }
3918   return 0;
3919}
3920
3921
3922/* Dec the ref of this RCEC. */
3923static void ctxt__rcdec ( RCEC* ec )
3924{
3925   stats__ctxt_rcdec_calls++;
3926   tl_assert(ec && ec->magic == RCEC_MAGIC);
3927   tl_assert(ec->rc > 0);
3928   ec->rc--;
3929}
3930
3931static void ctxt__rcinc ( RCEC* ec )
3932{
3933   tl_assert(ec && ec->magic == RCEC_MAGIC);
3934   ec->rc++;
3935}
3936
3937
3938//////////// BEGIN RCEC pool allocator
3939static PoolAlloc* rcec_pool_allocator;
3940
3941static RCEC* alloc_RCEC ( void ) {
3942   return VG_(allocEltPA) ( rcec_pool_allocator );
3943}
3944
3945static void free_RCEC ( RCEC* rcec ) {
3946   tl_assert(rcec->magic == RCEC_MAGIC);
3947   VG_(freeEltPA)( rcec_pool_allocator, rcec );
3948}
3949//////////// END RCEC pool allocator
3950
3951
3952/* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
3953   move it one step closer the the front of the list, so as to make
3954   subsequent searches for it cheaper. */
3955static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
3956{
3957   RCEC *ec0, *ec1, *ec2;
3958   if (ec == *headp)
3959      tl_assert(0); /* already at head of list */
3960   tl_assert(ec != NULL);
3961   ec0 = *headp;
3962   ec1 = NULL;
3963   ec2 = NULL;
3964   while (True) {
3965      if (ec0 == NULL || ec0 == ec) break;
3966      ec2 = ec1;
3967      ec1 = ec0;
3968      ec0 = ec0->next;
3969   }
3970   tl_assert(ec0 == ec);
3971   if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
3972      RCEC* tmp;
3973      /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
3974         predecessor.  Swap ec0 and ec1, that is, move ec0 one step
3975         closer to the start of the list. */
3976      tl_assert(ec2->next == ec1);
3977      tl_assert(ec1->next == ec0);
3978      tmp = ec0->next;
3979      ec2->next = ec0;
3980      ec0->next = ec1;
3981      ec1->next = tmp;
3982   }
3983   else
3984   if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
3985      /* it's second in the list. */
3986      tl_assert(*headp == ec1);
3987      tl_assert(ec1->next == ec0);
3988      ec1->next = ec0->next;
3989      ec0->next = ec1;
3990      *headp = ec0;
3991   }
3992}
3993
3994
3995/* Find the given RCEC in the tree, and return a pointer to it.  Or,
3996   if not present, add the given one to the tree (by making a copy of
3997   it, so the caller can immediately deallocate the original) and
3998   return a pointer to the copy.  The caller can safely have 'example'
3999   on its stack, since we will always return a pointer to a copy of
4000   it, not to the original.  Note that the inserted node will have .rc
4001   of zero and so the caller must immediatly increment it. */
4002__attribute__((noinline))
4003static RCEC* ctxt__find_or_add ( RCEC* example )
4004{
4005   UWord hent;
4006   RCEC* copy;
4007   tl_assert(example && example->magic == RCEC_MAGIC);
4008   tl_assert(example->rc == 0);
4009
4010   /* Search the hash table to see if we already have it. */
4011   stats__ctxt_tab_qs++;
4012   hent = example->frames_hash % N_RCEC_TAB;
4013   copy = contextTab[hent];
4014   while (1) {
4015      if (!copy) break;
4016      tl_assert(copy->magic == RCEC_MAGIC);
4017      stats__ctxt_tab_cmps++;
4018      if (0 == RCEC__cmp_by_frames(copy, example)) break;
4019      copy = copy->next;
4020   }
4021
4022   if (copy) {
4023      tl_assert(copy != example);
4024      /* optimisation: if it's not at the head of its list, move 1
4025         step fwds, to make future searches cheaper */
4026      if (copy != contextTab[hent]) {
4027         move_RCEC_one_step_forward( &contextTab[hent], copy );
4028      }
4029   } else {
4030      copy = alloc_RCEC();
4031      tl_assert(copy != example);
4032      *copy = *example;
4033      copy->next = contextTab[hent];
4034      contextTab[hent] = copy;
4035      stats__ctxt_tab_curr++;
4036      if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4037         stats__ctxt_tab_max = stats__ctxt_tab_curr;
4038   }
4039   return copy;
4040}
4041
4042static inline UWord ROLW ( UWord w, Int n )
4043{
4044   Int bpw = 8 * sizeof(UWord);
4045   w = (w << n) | (w >> (bpw-n));
4046   return w;
4047}
4048
4049__attribute__((noinline))
4050static RCEC* get_RCEC ( Thr* thr )
4051{
4052   UWord hash, i;
4053   RCEC  example;
4054   example.magic = RCEC_MAGIC;
4055   example.rc = 0;
4056   example.rcX = 0;
4057   example.next = NULL;
4058   main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
4059   hash = 0;
4060   for (i = 0; i < N_FRAMES; i++) {
4061      hash ^= example.frames[i];
4062      hash = ROLW(hash, 19);
4063   }
4064   example.frames_hash = hash;
4065   return ctxt__find_or_add( &example );
4066}
4067
4068///////////////////////////////////////////////////////
4069//// Part (2):
4070///  A SparseWA guest-addr -> OldRef, that refers to (1)
4071///
4072
4073// (UInt) `echo "Old Reference Information" | md5sum`
4074#define OldRef_MAGIC 0x30b1f075UL
4075
4076/* Records an access: a thread, a context (size & writeness) and the
4077   number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
4078   2, 10 = 4, 11 = 8.
4079*/
4080typedef
4081   struct {
4082      RCEC*     rcec;
4083      WordSetID locksHeldW;
4084      UInt      thrid  : SCALARTS_N_THRBITS;
4085      UInt      szLg2B : 2;
4086      UInt      isW    : 1;
4087   }
4088   Thr_n_RCEC;
4089
4090#define N_OLDREF_ACCS 5
4091
4092typedef
4093   struct {
4094      UWord magic;  /* sanity check only */
4095      UWord gen;    /* when most recently accessed */
4096                    /* or free list when not in use */
4097      /* unused slots in this array have .thrid == 0, which is invalid */
4098      Thr_n_RCEC accs[N_OLDREF_ACCS];
4099   }
4100   OldRef;
4101
4102
4103//////////// BEGIN OldRef pool allocator
4104static PoolAlloc* oldref_pool_allocator;
4105
4106static OldRef* alloc_OldRef ( void ) {
4107   return VG_(allocEltPA) ( oldref_pool_allocator );
4108}
4109
4110static void free_OldRef ( OldRef* r ) {
4111   tl_assert(r->magic == OldRef_MAGIC);
4112   VG_(freeEltPA)( oldref_pool_allocator, r );
4113}
4114//////////// END OldRef pool allocator
4115
4116
4117static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
4118static UWord     oldrefGen      = 0;    /* current LRU generation # */
4119static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
4120static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */
4121
4122inline static UInt min_UInt ( UInt a, UInt b ) {
4123   return a < b ? a : b;
4124}
4125
4126/* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4127   first interval is lower, 1 if the first interval is higher, and 0
4128   if there is any overlap.  Redundant paranoia with casting is there
4129   following what looked distinctly like a bug in gcc-4.1.2, in which
4130   some of the comparisons were done signedly instead of
4131   unsignedly. */
4132/* Copied from exp-ptrcheck/sg_main.c */
4133static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4134                                     Addr a2, SizeT n2 ) {
4135   UWord a1w = (UWord)a1;
4136   UWord n1w = (UWord)n1;
4137   UWord a2w = (UWord)a2;
4138   UWord n2w = (UWord)n2;
4139   tl_assert(n1w > 0 && n2w > 0);
4140   if (a1w + n1w <= a2w) return -1L;
4141   if (a2w + n2w <= a1w) return 1L;
4142   return 0;
4143}
4144
4145static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4146{
4147   OldRef* ref;
4148   RCEC*   rcec;
4149   Word    i, j;
4150   UWord   keyW, valW;
4151   Bool    b;
4152
4153   tl_assert(thr);
4154   ThrID thrid = thr->thrid;
4155   tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4156
4157   WordSetID locksHeldW = thr->hgthread->locksetW;
4158
4159   rcec = get_RCEC( thr );
4160   ctxt__rcinc(rcec);
4161
4162   UInt szLg2B = 0;
4163   switch (szB) {
4164      /* This doesn't look particularly branch-predictor friendly. */
4165      case 1:  szLg2B = 0; break;
4166      case 2:  szLg2B = 1; break;
4167      case 4:  szLg2B = 2; break;
4168      case 8:  szLg2B = 3; break;
4169      default: tl_assert(0);
4170   }
4171
4172   /* Look in the map to see if we already have a record for this
4173      address. */
4174   b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );
4175
4176   if (b) {
4177
4178      /* We already have a record for this address.  We now need to
4179         see if we have a stack trace pertaining to this (thrid, R/W,
4180         size) triple. */
4181      tl_assert(keyW == a);
4182      ref = (OldRef*)valW;
4183      tl_assert(ref->magic == OldRef_MAGIC);
4184
4185      for (i = 0; i < N_OLDREF_ACCS; i++) {
4186         if (ref->accs[i].thrid != thrid)
4187            continue;
4188         if (ref->accs[i].szLg2B != szLg2B)
4189            continue;
4190         if (ref->accs[i].isW != (UInt)(isW & 1))
4191            continue;
4192         /* else we have a match, so stop looking. */
4193         break;
4194      }
4195
4196      if (i < N_OLDREF_ACCS) {
4197         /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
4198         if (i > 0) {
4199            Thr_n_RCEC tmp = ref->accs[i-1];
4200            ref->accs[i-1] = ref->accs[i];
4201            ref->accs[i] = tmp;
4202            i--;
4203         }
4204         if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
4205         stats__ctxt_rcdec1++;
4206         ctxt__rcdec( ref->accs[i].rcec );
4207         tl_assert(ref->accs[i].thrid == thrid);
4208         /* Update the RCEC and the W-held lockset. */
4209         ref->accs[i].rcec       = rcec;
4210         ref->accs[i].locksHeldW = locksHeldW;
4211      } else {
4212         /* No entry for this (thread, R/W, size, nWHeld) quad.
4213            Shuffle all of them down one slot, and put the new entry
4214            at the start of the array. */
4215         if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
4216            /* the last slot is in use.  We must dec the rc on the
4217               associated rcec. */
4218            tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
4219            stats__ctxt_rcdec2++;
4220            if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
4221               VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
4222            ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
4223         } else {
4224            tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
4225         }
4226         for (j = N_OLDREF_ACCS-1; j >= 1; j--)
4227            ref->accs[j] = ref->accs[j-1];
4228         ref->accs[0].thrid      = thrid;
4229         ref->accs[0].szLg2B     = szLg2B;
4230         ref->accs[0].isW        = (UInt)(isW & 1);
4231         ref->accs[0].locksHeldW = locksHeldW;
4232         ref->accs[0].rcec       = rcec;
4233         /* thrid==0 is used to signify an empty slot, so we can't
4234            add zero thrid (such a ThrID is invalid anyway). */
4235         /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
4236      }
4237
4238      ref->gen = oldrefGen;
4239
4240   } else {
4241
4242      /* We don't have a record for this address.  Create a new one. */
4243      if (oldrefTreeN >= oldrefGenIncAt) {
4244         oldrefGen++;
4245         oldrefGenIncAt = oldrefTreeN + 50000;
4246         if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
4247                            oldrefGen, oldrefTreeN );
4248      }
4249
4250      ref = alloc_OldRef();
4251      ref->magic = OldRef_MAGIC;
4252      ref->gen   = oldrefGen;
4253      ref->accs[0].thrid      = thrid;
4254      ref->accs[0].szLg2B     = szLg2B;
4255      ref->accs[0].isW        = (UInt)(isW & 1);
4256      ref->accs[0].locksHeldW = locksHeldW;
4257      ref->accs[0].rcec       = rcec;
4258
4259      /* thrid==0 is used to signify an empty slot, so we can't
4260         add zero thrid (such a ThrID is invalid anyway). */
4261      /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
4262
4263      /* Clear out the rest of the entries */
4264      for (j = 1; j < N_OLDREF_ACCS; j++) {
4265         ref->accs[j].rcec       = NULL;
4266         ref->accs[j].thrid      = 0;
4267         ref->accs[j].szLg2B     = 0;
4268         ref->accs[j].isW        = 0;
4269         ref->accs[j].locksHeldW = 0;
4270      }
4271      VG_(addToSWA)( oldrefTree, a, (UWord)ref );
4272      oldrefTreeN++;
4273
4274   }
4275}
4276
4277
4278/* Extract info from the conflicting-access machinery. */
4279Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4280                              /*OUT*/Thr**        resThr,
4281                              /*OUT*/SizeT*       resSzB,
4282                              /*OUT*/Bool*        resIsW,
4283                              /*OUT*/WordSetID*   locksHeldW,
4284                              Thr* thr, Addr a, SizeT szB, Bool isW )
4285{
4286   Word    i, j;
4287   OldRef* ref;
4288   UWord   keyW, valW;
4289   Bool    b;
4290
4291   ThrID     cand_thrid;
4292   RCEC*     cand_rcec;
4293   Bool      cand_isW;
4294   SizeT     cand_szB;
4295   WordSetID cand_locksHeldW;
4296   Addr      cand_a;
4297
4298   Addr toCheck[15];
4299   Int  nToCheck = 0;
4300
4301   tl_assert(thr);
4302   tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4303
4304   ThrID thrid = thr->thrid;
4305
4306   toCheck[nToCheck++] = a;
4307   for (i = -7; i < (Word)szB; i++) {
4308      if (i != 0)
4309         toCheck[nToCheck++] = a + i;
4310   }
4311   tl_assert(nToCheck <= 15);
4312
4313   /* Now see if we can find a suitable matching event for
4314      any of the addresses in toCheck[0 .. nToCheck-1]. */
4315   for (j = 0; j < nToCheck; j++) {
4316
4317      cand_a = toCheck[j];
4318      //      VG_(printf)("test %ld %p\n", j, cand_a);
4319
4320      b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
4321      if (!b)
4322         continue;
4323
4324      ref = (OldRef*)valW;
4325      tl_assert(keyW == cand_a);
4326      tl_assert(ref->magic == OldRef_MAGIC);
4327      tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */
4328
4329      cand_thrid      = 0; /* invalid; see comments in event_map_bind */
4330      cand_rcec       = NULL;
4331      cand_isW        = False;
4332      cand_szB        = 0;
4333      cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */
4334
4335      for (i = 0; i < N_OLDREF_ACCS; i++) {
4336         Thr_n_RCEC* cand = &ref->accs[i];
4337         cand_rcec       = cand->rcec;
4338         cand_thrid      = cand->thrid;
4339         cand_isW        = (Bool)cand->isW;
4340         cand_szB        = 1 << cand->szLg2B;
4341         cand_locksHeldW = cand->locksHeldW;
4342
4343         if (cand_thrid == 0)
4344            /* This slot isn't in use.  Ignore it. */
4345            continue;
4346
4347         if (cand_thrid == thrid)
4348            /* This is an access by the same thread, but we're only
4349               interested in accesses from other threads.  Ignore. */
4350            continue;
4351
4352         if ((!cand_isW) && (!isW))
4353            /* We don't want to report a read racing against another
4354               read; that's stupid.  So in this case move on. */
4355            continue;
4356
4357         if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
4358            /* No overlap with the access we're asking about.  Ignore. */
4359            continue;
4360
4361         /* We have a match.  Stop searching. */
4362         break;
4363      }
4364
4365      tl_assert(i >= 0 && i <= N_OLDREF_ACCS);
4366
4367      if (i < N_OLDREF_ACCS) {
4368         Int n, maxNFrames;
4369         /* return with success */
4370         tl_assert(cand_thrid);
4371         tl_assert(cand_rcec);
4372         tl_assert(cand_rcec->magic == RCEC_MAGIC);
4373         tl_assert(cand_szB >= 1);
4374         /* Count how many non-zero frames we have. */
4375         maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
4376         for (n = 0; n < maxNFrames; n++) {
4377            if (0 == cand_rcec->frames[n]) break;
4378         }
4379         *resEC      = VG_(make_ExeContext_from_StackTrace)
4380                          (cand_rcec->frames, n);
4381         *resThr     = Thr__from_ThrID(cand_thrid);
4382         *resSzB     = cand_szB;
4383         *resIsW     = cand_isW;
4384         *locksHeldW = cand_locksHeldW;
4385         return True;
4386      }
4387
4388      /* consider next address in toCheck[] */
4389   } /* for (j = 0; j < nToCheck; j++) */
4390
4391   /* really didn't find anything. */
4392   return False;
4393}
4394
4395static void event_map_init ( void )
4396{
4397   Word i;
4398
4399   /* Context (RCEC) pool allocator */
4400   rcec_pool_allocator = VG_(newPA) (
4401                             sizeof(RCEC),
4402                             1000 /* RCECs per pool */,
4403                             HG_(zalloc),
4404                             "libhb.event_map_init.1 (RCEC pools)",
4405                             HG_(free)
4406                          );
4407
4408   /* Context table */
4409   tl_assert(!contextTab);
4410   contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
4411                             N_RCEC_TAB * sizeof(RCEC*) );
4412   tl_assert(contextTab);
4413   for (i = 0; i < N_RCEC_TAB; i++)
4414      contextTab[i] = NULL;
4415
4416   /* Oldref pool allocator */
4417   oldref_pool_allocator = VG_(newPA)(
4418                               sizeof(OldRef),
4419                               1000 /* OldRefs per pool */,
4420                               HG_(zalloc),
4421                               "libhb.event_map_init.3 (OldRef pools)",
4422                               HG_(free)
4423                            );
4424
4425   /* Oldref tree */
4426   tl_assert(!oldrefTree);
4427   oldrefTree = VG_(newSWA)(
4428                   HG_(zalloc),
4429                   "libhb.event_map_init.4 (oldref tree)",
4430                   HG_(free)
4431                );
4432   tl_assert(oldrefTree);
4433
4434   oldrefGen = 0;
4435   oldrefGenIncAt = 0;
4436   oldrefTreeN = 0;
4437}
4438
4439static void event_map__check_reference_counts ( Bool before )
4440{
4441   RCEC*   rcec;
4442   OldRef* oldref;
4443   Word    i;
4444   UWord   nEnts = 0;
4445   UWord   keyW, valW;
4446
4447   /* Set the 'check' reference counts to zero.  Also, optionally
4448      check that the real reference counts are non-zero.  We allow
4449      these to fall to zero before a GC, but the GC must get rid of
4450      all those that are zero, hence none should be zero after a
4451      GC. */
4452   for (i = 0; i < N_RCEC_TAB; i++) {
4453      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4454         nEnts++;
4455         tl_assert(rcec);
4456         tl_assert(rcec->magic == RCEC_MAGIC);
4457         if (!before)
4458            tl_assert(rcec->rc > 0);
4459         rcec->rcX = 0;
4460      }
4461   }
4462
4463   /* check that the stats are sane */
4464   tl_assert(nEnts == stats__ctxt_tab_curr);
4465   tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
4466
4467   /* visit all the referencing points, inc check ref counts */
4468   VG_(initIterSWA)( oldrefTree );
4469   while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4470      oldref = (OldRef*)valW;
4471      tl_assert(oldref->magic == OldRef_MAGIC);
4472      for (i = 0; i < N_OLDREF_ACCS; i++) {
4473         ThrID aThrID = oldref->accs[i].thrid;
4474         RCEC* aRef   = oldref->accs[i].rcec;
4475         if (aThrID != 0) {
4476            tl_assert(aRef);
4477            tl_assert(aRef->magic == RCEC_MAGIC);
4478            aRef->rcX++;
4479         } else {
4480            tl_assert(!aRef);
4481         }
4482      }
4483   }
4484
4485   /* compare check ref counts with actual */
4486   for (i = 0; i < N_RCEC_TAB; i++) {
4487      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4488         tl_assert(rcec->rc == rcec->rcX);
4489      }
4490   }
4491}
4492
4493__attribute__((noinline))
4494static void event_map_maybe_GC ( void )
4495{
4496   OldRef* oldref;
4497   UWord   keyW, valW, retained, maxGen;
4498   XArray* refs2del;
4499   Word    i, j, n2del;
4500
4501   UWord* genMap      = NULL;
4502   UWord  genMap_min  = 0;
4503   UWord  genMap_size = 0;
4504
4505   if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
4506      return;
4507
4508   if (0)
4509      VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);
4510
4511   /* Check for sane command line params.  Limit values must match
4512      those in hg_process_cmd_line_option. */
4513   tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
4514   tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );
4515
4516   /* Check our counting is sane (expensive) */
4517   if (CHECK_CEM)
4518      tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));
4519
4520   /* Check the reference counts (expensive) */
4521   if (CHECK_CEM)
4522      event_map__check_reference_counts( True/*before*/ );
4523
4524   /* Compute the distribution of generation values in the ref tree.
4525      There are likely only to be a few different generation numbers
4526      in the whole tree, but we don't know what they are.  Hence use a
4527      dynamically resized array of counters.  The array is genMap[0
4528      .. genMap_size-1], where genMap[0] is the count for the
4529      generation number genMap_min, genMap[1] is the count for
4530      genMap_min+1, etc.  If a new number is seen outside the range
4531      [genMap_min .. genMap_min + genMap_size - 1] then the array is
4532      copied into a larger array, and genMap_min and genMap_size are
4533      adjusted accordingly. */
4534
4535   /* genMap :: generation-number -> count-of-nodes-with-that-number */
4536
4537   VG_(initIterSWA)( oldrefTree );
4538   while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4539
4540       UWord ea, key;
4541       oldref = (OldRef*)valW;
4542       key = oldref->gen;
4543
4544      /* BEGIN find 'ea', which is the index in genMap holding the
4545         count for generation number 'key'. */
4546      if (UNLIKELY(genMap == NULL)) {
4547         /* deal with the first key to be seen, so that the following
4548            cases don't need to handle the complexity of a NULL count
4549            array. */
4550         genMap_min  = key;
4551         genMap_size = 1;
4552         genMap = HG_(zalloc)( "libhb.emmG.1a",
4553                                genMap_size * sizeof(UWord) );
4554         ea = 0;
4555         if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
4556                            key, genMap_min, genMap_min+genMap_size- 1 );
4557      }
4558      else
4559      if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
4560         /* this is the expected (almost-always-happens) case: 'key'
4561            is already mapped in the array. */
4562         ea = key - genMap_min;
4563      }
4564      else
4565      if (key < genMap_min) {
4566         /* 'key' appears before the start of the current array.
4567            Extend the current array by allocating a larger one and
4568            copying the current one to the upper end of it. */
4569         Word   more;
4570         UWord* map2;
4571         more = genMap_min - key;
4572         tl_assert(more > 0);
4573         map2 = HG_(zalloc)( "libhb.emmG.1b",
4574                             (genMap_size + more) * sizeof(UWord) );
4575         VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
4576         HG_(free)( genMap );
4577         genMap = map2;
4578         genMap_size += more;
4579         genMap_min -= more;
4580         ea = 0;
4581         tl_assert(genMap_min == key);
4582         if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
4583                            key, genMap_min,  genMap_min+genMap_size- 1 );
4584      }
4585      else {
4586         /* 'key' appears after the end of the current array.  Extend
4587            the current array by allocating a larger one and copying
4588            the current one to the lower end of it. */
4589         Word   more;
4590         UWord* map2;
4591         tl_assert(key >= genMap_min + genMap_size);
4592         more = key - (genMap_min + genMap_size) + 1;
4593         tl_assert(more > 0);
4594         map2 = HG_(zalloc)( "libhb.emmG.1c",
4595                             (genMap_size + more) * sizeof(UWord) );
4596         VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
4597         HG_(free)( genMap );
4598         genMap = map2;
4599         genMap_size += more;
4600         ea = genMap_size - 1;;
4601         tl_assert(genMap_min + genMap_size - 1 == key);
4602         if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
4603                            key, genMap_min, genMap_min+genMap_size- 1 );
4604      }
4605      /* END find 'ea' from 'key' */
4606
4607      tl_assert(ea >= 0 && ea < genMap_size);
4608      /* and the whole point of this elaborate computation of 'ea' is .. */
4609      genMap[ea]++;
4610   }
4611
4612   tl_assert(genMap);
4613   tl_assert(genMap_size > 0);
4614
4615   /* Sanity check what we just computed */
4616   { UWord sum = 0;
4617     for (i = 0; i < genMap_size; i++) {
4618        if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
4619                           i + genMap_min, genMap[i] );
4620        sum += genMap[i];
4621     }
4622     tl_assert(sum == oldrefTreeN);
4623   }
4624
4625   /* Figure out how many generations to throw away */
4626   retained = oldrefTreeN;
4627   maxGen = 0;
4628
4629   for (i = 0; i < genMap_size; i++) {
4630      keyW = i + genMap_min;
4631      valW = genMap[i];
4632      tl_assert(keyW > 0); /* can't allow a generation # 0 */
4633      if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
4634      tl_assert(keyW >= maxGen);
4635      tl_assert(retained >= valW);
4636      if (retained - valW
4637          > (UWord)(HG_(clo_conflict_cache_size)
4638                    * EVENT_MAP_GC_DISCARD_FRACTION)) {
4639         retained -= valW;
4640         maxGen = keyW;
4641      } else {
4642         break;
4643      }
4644   }
4645
4646   HG_(free)(genMap);
4647
4648   tl_assert(retained >= 0 && retained <= oldrefTreeN);
4649
4650   /* Now make up a big list of the oldrefTree entries we want to
4651      delete.  We can't simultaneously traverse the tree and delete
4652      stuff from it, so first we need to copy them off somewhere
4653      else. (sigh) */
4654   refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
4655                          HG_(free), sizeof(Addr) );
4656
4657   if (retained < oldrefTreeN) {
4658
4659      /* This is the normal (expected) case.  We discard any ref whose
4660         generation number <= maxGen. */
4661      VG_(initIterSWA)( oldrefTree );
4662      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4663         oldref = (OldRef*)valW;
4664         tl_assert(oldref->magic == OldRef_MAGIC);
4665         if (oldref->gen <= maxGen) {
4666            VG_(addToXA)( refs2del, &keyW );
4667         }
4668      }
4669      if (VG_(clo_stats)) {
4670         VG_(message)(Vg_DebugMsg,
4671            "libhb: EvM GC: delete generations %lu and below, "
4672            "retaining %lu entries\n",
4673            maxGen, retained );
4674      }
4675
4676   } else {
4677
4678      static UInt rand_seed = 0; /* leave as static */
4679
4680      /* Degenerate case: there's only one generation in the entire
4681         tree, so we need to have some other way of deciding which
4682         refs to throw away.  Just throw out half of them randomly. */
4683      tl_assert(retained == oldrefTreeN);
4684      VG_(initIterSWA)( oldrefTree );
4685      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4686         UInt n;
4687         oldref = (OldRef*)valW;
4688         tl_assert(oldref->magic == OldRef_MAGIC);
4689         n = VG_(random)( &rand_seed );
4690         if ((n & 0xFFF) < 0x800) {
4691            VG_(addToXA)( refs2del, &keyW );
4692            retained--;
4693         }
4694      }
4695      if (VG_(clo_stats)) {
4696         VG_(message)(Vg_DebugMsg,
4697            "libhb: EvM GC: randomly delete half the entries, "
4698            "retaining %lu entries\n",
4699            retained );
4700      }
4701
4702   }
4703
4704   n2del = VG_(sizeXA)( refs2del );
4705   tl_assert(n2del == (Word)(oldrefTreeN - retained));
4706
4707   if (0) VG_(printf)("%s","deleting entries\n");
4708   for (i = 0; i < n2del; i++) {
4709      Bool  b;
4710      Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
4711      b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
4712      tl_assert(b);
4713      tl_assert(keyW == ga2del);
4714      oldref = (OldRef*)valW;
4715      for (j = 0; j < N_OLDREF_ACCS; j++) {
4716         ThrID aThrID = oldref->accs[j].thrid;
4717         RCEC* aRef   = oldref->accs[j].rcec;
4718         if (aRef) {
4719            tl_assert(aThrID != 0);
4720            stats__ctxt_rcdec3++;
4721            ctxt__rcdec( aRef );
4722         } else {
4723            tl_assert(aThrID == 0);
4724         }
4725      }
4726
4727      free_OldRef( oldref );
4728   }
4729
4730   VG_(deleteXA)( refs2del );
4731
4732   tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );
4733
4734   oldrefTreeN = retained;
4735   oldrefGenIncAt = oldrefTreeN; /* start new gen right away */
4736
4737   /* Throw away all RCECs with zero reference counts */
4738   for (i = 0; i < N_RCEC_TAB; i++) {
4739      RCEC** pp = &contextTab[i];
4740      RCEC*  p  = *pp;
4741      while (p) {
4742         if (p->rc == 0) {
4743            *pp = p->next;
4744            free_RCEC(p);
4745            p = *pp;
4746            tl_assert(stats__ctxt_tab_curr > 0);
4747            stats__ctxt_tab_curr--;
4748         } else {
4749            pp = &p->next;
4750            p = p->next;
4751         }
4752      }
4753   }
4754
4755   /* Check the reference counts (expensive) */
4756   if (CHECK_CEM)
4757      event_map__check_reference_counts( False/*after*/ );
4758
4759   //if (0)
4760   //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
4761   //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));
4762
4763}
4764
4765
4766/////////////////////////////////////////////////////////
4767//                                                     //
4768// Core MSM                                            //
4769//                                                     //
4770/////////////////////////////////////////////////////////
4771
4772/* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
4773   Nov 08, and again after [...],
4774   June 09. */
4775
4776static ULong stats__msmcread         = 0;
4777static ULong stats__msmcread_change  = 0;
4778static ULong stats__msmcwrite        = 0;
4779static ULong stats__msmcwrite_change = 0;
4780
4781/* Some notes on the H1 history mechanism:
4782
4783   Transition rules are:
4784
4785   read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
4786   write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
4787
4788   After any access by a thread T to a location L, L's constraint pair
4789   (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
4790
4791   After a race by thread T conflicting with some previous access by
4792   some other thread U, for a location with constraint (before
4793   processing the later access) (Cr,Cw), then Cw[U] is the segment in
4794   which the previously access lies.
4795
4796   Hence in record_race_info, we pass in Cfailed and Kfailed, which
4797   are compared so as to find out which thread(s) this access
4798   conflicts with.  Once that is established, we also require the
4799   pre-update Cw for the location, so we can index into it for those
4800   threads, to get the scalar clock values for the point at which the
4801   former accesses were made.  (In fact we only bother to do any of
4802   this for an arbitrarily chosen one of the conflicting threads, as
4803   that's simpler, it avoids flooding the user with vast amounts of
4804   mostly useless information, and because the program is wrong if it
4805   contains any races at all -- so we don't really need to show all
4806   conflicting access pairs initially, so long as we only show none if
4807   none exist).
4808
4809   ---
4810
4811   That requires the auxiliary proof that
4812
4813      (Cr `join` Kw)[T] == Kw[T]
4814
4815   Why should that be true?  Because for any thread T, Kw[T] >= the
4816   scalar clock value for T known by any other thread.  In other
4817   words, because T's value for its own scalar clock is at least as up
4818   to date as the value for it known by any other thread (that is true
4819   for both the R- and W- scalar clocks).  Hence no other thread will
4820   be able to feed in a value for that element (indirectly via a
4821   constraint) which will exceed Kw[T], and hence the join cannot
4822   cause that particular element to advance.
4823*/
4824
4825__attribute__((noinline))
4826static void record_race_info ( Thr* acc_thr,
4827                               Addr acc_addr, SizeT szB, Bool isWrite,
4828                               VtsID Cfailed,
4829                               VtsID Kfailed,
4830                               VtsID Cw )
4831{
4832   /* Call here to report a race.  We just hand it onwards to
4833      HG_(record_error_Race).  If that in turn discovers that the
4834      error is going to be collected, then, at history_level 2, that
4835      queries the conflicting-event map.  The alternative would be to
4836      query it right here.  But that causes a lot of pointless queries
4837      for errors which will shortly be discarded as duplicates, and
4838      can become a performance overhead; so we defer the query until
4839      we know the error is not a duplicate. */
4840
4841   /* Stacks for the bounds of the (or one of the) conflicting
4842      segment(s).  These are only set at history_level 1. */
4843   ExeContext* hist1_seg_start = NULL;
4844   ExeContext* hist1_seg_end   = NULL;
4845   Thread*     hist1_conf_thr  = NULL;
4846
4847   tl_assert(acc_thr);
4848   tl_assert(acc_thr->hgthread);
4849   tl_assert(acc_thr->hgthread->hbthr == acc_thr);
4850   tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
4851
4852   if (HG_(clo_history_level) == 1) {
4853      Bool found;
4854      Word firstIx, lastIx;
4855      ULong_n_EC key;
4856
4857      /* At history_level 1, we must round up the relevant stack-pair
4858         for the conflicting segment right now.  This is because
4859         deferring it is complex; we can't (easily) put Kfailed and
4860         Cfailed into the XError and wait for later without
4861         getting tied up in difficulties with VtsID reference
4862         counting.  So just do it now. */
4863      Thr*  confThr;
4864      ULong confTym = 0;
4865      /* Which thread are we in conflict with?  There may be more than
4866         one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
4867         (in fact it's the one with the lowest Thr* value). */
4868      confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
4869      /* This must exist!  since if it was NULL then there's no
4870         conflict (semantics of return value of
4871         VtsID__findFirst_notLEQ), and msmc{read,write}, which has
4872         called us, just checked exactly this -- that there was in
4873         fact a race. */
4874      tl_assert(confThr);
4875
4876      /* Get the scalar clock value that the conflicting thread
4877         introduced into the constraint.  A careful examination of the
4878         base machine rules shows that this must be the same as the
4879         conflicting thread's scalar clock when it created this
4880         constraint.  Hence we know the scalar clock of the
4881         conflicting thread when the conflicting access was made. */
4882      confTym = VtsID__indexAt( Cfailed, confThr );
4883
4884      /* Using this scalar clock, index into the conflicting thread's
4885         collection of stack traces made each time its vector clock
4886         (hence its scalar clock) changed.  This gives the stack
4887         traces at the start and end of the conflicting segment (well,
4888         as per comment just above, of one of the conflicting
4889         segments, if there are more than one). */
4890      key.ull = confTym;
4891      key.ec  = NULL;
4892      /* tl_assert(confThr); -- asserted just above */
4893      tl_assert(confThr->local_Kws_n_stacks);
4894      firstIx = lastIx = 0;
4895      found = VG_(lookupXA_UNSAFE)(
4896                 confThr->local_Kws_n_stacks,
4897                 &key, &firstIx, &lastIx,
4898                 (XACmpFn_t)cmp__ULong_n_EC__by_ULong
4899              );
4900      if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
4901                         "confTym %llu found %d (%lu,%lu)\n",
4902                         Cfailed, Kfailed, Cw,
4903                         confThr, confTym, found, firstIx, lastIx);
4904      /* We can't indefinitely collect stack traces at VTS
4905         transitions, since we'd eventually run out of memory.  Hence
4906         note_local_Kw_n_stack_for will eventually throw away old
4907         ones, which in turn means we might fail to find index value
4908         confTym in the array. */
4909      if (found) {
4910         ULong_n_EC *pair_start, *pair_end;
4911         pair_start
4912            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
4913         hist1_seg_start = pair_start->ec;
4914         if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
4915            pair_end
4916               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
4917                                            lastIx+1 );
4918            /* from properties of VG_(lookupXA) and the comparison fn used: */
4919            tl_assert(pair_start->ull < pair_end->ull);
4920            hist1_seg_end = pair_end->ec;
4921            /* Could do a bit better here.  It may be that pair_end
4922               doesn't have a stack, but the following entries in the
4923               array have the same scalar Kw and to have a stack.  So
4924               we should search a bit further along the array than
4925               lastIx+1 if hist1_seg_end is NULL. */
4926         } else {
4927            if (!confThr->llexit_done)
4928               hist1_seg_end = main_get_EC( confThr );
4929         }
4930         // seg_start could be NULL iff this is the first stack in the thread
4931         //if (seg_start) VG_(pp_ExeContext)(seg_start);
4932         //if (seg_end)   VG_(pp_ExeContext)(seg_end);
4933         hist1_conf_thr = confThr->hgthread;
4934      }
4935   }
4936
4937   HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
4938                           szB, isWrite,
4939                           hist1_conf_thr, hist1_seg_start, hist1_seg_end );
4940}
4941
4942static Bool is_sane_SVal_C ( SVal sv ) {
4943   Bool leq;
4944   if (!SVal__isC(sv)) return True;
4945   leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
4946   return leq;
4947}
4948
4949
4950/* Compute new state following a read */
4951static inline SVal msmcread ( SVal svOld,
4952                              /* The following are only needed for
4953                                 creating error reports. */
4954                              Thr* acc_thr,
4955                              Addr acc_addr, SizeT szB )
4956{
4957   SVal svNew = SVal_INVALID;
4958   stats__msmcread++;
4959
4960   /* Redundant sanity check on the constraints */
4961   if (CHECK_MSM) {
4962      tl_assert(is_sane_SVal_C(svOld));
4963   }
4964
4965   if (LIKELY(SVal__isC(svOld))) {
4966      VtsID tviR  = acc_thr->viR;
4967      VtsID tviW  = acc_thr->viW;
4968      VtsID rmini = SVal__unC_Rmin(svOld);
4969      VtsID wmini = SVal__unC_Wmin(svOld);
4970      Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
4971      if (LIKELY(leq)) {
4972         /* no race */
4973         /* Note: RWLOCK subtlety: use tviW, not tviR */
4974         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
4975         goto out;
4976      } else {
4977         /* assert on sanity of constraints. */
4978         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
4979         tl_assert(leqxx);
4980         // same as in non-race case
4981         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
4982         record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
4983                           rmini, /* Cfailed */
4984                           tviR,  /* Kfailed */
4985                           wmini  /* Cw */ );
4986         goto out;
4987      }
4988   }
4989   if (SVal__isA(svOld)) {
4990      /* reading no-access memory (sigh); leave unchanged */
4991      /* check for no pollution */
4992      tl_assert(svOld == SVal_NOACCESS);
4993      svNew = SVal_NOACCESS;
4994      goto out;
4995   }
4996   if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
4997   tl_assert(0);
4998
4999  out:
5000   if (CHECK_MSM) {
5001      tl_assert(is_sane_SVal_C(svNew));
5002   }
5003   if (UNLIKELY(svNew != svOld)) {
5004      tl_assert(svNew != SVal_INVALID);
5005      if (HG_(clo_history_level) >= 2
5006          && SVal__isC(svOld) && SVal__isC(svNew)) {
5007         event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5008         stats__msmcread_change++;
5009      }
5010   }
5011   return svNew;
5012}
5013
5014
5015/* Compute new state following a write */
5016static inline SVal msmcwrite ( SVal svOld,
5017                              /* The following are only needed for
5018                                 creating error reports. */
5019                              Thr* acc_thr,
5020                              Addr acc_addr, SizeT szB )
5021{
5022   SVal svNew = SVal_INVALID;
5023   stats__msmcwrite++;
5024
5025   /* Redundant sanity check on the constraints */
5026   if (CHECK_MSM) {
5027      tl_assert(is_sane_SVal_C(svOld));
5028   }
5029
5030   if (LIKELY(SVal__isC(svOld))) {
5031      VtsID tviW  = acc_thr->viW;
5032      VtsID wmini = SVal__unC_Wmin(svOld);
5033      Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5034      if (LIKELY(leq)) {
5035         /* no race */
5036         svNew = SVal__mkC( tviW, tviW );
5037         goto out;
5038      } else {
5039         VtsID rmini = SVal__unC_Rmin(svOld);
5040         /* assert on sanity of constraints. */
5041         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5042         tl_assert(leqxx);
5043         // same as in non-race case
5044         // proof: in the non-race case, we have
5045         //    rmini <= wmini (invar on constraints)
5046         //    tviW <= tviR (invar on thread clocks)
5047         //    wmini <= tviW (from run-time check)
5048         // hence from transitivity of <= we have
5049         //    rmini <= wmini <= tviW
5050         // and so join(rmini,tviW) == tviW
5051         // and    join(wmini,tviW) == tviW
5052         // qed.
5053         svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5054                            VtsID__join2(wmini, tviW) );
5055         record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5056                           wmini, /* Cfailed */
5057                           tviW,  /* Kfailed */
5058                           wmini  /* Cw */ );
5059         goto out;
5060      }
5061   }
5062   if (SVal__isA(svOld)) {
5063      /* writing no-access memory (sigh); leave unchanged */
5064      /* check for no pollution */
5065      tl_assert(svOld == SVal_NOACCESS);
5066      svNew = SVal_NOACCESS;
5067      goto out;
5068   }
5069   if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5070   tl_assert(0);
5071
5072  out:
5073   if (CHECK_MSM) {
5074      tl_assert(is_sane_SVal_C(svNew));
5075   }
5076   if (UNLIKELY(svNew != svOld)) {
5077      tl_assert(svNew != SVal_INVALID);
5078      if (HG_(clo_history_level) >= 2
5079          && SVal__isC(svOld) && SVal__isC(svNew)) {
5080         event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5081         stats__msmcwrite_change++;
5082      }
5083   }
5084   return svNew;
5085}
5086
5087
5088/////////////////////////////////////////////////////////
5089//                                                     //
5090// Apply core MSM to specific memory locations         //
5091//                                                     //
5092/////////////////////////////////////////////////////////
5093
5094/*------------- ZSM accesses: 8 bit sapply ------------- */
5095
5096static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5097   CacheLine* cl;
5098   UWord      cloff, tno, toff;
5099   SVal       svOld, svNew;
5100   UShort     descr;
5101   stats__cline_cread08s++;
5102   cl    = get_cacheline(a);
5103   cloff = get_cacheline_offset(a);
5104   tno   = get_treeno(a);
5105   toff  = get_tree_offset(a); /* == 0 .. 7 */
5106   descr = cl->descrs[tno];
5107   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5108      SVal* tree = &cl->svals[tno << 3];
5109      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5110      if (CHECK_ZSM)
5111         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5112   }
5113   svOld = cl->svals[cloff];
5114   svNew = msmcread( svOld, thr,a,1 );
5115   if (CHECK_ZSM)
5116      tl_assert(svNew != SVal_INVALID);
5117   cl->svals[cloff] = svNew;
5118}
5119
5120static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5121   CacheLine* cl;
5122   UWord      cloff, tno, toff;
5123   SVal       svOld, svNew;
5124   UShort     descr;
5125   stats__cline_cwrite08s++;
5126   cl    = get_cacheline(a);
5127   cloff = get_cacheline_offset(a);
5128   tno   = get_treeno(a);
5129   toff  = get_tree_offset(a); /* == 0 .. 7 */
5130   descr = cl->descrs[tno];
5131   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5132      SVal* tree = &cl->svals[tno << 3];
5133      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5134      if (CHECK_ZSM)
5135         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5136   }
5137   svOld = cl->svals[cloff];
5138   svNew = msmcwrite( svOld, thr,a,1 );
5139   if (CHECK_ZSM)
5140      tl_assert(svNew != SVal_INVALID);
5141   cl->svals[cloff] = svNew;
5142}
5143
5144/*------------- ZSM accesses: 16 bit sapply ------------- */
5145
5146static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5147   CacheLine* cl;
5148   UWord      cloff, tno, toff;
5149   SVal       svOld, svNew;
5150   UShort     descr;
5151   stats__cline_cread16s++;
5152   if (UNLIKELY(!aligned16(a))) goto slowcase;
5153   cl    = get_cacheline(a);
5154   cloff = get_cacheline_offset(a);
5155   tno   = get_treeno(a);
5156   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5157   descr = cl->descrs[tno];
5158   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5159      if (valid_value_is_below_me_16(descr, toff)) {
5160         goto slowcase;
5161      } else {
5162         SVal* tree = &cl->svals[tno << 3];
5163         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5164      }
5165      if (CHECK_ZSM)
5166         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5167   }
5168   svOld = cl->svals[cloff];
5169   svNew = msmcread( svOld, thr,a,2 );
5170   if (CHECK_ZSM)
5171      tl_assert(svNew != SVal_INVALID);
5172   cl->svals[cloff] = svNew;
5173   return;
5174  slowcase: /* misaligned, or must go further down the tree */
5175   stats__cline_16to8splits++;
5176   zsm_sapply08__msmcread( thr, a + 0 );
5177   zsm_sapply08__msmcread( thr, a + 1 );
5178}
5179
5180static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5181   CacheLine* cl;
5182   UWord      cloff, tno, toff;
5183   SVal       svOld, svNew;
5184   UShort     descr;
5185   stats__cline_cwrite16s++;
5186   if (UNLIKELY(!aligned16(a))) goto slowcase;
5187   cl    = get_cacheline(a);
5188   cloff = get_cacheline_offset(a);
5189   tno   = get_treeno(a);
5190   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5191   descr = cl->descrs[tno];
5192   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5193      if (valid_value_is_below_me_16(descr, toff)) {
5194         goto slowcase;
5195      } else {
5196         SVal* tree = &cl->svals[tno << 3];
5197         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5198      }
5199      if (CHECK_ZSM)
5200         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5201   }
5202   svOld = cl->svals[cloff];
5203   svNew = msmcwrite( svOld, thr,a,2 );
5204   if (CHECK_ZSM)
5205      tl_assert(svNew != SVal_INVALID);
5206   cl->svals[cloff] = svNew;
5207   return;
5208  slowcase: /* misaligned, or must go further down the tree */
5209   stats__cline_16to8splits++;
5210   zsm_sapply08__msmcwrite( thr, a + 0 );
5211   zsm_sapply08__msmcwrite( thr, a + 1 );
5212}
5213
5214/*------------- ZSM accesses: 32 bit sapply ------------- */
5215
5216static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5217   CacheLine* cl;
5218   UWord      cloff, tno, toff;
5219   SVal       svOld, svNew;
5220   UShort     descr;
5221   stats__cline_cread32s++;
5222   if (UNLIKELY(!aligned32(a))) goto slowcase;
5223   cl    = get_cacheline(a);
5224   cloff = get_cacheline_offset(a);
5225   tno   = get_treeno(a);
5226   toff  = get_tree_offset(a); /* == 0 or 4 */
5227   descr = cl->descrs[tno];
5228   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5229      if (valid_value_is_above_me_32(descr, toff)) {
5230         SVal* tree = &cl->svals[tno << 3];
5231         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5232      } else {
5233         goto slowcase;
5234      }
5235      if (CHECK_ZSM)
5236         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5237   }
5238   svOld = cl->svals[cloff];
5239   svNew = msmcread( svOld, thr,a,4 );
5240   if (CHECK_ZSM)
5241      tl_assert(svNew != SVal_INVALID);
5242   cl->svals[cloff] = svNew;
5243   return;
5244  slowcase: /* misaligned, or must go further down the tree */
5245   stats__cline_32to16splits++;
5246   zsm_sapply16__msmcread( thr, a + 0 );
5247   zsm_sapply16__msmcread( thr, a + 2 );
5248}
5249
5250static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5251   CacheLine* cl;
5252   UWord      cloff, tno, toff;
5253   SVal       svOld, svNew;
5254   UShort     descr;
5255   stats__cline_cwrite32s++;
5256   if (UNLIKELY(!aligned32(a))) goto slowcase;
5257   cl    = get_cacheline(a);
5258   cloff = get_cacheline_offset(a);
5259   tno   = get_treeno(a);
5260   toff  = get_tree_offset(a); /* == 0 or 4 */
5261   descr = cl->descrs[tno];
5262   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5263      if (valid_value_is_above_me_32(descr, toff)) {
5264         SVal* tree = &cl->svals[tno << 3];
5265         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5266      } else {
5267         goto slowcase;
5268      }
5269      if (CHECK_ZSM)
5270         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5271   }
5272   svOld = cl->svals[cloff];
5273   svNew = msmcwrite( svOld, thr,a,4 );
5274   if (CHECK_ZSM)
5275      tl_assert(svNew != SVal_INVALID);
5276   cl->svals[cloff] = svNew;
5277   return;
5278  slowcase: /* misaligned, or must go further down the tree */
5279   stats__cline_32to16splits++;
5280   zsm_sapply16__msmcwrite( thr, a + 0 );
5281   zsm_sapply16__msmcwrite( thr, a + 2 );
5282}
5283
5284/*------------- ZSM accesses: 64 bit sapply ------------- */
5285
5286static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5287   CacheLine* cl;
5288   UWord      cloff, tno;
5289   //UWord      toff;
5290   SVal       svOld, svNew;
5291   UShort     descr;
5292   stats__cline_cread64s++;
5293   if (UNLIKELY(!aligned64(a))) goto slowcase;
5294   cl    = get_cacheline(a);
5295   cloff = get_cacheline_offset(a);
5296   tno   = get_treeno(a);
5297   //toff  = get_tree_offset(a); /* == 0, unused */
5298   descr = cl->descrs[tno];
5299   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5300      goto slowcase;
5301   }
5302   svOld = cl->svals[cloff];
5303   svNew = msmcread( svOld, thr,a,8 );
5304   if (CHECK_ZSM)
5305      tl_assert(svNew != SVal_INVALID);
5306   cl->svals[cloff] = svNew;
5307   return;
5308  slowcase: /* misaligned, or must go further down the tree */
5309   stats__cline_64to32splits++;
5310   zsm_sapply32__msmcread( thr, a + 0 );
5311   zsm_sapply32__msmcread( thr, a + 4 );
5312}
5313
5314static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5315   CacheLine* cl;
5316   UWord      cloff, tno;
5317   //UWord      toff;
5318   SVal       svOld, svNew;
5319   UShort     descr;
5320   stats__cline_cwrite64s++;
5321   if (UNLIKELY(!aligned64(a))) goto slowcase;
5322   cl    = get_cacheline(a);
5323   cloff = get_cacheline_offset(a);
5324   tno   = get_treeno(a);
5325   //toff  = get_tree_offset(a); /* == 0, unused */
5326   descr = cl->descrs[tno];
5327   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5328      goto slowcase;
5329   }
5330   svOld = cl->svals[cloff];
5331   svNew = msmcwrite( svOld, thr,a,8 );
5332   if (CHECK_ZSM)
5333      tl_assert(svNew != SVal_INVALID);
5334   cl->svals[cloff] = svNew;
5335   return;
5336  slowcase: /* misaligned, or must go further down the tree */
5337   stats__cline_64to32splits++;
5338   zsm_sapply32__msmcwrite( thr, a + 0 );
5339   zsm_sapply32__msmcwrite( thr, a + 4 );
5340}
5341
5342/*--------------- ZSM accesses: 8 bit swrite --------------- */
5343
5344static
5345void zsm_swrite08 ( Addr a, SVal svNew ) {
5346   CacheLine* cl;
5347   UWord      cloff, tno, toff;
5348   UShort     descr;
5349   stats__cline_swrite08s++;
5350   cl    = get_cacheline(a);
5351   cloff = get_cacheline_offset(a);
5352   tno   = get_treeno(a);
5353   toff  = get_tree_offset(a); /* == 0 .. 7 */
5354   descr = cl->descrs[tno];
5355   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5356      SVal* tree = &cl->svals[tno << 3];
5357      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5358      if (CHECK_ZSM)
5359         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5360   }
5361   tl_assert(svNew != SVal_INVALID);
5362   cl->svals[cloff] = svNew;
5363}
5364
5365/*--------------- ZSM accesses: 16 bit swrite --------------- */
5366
5367static
5368void zsm_swrite16 ( Addr a, SVal svNew ) {
5369   CacheLine* cl;
5370   UWord      cloff, tno, toff;
5371   UShort     descr;
5372   stats__cline_swrite16s++;
5373   if (UNLIKELY(!aligned16(a))) goto slowcase;
5374   cl    = get_cacheline(a);
5375   cloff = get_cacheline_offset(a);
5376   tno   = get_treeno(a);
5377   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5378   descr = cl->descrs[tno];
5379   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5380      if (valid_value_is_below_me_16(descr, toff)) {
5381         /* Writing at this level.  Need to fix up 'descr'. */
5382         cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5383         /* At this point, the tree does not match cl->descr[tno] any
5384            more.  The assignments below will fix it up. */
5385      } else {
5386         /* We can't indiscriminately write on the w16 node as in the
5387            w64 case, as that might make the node inconsistent with
5388            its parent.  So first, pull down to this level. */
5389         SVal* tree = &cl->svals[tno << 3];
5390         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5391      if (CHECK_ZSM)
5392         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5393      }
5394   }
5395   tl_assert(svNew != SVal_INVALID);
5396   cl->svals[cloff + 0] = svNew;
5397   cl->svals[cloff + 1] = SVal_INVALID;
5398   return;
5399  slowcase: /* misaligned */
5400   stats__cline_16to8splits++;
5401   zsm_swrite08( a + 0, svNew );
5402   zsm_swrite08( a + 1, svNew );
5403}
5404
5405/*--------------- ZSM accesses: 32 bit swrite --------------- */
5406
5407static
5408void zsm_swrite32 ( Addr a, SVal svNew ) {
5409   CacheLine* cl;
5410   UWord      cloff, tno, toff;
5411   UShort     descr;
5412   stats__cline_swrite32s++;
5413   if (UNLIKELY(!aligned32(a))) goto slowcase;
5414   cl    = get_cacheline(a);
5415   cloff = get_cacheline_offset(a);
5416   tno   = get_treeno(a);
5417   toff  = get_tree_offset(a); /* == 0 or 4 */
5418   descr = cl->descrs[tno];
5419   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5420      if (valid_value_is_above_me_32(descr, toff)) {
5421         /* We can't indiscriminately write on the w32 node as in the
5422            w64 case, as that might make the node inconsistent with
5423            its parent.  So first, pull down to this level. */
5424         SVal* tree = &cl->svals[tno << 3];
5425         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5426         if (CHECK_ZSM)
5427            tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5428      } else {
5429         /* Writing at this level.  Need to fix up 'descr'. */
5430         cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5431         /* At this point, the tree does not match cl->descr[tno] any
5432            more.  The assignments below will fix it up. */
5433      }
5434   }
5435   tl_assert(svNew != SVal_INVALID);
5436   cl->svals[cloff + 0] = svNew;
5437   cl->svals[cloff + 1] = SVal_INVALID;
5438   cl->svals[cloff + 2] = SVal_INVALID;
5439   cl->svals[cloff + 3] = SVal_INVALID;
5440   return;
5441  slowcase: /* misaligned */
5442   stats__cline_32to16splits++;
5443   zsm_swrite16( a + 0, svNew );
5444   zsm_swrite16( a + 2, svNew );
5445}
5446
5447/*--------------- ZSM accesses: 64 bit swrite --------------- */
5448
5449static
5450void zsm_swrite64 ( Addr a, SVal svNew ) {
5451   CacheLine* cl;
5452   UWord      cloff, tno;
5453   //UWord    toff;
5454   stats__cline_swrite64s++;
5455   if (UNLIKELY(!aligned64(a))) goto slowcase;
5456   cl    = get_cacheline(a);
5457   cloff = get_cacheline_offset(a);
5458   tno   = get_treeno(a);
5459   //toff  = get_tree_offset(a); /* == 0, unused */
5460   cl->descrs[tno] = TREE_DESCR_64;
5461   tl_assert(svNew != SVal_INVALID);
5462   cl->svals[cloff + 0] = svNew;
5463   cl->svals[cloff + 1] = SVal_INVALID;
5464   cl->svals[cloff + 2] = SVal_INVALID;
5465   cl->svals[cloff + 3] = SVal_INVALID;
5466   cl->svals[cloff + 4] = SVal_INVALID;
5467   cl->svals[cloff + 5] = SVal_INVALID;
5468   cl->svals[cloff + 6] = SVal_INVALID;
5469   cl->svals[cloff + 7] = SVal_INVALID;
5470   return;
5471  slowcase: /* misaligned */
5472   stats__cline_64to32splits++;
5473   zsm_swrite32( a + 0, svNew );
5474   zsm_swrite32( a + 4, svNew );
5475}
5476
5477/*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5478
5479static
5480SVal zsm_sread08 ( Addr a ) {
5481   CacheLine* cl;
5482   UWord      cloff, tno, toff;
5483   UShort     descr;
5484   stats__cline_sread08s++;
5485   cl    = get_cacheline(a);
5486   cloff = get_cacheline_offset(a);
5487   tno   = get_treeno(a);
5488   toff  = get_tree_offset(a); /* == 0 .. 7 */
5489   descr = cl->descrs[tno];
5490   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5491      SVal* tree = &cl->svals[tno << 3];
5492      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5493   }
5494   return cl->svals[cloff];
5495}
5496
5497static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5498   SVal       sv;
5499   stats__cline_scopy08s++;
5500   sv = zsm_sread08( src );
5501   zsm_swrite08( dst, sv );
5502}
5503
5504
5505/* Block-copy states (needed for implementing realloc()).  Note this
5506   doesn't change the filtering arrangements.  The caller of
5507   zsm_scopy_range needs to attend to that. */
5508
5509static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
5510{
5511   SizeT i;
5512   if (len == 0)
5513      return;
5514
5515   /* assert for non-overlappingness */
5516   tl_assert(src+len <= dst || dst+len <= src);
5517
5518   /* To be simple, just copy byte by byte.  But so as not to wreck
5519      performance for later accesses to dst[0 .. len-1], normalise
5520      destination lines as we finish with them, and also normalise the
5521      line containing the first and last address. */
5522   for (i = 0; i < len; i++) {
5523      Bool normalise
5524         = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
5525           || i == 0       /* first in range */
5526           || i == len-1;  /* last in range */
5527      zsm_scopy08( src+i, dst+i, normalise );
5528   }
5529}
5530
5531
5532/* For setting address ranges to a given value.  Has considerable
5533   sophistication so as to avoid generating large numbers of pointless
5534   cache loads/writebacks for large ranges. */
5535
5536/* Do small ranges in-cache, in the obvious way. */
5537static
5538void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
5539{
5540   /* fast track a couple of common cases */
5541   if (len == 4 && aligned32(a)) {
5542      zsm_swrite32( a, svNew );
5543      return;
5544   }
5545   if (len == 8 && aligned64(a)) {
5546      zsm_swrite64( a, svNew );
5547      return;
5548   }
5549
5550   /* be completely general (but as efficient as possible) */
5551   if (len == 0) return;
5552
5553   if (!aligned16(a) && len >= 1) {
5554      zsm_swrite08( a, svNew );
5555      a += 1;
5556      len -= 1;
5557      tl_assert(aligned16(a));
5558   }
5559   if (len == 0) return;
5560
5561   if (!aligned32(a) && len >= 2) {
5562      zsm_swrite16( a, svNew );
5563      a += 2;
5564      len -= 2;
5565      tl_assert(aligned32(a));
5566   }
5567   if (len == 0) return;
5568
5569   if (!aligned64(a) && len >= 4) {
5570      zsm_swrite32( a, svNew );
5571      a += 4;
5572      len -= 4;
5573      tl_assert(aligned64(a));
5574   }
5575   if (len == 0) return;
5576
5577   if (len >= 8) {
5578      tl_assert(aligned64(a));
5579      while (len >= 8) {
5580         zsm_swrite64( a, svNew );
5581         a += 8;
5582         len -= 8;
5583      }
5584      tl_assert(aligned64(a));
5585   }
5586   if (len == 0) return;
5587
5588   if (len >= 4)
5589      tl_assert(aligned32(a));
5590   if (len >= 4) {
5591      zsm_swrite32( a, svNew );
5592      a += 4;
5593      len -= 4;
5594   }
5595   if (len == 0) return;
5596
5597   if (len >= 2)
5598      tl_assert(aligned16(a));
5599   if (len >= 2) {
5600      zsm_swrite16( a, svNew );
5601      a += 2;
5602      len -= 2;
5603   }
5604   if (len == 0) return;
5605
5606   if (len >= 1) {
5607      zsm_swrite08( a, svNew );
5608      //a += 1;
5609      len -= 1;
5610   }
5611   tl_assert(len == 0);
5612}
5613
5614
5615/* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
5616   for larger ranges, try to operate directly on the out-of-cache
5617   representation, rather than dragging lines into the cache,
5618   overwriting them, and forcing them out.  This turns out to be an
5619   important performance optimisation.
5620
5621   Note that this doesn't change the filtering arrangements.  The
5622   caller of zsm_sset_range needs to attend to that. */
5623
5624static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
5625{
5626   tl_assert(svNew != SVal_INVALID);
5627   stats__cache_make_New_arange += (ULong)len;
5628
5629   if (0 && len > 500)
5630      VG_(printf)("make New      ( %#lx, %ld )\n", a, len );
5631
5632   if (0) {
5633      static UWord n_New_in_cache = 0;
5634      static UWord n_New_not_in_cache = 0;
5635      /* tag is 'a' with the in-line offset masked out,
5636         eg a[31]..a[4] 0000 */
5637      Addr       tag = a & ~(N_LINE_ARANGE - 1);
5638      UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
5639      if (LIKELY(tag == cache_shmem.tags0[wix])) {
5640         n_New_in_cache++;
5641      } else {
5642         n_New_not_in_cache++;
5643      }
5644      if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
5645         VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
5646                     n_New_in_cache, n_New_not_in_cache );
5647   }
5648
5649   if (LIKELY(len < 2 * N_LINE_ARANGE)) {
5650      zsm_sset_range_SMALL( a, len, svNew );
5651   } else {
5652      Addr  before_start  = a;
5653      Addr  aligned_start = cacheline_ROUNDUP(a);
5654      Addr  after_start   = cacheline_ROUNDDN(a + len);
5655      UWord before_len    = aligned_start - before_start;
5656      UWord aligned_len   = after_start - aligned_start;
5657      UWord after_len     = a + len - after_start;
5658      tl_assert(before_start <= aligned_start);
5659      tl_assert(aligned_start <= after_start);
5660      tl_assert(before_len < N_LINE_ARANGE);
5661      tl_assert(after_len < N_LINE_ARANGE);
5662      tl_assert(get_cacheline_offset(aligned_start) == 0);
5663      if (get_cacheline_offset(a) == 0) {
5664         tl_assert(before_len == 0);
5665         tl_assert(a == aligned_start);
5666      }
5667      if (get_cacheline_offset(a+len) == 0) {
5668         tl_assert(after_len == 0);
5669         tl_assert(after_start == a+len);
5670      }
5671      if (before_len > 0) {
5672         zsm_sset_range_SMALL( before_start, before_len, svNew );
5673      }
5674      if (after_len > 0) {
5675         zsm_sset_range_SMALL( after_start, after_len, svNew );
5676      }
5677      stats__cache_make_New_inZrep += (ULong)aligned_len;
5678
5679      while (1) {
5680         Addr tag;
5681         UWord wix;
5682         if (aligned_start >= after_start)
5683            break;
5684         tl_assert(get_cacheline_offset(aligned_start) == 0);
5685         tag = aligned_start & ~(N_LINE_ARANGE - 1);
5686         wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
5687         if (tag == cache_shmem.tags0[wix]) {
5688            UWord i;
5689            for (i = 0; i < N_LINE_ARANGE / 8; i++)
5690               zsm_swrite64( aligned_start + i * 8, svNew );
5691         } else {
5692            UWord i;
5693            Word zix;
5694            SecMap* sm;
5695            LineZ* lineZ;
5696            /* This line is not in the cache.  Do not force it in; instead
5697               modify it in-place. */
5698            /* find the Z line to write in and rcdec it or the
5699               associated F line. */
5700            find_Z_for_writing( &sm, &zix, tag );
5701            tl_assert(sm);
5702            tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
5703            lineZ = &sm->linesZ[zix];
5704            lineZ->dict[0] = svNew;
5705            lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
5706            for (i = 0; i < N_LINE_ARANGE/4; i++)
5707               lineZ->ix2s[i] = 0; /* all refer to dict[0] */
5708            rcinc_LineZ(lineZ);
5709         }
5710         aligned_start += N_LINE_ARANGE;
5711         aligned_len -= N_LINE_ARANGE;
5712      }
5713      tl_assert(aligned_start == after_start);
5714      tl_assert(aligned_len == 0);
5715   }
5716}
5717
5718
5719/////////////////////////////////////////////////////////
5720//                                                     //
5721// Front-filtering accesses                            //
5722//                                                     //
5723/////////////////////////////////////////////////////////
5724
5725static UWord stats__f_ac = 0;
5726static UWord stats__f_sk = 0;
5727
5728#if 0
5729#  define STATS__F_SHOW \
5730     do { \
5731        if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
5732           VG_(printf)("filters: ac %lu sk %lu\n",   \
5733           stats__f_ac, stats__f_sk); \
5734     } while (0)
5735#else
5736#  define STATS__F_SHOW /* */
5737#endif
5738
5739void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
5740   stats__f_ac++;
5741   STATS__F_SHOW;
5742   if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
5743      stats__f_sk++;
5744      return;
5745   }
5746   zsm_sapply08__msmcwrite(thr, a);
5747}
5748
5749void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
5750   stats__f_ac++;
5751   STATS__F_SHOW;
5752   if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
5753      stats__f_sk++;
5754      return;
5755   }
5756   zsm_sapply16__msmcwrite(thr, a);
5757}
5758
5759void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
5760   stats__f_ac++;
5761   STATS__F_SHOW;
5762   if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
5763      stats__f_sk++;
5764      return;
5765   }
5766   zsm_sapply32__msmcwrite(thr, a);
5767}
5768
5769void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
5770   stats__f_ac++;
5771   STATS__F_SHOW;
5772   if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
5773      stats__f_sk++;
5774      return;
5775   }
5776   zsm_sapply64__msmcwrite(thr, a);
5777}
5778
5779void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
5780{
5781   /* fast track a couple of common cases */
5782   if (len == 4 && aligned32(a)) {
5783      zsm_sapply32_f__msmcwrite( thr, a );
5784      return;
5785   }
5786   if (len == 8 && aligned64(a)) {
5787      zsm_sapply64_f__msmcwrite( thr, a );
5788      return;
5789   }
5790
5791   /* be completely general (but as efficient as possible) */
5792   if (len == 0) return;
5793
5794   if (!aligned16(a) && len >= 1) {
5795      zsm_sapply08_f__msmcwrite( thr, a );
5796      a += 1;
5797      len -= 1;
5798      tl_assert(aligned16(a));
5799   }
5800   if (len == 0) return;
5801
5802   if (!aligned32(a) && len >= 2) {
5803      zsm_sapply16_f__msmcwrite( thr, a );
5804      a += 2;
5805      len -= 2;
5806      tl_assert(aligned32(a));
5807   }
5808   if (len == 0) return;
5809
5810   if (!aligned64(a) && len >= 4) {
5811      zsm_sapply32_f__msmcwrite( thr, a );
5812      a += 4;
5813      len -= 4;
5814      tl_assert(aligned64(a));
5815   }
5816   if (len == 0) return;
5817
5818   if (len >= 8) {
5819      tl_assert(aligned64(a));
5820      while (len >= 8) {
5821         zsm_sapply64_f__msmcwrite( thr, a );
5822         a += 8;
5823         len -= 8;
5824      }
5825      tl_assert(aligned64(a));
5826   }
5827   if (len == 0) return;
5828
5829   if (len >= 4)
5830      tl_assert(aligned32(a));
5831   if (len >= 4) {
5832      zsm_sapply32_f__msmcwrite( thr, a );
5833      a += 4;
5834      len -= 4;
5835   }
5836   if (len == 0) return;
5837
5838   if (len >= 2)
5839      tl_assert(aligned16(a));
5840   if (len >= 2) {
5841      zsm_sapply16_f__msmcwrite( thr, a );
5842      a += 2;
5843      len -= 2;
5844   }
5845   if (len == 0) return;
5846
5847   if (len >= 1) {
5848      zsm_sapply08_f__msmcwrite( thr, a );
5849      //a += 1;
5850      len -= 1;
5851   }
5852   tl_assert(len == 0);
5853}
5854
5855void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
5856   stats__f_ac++;
5857   STATS__F_SHOW;
5858   if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
5859      stats__f_sk++;
5860      return;
5861   }
5862   zsm_sapply08__msmcread(thr, a);
5863}
5864
5865void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
5866   stats__f_ac++;
5867   STATS__F_SHOW;
5868   if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
5869      stats__f_sk++;
5870      return;
5871   }
5872   zsm_sapply16__msmcread(thr, a);
5873}
5874
5875void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
5876   stats__f_ac++;
5877   STATS__F_SHOW;
5878   if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
5879      stats__f_sk++;
5880      return;
5881   }
5882   zsm_sapply32__msmcread(thr, a);
5883}
5884
5885void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
5886   stats__f_ac++;
5887   STATS__F_SHOW;
5888   if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
5889      stats__f_sk++;
5890      return;
5891   }
5892   zsm_sapply64__msmcread(thr, a);
5893}
5894
5895void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
5896{
5897   /* fast track a couple of common cases */
5898   if (len == 4 && aligned32(a)) {
5899      zsm_sapply32_f__msmcread( thr, a );
5900      return;
5901   }
5902   if (len == 8 && aligned64(a)) {
5903      zsm_sapply64_f__msmcread( thr, a );
5904      return;
5905   }
5906
5907   /* be completely general (but as efficient as possible) */
5908   if (len == 0) return;
5909
5910   if (!aligned16(a) && len >= 1) {
5911      zsm_sapply08_f__msmcread( thr, a );
5912      a += 1;
5913      len -= 1;
5914      tl_assert(aligned16(a));
5915   }
5916   if (len == 0) return;
5917
5918   if (!aligned32(a) && len >= 2) {
5919      zsm_sapply16_f__msmcread( thr, a );
5920      a += 2;
5921      len -= 2;
5922      tl_assert(aligned32(a));
5923   }
5924   if (len == 0) return;
5925
5926   if (!aligned64(a) && len >= 4) {
5927      zsm_sapply32_f__msmcread( thr, a );
5928      a += 4;
5929      len -= 4;
5930      tl_assert(aligned64(a));
5931   }
5932   if (len == 0) return;
5933
5934   if (len >= 8) {
5935      tl_assert(aligned64(a));
5936      while (len >= 8) {
5937         zsm_sapply64_f__msmcread( thr, a );
5938         a += 8;
5939         len -= 8;
5940      }
5941      tl_assert(aligned64(a));
5942   }
5943   if (len == 0) return;
5944
5945   if (len >= 4)
5946      tl_assert(aligned32(a));
5947   if (len >= 4) {
5948      zsm_sapply32_f__msmcread( thr, a );
5949      a += 4;
5950      len -= 4;
5951   }
5952   if (len == 0) return;
5953
5954   if (len >= 2)
5955      tl_assert(aligned16(a));
5956   if (len >= 2) {
5957      zsm_sapply16_f__msmcread( thr, a );
5958      a += 2;
5959      len -= 2;
5960   }
5961   if (len == 0) return;
5962
5963   if (len >= 1) {
5964      zsm_sapply08_f__msmcread( thr, a );
5965      //a += 1;
5966      len -= 1;
5967   }
5968   tl_assert(len == 0);
5969}
5970
5971void libhb_Thr_resumes ( Thr* thr )
5972{
5973   if (0) VG_(printf)("resume %p\n", thr);
5974   tl_assert(thr);
5975   tl_assert(!thr->llexit_done);
5976   Filter__clear(thr->filter, "libhb_Thr_resumes");
5977   /* A kludge, but .. if this thread doesn't have any marker stacks
5978      at all, get one right now.  This is easier than figuring out
5979      exactly when at thread startup we can and can't take a stack
5980      snapshot. */
5981   if (HG_(clo_history_level) == 1) {
5982      tl_assert(thr->local_Kws_n_stacks);
5983      if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
5984         note_local_Kw_n_stack_for(thr);
5985   }
5986}
5987
5988
5989/////////////////////////////////////////////////////////
5990//                                                     //
5991// Synchronisation objects                             //
5992//                                                     //
5993/////////////////////////////////////////////////////////
5994
5995/* A double linked list of all the SO's. */
5996SO* admin_SO = NULL;
5997
5998static SO* SO__Alloc ( void )
5999{
6000   SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
6001   so->viR   = VtsID_INVALID;
6002   so->viW   = VtsID_INVALID;
6003   so->magic = SO_MAGIC;
6004   /* Add to double linked list */
6005   if (admin_SO) {
6006      tl_assert(admin_SO->admin_prev == NULL);
6007      admin_SO->admin_prev = so;
6008      so->admin_next = admin_SO;
6009   } else {
6010      so->admin_next = NULL;
6011   }
6012   so->admin_prev = NULL;
6013   admin_SO = so;
6014   /* */
6015   return so;
6016}
6017
6018static void SO__Dealloc ( SO* so )
6019{
6020   tl_assert(so);
6021   tl_assert(so->magic == SO_MAGIC);
6022   if (so->viR == VtsID_INVALID) {
6023      tl_assert(so->viW == VtsID_INVALID);
6024   } else {
6025      tl_assert(so->viW != VtsID_INVALID);
6026      VtsID__rcdec(so->viR);
6027      VtsID__rcdec(so->viW);
6028   }
6029   so->magic = 0;
6030   /* Del from double linked list */
6031   if (so->admin_prev)
6032      so->admin_prev->admin_next = so->admin_next;
6033   if (so->admin_next)
6034      so->admin_next->admin_prev = so->admin_prev;
6035   if (so == admin_SO)
6036      admin_SO = so->admin_next;
6037   /* */
6038   HG_(free)( so );
6039}
6040
6041
6042/////////////////////////////////////////////////////////
6043//                                                     //
6044// Top Level API                                       //
6045//                                                     //
6046/////////////////////////////////////////////////////////
6047
6048static void show_thread_state ( const HChar* str, Thr* t )
6049{
6050   if (1) return;
6051   if (t->viR == t->viW) {
6052      VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6053      VtsID__pp( t->viR );
6054      VG_(printf)("%s","\n");
6055   } else {
6056      VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6057      VtsID__pp( t->viR );
6058      VG_(printf)(" viW %u==", t->viW);
6059      VtsID__pp( t->viW );
6060      VG_(printf)("%s","\n");
6061   }
6062}
6063
6064
6065Thr* libhb_init (
6066        void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6067        ExeContext* (*get_EC)( Thr* )
6068     )
6069{
6070   Thr*  thr;
6071   VtsID vi;
6072
6073   // We will have to have to store a large number of these,
6074   // so make sure they're the size we expect them to be.
6075   tl_assert(sizeof(ScalarTS) == 8);
6076
6077   /* because first 1024 unusable */
6078   tl_assert(SCALARTS_N_THRBITS >= 11);
6079   /* so as to fit in a UInt w/ 3 bits to spare (see defn of
6080      Thr_n_RCEC). */
6081   tl_assert(SCALARTS_N_THRBITS <= 29);
6082
6083   /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6084      (32-bit).  It's not correctness-critical, but there are a lot of
6085      them, so it's important from a space viewpoint.  Unfortunately
6086      we simply can't pack it into 2 words on a 32-bit target. */
6087   if (sizeof(UWord) == 8) {
6088      tl_assert(sizeof(Thr_n_RCEC) == 16);
6089   } else {
6090      tl_assert(sizeof(Thr_n_RCEC) == 12);
6091   }
6092
6093   /* Word sets really are 32 bits.  Even on a 64 bit target. */
6094   tl_assert(sizeof(WordSetID) == 4);
6095   tl_assert(sizeof(WordSet) == sizeof(WordSetID));
6096
6097   tl_assert(get_stacktrace);
6098   tl_assert(get_EC);
6099   main_get_stacktrace   = get_stacktrace;
6100   main_get_EC           = get_EC;
6101
6102   // No need to initialise hg_wordfm.
6103   // No need to initialise hg_wordset.
6104
6105   /* Allocated once and never deallocated.  Used as a temporary in
6106      VTS singleton, tick and join operations. */
6107   temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6108   temp_max_sized_VTS->id = VtsID_INVALID;
6109   verydead_thread_table_init();
6110   vts_set_init();
6111   vts_tab_init();
6112   event_map_init();
6113   VtsID__invalidate_caches();
6114
6115   // initialise shadow memory
6116   zsm_init( SVal__rcinc, SVal__rcdec );
6117
6118   thr = Thr__new();
6119   vi  = VtsID__mk_Singleton( thr, 1 );
6120   thr->viR = vi;
6121   thr->viW = vi;
6122   VtsID__rcinc(thr->viR);
6123   VtsID__rcinc(thr->viW);
6124
6125   show_thread_state("  root", thr);
6126   return thr;
6127}
6128
6129
6130Thr* libhb_create ( Thr* parent )
6131{
6132   /* The child's VTSs are copies of the parent's VTSs, but ticked at
6133      the child's index.  Since the child's index is guaranteed
6134      unique, it has never been seen before, so the implicit value
6135      before the tick is zero and after that is one. */
6136   Thr* child = Thr__new();
6137
6138   child->viR = VtsID__tick( parent->viR, child );
6139   child->viW = VtsID__tick( parent->viW, child );
6140   Filter__clear(child->filter, "libhb_create(child)");
6141   VtsID__rcinc(child->viR);
6142   VtsID__rcinc(child->viW);
6143   /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6144      early for that - it may not have a valid TId yet.  So, let
6145      libhb_Thr_resumes pick it up the first time the thread runs. */
6146
6147   tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6148   tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6149
6150   /* and the parent has to move along too */
6151   VtsID__rcdec(parent->viR);
6152   VtsID__rcdec(parent->viW);
6153   parent->viR = VtsID__tick( parent->viR, parent );
6154   parent->viW = VtsID__tick( parent->viW, parent );
6155   Filter__clear(parent->filter, "libhb_create(parent)");
6156   VtsID__rcinc(parent->viR);
6157   VtsID__rcinc(parent->viW);
6158   note_local_Kw_n_stack_for( parent );
6159
6160   show_thread_state(" child", child);
6161   show_thread_state("parent", parent);
6162
6163   return child;
6164}
6165
6166/* Shut down the library, and print stats (in fact that's _all_
6167   this is for. */
6168void libhb_shutdown ( Bool show_stats )
6169{
6170   if (show_stats) {
6171      VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6172      VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6173                  stats__secmaps_allocd,
6174                  stats__secmap_ga_space_covered);
6175      VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6176                  stats__secmap_linesZ_allocd,
6177                  stats__secmap_linesZ_bytes);
6178      VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
6179                  stats__secmap_linesF_allocd,
6180                  stats__secmap_linesF_bytes);
6181      VG_(printf)(" secmaps: %'10lu iterator steppings\n",
6182                  stats__secmap_iterator_steppings);
6183      VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6184                  stats__secmaps_search, stats__secmaps_search_slow);
6185
6186      VG_(printf)("%s","\n");
6187      VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6188                  stats__cache_totrefs, stats__cache_totmisses );
6189      VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6190                  stats__cache_Z_fetches, stats__cache_F_fetches );
6191      VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6192                  stats__cache_Z_wbacks, stats__cache_F_wbacks );
6193      VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
6194                  stats__cache_invals, stats__cache_flushes );
6195      VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6196                  stats__cache_make_New_arange,
6197                  stats__cache_make_New_inZrep);
6198
6199      VG_(printf)("%s","\n");
6200      VG_(printf)("   cline: %'10lu normalises\n",
6201                  stats__cline_normalises );
6202      VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6203                  stats__cline_cread64s,
6204                  stats__cline_cread32s,
6205                  stats__cline_cread16s,
6206                  stats__cline_cread08s );
6207      VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6208                  stats__cline_cwrite64s,
6209                  stats__cline_cwrite32s,
6210                  stats__cline_cwrite16s,
6211                  stats__cline_cwrite08s );
6212      VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6213                  stats__cline_swrite64s,
6214                  stats__cline_swrite32s,
6215                  stats__cline_swrite16s,
6216                  stats__cline_swrite08s );
6217      VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6218                  stats__cline_sread08s, stats__cline_scopy08s );
6219      VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
6220                 stats__cline_64to32splits,
6221                 stats__cline_32to16splits,
6222                 stats__cline_16to8splits );
6223      VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
6224                 stats__cline_64to32pulldown,
6225                 stats__cline_32to16pulldown,
6226                 stats__cline_16to8pulldown );
6227      if (0)
6228      VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
6229                  (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);
6230
6231      VG_(printf)("%s","\n");
6232
6233      VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6234                  stats__msmcread, stats__msmcread_change);
6235      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6236                  stats__msmcwrite, stats__msmcwrite_change);
6237      VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6238                  stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6239      VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6240                  stats__join2_queries, stats__join2_misses);
6241
6242      VG_(printf)("%s","\n");
6243      VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6244                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6245      VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6246                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
6247      VG_(printf)( "   libhb: VTSset: find__or__clone_and_add %'lu (%'lu allocd)\n",
6248                   stats__vts_set__focaa, stats__vts_set__focaa_a );
6249      VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6250                   stats__vts__indexat_slow );
6251
6252      VG_(printf)("%s","\n");
6253      VG_(printf)(
6254         "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6255         VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6256      );
6257      VG_(printf)( "   libhb: %lu entries in vts_set\n",
6258                   VG_(sizeFM)( vts_set ) );
6259
6260      VG_(printf)("%s","\n");
6261      VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
6262                   stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
6263                   stats__ctxt_rcdec2,
6264                   stats__ctxt_rcdec3 );
6265      VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
6266                   stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
6267      VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
6268                   (UWord)N_RCEC_TAB,
6269                   stats__ctxt_tab_curr );
6270      VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6271                   stats__ctxt_tab_qs,
6272                   stats__ctxt_tab_cmps );
6273#if 0
6274      VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
6275      VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
6276      VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
6277      VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
6278      VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
6279      VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
6280      VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
6281      VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
6282      VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
6283      VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
6284      VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
6285      VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
6286      VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
6287      VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
6288
6289      VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
6290      VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
6291      VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
6292      VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
6293#endif
6294
6295      VG_(printf)("%s","<<< END libhb stats >>>\n");
6296      VG_(printf)("%s","\n");
6297
6298   }
6299}
6300
6301/* Receive notification that a thread has low level exited.  The
6302   significance here is that we do not expect to see any more memory
6303   references from it. */
6304void libhb_async_exit ( Thr* thr )
6305{
6306   tl_assert(thr);
6307   tl_assert(!thr->llexit_done);
6308   thr->llexit_done = True;
6309
6310   /* free up Filter and local_Kws_n_stacks (well, actually not the
6311      latter ..) */
6312   tl_assert(thr->filter);
6313   HG_(free)(thr->filter);
6314   thr->filter = NULL;
6315
6316   /* Tell the VTS mechanism this thread has exited, so it can
6317      participate in VTS pruning.  Note this can only happen if the
6318      thread has both ll_exited and has been joined with. */
6319   if (thr->joinedwith_done)
6320      VTS__declare_thread_very_dead(thr);
6321
6322   /* Another space-accuracy tradeoff.  Do we want to be able to show
6323      H1 history for conflicts in threads which have since exited?  If
6324      yes, then we better not free up thr->local_Kws_n_stacks.  The
6325      downside is a potential per-thread leak of up to
6326      N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6327      XArray average overcommit factor is (1.5 I'd guess). */
6328   // hence:
6329   // VG_(deleteXA)(thr->local_Kws_n_stacks);
6330   // thr->local_Kws_n_stacks = NULL;
6331}
6332
6333/* Receive notification that a thread has been joined with.  The
6334   significance here is that we do not expect to see any further
6335   references to its vector clocks (Thr::viR and Thr::viW). */
6336void libhb_joinedwith_done ( Thr* thr )
6337{
6338   tl_assert(thr);
6339   /* Caller must ensure that this is only ever called once per Thr. */
6340   tl_assert(!thr->joinedwith_done);
6341   thr->joinedwith_done = True;
6342   if (thr->llexit_done)
6343      VTS__declare_thread_very_dead(thr);
6344}
6345
6346
6347/* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6348   a Seg that points at a VTS is its one-and-only owner, and ditto for
6349   a SO that points at a VTS. */
6350
6351SO* libhb_so_alloc ( void )
6352{
6353   return SO__Alloc();
6354}
6355
6356void libhb_so_dealloc ( SO* so )
6357{
6358   tl_assert(so);
6359   tl_assert(so->magic == SO_MAGIC);
6360   SO__Dealloc(so);
6361}
6362
6363/* See comments in libhb.h for details on the meaning of
6364   strong vs weak sends and strong vs weak receives. */
6365void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6366{
6367   /* Copy the VTSs from 'thr' into the sync object, and then move
6368      the thread along one step. */
6369
6370   tl_assert(so);
6371   tl_assert(so->magic == SO_MAGIC);
6372
6373   /* stay sane .. a thread's read-clock must always lead or be the
6374      same as its write-clock */
6375   { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6376     tl_assert(leq);
6377   }
6378
6379   /* since we're overwriting the VtsIDs in the SO, we need to drop
6380      any references made by the previous contents thereof */
6381   if (so->viR == VtsID_INVALID) {
6382      tl_assert(so->viW == VtsID_INVALID);
6383      so->viR = thr->viR;
6384      so->viW = thr->viW;
6385      VtsID__rcinc(so->viR);
6386      VtsID__rcinc(so->viW);
6387   } else {
6388      /* In a strong send, we dump any previous VC in the SO and
6389         install the sending thread's VC instead.  For a weak send we
6390         must join2 with what's already there. */
6391      tl_assert(so->viW != VtsID_INVALID);
6392      VtsID__rcdec(so->viR);
6393      VtsID__rcdec(so->viW);
6394      so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6395      so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6396      VtsID__rcinc(so->viR);
6397      VtsID__rcinc(so->viW);
6398   }
6399
6400   /* move both parent clocks along */
6401   VtsID__rcdec(thr->viR);
6402   VtsID__rcdec(thr->viW);
6403   thr->viR = VtsID__tick( thr->viR, thr );
6404   thr->viW = VtsID__tick( thr->viW, thr );
6405   if (!thr->llexit_done) {
6406      Filter__clear(thr->filter, "libhb_so_send");
6407      note_local_Kw_n_stack_for(thr);
6408   }
6409   VtsID__rcinc(thr->viR);
6410   VtsID__rcinc(thr->viW);
6411
6412   if (strong_send)
6413      show_thread_state("s-send", thr);
6414   else
6415      show_thread_state("w-send", thr);
6416}
6417
6418void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
6419{
6420   tl_assert(so);
6421   tl_assert(so->magic == SO_MAGIC);
6422
6423   if (so->viR != VtsID_INVALID) {
6424      tl_assert(so->viW != VtsID_INVALID);
6425
6426      /* Weak receive (basically, an R-acquisition of a R-W lock).
6427         This advances the read-clock of the receiver, but not the
6428         write-clock. */
6429      VtsID__rcdec(thr->viR);
6430      thr->viR = VtsID__join2( thr->viR, so->viR );
6431      VtsID__rcinc(thr->viR);
6432
6433      /* At one point (r10589) it seemed safest to tick the clocks for
6434         the receiving thread after the join.  But on reflection, I
6435         wonder if that might cause it to 'overtake' constraints,
6436         which could lead to missing races.  So, back out that part of
6437         r10589. */
6438      //VtsID__rcdec(thr->viR);
6439      //thr->viR = VtsID__tick( thr->viR, thr );
6440      //VtsID__rcinc(thr->viR);
6441
6442      /* For a strong receive, we also advance the receiver's write
6443         clock, which means the receive as a whole is essentially
6444         equivalent to a W-acquisition of a R-W lock. */
6445      if (strong_recv) {
6446         VtsID__rcdec(thr->viW);
6447         thr->viW = VtsID__join2( thr->viW, so->viW );
6448         VtsID__rcinc(thr->viW);
6449
6450         /* See comment just above, re r10589. */
6451         //VtsID__rcdec(thr->viW);
6452         //thr->viW = VtsID__tick( thr->viW, thr );
6453         //VtsID__rcinc(thr->viW);
6454      }
6455
6456      if (thr->filter)
6457         Filter__clear(thr->filter, "libhb_so_recv");
6458      note_local_Kw_n_stack_for(thr);
6459
6460      if (strong_recv)
6461         show_thread_state("s-recv", thr);
6462      else
6463         show_thread_state("w-recv", thr);
6464
6465   } else {
6466      tl_assert(so->viW == VtsID_INVALID);
6467      /* Deal with degenerate case: 'so' has no vts, so there has been
6468         no message posted to it.  Just ignore this case. */
6469      show_thread_state("d-recv", thr);
6470   }
6471}
6472
6473Bool libhb_so_everSent ( SO* so )
6474{
6475   if (so->viR == VtsID_INVALID) {
6476      tl_assert(so->viW == VtsID_INVALID);
6477      return False;
6478   } else {
6479      tl_assert(so->viW != VtsID_INVALID);
6480      return True;
6481   }
6482}
6483
6484#define XXX1 0 // 0x67a106c
6485#define XXX2 0
6486
6487static inline Bool TRACEME(Addr a, SizeT szB) {
6488   if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
6489   if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
6490   return False;
6491}
6492static void trace ( Thr* thr, Addr a, SizeT szB, const HChar* s )
6493{
6494  SVal sv = zsm_sread08(a);
6495  VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
6496  show_thread_state("", thr);
6497  VG_(printf)("%s","\n");
6498}
6499
6500void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
6501{
6502   SVal sv = SVal__mkC(thr->viW, thr->viW);
6503   tl_assert(is_sane_SVal_C(sv));
6504   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
6505   zsm_sset_range( a, szB, sv );
6506   Filter__clear_range( thr->filter, a, szB );
6507   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
6508}
6509
6510void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
6511{
6512   /* do nothing */
6513}
6514
6515void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
6516{
6517   /* This really does put the requested range in NoAccess.  It's
6518      expensive though. */
6519   SVal sv = SVal_NOACCESS;
6520   tl_assert(is_sane_SVal_C(sv));
6521   zsm_sset_range( a, szB, sv );
6522   Filter__clear_range( thr->filter, a, szB );
6523}
6524
6525void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
6526{
6527   SVal sv = SVal_NOACCESS;
6528   tl_assert(is_sane_SVal_C(sv));
6529   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
6530   zsm_sset_range( a, szB, sv );
6531   Filter__clear_range( thr->filter, a, szB );
6532   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
6533}
6534
6535Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
6536   tl_assert(thr);
6537   return thr->hgthread;
6538}
6539
6540void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
6541   tl_assert(thr);
6542   thr->hgthread = hgthread;
6543}
6544
6545void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
6546{
6547   zsm_scopy_range(src, dst, len);
6548   Filter__clear_range( thr->filter, dst, len );
6549}
6550
6551void libhb_maybe_GC ( void )
6552{
6553   event_map_maybe_GC();
6554   /* If there are still freelist entries available, no need for a
6555      GC. */
6556   if (vts_tab_freelist != VtsID_INVALID)
6557      return;
6558   /* So all the table entries are full, and we're having to expand
6559      the table.  But did we hit the threshhold point yet? */
6560   if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
6561      return;
6562   vts_tab__do_GC( False/*don't show stats*/ );
6563}
6564
6565
6566/////////////////////////////////////////////////////////////////
6567/////////////////////////////////////////////////////////////////
6568//                                                             //
6569// SECTION END main library                                    //
6570//                                                             //
6571/////////////////////////////////////////////////////////////////
6572/////////////////////////////////////////////////////////////////
6573
6574/*--------------------------------------------------------------------*/
6575/*--- end                                             libhb_main.c ---*/
6576/*--------------------------------------------------------------------*/
6577