1
2/*--------------------------------------------------------------------*/
3/*--- LibHB: a library for implementing and checking               ---*/
4/*--- the happens-before relationship in concurrent programs.      ---*/
5/*---                                                 libhb_main.c ---*/
6/*--------------------------------------------------------------------*/
7
8/*
9   This file is part of LibHB, a library for implementing and checking
10   the happens-before relationship in concurrent programs.
11
12   Copyright (C) 2008-2011 OpenWorks Ltd
13      info@open-works.co.uk
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "pub_tool_basics.h"
34#include "pub_tool_libcassert.h"
35#include "pub_tool_libcbase.h"
36#include "pub_tool_libcprint.h"
37#include "pub_tool_mallocfree.h"
38#include "pub_tool_wordfm.h"
39#include "pub_tool_sparsewa.h"
40#include "pub_tool_xarray.h"
41#include "pub_tool_oset.h"
42#include "pub_tool_threadstate.h"
43#include "pub_tool_aspacemgr.h"
44#include "pub_tool_execontext.h"
45#include "pub_tool_errormgr.h"
46#include "pub_tool_options.h"        // VG_(clo_stats)
47#include "hg_basics.h"
48#include "hg_wordset.h"
49#include "hg_lock_n_thread.h"
50#include "hg_errors.h"
51
52#include "libhb.h"
53
54
55/////////////////////////////////////////////////////////////////
56/////////////////////////////////////////////////////////////////
57//                                                             //
58// Debugging #defines                                          //
59//                                                             //
60/////////////////////////////////////////////////////////////////
61/////////////////////////////////////////////////////////////////
62
63/* Check the sanity of shadow values in the core memory state
64   machine.  Change #if 0 to #if 1 to enable this. */
65#if 0
66#  define CHECK_MSM 1
67#else
68#  define CHECK_MSM 0
69#endif
70
71
72/* Check sanity (reference counts, etc) in the conflicting access
73   machinery.  Change #if 0 to #if 1 to enable this. */
74#if 0
75#  define CHECK_CEM 1
76#else
77#  define CHECK_CEM 0
78#endif
79
80
81/* Check sanity in the compressed shadow memory machinery,
82   particularly in its caching innards.  Unfortunately there's no
83   almost-zero-cost way to make them selectable at run time.  Hence
84   set the #if 0 to #if 1 and rebuild if you want them. */
85#if 0
86#  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
87#  define inline __attribute__((noinline))
88   /* probably want to ditch -fomit-frame-pointer too */
89#else
90#  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
91#endif
92
93
94/////////////////////////////////////////////////////////////////
95/////////////////////////////////////////////////////////////////
96//                                                             //
97// data decls: VtsID                                           //
98//                                                             //
99/////////////////////////////////////////////////////////////////
100/////////////////////////////////////////////////////////////////
101
102/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
103   bits, since they have to be packed into the lowest 30 bits of an
104   SVal. */
105typedef  UInt  VtsID;
106#define VtsID_INVALID 0xFFFFFFFF
107
108
109
110/////////////////////////////////////////////////////////////////
111/////////////////////////////////////////////////////////////////
112//                                                             //
113// data decls: SVal                                            //
114//                                                             //
115/////////////////////////////////////////////////////////////////
116/////////////////////////////////////////////////////////////////
117
118typedef  ULong  SVal;
119
120/* This value has special significance to the implementation, and callers
121   may not store it in the shadow memory. */
122#define SVal_INVALID (3ULL << 62)
123
124/* This is the default value for shadow memory.  Initially the shadow
125   memory contains no accessible areas and so all reads produce this
126   value.  TODO: make this caller-defineable. */
127#define SVal_NOACCESS (2ULL << 62)
128
129
130
131/////////////////////////////////////////////////////////////////
132/////////////////////////////////////////////////////////////////
133//                                                             //
134// data decls: ScalarTS                                        //
135//                                                             //
136/////////////////////////////////////////////////////////////////
137/////////////////////////////////////////////////////////////////
138
139/* Scalar Timestamp.  We have to store a lot of these, so there is
140   some effort to make them as small as possible.  Logically they are
141   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
142   We pack it into 64 bits by representing the Thr* using a ThrID, a
143   small integer (18 bits), and a 46 bit integer for the timestamp
144   number.  The 46/18 split is arbitary, but has the effect that
145   Helgrind can only handle programs that create 2^18 or fewer threads
146   over their entire lifetime, and have no more than 2^46 timestamp
147   ticks (synchronisation operations on the same thread).
148
149   This doesn't seem like much of a limitation.  2^46 ticks is
150   7.06e+13, and if each tick (optimistically) takes the machine 1000
151   cycles to process, then the minimum time to process that many ticks
152   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
153   but VTS ticks, which isn't realistic.
154
155   NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
156   32 since a ThrID is a UInt.  29 comes from the fact that
157   'Thr_n_RCEC', which records information about old accesses, packs
158   not only a ThrID but also 2+1 other bits (access size and
159   writeness) in a UInt, hence limiting size to 32-(2+1) == 29.
160
161   NB2: thrid values are issued upwards from 1024, and values less
162   than that aren't valid.  This isn't per se necessary (any order
163   will do, so long as they are unique), but it does help ensure they
164   are less likely to get confused with the various other kinds of
165   small-integer thread ids drifting around (eg, TId).  See also NB5.
166
167   NB3: this probably also relies on the fact that Thr's are never
168   deallocated -- they exist forever.  Hence the 1-1 mapping from
169   Thr's to thrid values (set up in Thr__new) persists forever.
170
171   NB4: temp_max_sized_VTS is allocated at startup and never freed.
172   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
173   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
174   making the memory use for this go sky-high.  With
175   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
176   like an OK tradeoff.  If more than 256k threads need to be
177   supported, we could change SCALARTS_N_THRBITS to 20, which would
178   facilitate supporting 1 million threads at the cost of 8MB storage
179   for temp_max_sized_VTS.
180
181   NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
182   ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
183   must never be a valid ThrID.  Given NB2 that's OK.
184*/
185#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */
186
187#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
188typedef
189   struct {
190      ThrID thrid : SCALARTS_N_THRBITS;
191      ULong tym   : SCALARTS_N_TYMBITS;
192   }
193   ScalarTS;
194
195#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
196
197
198
199/////////////////////////////////////////////////////////////////
200/////////////////////////////////////////////////////////////////
201//                                                             //
202// data decls: Filter                                          //
203//                                                             //
204/////////////////////////////////////////////////////////////////
205/////////////////////////////////////////////////////////////////
206
207// baseline: 5, 9
208#define FI_LINE_SZB_LOG2  5
209#define FI_NUM_LINES_LOG2 10
210
211#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
212#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
213
214#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
215#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
216
217#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
218                             & (Addr)(FI_NUM_LINES-1) )
219
220
221/* In the lines, each 8 bytes are treated individually, and are mapped
222   to a UShort.  Regardless of endianness of the underlying machine,
223   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
224   the highest address.
225
226   Of each bit pair, the higher numbered bit is set if a R has been
227   seen, so the actual layout is:
228
229   15 14             ...  01 00
230
231   R  W  for addr+7  ...  R  W  for addr+0
232
233   So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
234*/
235
236/* tags are separated from lines.  tags are Addrs and are
237   the base address of the line. */
238typedef
239   struct {
240      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
241   }
242   FiLine;
243
244typedef
245   struct {
246      Addr   tags[FI_NUM_LINES];
247      FiLine lines[FI_NUM_LINES];
248   }
249   Filter;
250
251
252
253/////////////////////////////////////////////////////////////////
254/////////////////////////////////////////////////////////////////
255//                                                             //
256// data decls: Thr, ULong_n_EC                                 //
257//                                                             //
258/////////////////////////////////////////////////////////////////
259/////////////////////////////////////////////////////////////////
260
261// Records stacks for H1 history mechanism (DRD-style)
262typedef
263   struct { ULong ull; ExeContext* ec; }
264   ULong_n_EC;
265
266
267/* How many of the above records to collect for each thread?  Older
268   ones are dumped when we run out of space.  62.5k requires 1MB per
269   thread, since each ULong_n_EC record is 16 bytes long.  When more
270   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
271   deleted to make space.  Hence in the worst case we will be able to
272   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
273   Kw transitions (segments in this thread).  For the current setting
274   that gives a guaranteed stack for at least the last 31.25k
275   segments. */
276#define N_KWs_N_STACKs_PER_THREAD 62500
277
278
279struct _Thr {
280   /* Current VTSs for this thread.  They change as we go along.  viR
281      is the VTS to be used for reads, viW for writes.  Usually they
282      are the same, but can differ when we deal with reader-writer
283      locks.  It is always the case that
284         VtsID__cmpLEQ(viW,viR) == True
285      that is, viW must be the same, or lagging behind, viR. */
286   VtsID viR;
287   VtsID viW;
288
289   /* Is initially False, and is set to True after the thread really
290      has done a low-level exit.  When True, we expect to never see
291      any more memory references done by this thread. */
292   Bool llexit_done;
293
294   /* Is initially False, and is set to True after the thread has been
295      joined with (reaped by some other thread).  After this point, we
296      do not expect to see any uses of .viR or .viW, so it is safe to
297      set them to VtsID_INVALID. */
298   Bool joinedwith_done;
299
300   /* A small integer giving a unique identity to this Thr.  See
301      comments on the definition of ScalarTS for details. */
302   ThrID thrid : SCALARTS_N_THRBITS;
303
304   /* A filter that removes references for which we believe that
305      msmcread/msmcwrite will not change the state, nor report a
306      race. */
307   Filter* filter;
308
309   /* A pointer back to the top level Thread structure.  There is a
310      1-1 mapping between Thread and Thr structures -- each Thr points
311      at its corresponding Thread, and vice versa.  Really, Thr and
312      Thread should be merged into a single structure. */
313   Thread* hgthread;
314
315   /* The ULongs (scalar Kws) in this accumulate in strictly
316      increasing order, without duplicates.  This is important because
317      we need to be able to find a given scalar Kw in this array
318      later, by binary search. */
319   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
320};
321
322
323
324/////////////////////////////////////////////////////////////////
325/////////////////////////////////////////////////////////////////
326//                                                             //
327// data decls: SO                                              //
328//                                                             //
329/////////////////////////////////////////////////////////////////
330/////////////////////////////////////////////////////////////////
331
332// (UInt) `echo "Synchronisation object" | md5sum`
333#define SO_MAGIC 0x56b3c5b0U
334
335struct _SO {
336   struct _SO* admin_prev;
337   struct _SO* admin_next;
338   VtsID viR; /* r-clock of sender */
339   VtsID viW; /* w-clock of sender */
340   UInt  magic;
341};
342
343
344
345/////////////////////////////////////////////////////////////////
346/////////////////////////////////////////////////////////////////
347//                                                             //
348// Forward declarations                                        //
349//                                                             //
350/////////////////////////////////////////////////////////////////
351/////////////////////////////////////////////////////////////////
352
353/* fwds for
354   Globals needed by other parts of the library.  These are set
355   once at startup and then never changed. */
356static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
357static ExeContext* (*main_get_EC)( Thr* ) = NULL;
358
359/* misc fn and data fwdses */
360static void VtsID__rcinc ( VtsID ii );
361static void VtsID__rcdec ( VtsID ii );
362
363static inline Bool SVal__isC ( SVal s );
364static inline VtsID SVal__unC_Rmin ( SVal s );
365static inline VtsID SVal__unC_Wmin ( SVal s );
366static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
367
368/* A double linked list of all the SO's. */
369SO* admin_SO;
370
371
372
373/////////////////////////////////////////////////////////////////
374/////////////////////////////////////////////////////////////////
375//                                                             //
376// SECTION BEGIN compressed shadow memory                      //
377//                                                             //
378/////////////////////////////////////////////////////////////////
379/////////////////////////////////////////////////////////////////
380
381#ifndef __HB_ZSM_H
382#define __HB_ZSM_H
383
384/* Initialise the library.  Once initialised, it will (or may) call
385   rcinc and rcdec in response to all the calls below, in order to
386   allow the user to do reference counting on the SVals stored herein.
387   It is important to understand, however, that due to internal
388   caching, the reference counts are in general inaccurate, and can be
389   both above or below the true reference count for an item.  In
390   particular, the library may indicate that the reference count for
391   an item is zero, when in fact it is not.
392
393   To make the reference counting exact and therefore non-pointless,
394   call zsm_flush_cache.  Immediately after it returns, the reference
395   counts for all items, as deduced by the caller by observing calls
396   to rcinc and rcdec, will be correct, and so any items with a zero
397   reference count may be freed (or at least considered to be
398   unreferenced by this library).
399*/
400static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
401
402static void zsm_sset_range  ( Addr, SizeT, SVal );
403static void zsm_scopy_range ( Addr, Addr, SizeT );
404static void zsm_flush_cache ( void );
405
406#endif /* ! __HB_ZSM_H */
407
408
409/* Round a up to the next multiple of N.  N must be a power of 2 */
410#define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
411/* Round a down to the next multiple of N.  N must be a power of 2 */
412#define ROUNDDN(a, N)   ((a) & ~(N-1))
413
414
415
416/* ------ User-supplied RC functions ------ */
417static void(*rcinc)(SVal) = NULL;
418static void(*rcdec)(SVal) = NULL;
419
420
421/* ------ CacheLine ------ */
422
423#define N_LINE_BITS      6 /* must be >= 3 */
424#define N_LINE_ARANGE    (1 << N_LINE_BITS)
425#define N_LINE_TREES     (N_LINE_ARANGE >> 3)
426
427typedef
428   struct {
429      UShort descrs[N_LINE_TREES];
430      SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
431   }
432   CacheLine;
433
434#define TREE_DESCR_16_0 (1<<0)
435#define TREE_DESCR_32_0 (1<<1)
436#define TREE_DESCR_16_1 (1<<2)
437#define TREE_DESCR_64   (1<<3)
438#define TREE_DESCR_16_2 (1<<4)
439#define TREE_DESCR_32_1 (1<<5)
440#define TREE_DESCR_16_3 (1<<6)
441#define TREE_DESCR_8_0  (1<<7)
442#define TREE_DESCR_8_1  (1<<8)
443#define TREE_DESCR_8_2  (1<<9)
444#define TREE_DESCR_8_3  (1<<10)
445#define TREE_DESCR_8_4  (1<<11)
446#define TREE_DESCR_8_5  (1<<12)
447#define TREE_DESCR_8_6  (1<<13)
448#define TREE_DESCR_8_7  (1<<14)
449#define TREE_DESCR_DTY  (1<<15)
450
451typedef
452   struct {
453      SVal  dict[4]; /* can represent up to 4 diff values in the line */
454      UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
455                                      dict indexes */
456      /* if dict[0] == SVal_INVALID then dict[1] is the index of the
457         LineF to use, and dict[2..] are also SVal_INVALID. */
458   }
459   LineZ; /* compressed rep for a cache line */
460
461typedef
462   struct {
463      Bool inUse;
464      SVal w64s[N_LINE_ARANGE];
465   }
466   LineF; /* full rep for a cache line */
467
468/* Shadow memory.
469   Primary map is a WordFM Addr SecMap*.
470   SecMaps cover some page-size-ish section of address space and hold
471     a compressed representation.
472   CacheLine-sized chunks of SecMaps are copied into a Cache, being
473   decompressed when moved into the cache and recompressed on the
474   way out.  Because of this, the cache must operate as a writeback
475   cache, not a writethrough one.
476
477   Each SecMap must hold a power-of-2 number of CacheLines.  Hence
478   N_SECMAP_BITS must >= N_LINE_BITS.
479*/
480#define N_SECMAP_BITS   13
481#define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
482
483// # CacheLines held by a SecMap
484#define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
485
486/* The data in the SecMap is held in the array of LineZs.  Each LineZ
487   either carries the required data directly, in a compressed
488   representation, or it holds (in .dict[0]) an index to the LineF in
489   .linesF that holds the full representation.
490
491   Currently-unused LineF's have their .inUse bit set to zero.
492   Since each in-use LineF is referred to be exactly one LineZ,
493   the number of .linesZ[] that refer to .linesF should equal
494   the number of .linesF[] that have .inUse == True.
495
496   RC obligations: the RCs presented to the user include exactly
497   the values in:
498   * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
499   * F reps that are in use (.inUse == True)
500
501   Hence the following actions at the following transitions are required:
502
503   F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
504   F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
505   Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
506   Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
507*/
508typedef
509   struct {
510      UInt   magic;
511      LineZ  linesZ[N_SECMAP_ZLINES];
512      LineF* linesF;
513      UInt   linesF_size;
514   }
515   SecMap;
516
517#define SecMap_MAGIC   0x571e58cbU
518
519static inline Bool is_sane_SecMap ( SecMap* sm ) {
520   return sm != NULL && sm->magic == SecMap_MAGIC;
521}
522
523/* ------ Cache ------ */
524
525#define N_WAY_BITS 16
526#define N_WAY_NENT (1 << N_WAY_BITS)
527
528/* Each tag is the address of the associated CacheLine, rounded down
529   to a CacheLine address boundary.  A CacheLine size must be a power
530   of 2 and must be 8 or more.  Hence an easy way to initialise the
531   cache so it is empty is to set all the tag values to any value % 8
532   != 0, eg 1.  This means all queries in the cache initially miss.
533   It does however require us to detect and not writeback, any line
534   with a bogus tag. */
535typedef
536   struct {
537      CacheLine lyns0[N_WAY_NENT];
538      Addr      tags0[N_WAY_NENT];
539   }
540   Cache;
541
542static inline Bool is_valid_scache_tag ( Addr tag ) {
543   /* a valid tag should be naturally aligned to the start of
544      a CacheLine. */
545   return 0 == (tag & (N_LINE_ARANGE - 1));
546}
547
548
549/* --------- Primary data structures --------- */
550
551/* Shadow memory primary map */
552static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
553static Cache   cache_shmem;
554
555
556static UWord stats__secmaps_search       = 0; // # SM finds
557static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
558static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
559static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
560static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
561static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
562static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
563static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
564static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
565static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
566static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
567static UWord stats__cache_F_fetches      = 0; // # F lines fetched
568static UWord stats__cache_F_wbacks       = 0; // # F lines written back
569static UWord stats__cache_invals         = 0; // # cache invals
570static UWord stats__cache_flushes        = 0; // # cache flushes
571static UWord stats__cache_totrefs        = 0; // # total accesses
572static UWord stats__cache_totmisses      = 0; // # misses
573static ULong stats__cache_make_New_arange = 0; // total arange made New
574static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
575static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
576static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
577static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
578static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
579static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
580static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
581static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
582static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
583static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
584static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
585static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
586static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
587static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
588static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
589static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
590static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
591static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
592static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
593static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
594static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
595static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
596static UWord stats__vts__tick            = 0; // # calls to VTS__tick
597static UWord stats__vts__join            = 0; // # calls to VTS__join
598static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
599static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
600
601// # calls to VTS__cmp_structural w/ slow case
602static UWord stats__vts__cmp_structural_slow = 0;
603
604// # calls to VTS__indexAt_SLOW
605static UWord stats__vts__indexat_slow = 0;
606
607// # calls to vts_set__find__or__clone_and_add
608static UWord stats__vts_set__focaa    = 0;
609
610// # calls to vts_set__find__or__clone_and_add that lead to an
611// allocation
612static UWord stats__vts_set__focaa_a  = 0;
613
614
615static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
616   return a & ~(N_SECMAP_ARANGE - 1);
617}
618static inline UWord shmem__get_SecMap_offset ( Addr a ) {
619   return a & (N_SECMAP_ARANGE - 1);
620}
621
622
623/*----------------------------------------------------------------*/
624/*--- map_shmem :: WordFM Addr SecMap                          ---*/
625/*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
626/*----------------------------------------------------------------*/
627
628/*--------------- SecMap allocation --------------- */
629
630static HChar* shmem__bigchunk_next = NULL;
631static HChar* shmem__bigchunk_end1 = NULL;
632
633static void* shmem__bigchunk_alloc ( SizeT n )
634{
635   const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
636   tl_assert(n > 0);
637   n = VG_ROUNDUP(n, 16);
638   tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
639   tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
640             <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
641   if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
642      if (0)
643      VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
644                  (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
645      shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
646      if (shmem__bigchunk_next == NULL)
647         VG_(out_of_memory_NORETURN)(
648            "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
649      shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
650   }
651   tl_assert(shmem__bigchunk_next);
652   tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
653   tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
654   shmem__bigchunk_next += n;
655   return shmem__bigchunk_next - n;
656}
657
658static SecMap* shmem__alloc_SecMap ( void )
659{
660   Word    i, j;
661   SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
662   if (0) VG_(printf)("alloc_SecMap %p\n",sm);
663   tl_assert(sm);
664   sm->magic = SecMap_MAGIC;
665   for (i = 0; i < N_SECMAP_ZLINES; i++) {
666      sm->linesZ[i].dict[0] = SVal_NOACCESS;
667      sm->linesZ[i].dict[1] = SVal_INVALID;
668      sm->linesZ[i].dict[2] = SVal_INVALID;
669      sm->linesZ[i].dict[3] = SVal_INVALID;
670      for (j = 0; j < N_LINE_ARANGE/4; j++)
671         sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
672   }
673   sm->linesF      = NULL;
674   sm->linesF_size = 0;
675   stats__secmaps_allocd++;
676   stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
677   stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
678   stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
679   return sm;
680}
681
682typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
683static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
684
685static SecMap* shmem__find_SecMap ( Addr ga )
686{
687   SecMap* sm    = NULL;
688   Addr    gaKey = shmem__round_to_SecMap_base(ga);
689   // Cache
690   stats__secmaps_search++;
691   if (LIKELY(gaKey == smCache[0].gaKey))
692      return smCache[0].sm;
693   if (LIKELY(gaKey == smCache[1].gaKey)) {
694      SMCacheEnt tmp = smCache[0];
695      smCache[0] = smCache[1];
696      smCache[1] = tmp;
697      return smCache[0].sm;
698   }
699   if (gaKey == smCache[2].gaKey) {
700      SMCacheEnt tmp = smCache[1];
701      smCache[1] = smCache[2];
702      smCache[2] = tmp;
703      return smCache[1].sm;
704   }
705   // end Cache
706   stats__secmaps_search_slow++;
707   if (VG_(lookupFM)( map_shmem,
708                      NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
709      tl_assert(sm != NULL);
710      smCache[2] = smCache[1];
711      smCache[1] = smCache[0];
712      smCache[0].gaKey = gaKey;
713      smCache[0].sm    = sm;
714   } else {
715      tl_assert(sm == NULL);
716   }
717   return sm;
718}
719
720static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
721{
722   SecMap* sm = shmem__find_SecMap ( ga );
723   if (LIKELY(sm)) {
724      return sm;
725   } else {
726      /* create a new one */
727      Addr gaKey = shmem__round_to_SecMap_base(ga);
728      sm = shmem__alloc_SecMap();
729      tl_assert(sm);
730      VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
731      return sm;
732   }
733}
734
735
736/* ------------ LineF and LineZ related ------------ */
737
738static void rcinc_LineF ( LineF* lineF ) {
739   UWord i;
740   tl_assert(lineF->inUse);
741   for (i = 0; i < N_LINE_ARANGE; i++)
742      rcinc(lineF->w64s[i]);
743}
744
745static void rcdec_LineF ( LineF* lineF ) {
746   UWord i;
747   tl_assert(lineF->inUse);
748   for (i = 0; i < N_LINE_ARANGE; i++)
749      rcdec(lineF->w64s[i]);
750}
751
752static void rcinc_LineZ ( LineZ* lineZ ) {
753   tl_assert(lineZ->dict[0] != SVal_INVALID);
754   rcinc(lineZ->dict[0]);
755   if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
756   if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
757   if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
758}
759
760static void rcdec_LineZ ( LineZ* lineZ ) {
761   tl_assert(lineZ->dict[0] != SVal_INVALID);
762   rcdec(lineZ->dict[0]);
763   if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
764   if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
765   if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
766}
767
768inline
769static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
770   Word bix, shft, mask, prep;
771   tl_assert(ix >= 0);
772   bix  = ix >> 2;
773   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
774   mask = 3 << shft;
775   prep = b2 << shft;
776   arr[bix] = (arr[bix] & ~mask) | prep;
777}
778
779inline
780static UWord read_twobit_array ( UChar* arr, UWord ix ) {
781   Word bix, shft;
782   tl_assert(ix >= 0);
783   bix  = ix >> 2;
784   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
785   return (arr[bix] >> shft) & 3;
786}
787
788/* Given address 'tag', find either the Z or F line containing relevant
789   data, so it can be read into the cache.
790*/
791static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
792                                  /*OUT*/LineF** fp, Addr tag ) {
793   LineZ* lineZ;
794   LineF* lineF;
795   UWord   zix;
796   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
797   UWord   smoff = shmem__get_SecMap_offset(tag);
798   /* since smoff is derived from a valid tag, it should be
799      cacheline-aligned. */
800   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
801   zix = smoff >> N_LINE_BITS;
802   tl_assert(zix < N_SECMAP_ZLINES);
803   lineZ = &sm->linesZ[zix];
804   lineF = NULL;
805   if (lineZ->dict[0] == SVal_INVALID) {
806      UInt fix = (UInt)lineZ->dict[1];
807      tl_assert(sm->linesF);
808      tl_assert(sm->linesF_size > 0);
809      tl_assert(fix >= 0 && fix < sm->linesF_size);
810      lineF = &sm->linesF[fix];
811      tl_assert(lineF->inUse);
812      lineZ = NULL;
813   }
814   *zp = lineZ;
815   *fp = lineF;
816}
817
818/* Given address 'tag', return the relevant SecMap and the index of
819   the LineZ within it, in the expectation that the line is to be
820   overwritten.  Regardless of whether 'tag' is currently associated
821   with a Z or F representation, to rcdec on the current
822   representation, in recognition of the fact that the contents are
823   just about to be overwritten. */
824static __attribute__((noinline))
825void find_Z_for_writing ( /*OUT*/SecMap** smp,
826                          /*OUT*/Word* zixp,
827                          Addr tag ) {
828   LineZ* lineZ;
829   LineF* lineF;
830   UWord   zix;
831   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
832   UWord   smoff = shmem__get_SecMap_offset(tag);
833   /* since smoff is derived from a valid tag, it should be
834      cacheline-aligned. */
835   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
836   zix = smoff >> N_LINE_BITS;
837   tl_assert(zix < N_SECMAP_ZLINES);
838   lineZ = &sm->linesZ[zix];
839   lineF = NULL;
840   /* re RCs, we are freeing up this LineZ/LineF so that new data can
841      be parked in it.  Hence have to rcdec it accordingly. */
842   /* If lineZ has an associated lineF, free it up. */
843   if (lineZ->dict[0] == SVal_INVALID) {
844      UInt fix = (UInt)lineZ->dict[1];
845      tl_assert(sm->linesF);
846      tl_assert(sm->linesF_size > 0);
847      tl_assert(fix >= 0 && fix < sm->linesF_size);
848      lineF = &sm->linesF[fix];
849      tl_assert(lineF->inUse);
850      rcdec_LineF(lineF);
851      lineF->inUse = False;
852   } else {
853      rcdec_LineZ(lineZ);
854   }
855   *smp  = sm;
856   *zixp = zix;
857}
858
859static __attribute__((noinline))
860void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
861   UInt        i, new_size;
862   LineF* nyu;
863
864   if (sm->linesF) {
865      tl_assert(sm->linesF_size > 0);
866   } else {
867      tl_assert(sm->linesF_size == 0);
868   }
869
870   if (sm->linesF) {
871      for (i = 0; i < sm->linesF_size; i++) {
872         if (!sm->linesF[i].inUse) {
873            *fixp = (Word)i;
874            return;
875         }
876      }
877   }
878
879   /* No free F line found.  Expand existing array and try again. */
880   new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
881   nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
882                           new_size * sizeof(LineF) );
883   tl_assert(nyu);
884
885   stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
886   stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
887                                  * sizeof(LineF);
888
889   if (0)
890   VG_(printf)("SM %p: expand F array from %d to %d\n",
891               sm, (Int)sm->linesF_size, new_size);
892
893   for (i = 0; i < new_size; i++)
894      nyu[i].inUse = False;
895
896   if (sm->linesF) {
897      for (i = 0; i < sm->linesF_size; i++) {
898         tl_assert(sm->linesF[i].inUse);
899         nyu[i] = sm->linesF[i];
900      }
901      VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
902      HG_(free)(sm->linesF);
903   }
904
905   sm->linesF      = nyu;
906   sm->linesF_size = new_size;
907
908   for (i = 0; i < sm->linesF_size; i++) {
909      if (!sm->linesF[i].inUse) {
910         *fixp = (Word)i;
911         return;
912      }
913    }
914
915    /*NOTREACHED*/
916    tl_assert(0);
917}
918
919
920/* ------------ CacheLine and implicit-tree related ------------ */
921
922__attribute__((unused))
923static void pp_CacheLine ( CacheLine* cl ) {
924   Word i;
925   if (!cl) {
926      VG_(printf)("%s","pp_CacheLine(NULL)\n");
927      return;
928   }
929   for (i = 0; i < N_LINE_TREES; i++)
930      VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
931   for (i = 0; i < N_LINE_ARANGE; i++)
932      VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
933}
934
935static UChar descr_to_validbits ( UShort descr )
936{
937   /* a.k.a Party Time for gcc's constant folder */
938#  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
939                b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
940             ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
941                          ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
942                          ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
943                          ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
944                          ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
945                          ( (b16_2) << 4)  | ( (b64)   << 3)  | \
946                          ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
947                          ( (b16_0) << 0) ) )
948
949#  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
950             ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
951                         ( (bit5) << 5) | ( (bit4) << 4) | \
952                         ( (bit3) << 3) | ( (bit2) << 2) | \
953                         ( (bit1) << 1) | ( (bit0) << 0) ) )
954
955   /* these should all get folded out at compile time */
956   tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
957   tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
958   tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
959   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
960   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
961   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
962   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
963   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
964   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
965
966   switch (descr) {
967   /*
968              +--------------------------------- TREE_DESCR_8_7
969              |             +------------------- TREE_DESCR_8_0
970              |             |  +---------------- TREE_DESCR_16_3
971              |             |  | +-------------- TREE_DESCR_32_1
972              |             |  | | +------------ TREE_DESCR_16_2
973              |             |  | | |  +--------- TREE_DESCR_64
974              |             |  | | |  |  +------ TREE_DESCR_16_1
975              |             |  | | |  |  | +---- TREE_DESCR_32_0
976              |             |  | | |  |  | | +-- TREE_DESCR_16_0
977              |             |  | | |  |  | | |
978              |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
979   case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
980                                                 return BYTE(1,1,1,1,1,1,1,1);
981   case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
982                                                 return BYTE(1,1,0,1,1,1,1,1);
983   case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
984                                                 return BYTE(0,1,1,1,1,1,1,1);
985   case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
986                                                 return BYTE(0,1,0,1,1,1,1,1);
987
988   case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
989                                                 return BYTE(1,1,1,1,1,1,0,1);
990   case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
991                                                 return BYTE(1,1,0,1,1,1,0,1);
992   case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
993                                                 return BYTE(0,1,1,1,1,1,0,1);
994   case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
995                                                 return BYTE(0,1,0,1,1,1,0,1);
996
997   case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
998                                                 return BYTE(1,1,1,1,0,1,1,1);
999   case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1000                                                 return BYTE(1,1,0,1,0,1,1,1);
1001   case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1002                                                 return BYTE(0,1,1,1,0,1,1,1);
1003   case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1004                                                 return BYTE(0,1,0,1,0,1,1,1);
1005
1006   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1007                                                 return BYTE(1,1,1,1,0,1,0,1);
1008   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1009                                                 return BYTE(1,1,0,1,0,1,0,1);
1010   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1011                                                 return BYTE(0,1,1,1,0,1,0,1);
1012   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1013                                                 return BYTE(0,1,0,1,0,1,0,1);
1014
1015   case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1016                                                 return BYTE(0,0,0,1,1,1,1,1);
1017   case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1018                                                 return BYTE(0,0,0,1,1,1,0,1);
1019   case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1020                                                 return BYTE(0,0,0,1,0,1,1,1);
1021   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1022                                                 return BYTE(0,0,0,1,0,1,0,1);
1023
1024   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1025                                                 return BYTE(1,1,1,1,0,0,0,1);
1026   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1027                                                 return BYTE(1,1,0,1,0,0,0,1);
1028   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1029                                                 return BYTE(0,1,1,1,0,0,0,1);
1030   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1031                                                 return BYTE(0,1,0,1,0,0,0,1);
1032
1033   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1034                                                 return BYTE(0,0,0,1,0,0,0,1);
1035
1036   case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1037                                                 return BYTE(0,0,0,0,0,0,0,1);
1038
1039   default: return BYTE(0,0,0,0,0,0,0,0);
1040                   /* INVALID - any valid descr produces at least one
1041                      valid bit in tree[0..7]*/
1042   }
1043   /* NOTREACHED*/
1044   tl_assert(0);
1045
1046#  undef DESCR
1047#  undef BYTE
1048}
1049
1050__attribute__((unused))
1051static Bool is_sane_Descr ( UShort descr ) {
1052   return descr_to_validbits(descr) != 0;
1053}
1054
1055static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1056   VG_(sprintf)(dst,
1057                "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1058                (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1059                (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1060                (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1061                (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1062                (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1063                (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1064                (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1065                (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1066                (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1067                (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1068                (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1069                (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1070                (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1071                (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1072                (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1073   );
1074}
1075static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1076   VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1077                     (Int)((byte & 128) ? 1 : 0),
1078                     (Int)((byte &  64) ? 1 : 0),
1079                     (Int)((byte &  32) ? 1 : 0),
1080                     (Int)((byte &  16) ? 1 : 0),
1081                     (Int)((byte &   8) ? 1 : 0),
1082                     (Int)((byte &   4) ? 1 : 0),
1083                     (Int)((byte &   2) ? 1 : 0),
1084                     (Int)((byte &   1) ? 1 : 0)
1085   );
1086}
1087
1088static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1089   Word  i;
1090   UChar validbits = descr_to_validbits(descr);
1091   HChar buf[128], buf2[128];
1092   if (validbits == 0)
1093      goto bad;
1094   for (i = 0; i < 8; i++) {
1095      if (validbits & (1<<i)) {
1096         if (tree[i] == SVal_INVALID)
1097            goto bad;
1098      } else {
1099         if (tree[i] != SVal_INVALID)
1100            goto bad;
1101      }
1102   }
1103   return True;
1104  bad:
1105   sprintf_Descr( buf, descr );
1106   sprintf_Byte( buf2, validbits );
1107   VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1108   VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1109   VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1110   for (i = 0; i < 8; i++)
1111      VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1112   VG_(printf)("%s","}\n");
1113   return 0;
1114}
1115
1116static Bool is_sane_CacheLine ( CacheLine* cl )
1117{
1118   Word tno, cloff;
1119
1120   if (!cl) goto bad;
1121
1122   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1123      UShort descr = cl->descrs[tno];
1124      SVal*  tree  = &cl->svals[cloff];
1125      if (!is_sane_Descr_and_Tree(descr, tree))
1126         goto bad;
1127   }
1128   tl_assert(cloff == N_LINE_ARANGE);
1129   return True;
1130  bad:
1131   pp_CacheLine(cl);
1132   return False;
1133}
1134
1135static UShort normalise_tree ( /*MOD*/SVal* tree )
1136{
1137   UShort descr;
1138   /* pre: incoming tree[0..7] does not have any invalid shvals, in
1139      particular no zeroes. */
1140   if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1141                || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1142                || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1143                || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1144      tl_assert(0);
1145
1146   descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1147           | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1148           | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1149   /* build 16-bit layer */
1150   if (tree[1] == tree[0]) {
1151      tree[1] = SVal_INVALID;
1152      descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1153      descr |= TREE_DESCR_16_0;
1154   }
1155   if (tree[3] == tree[2]) {
1156      tree[3] = SVal_INVALID;
1157      descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1158      descr |= TREE_DESCR_16_1;
1159   }
1160   if (tree[5] == tree[4]) {
1161      tree[5] = SVal_INVALID;
1162      descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1163      descr |= TREE_DESCR_16_2;
1164   }
1165   if (tree[7] == tree[6]) {
1166      tree[7] = SVal_INVALID;
1167      descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1168      descr |= TREE_DESCR_16_3;
1169   }
1170   /* build 32-bit layer */
1171   if (tree[2] == tree[0]
1172       && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1173      tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1174      descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1175      descr |= TREE_DESCR_32_0;
1176   }
1177   if (tree[6] == tree[4]
1178       && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1179      tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1180      descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1181      descr |= TREE_DESCR_32_1;
1182   }
1183   /* build 64-bit layer */
1184   if (tree[4] == tree[0]
1185       && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1186      tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1187      descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1188      descr |= TREE_DESCR_64;
1189   }
1190   return descr;
1191}
1192
1193/* This takes a cacheline where all the data is at the leaves
1194   (w8[..]) and builds a correctly normalised tree. */
1195static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1196{
1197   Word tno, cloff;
1198   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1199      SVal* tree = &cl->svals[cloff];
1200      cl->descrs[tno] = normalise_tree( tree );
1201   }
1202   tl_assert(cloff == N_LINE_ARANGE);
1203   if (CHECK_ZSM)
1204      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1205   stats__cline_normalises++;
1206}
1207
1208
1209typedef struct { UChar count; SVal sval; } CountedSVal;
1210
1211static
1212void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1213                               /*OUT*/Word* dstUsedP,
1214                               Word nDst, CacheLine* src )
1215{
1216   Word  tno, cloff, dstUsed;
1217
1218   tl_assert(nDst == N_LINE_ARANGE);
1219   dstUsed = 0;
1220
1221   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1222      UShort descr = src->descrs[tno];
1223      SVal*  tree  = &src->svals[cloff];
1224
1225      /* sequentialise the tree described by (descr,tree). */
1226#     define PUT(_n,_v)                                \
1227         do { dst[dstUsed  ].count = (_n);             \
1228              dst[dstUsed++].sval  = (_v);             \
1229         } while (0)
1230
1231      /* byte 0 */
1232      if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1233      if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1234      if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1235      if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1236      /* byte 1 */
1237      if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1238      /* byte 2 */
1239      if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1240      if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1241      /* byte 3 */
1242      if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1243      /* byte 4 */
1244      if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1245      if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1246      if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1247      /* byte 5 */
1248      if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1249      /* byte 6 */
1250      if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1251      if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1252      /* byte 7 */
1253      if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1254
1255#     undef PUT
1256      /* END sequentialise the tree described by (descr,tree). */
1257
1258   }
1259   tl_assert(cloff == N_LINE_ARANGE);
1260   tl_assert(dstUsed <= nDst);
1261
1262   *dstUsedP = dstUsed;
1263}
1264
1265/* Write the cacheline 'wix' to backing store.  Where it ends up
1266   is determined by its tag field. */
1267static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1268{
1269   Word        i, j, k, m;
1270   Addr        tag;
1271   SecMap*     sm;
1272   CacheLine*  cl;
1273   LineZ* lineZ;
1274   LineF* lineF;
1275   Word        zix, fix, csvalsUsed;
1276   CountedSVal csvals[N_LINE_ARANGE];
1277   SVal        sv;
1278
1279   if (0)
1280   VG_(printf)("scache wback line %d\n", (Int)wix);
1281
1282   tl_assert(wix >= 0 && wix < N_WAY_NENT);
1283
1284   tag =  cache_shmem.tags0[wix];
1285   cl  = &cache_shmem.lyns0[wix];
1286
1287   /* The cache line may have been invalidated; if so, ignore it. */
1288   if (!is_valid_scache_tag(tag))
1289      return;
1290
1291   /* Where are we going to put it? */
1292   sm         = NULL;
1293   lineZ      = NULL;
1294   lineF      = NULL;
1295   zix = fix = -1;
1296
1297   /* find the Z line to write in and rcdec it or the associated F
1298      line. */
1299   find_Z_for_writing( &sm, &zix, tag );
1300
1301   tl_assert(sm);
1302   tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1303   lineZ = &sm->linesZ[zix];
1304
1305   /* Generate the data to be stored */
1306   if (CHECK_ZSM)
1307      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1308
1309   csvalsUsed = -1;
1310   sequentialise_CacheLine( csvals, &csvalsUsed,
1311                            N_LINE_ARANGE, cl );
1312   tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1313   if (0) VG_(printf)("%lu ", csvalsUsed);
1314
1315   lineZ->dict[0] = lineZ->dict[1]
1316                  = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1317
1318   /* i indexes actual shadow values, k is cursor in csvals */
1319   i = 0;
1320   for (k = 0; k < csvalsUsed; k++) {
1321
1322      sv = csvals[k].sval;
1323      if (CHECK_ZSM)
1324         tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1325      /* do we already have it? */
1326      if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1327      if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1328      if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1329      if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1330      /* no.  look for a free slot. */
1331      if (CHECK_ZSM)
1332         tl_assert(sv != SVal_INVALID);
1333      if (lineZ->dict[0]
1334          == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1335      if (lineZ->dict[1]
1336          == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1337      if (lineZ->dict[2]
1338          == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1339      if (lineZ->dict[3]
1340          == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1341      break; /* we'll have to use the f rep */
1342     dict_ok:
1343      m = csvals[k].count;
1344      if (m == 8) {
1345         write_twobit_array( lineZ->ix2s, i+0, j );
1346         write_twobit_array( lineZ->ix2s, i+1, j );
1347         write_twobit_array( lineZ->ix2s, i+2, j );
1348         write_twobit_array( lineZ->ix2s, i+3, j );
1349         write_twobit_array( lineZ->ix2s, i+4, j );
1350         write_twobit_array( lineZ->ix2s, i+5, j );
1351         write_twobit_array( lineZ->ix2s, i+6, j );
1352         write_twobit_array( lineZ->ix2s, i+7, j );
1353         i += 8;
1354      }
1355      else if (m == 4) {
1356         write_twobit_array( lineZ->ix2s, i+0, j );
1357         write_twobit_array( lineZ->ix2s, i+1, j );
1358         write_twobit_array( lineZ->ix2s, i+2, j );
1359         write_twobit_array( lineZ->ix2s, i+3, j );
1360         i += 4;
1361      }
1362      else if (m == 1) {
1363         write_twobit_array( lineZ->ix2s, i+0, j );
1364         i += 1;
1365      }
1366      else if (m == 2) {
1367         write_twobit_array( lineZ->ix2s, i+0, j );
1368         write_twobit_array( lineZ->ix2s, i+1, j );
1369         i += 2;
1370      }
1371      else {
1372         tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1373      }
1374
1375   }
1376
1377   if (LIKELY(i == N_LINE_ARANGE)) {
1378      /* Construction of the compressed representation was
1379         successful. */
1380      rcinc_LineZ(lineZ);
1381      stats__cache_Z_wbacks++;
1382   } else {
1383      /* Cannot use the compressed(z) representation.  Use the full(f)
1384         rep instead. */
1385      tl_assert(i >= 0 && i < N_LINE_ARANGE);
1386      alloc_F_for_writing( sm, &fix );
1387      tl_assert(sm->linesF);
1388      tl_assert(sm->linesF_size > 0);
1389      tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
1390      lineF = &sm->linesF[fix];
1391      tl_assert(!lineF->inUse);
1392      lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1393      lineZ->dict[1] = (SVal)fix;
1394      lineF->inUse = True;
1395      i = 0;
1396      for (k = 0; k < csvalsUsed; k++) {
1397         if (CHECK_ZSM)
1398            tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1399         sv = csvals[k].sval;
1400         if (CHECK_ZSM)
1401            tl_assert(sv != SVal_INVALID);
1402         for (m = csvals[k].count; m > 0; m--) {
1403            lineF->w64s[i] = sv;
1404            i++;
1405         }
1406      }
1407      tl_assert(i == N_LINE_ARANGE);
1408      rcinc_LineF(lineF);
1409      stats__cache_F_wbacks++;
1410   }
1411}
1412
1413/* Fetch the cacheline 'wix' from the backing store.  The tag
1414   associated with 'wix' is assumed to have already been filled in;
1415   hence that is used to determine where in the backing store to read
1416   from. */
1417static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1418{
1419   Word       i;
1420   Addr       tag;
1421   CacheLine* cl;
1422   LineZ*     lineZ;
1423   LineF*     lineF;
1424
1425   if (0)
1426   VG_(printf)("scache fetch line %d\n", (Int)wix);
1427
1428   tl_assert(wix >= 0 && wix < N_WAY_NENT);
1429
1430   tag =  cache_shmem.tags0[wix];
1431   cl  = &cache_shmem.lyns0[wix];
1432
1433   /* reject nonsense requests */
1434   tl_assert(is_valid_scache_tag(tag));
1435
1436   lineZ = NULL;
1437   lineF = NULL;
1438   find_ZF_for_reading( &lineZ, &lineF, tag );
1439   tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1440
1441   /* expand the data into the bottom layer of the tree, then get
1442      cacheline_normalise to build the descriptor array. */
1443   if (lineF) {
1444      tl_assert(lineF->inUse);
1445      for (i = 0; i < N_LINE_ARANGE; i++) {
1446         cl->svals[i] = lineF->w64s[i];
1447      }
1448      stats__cache_F_fetches++;
1449   } else {
1450      for (i = 0; i < N_LINE_ARANGE; i++) {
1451         SVal sv;
1452         UWord ix = read_twobit_array( lineZ->ix2s, i );
1453         /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
1454         sv = lineZ->dict[ix];
1455         tl_assert(sv != SVal_INVALID);
1456         cl->svals[i] = sv;
1457      }
1458      stats__cache_Z_fetches++;
1459   }
1460   normalise_CacheLine( cl );
1461}
1462
1463static void shmem__invalidate_scache ( void ) {
1464   Word wix;
1465   if (0) VG_(printf)("%s","scache inval\n");
1466   tl_assert(!is_valid_scache_tag(1));
1467   for (wix = 0; wix < N_WAY_NENT; wix++) {
1468      cache_shmem.tags0[wix] = 1/*INVALID*/;
1469   }
1470   stats__cache_invals++;
1471}
1472
1473static void shmem__flush_and_invalidate_scache ( void ) {
1474   Word wix;
1475   Addr tag;
1476   if (0) VG_(printf)("%s","scache flush and invalidate\n");
1477   tl_assert(!is_valid_scache_tag(1));
1478   for (wix = 0; wix < N_WAY_NENT; wix++) {
1479      tag = cache_shmem.tags0[wix];
1480      if (tag == 1/*INVALID*/) {
1481         /* already invalid; nothing to do */
1482      } else {
1483         tl_assert(is_valid_scache_tag(tag));
1484         cacheline_wback( wix );
1485      }
1486      cache_shmem.tags0[wix] = 1/*INVALID*/;
1487   }
1488   stats__cache_flushes++;
1489   stats__cache_invals++;
1490}
1491
1492
1493static inline Bool aligned16 ( Addr a ) {
1494   return 0 == (a & 1);
1495}
1496static inline Bool aligned32 ( Addr a ) {
1497   return 0 == (a & 3);
1498}
1499static inline Bool aligned64 ( Addr a ) {
1500   return 0 == (a & 7);
1501}
1502static inline UWord get_cacheline_offset ( Addr a ) {
1503   return (UWord)(a & (N_LINE_ARANGE - 1));
1504}
1505static inline Addr cacheline_ROUNDUP ( Addr a ) {
1506   return ROUNDUP(a, N_LINE_ARANGE);
1507}
1508static inline Addr cacheline_ROUNDDN ( Addr a ) {
1509   return ROUNDDN(a, N_LINE_ARANGE);
1510}
1511static inline UWord get_treeno ( Addr a ) {
1512   return get_cacheline_offset(a) >> 3;
1513}
1514static inline UWord get_tree_offset ( Addr a ) {
1515   return a & 7;
1516}
1517
1518static __attribute__((noinline))
1519       CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
1520static inline CacheLine* get_cacheline ( Addr a )
1521{
1522   /* tag is 'a' with the in-line offset masked out,
1523      eg a[31]..a[4] 0000 */
1524   Addr       tag = a & ~(N_LINE_ARANGE - 1);
1525   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1526   stats__cache_totrefs++;
1527   if (LIKELY(tag == cache_shmem.tags0[wix])) {
1528      return &cache_shmem.lyns0[wix];
1529   } else {
1530      return get_cacheline_MISS( a );
1531   }
1532}
1533
1534static __attribute__((noinline))
1535       CacheLine* get_cacheline_MISS ( Addr a )
1536{
1537   /* tag is 'a' with the in-line offset masked out,
1538      eg a[31]..a[4] 0000 */
1539
1540   CacheLine* cl;
1541   Addr*      tag_old_p;
1542   Addr       tag = a & ~(N_LINE_ARANGE - 1);
1543   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1544
1545   tl_assert(tag != cache_shmem.tags0[wix]);
1546
1547   /* Dump the old line into the backing store. */
1548   stats__cache_totmisses++;
1549
1550   cl        = &cache_shmem.lyns0[wix];
1551   tag_old_p = &cache_shmem.tags0[wix];
1552
1553   if (is_valid_scache_tag( *tag_old_p )) {
1554      /* EXPENSIVE and REDUNDANT: callee does it */
1555      if (CHECK_ZSM)
1556         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1557      cacheline_wback( wix );
1558   }
1559   /* and reload the new one */
1560   *tag_old_p = tag;
1561   cacheline_fetch( wix );
1562   if (CHECK_ZSM)
1563      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1564   return cl;
1565}
1566
1567static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1568   stats__cline_64to32pulldown++;
1569   switch (toff) {
1570      case 0: case 4:
1571         tl_assert(descr & TREE_DESCR_64);
1572         tree[4] = tree[0];
1573         descr &= ~TREE_DESCR_64;
1574         descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1575         break;
1576      default:
1577         tl_assert(0);
1578   }
1579   return descr;
1580}
1581
1582static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1583   stats__cline_32to16pulldown++;
1584   switch (toff) {
1585      case 0: case 2:
1586         if (!(descr & TREE_DESCR_32_0)) {
1587            descr = pulldown_to_32(tree, 0, descr);
1588         }
1589         tl_assert(descr & TREE_DESCR_32_0);
1590         tree[2] = tree[0];
1591         descr &= ~TREE_DESCR_32_0;
1592         descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1593         break;
1594      case 4: case 6:
1595         if (!(descr & TREE_DESCR_32_1)) {
1596            descr = pulldown_to_32(tree, 4, descr);
1597         }
1598         tl_assert(descr & TREE_DESCR_32_1);
1599         tree[6] = tree[4];
1600         descr &= ~TREE_DESCR_32_1;
1601         descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1602         break;
1603      default:
1604         tl_assert(0);
1605   }
1606   return descr;
1607}
1608
1609static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1610   stats__cline_16to8pulldown++;
1611   switch (toff) {
1612      case 0: case 1:
1613         if (!(descr & TREE_DESCR_16_0)) {
1614            descr = pulldown_to_16(tree, 0, descr);
1615         }
1616         tl_assert(descr & TREE_DESCR_16_0);
1617         tree[1] = tree[0];
1618         descr &= ~TREE_DESCR_16_0;
1619         descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1620         break;
1621      case 2: case 3:
1622         if (!(descr & TREE_DESCR_16_1)) {
1623            descr = pulldown_to_16(tree, 2, descr);
1624         }
1625         tl_assert(descr & TREE_DESCR_16_1);
1626         tree[3] = tree[2];
1627         descr &= ~TREE_DESCR_16_1;
1628         descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1629         break;
1630      case 4: case 5:
1631         if (!(descr & TREE_DESCR_16_2)) {
1632            descr = pulldown_to_16(tree, 4, descr);
1633         }
1634         tl_assert(descr & TREE_DESCR_16_2);
1635         tree[5] = tree[4];
1636         descr &= ~TREE_DESCR_16_2;
1637         descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1638         break;
1639      case 6: case 7:
1640         if (!(descr & TREE_DESCR_16_3)) {
1641            descr = pulldown_to_16(tree, 6, descr);
1642         }
1643         tl_assert(descr & TREE_DESCR_16_3);
1644         tree[7] = tree[6];
1645         descr &= ~TREE_DESCR_16_3;
1646         descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1647         break;
1648      default:
1649         tl_assert(0);
1650   }
1651   return descr;
1652}
1653
1654
1655static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1656   UShort mask;
1657   switch (toff) {
1658      case 0:
1659         mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1660         tl_assert( (descr & mask) == mask );
1661         descr &= ~mask;
1662         descr |= TREE_DESCR_16_0;
1663         break;
1664      case 2:
1665         mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1666         tl_assert( (descr & mask) == mask );
1667         descr &= ~mask;
1668         descr |= TREE_DESCR_16_1;
1669         break;
1670      case 4:
1671         mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1672         tl_assert( (descr & mask) == mask );
1673         descr &= ~mask;
1674         descr |= TREE_DESCR_16_2;
1675         break;
1676      case 6:
1677         mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1678         tl_assert( (descr & mask) == mask );
1679         descr &= ~mask;
1680         descr |= TREE_DESCR_16_3;
1681         break;
1682      default:
1683         tl_assert(0);
1684   }
1685   return descr;
1686}
1687
1688static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1689   UShort mask;
1690   switch (toff) {
1691      case 0:
1692         if (!(descr & TREE_DESCR_16_0))
1693            descr = pullup_descr_to_16(descr, 0);
1694         if (!(descr & TREE_DESCR_16_1))
1695            descr = pullup_descr_to_16(descr, 2);
1696         mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1697         tl_assert( (descr & mask) == mask );
1698         descr &= ~mask;
1699         descr |= TREE_DESCR_32_0;
1700         break;
1701      case 4:
1702         if (!(descr & TREE_DESCR_16_2))
1703            descr = pullup_descr_to_16(descr, 4);
1704         if (!(descr & TREE_DESCR_16_3))
1705            descr = pullup_descr_to_16(descr, 6);
1706         mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1707         tl_assert( (descr & mask) == mask );
1708         descr &= ~mask;
1709         descr |= TREE_DESCR_32_1;
1710         break;
1711      default:
1712         tl_assert(0);
1713   }
1714   return descr;
1715}
1716
1717static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1718   switch (toff) {
1719      case 0: case 4:
1720         return 0 != (descr & TREE_DESCR_64);
1721      default:
1722         tl_assert(0);
1723   }
1724}
1725
1726static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1727   switch (toff) {
1728      case 0:
1729         return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1730      case 2:
1731         return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1732      case 4:
1733         return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1734      case 6:
1735         return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1736      default:
1737         tl_assert(0);
1738   }
1739}
1740
1741/* ------------ Cache management ------------ */
1742
1743static void zsm_flush_cache ( void )
1744{
1745   shmem__flush_and_invalidate_scache();
1746}
1747
1748
1749static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
1750{
1751   tl_assert( sizeof(UWord) == sizeof(Addr) );
1752
1753   rcinc = p_rcinc;
1754   rcdec = p_rcdec;
1755
1756   tl_assert(map_shmem == NULL);
1757   map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1758                           HG_(free),
1759                           NULL/*unboxed UWord cmp*/);
1760   tl_assert(map_shmem != NULL);
1761   shmem__invalidate_scache();
1762
1763   /* a SecMap must contain an integral number of CacheLines */
1764   tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
1765   /* also ... a CacheLine holds an integral number of trees */
1766   tl_assert(0 == (N_LINE_ARANGE % 8));
1767}
1768
1769/////////////////////////////////////////////////////////////////
1770/////////////////////////////////////////////////////////////////
1771//                                                             //
1772// SECTION END compressed shadow memory                        //
1773//                                                             //
1774/////////////////////////////////////////////////////////////////
1775/////////////////////////////////////////////////////////////////
1776
1777
1778
1779/////////////////////////////////////////////////////////////////
1780/////////////////////////////////////////////////////////////////
1781//                                                             //
1782// SECTION BEGIN vts primitives                                //
1783//                                                             //
1784/////////////////////////////////////////////////////////////////
1785/////////////////////////////////////////////////////////////////
1786
1787
1788/* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
1789   being compact stand-ins for Thr*'s.  Use these functions to map
1790   between them. */
1791static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
1792static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
1793
1794__attribute__((noreturn))
1795static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
1796{
1797   if (due_to_nThrs) {
1798      HChar* s =
1799         "\n"
1800         "Helgrind: cannot continue, run aborted: too many threads.\n"
1801         "Sorry.  Helgrind can only handle programs that create\n"
1802         "%'llu or fewer threads over their entire lifetime.\n"
1803         "\n";
1804      VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
1805   } else {
1806      HChar* s =
1807         "\n"
1808         "Helgrind: cannot continue, run aborted: too many\n"
1809         "synchronisation events.  Sorry. Helgrind can only handle\n"
1810         "programs which perform %'llu or fewer\n"
1811         "inter-thread synchronisation events (locks, unlocks, etc).\n"
1812         "\n";
1813      VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
1814   }
1815   VG_(exit)(1);
1816   /*NOTREACHED*/
1817   tl_assert(0); /*wtf?!*/
1818}
1819
1820
1821/* The dead thread (ThrID, actually) table.  A thread may only be
1822   listed here if we have been notified thereof by libhb_async_exit.
1823   New entries are added at the end.  The order isn't important, but
1824   the ThrID values must be unique.  This table lists the identity of
1825   all threads that have ever died -- none are ever removed.  We keep
1826   this table so as to be able to prune entries from VTSs.  We don't
1827   actually need to keep the set of threads that have ever died --
1828   only the threads that have died since the previous round of
1829   pruning.  But it's useful for sanity check purposes to keep the
1830   entire set, so we do. */
1831static XArray* /* of ThrID */ verydead_thread_table = NULL;
1832
1833/* Arbitrary total ordering on ThrIDs. */
1834static Int cmp__ThrID ( void* v1, void* v2 ) {
1835   ThrID id1 = *(ThrID*)v1;
1836   ThrID id2 = *(ThrID*)v2;
1837   if (id1 < id2) return -1;
1838   if (id1 > id2) return 1;
1839   return 0;
1840}
1841
1842static void verydead_thread_table_init ( void )
1843{
1844   tl_assert(!verydead_thread_table);
1845   verydead_thread_table
1846     = VG_(newXA)( HG_(zalloc),
1847                   "libhb.verydead_thread_table_init.1",
1848                   HG_(free), sizeof(ThrID) );
1849   tl_assert(verydead_thread_table);
1850   VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
1851}
1852
1853
1854/* A VTS contains .ts, its vector clock, and also .id, a field to hold
1855   a backlink for the caller's convenience.  Since we have no idea
1856   what to set that to in the library, it always gets set to
1857   VtsID_INVALID. */
1858typedef
1859   struct {
1860      VtsID    id;
1861      UInt     usedTS;
1862      UInt     sizeTS;
1863      ScalarTS ts[0];
1864   }
1865   VTS;
1866
1867/* Allocate a VTS capable of storing 'sizeTS' entries. */
1868static VTS* VTS__new ( HChar* who, UInt sizeTS );
1869
1870/* Make a clone of 'vts', sizing the new array to exactly match the
1871   number of ScalarTSs present. */
1872static VTS* VTS__clone ( HChar* who, VTS* vts );
1873
1874/* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
1875   array is sized exactly to hold the number of required elements.
1876   'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
1877   must be in strictly increasing order. */
1878static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel );
1879
1880/* Delete this VTS in its entirety. */
1881static void VTS__delete ( VTS* vts );
1882
1883/* Create a new singleton VTS in 'out'.  Caller must have
1884   pre-allocated 'out' sufficiently big to hold the result in all
1885   possible cases. */
1886static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
1887
1888/* Create in 'out' a VTS which is the same as 'vts' except with
1889   vts[me]++, so to speak.  Caller must have pre-allocated 'out'
1890   sufficiently big to hold the result in all possible cases. */
1891static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
1892
1893/* Create in 'out' a VTS which is the join (max) of 'a' and
1894   'b'. Caller must have pre-allocated 'out' sufficiently big to hold
1895   the result in all possible cases. */
1896static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
1897
1898/* Compute the partial ordering relation of the two args.  Although we
1899   could be completely general and return an enumeration value (EQ,
1900   LT, GT, UN), in fact we only need LEQ, and so we may as well
1901   hardwire that fact.
1902
1903   Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
1904   invald ThrID).  In the latter case, the returned ThrID indicates
1905   the discovered point for which they are not.  There may be more
1906   than one such point, but we only care about seeing one of them, not
1907   all of them.  This rather strange convention is used because
1908   sometimes we want to know the actual index at which they first
1909   differ. */
1910static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
1911
1912/* Compute an arbitrary structural (total) ordering on the two args,
1913   based on their VCs, so they can be looked up in a table, tree, etc.
1914   Returns -1, 0 or 1. */
1915static Word VTS__cmp_structural ( VTS* a, VTS* b );
1916
1917/* Debugging only.  Display the given VTS in the buffer. */
1918static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
1919
1920/* Debugging only.  Return vts[index], so to speak. */
1921static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
1922
1923/* Notify the VTS machinery that a thread has been declared
1924   comprehensively dead: that is, it has done an async exit AND it has
1925   been joined with.  This should ensure that its local clocks (.viR
1926   and .viW) will never again change, and so all mentions of this
1927   thread from all VTSs in the system may be removed. */
1928static void VTS__declare_thread_very_dead ( Thr* idx );
1929
1930/*--------------- to do with Vector Timestamps ---------------*/
1931
1932static Bool is_sane_VTS ( VTS* vts )
1933{
1934   UWord     i, n;
1935   ScalarTS  *st1, *st2;
1936   if (!vts) return False;
1937   if (!vts->ts) return False;
1938   if (vts->usedTS > vts->sizeTS) return False;
1939   n = vts->usedTS;
1940   if (n == 1) {
1941      st1 = &vts->ts[0];
1942      if (st1->tym == 0)
1943         return False;
1944   }
1945   else
1946   if (n >= 2) {
1947      for (i = 0; i < n-1; i++) {
1948         st1 = &vts->ts[i];
1949         st2 = &vts->ts[i+1];
1950         if (st1->thrid >= st2->thrid)
1951            return False;
1952         if (st1->tym == 0 || st2->tym == 0)
1953            return False;
1954      }
1955   }
1956   return True;
1957}
1958
1959
1960/* Create a new, empty VTS.
1961*/
1962static VTS* VTS__new ( HChar* who, UInt sizeTS )
1963{
1964   VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
1965   tl_assert(vts->usedTS == 0);
1966   vts->sizeTS = sizeTS;
1967   *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
1968   return vts;
1969}
1970
1971/* Clone this VTS.
1972*/
1973static VTS* VTS__clone ( HChar* who, VTS* vts )
1974{
1975   tl_assert(vts);
1976   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1977   UInt nTS = vts->usedTS;
1978   VTS* clone = VTS__new(who, nTS);
1979   clone->id = vts->id;
1980   clone->sizeTS = nTS;
1981   clone->usedTS = nTS;
1982   UInt i;
1983   for (i = 0; i < nTS; i++) {
1984      clone->ts[i] = vts->ts[i];
1985   }
1986   tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1987   return clone;
1988}
1989
1990
1991/* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
1992   must be in strictly increasing order.  We could obviously do this
1993   much more efficiently (in linear time) if necessary.
1994*/
1995static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel )
1996{
1997   UInt i, j;
1998   tl_assert(vts);
1999   tl_assert(thridsToDel);
2000   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2001   UInt nTS = vts->usedTS;
2002   /* Figure out how many ScalarTSs will remain in the output. */
2003   UInt nReq = nTS;
2004   for (i = 0; i < nTS; i++) {
2005      ThrID thrid = vts->ts[i].thrid;
2006      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2007         nReq--;
2008   }
2009   tl_assert(nReq <= nTS);
2010   /* Copy the ones that will remain. */
2011   VTS* res = VTS__new(who, nReq);
2012   j = 0;
2013   for (i = 0; i < nTS; i++) {
2014      ThrID thrid = vts->ts[i].thrid;
2015      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2016         continue;
2017      res->ts[j++] = vts->ts[i];
2018   }
2019   tl_assert(j == nReq);
2020   tl_assert(j == res->sizeTS);
2021   res->usedTS = j;
2022   tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2023   return res;
2024}
2025
2026
2027/* Delete this VTS in its entirety.
2028*/
2029static void VTS__delete ( VTS* vts )
2030{
2031   tl_assert(vts);
2032   tl_assert(vts->usedTS <= vts->sizeTS);
2033   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2034   HG_(free)(vts);
2035}
2036
2037
2038/* Create a new singleton VTS.
2039*/
2040static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2041{
2042   tl_assert(thr);
2043   tl_assert(tym >= 1);
2044   tl_assert(out);
2045   tl_assert(out->usedTS == 0);
2046   tl_assert(out->sizeTS >= 1);
2047   UInt hi = out->usedTS++;
2048   out->ts[hi].thrid = Thr__to_ThrID(thr);
2049   out->ts[hi].tym   = tym;
2050}
2051
2052
2053/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2054   not modified.
2055*/
2056static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2057{
2058   UInt      i, n;
2059   ThrID     me_thrid;
2060   Bool      found = False;
2061
2062   stats__vts__tick++;
2063
2064   tl_assert(out);
2065   tl_assert(out->usedTS == 0);
2066   if (vts->usedTS >= ThrID_MAX_VALID)
2067      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2068   tl_assert(out->sizeTS >= 1 + vts->usedTS);
2069
2070   tl_assert(me);
2071   me_thrid = Thr__to_ThrID(me);
2072   tl_assert(is_sane_VTS(vts));
2073   n = vts->usedTS;
2074
2075   /* Copy all entries which precede 'me'. */
2076   for (i = 0; i < n; i++) {
2077      ScalarTS* here = &vts->ts[i];
2078      if (UNLIKELY(here->thrid >= me_thrid))
2079         break;
2080      UInt hi = out->usedTS++;
2081      out->ts[hi] = *here;
2082   }
2083
2084   /* 'i' now indicates the next entry to copy, if any.
2085       There are 3 possibilities:
2086       (a) there is no next entry (we used them all up already):
2087           add (me_thrid,1) to the output, and quit
2088       (b) there is a next entry, and its thrid > me_thrid:
2089           add (me_thrid,1) to the output, then copy the remaining entries
2090       (c) there is a next entry, and its thrid == me_thrid:
2091           copy it to the output but increment its timestamp value.
2092           Then copy the remaining entries.  (c) is the common case.
2093   */
2094   tl_assert(i >= 0 && i <= n);
2095   if (i == n) { /* case (a) */
2096      UInt hi = out->usedTS++;
2097      out->ts[hi].thrid = me_thrid;
2098      out->ts[hi].tym   = 1;
2099   } else {
2100      /* cases (b) and (c) */
2101      ScalarTS* here = &vts->ts[i];
2102      if (me_thrid == here->thrid) { /* case (c) */
2103         if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2104            /* We're hosed.  We have to stop. */
2105            scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2106         }
2107         UInt hi = out->usedTS++;
2108         out->ts[hi].thrid = here->thrid;
2109         out->ts[hi].tym   = here->tym + 1;
2110         i++;
2111         found = True;
2112      } else { /* case (b) */
2113         UInt hi = out->usedTS++;
2114         out->ts[hi].thrid = me_thrid;
2115         out->ts[hi].tym   = 1;
2116      }
2117      /* And copy any remaining entries. */
2118      for (/*keepgoing*/; i < n; i++) {
2119         ScalarTS* here2 = &vts->ts[i];
2120         UInt hi = out->usedTS++;
2121         out->ts[hi] = *here2;
2122      }
2123   }
2124
2125   tl_assert(is_sane_VTS(out));
2126   tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2127   tl_assert(out->usedTS <= out->sizeTS);
2128}
2129
2130
2131/* Return a new VTS constructed as the join (max) of the 2 args.
2132   Neither arg is modified.
2133*/
2134static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2135{
2136   UInt     ia, ib, useda, usedb;
2137   ULong    tyma, tymb, tymMax;
2138   ThrID    thrid;
2139   UInt     ncommon = 0;
2140
2141   stats__vts__join++;
2142
2143   tl_assert(a);
2144   tl_assert(b);
2145   useda = a->usedTS;
2146   usedb = b->usedTS;
2147
2148   tl_assert(out);
2149   tl_assert(out->usedTS == 0);
2150   /* overly conservative test, but doing better involves comparing
2151      the two VTSs, which we don't want to do at this point. */
2152   if (useda + usedb >= ThrID_MAX_VALID)
2153      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2154   tl_assert(out->sizeTS >= useda + usedb);
2155
2156   ia = ib = 0;
2157
2158   while (1) {
2159
2160      /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2161         from a and b in order, where thrid is the next ThrID
2162         occurring in either a or b, and tyma/b are the relevant
2163         scalar timestamps, taking into account implicit zeroes. */
2164      tl_assert(ia >= 0 && ia <= useda);
2165      tl_assert(ib >= 0 && ib <= usedb);
2166
2167      if        (ia == useda && ib == usedb) {
2168         /* both empty - done */
2169         break;
2170
2171      } else if (ia == useda && ib != usedb) {
2172         /* a empty, use up b */
2173         ScalarTS* tmpb = &b->ts[ib];
2174         thrid = tmpb->thrid;
2175         tyma  = 0;
2176         tymb  = tmpb->tym;
2177         ib++;
2178
2179      } else if (ia != useda && ib == usedb) {
2180         /* b empty, use up a */
2181         ScalarTS* tmpa = &a->ts[ia];
2182         thrid = tmpa->thrid;
2183         tyma  = tmpa->tym;
2184         tymb  = 0;
2185         ia++;
2186
2187      } else {
2188         /* both not empty; extract lowest-ThrID'd triple */
2189         ScalarTS* tmpa = &a->ts[ia];
2190         ScalarTS* tmpb = &b->ts[ib];
2191         if (tmpa->thrid < tmpb->thrid) {
2192            /* a has the lowest unconsidered ThrID */
2193            thrid = tmpa->thrid;
2194            tyma  = tmpa->tym;
2195            tymb  = 0;
2196            ia++;
2197         } else if (tmpa->thrid > tmpb->thrid) {
2198            /* b has the lowest unconsidered ThrID */
2199            thrid = tmpb->thrid;
2200            tyma  = 0;
2201            tymb  = tmpb->tym;
2202            ib++;
2203         } else {
2204            /* they both next mention the same ThrID */
2205            tl_assert(tmpa->thrid == tmpb->thrid);
2206            thrid = tmpa->thrid; /* == tmpb->thrid */
2207            tyma  = tmpa->tym;
2208            tymb  = tmpb->tym;
2209            ia++;
2210            ib++;
2211            ncommon++;
2212         }
2213      }
2214
2215      /* having laboriously determined (thr, tyma, tymb), do something
2216         useful with it. */
2217      tymMax = tyma > tymb ? tyma : tymb;
2218      if (tymMax > 0) {
2219         UInt hi = out->usedTS++;
2220         out->ts[hi].thrid = thrid;
2221         out->ts[hi].tym   = tymMax;
2222      }
2223
2224   }
2225
2226   tl_assert(is_sane_VTS(out));
2227   tl_assert(out->usedTS <= out->sizeTS);
2228   tl_assert(out->usedTS == useda + usedb - ncommon);
2229}
2230
2231
2232/* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2233   they are, or the first ThrID for which they are not (no valid ThrID
2234   has the value zero).  This rather strange convention is used
2235   because sometimes we want to know the actual index at which they
2236   first differ. */
2237static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2238{
2239   Word  ia, ib, useda, usedb;
2240   ULong tyma, tymb;
2241
2242   stats__vts__cmpLEQ++;
2243
2244   tl_assert(a);
2245   tl_assert(b);
2246   useda = a->usedTS;
2247   usedb = b->usedTS;
2248
2249   ia = ib = 0;
2250
2251   while (1) {
2252
2253      /* This logic is to enumerate doubles (tyma, tymb) drawn
2254         from a and b in order, and tyma/b are the relevant
2255         scalar timestamps, taking into account implicit zeroes. */
2256      ThrID thrid;
2257
2258      tl_assert(ia >= 0 && ia <= useda);
2259      tl_assert(ib >= 0 && ib <= usedb);
2260
2261      if        (ia == useda && ib == usedb) {
2262         /* both empty - done */
2263         break;
2264
2265      } else if (ia == useda && ib != usedb) {
2266         /* a empty, use up b */
2267         ScalarTS* tmpb = &b->ts[ib];
2268         tyma  = 0;
2269         tymb  = tmpb->tym;
2270         thrid = tmpb->thrid;
2271         ib++;
2272
2273      } else if (ia != useda && ib == usedb) {
2274         /* b empty, use up a */
2275         ScalarTS* tmpa = &a->ts[ia];
2276         tyma  = tmpa->tym;
2277         thrid = tmpa->thrid;
2278         tymb  = 0;
2279         ia++;
2280
2281      } else {
2282         /* both not empty; extract lowest-ThrID'd triple */
2283         ScalarTS* tmpa = &a->ts[ia];
2284         ScalarTS* tmpb = &b->ts[ib];
2285         if (tmpa->thrid < tmpb->thrid) {
2286            /* a has the lowest unconsidered ThrID */
2287            tyma  = tmpa->tym;
2288            thrid = tmpa->thrid;
2289            tymb  = 0;
2290            ia++;
2291         }
2292         else
2293         if (tmpa->thrid > tmpb->thrid) {
2294            /* b has the lowest unconsidered ThrID */
2295            tyma  = 0;
2296            tymb  = tmpb->tym;
2297            thrid = tmpb->thrid;
2298            ib++;
2299         } else {
2300            /* they both next mention the same ThrID */
2301            tl_assert(tmpa->thrid == tmpb->thrid);
2302            tyma  = tmpa->tym;
2303            thrid = tmpa->thrid;
2304            tymb  = tmpb->tym;
2305            ia++;
2306            ib++;
2307         }
2308      }
2309
2310      /* having laboriously determined (tyma, tymb), do something
2311         useful with it. */
2312      if (tyma > tymb) {
2313         /* not LEQ at this index.  Quit, since the answer is
2314            determined already. */
2315         tl_assert(thrid >= 1024);
2316         return thrid;
2317      }
2318   }
2319
2320   return 0; /* all points are LEQ => return an invalid ThrID */
2321}
2322
2323
2324/* Compute an arbitrary structural (total) ordering on the two args,
2325   based on their VCs, so they can be looked up in a table, tree, etc.
2326   Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2327   performance critical so there is some effort expended to make it sa
2328   fast as possible.
2329*/
2330Word VTS__cmp_structural ( VTS* a, VTS* b )
2331{
2332   /* We just need to generate an arbitrary total ordering based on
2333      a->ts and b->ts.  Preferably do it in a way which comes across likely
2334      differences relatively quickly. */
2335   Word     i;
2336   Word     useda = 0,    usedb = 0;
2337   ScalarTS *ctsa = NULL, *ctsb = NULL;
2338
2339   stats__vts__cmp_structural++;
2340
2341   tl_assert(a);
2342   tl_assert(b);
2343
2344   ctsa = &a->ts[0]; useda = a->usedTS;
2345   ctsb = &b->ts[0]; usedb = b->usedTS;
2346
2347   if (LIKELY(useda == usedb)) {
2348      ScalarTS *tmpa = NULL, *tmpb = NULL;
2349      stats__vts__cmp_structural_slow++;
2350      /* Same length vectors.  Find the first difference, if any, as
2351         fast as possible. */
2352      for (i = 0; i < useda; i++) {
2353         tmpa = &ctsa[i];
2354         tmpb = &ctsb[i];
2355         if (LIKELY(tmpa->tym == tmpb->tym
2356                    && tmpa->thrid == tmpb->thrid))
2357            continue;
2358         else
2359            break;
2360      }
2361      if (UNLIKELY(i == useda)) {
2362         /* They're identical. */
2363         return 0;
2364      } else {
2365         tl_assert(i >= 0 && i < useda);
2366         if (tmpa->tym < tmpb->tym) return -1;
2367         if (tmpa->tym > tmpb->tym) return 1;
2368         if (tmpa->thrid < tmpb->thrid) return -1;
2369         if (tmpa->thrid > tmpb->thrid) return 1;
2370         /* we just established them as non-identical, hence: */
2371      }
2372      /*NOTREACHED*/
2373      tl_assert(0);
2374   }
2375
2376   if (useda < usedb) return -1;
2377   if (useda > usedb) return 1;
2378   /*NOTREACHED*/
2379   tl_assert(0);
2380}
2381
2382
2383/* Debugging only.  Display the given VTS in the buffer.
2384*/
2385void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
2386{
2387   ScalarTS* st;
2388   HChar     unit[64];
2389   Word      i, n;
2390   Int       avail = nBuf;
2391   tl_assert(vts && vts->ts);
2392   tl_assert(nBuf > 16);
2393   buf[0] = '[';
2394   buf[1] = 0;
2395   n =  vts->usedTS;
2396   for (i = 0; i < n; i++) {
2397      tl_assert(avail >= 40);
2398      st = &vts->ts[i];
2399      VG_(memset)(unit, 0, sizeof(unit));
2400      VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
2401                         st->thrid, (ULong)st->tym);
2402      if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
2403         VG_(strcat)(buf, " ...]");
2404         buf[nBuf-1] = 0;
2405         return;
2406      }
2407      VG_(strcat)(buf, unit);
2408      avail -= VG_(strlen)(unit);
2409   }
2410   VG_(strcat)(buf, "]");
2411   buf[nBuf-1] = 0;
2412}
2413
2414
2415/* Debugging only.  Return vts[index], so to speak.
2416*/
2417ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2418{
2419   UWord i, n;
2420   ThrID idx_thrid = Thr__to_ThrID(idx);
2421   stats__vts__indexat_slow++;
2422   tl_assert(vts && vts->ts);
2423   n = vts->usedTS;
2424   for (i = 0; i < n; i++) {
2425      ScalarTS* st = &vts->ts[i];
2426      if (st->thrid == idx_thrid)
2427         return st->tym;
2428   }
2429   return 0;
2430}
2431
2432
2433/* See comment on prototype above.
2434*/
2435static void VTS__declare_thread_very_dead ( Thr* thr )
2436{
2437   if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2438
2439   tl_assert(thr->llexit_done);
2440   tl_assert(thr->joinedwith_done);
2441
2442   ThrID nyu;
2443   nyu = Thr__to_ThrID(thr);
2444   VG_(addToXA)( verydead_thread_table, &nyu );
2445
2446   /* We can only get here if we're assured that we'll never again
2447      need to look at this thread's ::viR or ::viW.  Set them to
2448      VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2449      mostly so that we don't wind up pruning them (as that would be
2450      nonsensical: the only interesting ScalarTS entry for a dead
2451      thread is its own index, and the pruning will remove that.). */
2452   VtsID__rcdec(thr->viR);
2453   VtsID__rcdec(thr->viW);
2454   thr->viR = VtsID_INVALID;
2455   thr->viW = VtsID_INVALID;
2456}
2457
2458
2459/////////////////////////////////////////////////////////////////
2460/////////////////////////////////////////////////////////////////
2461//                                                             //
2462// SECTION END vts primitives                                  //
2463//                                                             //
2464/////////////////////////////////////////////////////////////////
2465/////////////////////////////////////////////////////////////////
2466
2467
2468
2469/////////////////////////////////////////////////////////////////
2470/////////////////////////////////////////////////////////////////
2471//                                                             //
2472// SECTION BEGIN main library                                  //
2473//                                                             //
2474/////////////////////////////////////////////////////////////////
2475/////////////////////////////////////////////////////////////////
2476
2477
2478/////////////////////////////////////////////////////////
2479//                                                     //
2480// VTS set                                             //
2481//                                                     //
2482/////////////////////////////////////////////////////////
2483
2484static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2485
2486static void vts_set_init ( void )
2487{
2488   tl_assert(!vts_set);
2489   vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2490                         HG_(free),
2491                         (Word(*)(UWord,UWord))VTS__cmp_structural );
2492   tl_assert(vts_set);
2493}
2494
2495/* Given a VTS, look in vts_set to see if we already have a
2496   structurally identical one.  If yes, return the pair (True, pointer
2497   to the existing one).  If no, clone this one, add the clone to the
2498   set, and return (False, pointer to the clone). */
2499static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2500{
2501   UWord keyW, valW;
2502   stats__vts_set__focaa++;
2503   tl_assert(cand->id == VtsID_INVALID);
2504   /* lookup cand (by value) */
2505   if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2506      /* found it */
2507      tl_assert(valW == 0);
2508      /* if this fails, cand (by ref) was already present (!) */
2509      tl_assert(keyW != (UWord)cand);
2510      *res = (VTS*)keyW;
2511      return True;
2512   } else {
2513      /* not present.  Clone, add and return address of clone. */
2514      stats__vts_set__focaa_a++;
2515      VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2516      tl_assert(clone != cand);
2517      VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2518      *res = clone;
2519      return False;
2520   }
2521}
2522
2523
2524/////////////////////////////////////////////////////////
2525//                                                     //
2526// VTS table                                           //
2527//                                                     //
2528/////////////////////////////////////////////////////////
2529
2530static void VtsID__invalidate_caches ( void ); /* fwds */
2531
2532/* A type to hold VTS table entries.  Invariants:
2533   If .vts == NULL, then this entry is not in use, so:
2534   - .rc == 0
2535   - this entry is on the freelist (unfortunately, does not imply
2536     any constraints on value for .freelink)
2537   If .vts != NULL, then this entry is in use:
2538   - .vts is findable in vts_set
2539   - .vts->id == this entry number
2540   - no specific value for .rc (even 0 is OK)
2541   - this entry is not on freelist, so .freelink == VtsID_INVALID
2542*/
2543typedef
2544   struct {
2545      VTS*  vts;      /* vts, in vts_set */
2546      UWord rc;       /* reference count - enough for entire aspace */
2547      VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2548      VtsID remap;    /* used only during pruning */
2549   }
2550   VtsTE;
2551
2552/* The VTS table. */
2553static XArray* /* of VtsTE */ vts_tab = NULL;
2554
2555/* An index into the VTS table, indicating the start of the list of
2556   free (available for use) entries.  If the list is empty, this is
2557   VtsID_INVALID. */
2558static VtsID vts_tab_freelist = VtsID_INVALID;
2559
2560/* Do a GC of vts_tab when the freelist becomes empty AND the size of
2561   vts_tab equals or exceeds this size.  After GC, the value here is
2562   set appropriately so as to check for the next GC point. */
2563static Word vts_next_GC_at = 1000;
2564
2565static void vts_tab_init ( void )
2566{
2567   vts_tab
2568      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2569                    HG_(free), sizeof(VtsTE) );
2570   vts_tab_freelist
2571      = VtsID_INVALID;
2572   tl_assert(vts_tab);
2573}
2574
2575/* Add ii to the free list, checking that it looks out-of-use. */
2576static void add_to_free_list ( VtsID ii )
2577{
2578   VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2579   tl_assert(ie->vts == NULL);
2580   tl_assert(ie->rc == 0);
2581   tl_assert(ie->freelink == VtsID_INVALID);
2582   ie->freelink = vts_tab_freelist;
2583   vts_tab_freelist = ii;
2584}
2585
2586/* Get an entry from the free list.  This will return VtsID_INVALID if
2587   the free list is empty. */
2588static VtsID get_from_free_list ( void )
2589{
2590   VtsID  ii;
2591   VtsTE* ie;
2592   if (vts_tab_freelist == VtsID_INVALID)
2593      return VtsID_INVALID;
2594   ii = vts_tab_freelist;
2595   ie = VG_(indexXA)( vts_tab, ii );
2596   tl_assert(ie->vts == NULL);
2597   tl_assert(ie->rc == 0);
2598   vts_tab_freelist = ie->freelink;
2599   return ii;
2600}
2601
2602/* Produce a new VtsID that can be used, either by getting it from
2603   the freelist, or, if that is empty, by expanding vts_tab. */
2604static VtsID get_new_VtsID ( void )
2605{
2606   VtsID ii;
2607   VtsTE te;
2608   ii = get_from_free_list();
2609   if (ii != VtsID_INVALID)
2610      return ii;
2611   te.vts = NULL;
2612   te.rc = 0;
2613   te.freelink = VtsID_INVALID;
2614   te.remap    = VtsID_INVALID;
2615   ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2616   return ii;
2617}
2618
2619
2620/* Indirect callback from lib_zsm. */
2621static void VtsID__rcinc ( VtsID ii )
2622{
2623   VtsTE* ie;
2624   /* VG_(indexXA) does a range check for us */
2625   ie = VG_(indexXA)( vts_tab, ii );
2626   tl_assert(ie->vts); /* else it's not in use */
2627   tl_assert(ie->rc < ~0UL); /* else we can't continue */
2628   tl_assert(ie->vts->id == ii);
2629   ie->rc++;
2630}
2631
2632/* Indirect callback from lib_zsm. */
2633static void VtsID__rcdec ( VtsID ii )
2634{
2635   VtsTE* ie;
2636   /* VG_(indexXA) does a range check for us */
2637   ie = VG_(indexXA)( vts_tab, ii );
2638   tl_assert(ie->vts); /* else it's not in use */
2639   tl_assert(ie->rc > 0); /* else RC snafu */
2640   tl_assert(ie->vts->id == ii);
2641   ie->rc--;
2642}
2643
2644
2645/* Look up 'cand' in our collection of VTSs.  If present, return the
2646   VtsID for the pre-existing version.  If not present, clone it, add
2647   the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2648   it, and return that. */
2649static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2650{
2651   VTS* in_tab = NULL;
2652   tl_assert(cand->id == VtsID_INVALID);
2653   Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2654   tl_assert(in_tab);
2655   if (already_have) {
2656      /* We already have a copy of 'cand'.  Use that. */
2657      VtsTE* ie;
2658      tl_assert(in_tab->id != VtsID_INVALID);
2659      ie = VG_(indexXA)( vts_tab, in_tab->id );
2660      tl_assert(ie->vts == in_tab);
2661      return in_tab->id;
2662   } else {
2663      VtsID  ii = get_new_VtsID();
2664      VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2665      ie->vts = in_tab;
2666      ie->rc = 0;
2667      ie->freelink = VtsID_INVALID;
2668      in_tab->id = ii;
2669      return ii;
2670   }
2671}
2672
2673
2674static void show_vts_stats ( HChar* caller )
2675{
2676   UWord nSet, nTab, nLive;
2677   ULong totrc;
2678   UWord n, i;
2679   nSet = VG_(sizeFM)( vts_set );
2680   nTab = VG_(sizeXA)( vts_tab );
2681   totrc = 0;
2682   nLive = 0;
2683   n = VG_(sizeXA)( vts_tab );
2684   for (i = 0; i < n; i++) {
2685      VtsTE* ie = VG_(indexXA)( vts_tab, i );
2686      if (ie->vts) {
2687         nLive++;
2688         totrc += (ULong)ie->rc;
2689      } else {
2690         tl_assert(ie->rc == 0);
2691      }
2692   }
2693   VG_(printf)("  show_vts_stats %s\n", caller);
2694   VG_(printf)("    vts_tab size %4lu\n", nTab);
2695   VG_(printf)("    vts_tab live %4lu\n", nLive);
2696   VG_(printf)("    vts_set size %4lu\n", nSet);
2697   VG_(printf)("        total rc %4llu\n", totrc);
2698}
2699
2700
2701/* --- Helpers for VtsID pruning --- */
2702
2703static
2704void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2705                   /*MOD*/XArray* /* of VtsTE */ new_tab,
2706                   VtsID* ii )
2707{
2708   VtsTE *old_te, *new_te;
2709   VtsID old_id, new_id;
2710   /* We're relying here on VG_(indexXA)'s range checking to assert on
2711      any stupid values, in particular *ii == VtsID_INVALID. */
2712   old_id = *ii;
2713   old_te = VG_(indexXA)( old_tab, old_id );
2714   old_te->rc--;
2715   new_id = old_te->remap;
2716   new_te = VG_(indexXA)( new_tab, new_id );
2717   new_te->rc++;
2718   *ii = new_id;
2719}
2720
2721static
2722void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2723                            /*MOD*/XArray* /* of VtsTE */ new_tab,
2724                            SVal* s )
2725{
2726   SVal old_sv, new_sv;
2727   old_sv = *s;
2728   if (SVal__isC(old_sv)) {
2729      VtsID rMin, wMin;
2730      rMin = SVal__unC_Rmin(old_sv);
2731      wMin = SVal__unC_Wmin(old_sv);
2732      remap_VtsID( old_tab, new_tab, &rMin );
2733      remap_VtsID( old_tab, new_tab, &wMin );
2734      new_sv = SVal__mkC( rMin, wMin );
2735      *s = new_sv;
2736  }
2737}
2738
2739
2740/* NOT TO BE CALLED FROM WITHIN libzsm. */
2741__attribute__((noinline))
2742static void vts_tab__do_GC ( Bool show_stats )
2743{
2744   UWord i, nTab, nLive, nFreed;
2745
2746   /* ---------- BEGIN VTS GC ---------- */
2747   /* check this is actually necessary. */
2748   tl_assert(vts_tab_freelist == VtsID_INVALID);
2749
2750   /* empty the caches for partial order checks and binary joins.  We
2751      could do better and prune out the entries to be deleted, but it
2752      ain't worth the hassle. */
2753   VtsID__invalidate_caches();
2754
2755   /* First, make the reference counts up to date. */
2756   zsm_flush_cache();
2757
2758   nTab = VG_(sizeXA)( vts_tab );
2759
2760   if (show_stats) {
2761      VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
2762      show_vts_stats("before GC");
2763   }
2764
2765   /* Now we can inspect the entire vts_tab.  Any entries with zero
2766      .rc fields are now no longer in use and can be put back on the
2767      free list, removed from vts_set, and deleted. */
2768   nFreed = 0;
2769   for (i = 0; i < nTab; i++) {
2770      Bool present;
2771      UWord oldK = 0, oldV = 12345;
2772      VtsTE* te = VG_(indexXA)( vts_tab, i );
2773      if (te->vts == NULL) {
2774         tl_assert(te->rc == 0);
2775         continue; /* already on the free list (presumably) */
2776      }
2777      if (te->rc > 0)
2778         continue; /* in use */
2779      /* Ok, we got one we can free. */
2780      tl_assert(te->vts->id == i);
2781      /* first, remove it from vts_set. */
2782      present = VG_(delFromFM)( vts_set,
2783                                &oldK, &oldV, (UWord)te->vts );
2784      tl_assert(present); /* else it isn't in vts_set ?! */
2785      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
2786      tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
2787      /* now free the VTS itself */
2788      VTS__delete(te->vts);
2789      te->vts = NULL;
2790      /* and finally put this entry on the free list */
2791      tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
2792      add_to_free_list( i );
2793      nFreed++;
2794   }
2795
2796   /* Now figure out when the next GC should be.  We'll allow the
2797      number of VTSs to double before GCing again.  Except of course
2798      that since we can't (or, at least, don't) shrink vts_tab, we
2799      can't set the threshhold value smaller than it. */
2800   tl_assert(nFreed <= nTab);
2801   nLive = nTab - nFreed;
2802   tl_assert(nLive >= 0 && nLive <= nTab);
2803   vts_next_GC_at = 2 * nLive;
2804   if (vts_next_GC_at < nTab)
2805      vts_next_GC_at = nTab;
2806
2807   if (show_stats) {
2808      show_vts_stats("after GC");
2809      VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
2810   }
2811
2812   if (VG_(clo_stats)) {
2813      static UInt ctr = 1;
2814      tl_assert(nTab > 0);
2815      VG_(message)(Vg_DebugMsg,
2816                  "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
2817                  ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
2818   }
2819   /* ---------- END VTS GC ---------- */
2820
2821   /* Decide whether to do VTS pruning.  We have one of three
2822      settings. */
2823   static UInt pruning_auto_ctr = 0; /* do not make non-static */
2824
2825   Bool do_pruning = False;
2826   switch (HG_(clo_vts_pruning)) {
2827      case 0: /* never */
2828         break;
2829      case 1: /* auto */
2830         do_pruning = (++pruning_auto_ctr % 5) == 0;
2831         break;
2832      case 2: /* always */
2833         do_pruning = True;
2834         break;
2835      default:
2836         tl_assert(0);
2837   }
2838
2839   /* The rest of this routine only handles pruning, so we can
2840      quit at this point if it is not to be done. */
2841   if (!do_pruning)
2842      return;
2843
2844   /* ---------- BEGIN VTS PRUNING ---------- */
2845   /* We begin by sorting the backing table on its .thr values, so as
2846      to (1) check they are unique [else something has gone wrong,
2847      since it means we must have seen some Thr* exiting more than
2848      once, which can't happen], and (2) so that we can quickly look
2849      up the dead-thread entries as we work through the VTSs. */
2850   VG_(sortXA)( verydead_thread_table );
2851   /* Sanity check: check for unique .sts.thr values. */
2852   UWord nBT = VG_(sizeXA)( verydead_thread_table );
2853   if (nBT > 0) {
2854      ThrID thrid1, thrid2;
2855      thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
2856      for (i = 1; i < nBT; i++) {
2857         thrid1 = thrid2;
2858         thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
2859         tl_assert(thrid1 < thrid2);
2860      }
2861   }
2862   /* Ok, so the dead thread table has unique and in-order keys. */
2863
2864   /* We will run through the old table, and create a new table and
2865      set, at the same time setting the .remap entries in the old
2866      table to point to the new entries.  Then, visit every VtsID in
2867      the system, and replace all of them with new ones, using the
2868      .remap entries in the old table.  Finally, we can delete the old
2869      table and set. */
2870
2871   XArray* /* of VtsTE */ new_tab
2872      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
2873                    HG_(free), sizeof(VtsTE) );
2874
2875   /* WordFM VTS* void */
2876   WordFM* new_set
2877      = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
2878                    HG_(free),
2879                    (Word(*)(UWord,UWord))VTS__cmp_structural );
2880
2881   /* Visit each old VTS.  For each one:
2882
2883      * make a pruned version
2884
2885      * search new_set for the pruned version, yielding either
2886        Nothing (not present) or the new VtsID for it.
2887
2888      * if not present, allocate a new VtsID for it, insert (pruned
2889        VTS, new VtsID) in the tree, and set
2890        remap_table[old VtsID] = new VtsID.
2891
2892      * if present, set remap_table[old VtsID] = new VtsID, where
2893        new VtsID was determined by the tree lookup.  Then free up
2894        the clone.
2895   */
2896
2897   UWord nBeforePruning = 0, nAfterPruning = 0;
2898   UWord nSTSsBefore = 0, nSTSsAfter = 0;
2899   VtsID new_VtsID_ctr = 0;
2900
2901   for (i = 0; i < nTab; i++) {
2902
2903      /* For each old VTS .. */
2904      VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
2905      VTS*   old_vts = old_te->vts;
2906      tl_assert(old_te->remap == VtsID_INVALID);
2907
2908      /* Skip it if not in use */
2909      if (old_te->rc == 0) {
2910         tl_assert(old_vts == NULL);
2911         continue;
2912      }
2913      tl_assert(old_vts != NULL);
2914      tl_assert(old_vts->id == i);
2915      tl_assert(old_vts->ts != NULL);
2916
2917      /* It is in use. Make a pruned version. */
2918      nBeforePruning++;
2919      nSTSsBefore += old_vts->usedTS;
2920      VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
2921                                   old_vts, verydead_thread_table);
2922      tl_assert(new_vts->sizeTS == new_vts->usedTS);
2923      tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
2924                == 0x0ddC0ffeeBadF00dULL);
2925
2926      /* Get rid of the old VTS and the tree entry.  It's a bit more
2927         complex to incrementally delete the VTSs now than to nuke
2928         them all after we're done, but the upside is that we don't
2929         wind up temporarily storing potentially two complete copies
2930         of each VTS and hence spiking memory use. */
2931      UWord oldK = 0, oldV = 12345;
2932      Bool  present = VG_(delFromFM)( vts_set,
2933                                      &oldK, &oldV, (UWord)old_vts );
2934      tl_assert(present); /* else it isn't in vts_set ?! */
2935      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
2936      tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
2937      /* now free the VTS itself */
2938      VTS__delete(old_vts);
2939      old_te->vts = NULL;
2940      old_vts = NULL;
2941
2942      /* NO MENTIONS of old_vts allowed beyond this point. */
2943
2944      /* Ok, we have the pruned copy in new_vts.  See if a
2945         structurally identical version is already present in new_set.
2946         If so, delete the one we just made and move on; if not, add
2947         it. */
2948      VTS*  identical_version = NULL;
2949      UWord valW = 12345;
2950      if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
2951                        (UWord)new_vts)) {
2952         // already have it
2953         tl_assert(valW == 0);
2954         tl_assert(identical_version != NULL);
2955         tl_assert(identical_version != new_vts);
2956         VTS__delete(new_vts);
2957         new_vts = identical_version;
2958         tl_assert(new_vts->id != VtsID_INVALID);
2959      } else {
2960         tl_assert(valW == 12345);
2961         tl_assert(identical_version == NULL);
2962         new_vts->id = new_VtsID_ctr++;
2963         Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
2964         tl_assert(!b);
2965         VtsTE new_te;
2966         new_te.vts      = new_vts;
2967         new_te.rc       = 0;
2968         new_te.freelink = VtsID_INVALID;
2969         new_te.remap    = VtsID_INVALID;
2970         Word j = VG_(addToXA)( new_tab, &new_te );
2971         tl_assert(j <= i);
2972         tl_assert(j == new_VtsID_ctr - 1);
2973         // stats
2974         nAfterPruning++;
2975         nSTSsAfter += new_vts->usedTS;
2976      }
2977      old_te->remap = new_vts->id;
2978
2979   } /* for (i = 0; i < nTab; i++) */
2980
2981   /* At this point, we have:
2982      * the old VTS table, with its .remap entries set,
2983        and with all .vts == NULL.
2984      * the old VTS tree should be empty, since it and the old VTSs
2985        it contained have been incrementally deleted was we worked
2986        through the old table.
2987      * the new VTS table, with all .rc == 0, all .freelink and .remap
2988        == VtsID_INVALID.
2989      * the new VTS tree.
2990   */
2991   tl_assert( VG_(sizeFM)(vts_set) == 0 );
2992
2993   /* Now actually apply the mapping. */
2994   /* Visit all the VtsIDs in the entire system.  Where do we expect
2995      to find them?
2996      (a) in shadow memory -- the LineZs and LineFs
2997      (b) in our collection of struct _Thrs.
2998      (c) in our collection of struct _SOs.
2999      Nowhere else, AFAICS.  Not in the zsm cache, because that just
3000      got invalidated.
3001
3002      Using the .remap fields in vts_tab, map each old VtsID to a new
3003      VtsID.  For each old VtsID, dec its rc; and for each new one,
3004      inc it.  This sets up the new refcounts, and it also gives a
3005      cheap sanity check of the old ones: all old refcounts should be
3006      zero after this operation.
3007   */
3008
3009   /* Do the mappings for (a) above: iterate over the Primary shadow
3010      mem map (WordFM Addr SecMap*). */
3011   UWord secmapW = 0;
3012   VG_(initIterFM)( map_shmem );
3013   while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3014      UWord   j;
3015      SecMap* sm = (SecMap*)secmapW;
3016      tl_assert(sm->magic == SecMap_MAGIC);
3017      /* Deal with the LineZs */
3018      for (i = 0; i < N_SECMAP_ZLINES; i++) {
3019         LineZ* lineZ = &sm->linesZ[i];
3020         if (lineZ->dict[0] == SVal_INVALID)
3021            continue; /* not in use -- data is in F rep instead */
3022         for (j = 0; j < 4; j++)
3023            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3024      }
3025      /* Deal with the LineFs */
3026      for (i = 0; i < sm->linesF_size; i++) {
3027         LineF* lineF = &sm->linesF[i];
3028         if (!lineF->inUse)
3029            continue;
3030         for (j = 0; j < N_LINE_ARANGE; j++)
3031            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3032      }
3033   }
3034   VG_(doneIterFM)( map_shmem );
3035
3036   /* Do the mappings for (b) above: visit our collection of struct
3037      _Thrs. */
3038   Thread* hgthread = get_admin_threads();
3039   tl_assert(hgthread);
3040   while (hgthread) {
3041      Thr* hbthr = hgthread->hbthr;
3042      tl_assert(hbthr);
3043      /* Threads that are listed in the prunable set have their viR
3044         and viW set to VtsID_INVALID, so we can't mess with them. */
3045      if (hbthr->llexit_done && hbthr->joinedwith_done) {
3046         tl_assert(hbthr->viR == VtsID_INVALID);
3047         tl_assert(hbthr->viW == VtsID_INVALID);
3048         hgthread = hgthread->admin;
3049         continue;
3050      }
3051      remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3052      remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3053      hgthread = hgthread->admin;
3054   }
3055
3056   /* Do the mappings for (c) above: visit the struct _SOs. */
3057   SO* so = admin_SO;
3058   while (so) {
3059      if (so->viR != VtsID_INVALID)
3060         remap_VtsID( vts_tab, new_tab, &so->viR );
3061      if (so->viW != VtsID_INVALID)
3062         remap_VtsID( vts_tab, new_tab, &so->viW );
3063      so = so->admin_next;
3064   }
3065
3066   /* So, we're nearly done (with this incredibly complex operation).
3067      Check the refcounts for the old VtsIDs all fell to zero, as
3068      expected.  Any failure is serious. */
3069   for (i = 0; i < nTab; i++) {
3070      VtsTE* te = VG_(indexXA)( vts_tab, i );
3071      tl_assert(te->vts == NULL);
3072      /* This is the assert proper.  Note we're also asserting
3073         zeroness for old entries which are unmapped (hence have
3074         .remap == VtsID_INVALID).  That's OK. */
3075      tl_assert(te->rc == 0);
3076   }
3077
3078   /* Install the new table and set. */
3079   VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3080   vts_set = new_set;
3081   VG_(deleteXA)( vts_tab );
3082   vts_tab = new_tab;
3083
3084   /* The freelist of vts_tab entries is empty now, because we've
3085      compacted all of the live entries at the low end of the
3086      table. */
3087   vts_tab_freelist = VtsID_INVALID;
3088
3089   /* Sanity check vts_set and vts_tab. */
3090
3091   /* Because all the live entries got slid down to the bottom of vts_tab: */
3092   tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3093
3094   /* Assert that the vts_tab and vts_set entries point at each other
3095      in the required way */
3096   UWord wordK = 0, wordV = 0;
3097   VG_(initIterFM)( vts_set );
3098   while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3099      tl_assert(wordK != 0);
3100      tl_assert(wordV == 0);
3101      VTS* vts = (VTS*)wordK;
3102      tl_assert(vts->id != VtsID_INVALID);
3103      VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3104      tl_assert(te->vts == vts);
3105   }
3106   VG_(doneIterFM)( vts_set );
3107
3108   /* Also iterate over the table, and check each entry is
3109      plausible. */
3110   nTab = VG_(sizeXA)( vts_tab );
3111   for (i = 0; i < nTab; i++) {
3112      VtsTE* te = VG_(indexXA)( vts_tab, i );
3113      tl_assert(te->vts);
3114      tl_assert(te->vts->id == i);
3115      tl_assert(te->rc > 0); /* 'cos we just GC'd */
3116      tl_assert(te->freelink == VtsID_INVALID); /* in use */
3117      tl_assert(te->remap == VtsID_INVALID); /* not relevant */
3118   }
3119
3120   /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3121   if (VG_(clo_stats)) {
3122      static UInt ctr = 1;
3123      tl_assert(nTab > 0);
3124      VG_(message)(
3125         Vg_DebugMsg,
3126         "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
3127            "after %lu (avg sz %lu)\n",
3128         ctr++,
3129         nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3130         nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3131      );
3132   }
3133   if (0)
3134   VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
3135               "after pruning %lu (avg sz %lu)\n",
3136               nBeforePruning, nSTSsBefore / nBeforePruning,
3137               nAfterPruning, nSTSsAfter / nAfterPruning);
3138   /* ---------- END VTS PRUNING ---------- */
3139}
3140
3141
3142/////////////////////////////////////////////////////////
3143//                                                     //
3144// Vts IDs                                             //
3145//                                                     //
3146/////////////////////////////////////////////////////////
3147
3148//////////////////////////
3149/* A temporary, max-sized VTS which is used as a temporary (the first
3150   argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3151static VTS* temp_max_sized_VTS = NULL;
3152
3153//////////////////////////
3154static ULong stats__cmpLEQ_queries = 0;
3155static ULong stats__cmpLEQ_misses  = 0;
3156static ULong stats__join2_queries  = 0;
3157static ULong stats__join2_misses   = 0;
3158
3159static inline UInt ROL32 ( UInt w, Int n ) {
3160   w = (w << n) | (w >> (32-n));
3161   return w;
3162}
3163static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3164   UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3165   return hash % nTab;
3166}
3167
3168#define N_CMPLEQ_CACHE 1023
3169static
3170   struct { VtsID vi1; VtsID vi2; Bool leq; }
3171   cmpLEQ_cache[N_CMPLEQ_CACHE];
3172
3173#define N_JOIN2_CACHE 1023
3174static
3175   struct { VtsID vi1; VtsID vi2; VtsID res; }
3176   join2_cache[N_JOIN2_CACHE];
3177
3178static void VtsID__invalidate_caches ( void ) {
3179   Int i;
3180   for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3181      cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3182      cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3183      cmpLEQ_cache[i].leq = False;
3184   }
3185   for (i = 0; i < N_JOIN2_CACHE; i++) {
3186     join2_cache[i].vi1 = VtsID_INVALID;
3187     join2_cache[i].vi2 = VtsID_INVALID;
3188     join2_cache[i].res = VtsID_INVALID;
3189   }
3190}
3191//////////////////////////
3192
3193//static Bool VtsID__is_valid ( VtsID vi ) {
3194//   VtsTE* ve;
3195//   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3196//      return False;
3197//   ve = VG_(indexXA)( vts_tab, vi );
3198//   if (!ve->vts)
3199//      return False;
3200//   tl_assert(ve->vts->id == vi);
3201//   return True;
3202//}
3203
3204static VTS* VtsID__to_VTS ( VtsID vi ) {
3205   VtsTE* te = VG_(indexXA)( vts_tab, vi );
3206   tl_assert(te->vts);
3207   return te->vts;
3208}
3209
3210static void VtsID__pp ( VtsID vi ) {
3211   HChar buf[100];
3212   VTS* vts = VtsID__to_VTS(vi);
3213   VTS__show( buf, sizeof(buf)-1, vts );
3214   buf[sizeof(buf)-1] = 0;
3215   VG_(printf)("%s", buf);
3216}
3217
3218/* compute partial ordering relation of vi1 and vi2. */
3219__attribute__((noinline))
3220static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3221   UInt hash;
3222   Bool leq;
3223   VTS  *v1, *v2;
3224   //if (vi1 == vi2) return True;
3225   tl_assert(vi1 != vi2);
3226   ////++
3227   stats__cmpLEQ_queries++;
3228   hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3229   if (cmpLEQ_cache[hash].vi1 == vi1
3230       && cmpLEQ_cache[hash].vi2 == vi2)
3231      return cmpLEQ_cache[hash].leq;
3232   stats__cmpLEQ_misses++;
3233   ////--
3234   v1  = VtsID__to_VTS(vi1);
3235   v2  = VtsID__to_VTS(vi2);
3236   leq = VTS__cmpLEQ( v1, v2 ) == 0;
3237   ////++
3238   cmpLEQ_cache[hash].vi1 = vi1;
3239   cmpLEQ_cache[hash].vi2 = vi2;
3240   cmpLEQ_cache[hash].leq = leq;
3241   ////--
3242   return leq;
3243}
3244static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3245   return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3246}
3247
3248/* compute binary join */
3249__attribute__((noinline))
3250static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3251   UInt  hash;
3252   VtsID res;
3253   VTS   *vts1, *vts2;
3254   //if (vi1 == vi2) return vi1;
3255   tl_assert(vi1 != vi2);
3256   ////++
3257   stats__join2_queries++;
3258   hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3259   if (join2_cache[hash].vi1 == vi1
3260       && join2_cache[hash].vi2 == vi2)
3261      return join2_cache[hash].res;
3262   stats__join2_misses++;
3263   ////--
3264   vts1 = VtsID__to_VTS(vi1);
3265   vts2 = VtsID__to_VTS(vi2);
3266   temp_max_sized_VTS->usedTS = 0;
3267   VTS__join(temp_max_sized_VTS, vts1,vts2);
3268   res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3269   ////++
3270   join2_cache[hash].vi1 = vi1;
3271   join2_cache[hash].vi2 = vi2;
3272   join2_cache[hash].res = res;
3273   ////--
3274   return res;
3275}
3276static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3277   return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3278}
3279
3280/* create a singleton VTS, namely [thr:1] */
3281static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3282   temp_max_sized_VTS->usedTS = 0;
3283   VTS__singleton(temp_max_sized_VTS, thr,tym);
3284   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3285}
3286
3287/* tick operation, creates value 1 if specified index is absent */
3288static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3289   VTS* vts = VtsID__to_VTS(vi);
3290   temp_max_sized_VTS->usedTS = 0;
3291   VTS__tick(temp_max_sized_VTS, idx,vts);
3292   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3293}
3294
3295/* index into a VTS (only for assertions) */
3296static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3297   VTS* vts = VtsID__to_VTS(vi);
3298   return VTS__indexAt_SLOW( vts, idx );
3299}
3300
3301/* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3302   any, really) element in vi1 which is pointwise greater-than the
3303   corresponding element in vi2.  If no such element exists, return
3304   NULL.  This needs to be fairly quick since it is called every time
3305   a race is detected. */
3306static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3307{
3308   VTS  *vts1, *vts2;
3309   Thr*  diffthr;
3310   ThrID diffthrid;
3311   tl_assert(vi1 != vi2);
3312   vts1 = VtsID__to_VTS(vi1);
3313   vts2 = VtsID__to_VTS(vi2);
3314   tl_assert(vts1 != vts2);
3315   diffthrid = VTS__cmpLEQ(vts1, vts2);
3316   diffthr = Thr__from_ThrID(diffthrid);
3317   tl_assert(diffthr); /* else they are LEQ ! */
3318   return diffthr;
3319}
3320
3321
3322/////////////////////////////////////////////////////////
3323//                                                     //
3324// Filters                                             //
3325//                                                     //
3326/////////////////////////////////////////////////////////
3327
3328/* Forget everything we know -- clear the filter and let everything
3329   through.  This needs to be as fast as possible, since it is called
3330   every time the running thread changes, and every time a thread's
3331   vector clocks change, which can be quite frequent.  The obvious
3332   fast way to do this is simply to stuff in tags which we know are
3333   not going to match anything, since they're not aligned to the start
3334   of a line. */
3335static void Filter__clear ( Filter* fi, HChar* who )
3336{
3337   UWord i;
3338   if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3339   for (i = 0; i < FI_NUM_LINES; i += 8) {
3340      fi->tags[i+0] = 1; /* impossible value -- cannot match */
3341      fi->tags[i+1] = 1;
3342      fi->tags[i+2] = 1;
3343      fi->tags[i+3] = 1;
3344      fi->tags[i+4] = 1;
3345      fi->tags[i+5] = 1;
3346      fi->tags[i+6] = 1;
3347      fi->tags[i+7] = 1;
3348   }
3349   tl_assert(i == FI_NUM_LINES);
3350}
3351
3352/* Clearing an arbitrary range in the filter.  Unfortunately
3353   we have to do this due to core-supplied new/die-mem events. */
3354
3355static void Filter__clear_1byte ( Filter* fi, Addr a )
3356{
3357   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3358   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3359   FiLine* line   = &fi->lines[lineno];
3360   UWord   loff   = (a - atag) / 8;
3361   UShort  mask   = 0x3 << (2 * (a & 7));
3362   /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3363   if (LIKELY( fi->tags[lineno] == atag )) {
3364      /* hit.  clear the bits. */
3365      UShort  u16  = line->u16s[loff];
3366      line->u16s[loff] = u16 & ~mask; /* clear them */
3367   } else {
3368      /* miss.  The filter doesn't hold this address, so ignore. */
3369   }
3370}
3371
3372static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3373{
3374   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3375   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3376   FiLine* line   = &fi->lines[lineno];
3377   UWord   loff   = (a - atag) / 8;
3378   if (LIKELY( fi->tags[lineno] == atag )) {
3379      line->u16s[loff] = 0;
3380   } else {
3381    /* miss.  The filter doesn't hold this address, so ignore. */
3382   }
3383}
3384
3385static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3386{
3387  //VG_(printf)("%lu ", len);
3388   /* slowly do part preceding 8-alignment */
3389   while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3390      Filter__clear_1byte( fi, a );
3391      a++;
3392      len--;
3393   }
3394   /* vector loop */
3395   while (len >= 8) {
3396      Filter__clear_8bytes_aligned( fi, a );
3397      a += 8;
3398      len -= 8;
3399   }
3400   /* slowly do tail */
3401   while (UNLIKELY(len > 0)) {
3402      Filter__clear_1byte( fi, a );
3403      a++;
3404      len--;
3405   }
3406}
3407
3408
3409/* ------ Read handlers for the filter. ------ */
3410
3411static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3412{
3413   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3414      return False;
3415   {
3416     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3417     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3418     FiLine* line   = &fi->lines[lineno];
3419     UWord   loff   = (a - atag) / 8;
3420     UShort  mask   = 0xAAAA;
3421     if (LIKELY( fi->tags[lineno] == atag )) {
3422        /* hit.  check line and update. */
3423        UShort u16  = line->u16s[loff];
3424        Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3425        line->u16s[loff] = u16 | mask; /* set them */
3426        return ok;
3427     } else {
3428        /* miss.  nuke existing line and re-use it. */
3429        UWord i;
3430        fi->tags[lineno] = atag;
3431        for (i = 0; i < FI_LINE_SZB / 8; i++)
3432           line->u16s[i] = 0;
3433        line->u16s[loff] = mask;
3434        return False;
3435     }
3436   }
3437}
3438
3439static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3440{
3441   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3442      return False;
3443   {
3444     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3445     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3446     FiLine* line   = &fi->lines[lineno];
3447     UWord   loff   = (a - atag) / 8;
3448     UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3449     if (LIKELY( fi->tags[lineno] == atag )) {
3450        /* hit.  check line and update. */
3451        UShort  u16  = line->u16s[loff];
3452        Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3453        line->u16s[loff] = u16 | mask; /* set them */
3454        return ok;
3455     } else {
3456        /* miss.  nuke existing line and re-use it. */
3457        UWord   i;
3458        fi->tags[lineno] = atag;
3459        for (i = 0; i < FI_LINE_SZB / 8; i++)
3460           line->u16s[i] = 0;
3461        line->u16s[loff] = mask;
3462        return False;
3463     }
3464   }
3465}
3466
3467static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3468{
3469   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3470      return False;
3471   {
3472     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3473     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3474     FiLine* line   = &fi->lines[lineno];
3475     UWord   loff   = (a - atag) / 8;
3476     UShort  mask   = 0xA << (2 * (a & 6));
3477     /* mask is A000, 0A00, 00A0 or 000A */
3478     if (LIKELY( fi->tags[lineno] == atag )) {
3479        /* hit.  check line and update. */
3480        UShort  u16  = line->u16s[loff];
3481        Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3482        line->u16s[loff] = u16 | mask; /* set them */
3483        return ok;
3484     } else {
3485        /* miss.  nuke existing line and re-use it. */
3486        UWord   i;
3487        fi->tags[lineno] = atag;
3488        for (i = 0; i < FI_LINE_SZB / 8; i++)
3489           line->u16s[i] = 0;
3490        line->u16s[loff] = mask;
3491        return False;
3492     }
3493   }
3494}
3495
3496static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3497{
3498   {
3499     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3500     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3501     FiLine* line   = &fi->lines[lineno];
3502     UWord   loff   = (a - atag) / 8;
3503     UShort  mask   = 0x2 << (2 * (a & 7));
3504     /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3505     if (LIKELY( fi->tags[lineno] == atag )) {
3506        /* hit.  check line and update. */
3507        UShort  u16  = line->u16s[loff];
3508        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3509        line->u16s[loff] = u16 | mask; /* set them */
3510        return ok;
3511     } else {
3512        /* miss.  nuke existing line and re-use it. */
3513        UWord   i;
3514        fi->tags[lineno] = atag;
3515        for (i = 0; i < FI_LINE_SZB / 8; i++)
3516           line->u16s[i] = 0;
3517        line->u16s[loff] = mask;
3518        return False;
3519     }
3520   }
3521}
3522
3523
3524/* ------ Write handlers for the filter. ------ */
3525
3526static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3527{
3528   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3529      return False;
3530   {
3531     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3532     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3533     FiLine* line   = &fi->lines[lineno];
3534     UWord   loff   = (a - atag) / 8;
3535     UShort  mask   = 0xFFFF;
3536     if (LIKELY( fi->tags[lineno] == atag )) {
3537        /* hit.  check line and update. */
3538        UShort u16  = line->u16s[loff];
3539        Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3540        line->u16s[loff] = u16 | mask; /* set them */
3541        return ok;
3542     } else {
3543        /* miss.  nuke existing line and re-use it. */
3544        UWord i;
3545        fi->tags[lineno] = atag;
3546        for (i = 0; i < FI_LINE_SZB / 8; i++)
3547           line->u16s[i] = 0;
3548        line->u16s[loff] = mask;
3549        return False;
3550     }
3551   }
3552}
3553
3554static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3555{
3556   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3557      return False;
3558   {
3559     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3560     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3561     FiLine* line   = &fi->lines[lineno];
3562     UWord   loff   = (a - atag) / 8;
3563     UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3564     if (LIKELY( fi->tags[lineno] == atag )) {
3565        /* hit.  check line and update. */
3566        UShort  u16  = line->u16s[loff];
3567        Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3568        line->u16s[loff] = u16 | mask; /* set them */
3569        return ok;
3570     } else {
3571        /* miss.  nuke existing line and re-use it. */
3572        UWord   i;
3573        fi->tags[lineno] = atag;
3574        for (i = 0; i < FI_LINE_SZB / 8; i++)
3575           line->u16s[i] = 0;
3576        line->u16s[loff] = mask;
3577        return False;
3578     }
3579   }
3580}
3581
3582static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3583{
3584   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3585      return False;
3586   {
3587     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3588     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3589     FiLine* line   = &fi->lines[lineno];
3590     UWord   loff   = (a - atag) / 8;
3591     UShort  mask   = 0xF << (2 * (a & 6));
3592     /* mask is F000, 0F00, 00F0 or 000F */
3593     if (LIKELY( fi->tags[lineno] == atag )) {
3594        /* hit.  check line and update. */
3595        UShort  u16  = line->u16s[loff];
3596        Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3597        line->u16s[loff] = u16 | mask; /* set them */
3598        return ok;
3599     } else {
3600        /* miss.  nuke existing line and re-use it. */
3601        UWord   i;
3602        fi->tags[lineno] = atag;
3603        for (i = 0; i < FI_LINE_SZB / 8; i++)
3604           line->u16s[i] = 0;
3605        line->u16s[loff] = mask;
3606        return False;
3607     }
3608   }
3609}
3610
3611static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
3612{
3613   {
3614     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3615     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3616     FiLine* line   = &fi->lines[lineno];
3617     UWord   loff   = (a - atag) / 8;
3618     UShort  mask   = 0x3 << (2 * (a & 7));
3619     /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3620     if (LIKELY( fi->tags[lineno] == atag )) {
3621        /* hit.  check line and update. */
3622        UShort  u16  = line->u16s[loff];
3623        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3624        line->u16s[loff] = u16 | mask; /* set them */
3625        return ok;
3626     } else {
3627        /* miss.  nuke existing line and re-use it. */
3628        UWord   i;
3629        fi->tags[lineno] = atag;
3630        for (i = 0; i < FI_LINE_SZB / 8; i++)
3631           line->u16s[i] = 0;
3632        line->u16s[loff] = mask;
3633        return False;
3634     }
3635   }
3636}
3637
3638
3639/////////////////////////////////////////////////////////
3640//                                                     //
3641// Threads                                             //
3642//                                                     //
3643/////////////////////////////////////////////////////////
3644
3645/* Maps ThrID values to their Thr*s (which contain ThrID values that
3646   should point back to the relevant slot in the array.  Lowest
3647   numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
3648static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
3649
3650/* And a counter to dole out ThrID values.  For rationale/background,
3651   see comments on definition of ScalarTS (far) above. */
3652static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
3653
3654static ThrID Thr__to_ThrID ( Thr* thr ) {
3655   return thr->thrid;
3656}
3657static Thr* Thr__from_ThrID ( UInt thrid ) {
3658   Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
3659   tl_assert(thr->thrid == thrid);
3660   return thr;
3661}
3662
3663static Thr* Thr__new ( void )
3664{
3665   Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
3666   thr->viR = VtsID_INVALID;
3667   thr->viW = VtsID_INVALID;
3668   thr->llexit_done = False;
3669   thr->joinedwith_done = False;
3670   thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
3671   /* We only really need this at history level 1, but unfortunately
3672      this routine is called before the command line processing is
3673      done (sigh), so we can't rely on HG_(clo_history_level) at this
3674      point.  Hence always allocate it.  Bah. */
3675   thr->local_Kws_n_stacks
3676      = VG_(newXA)( HG_(zalloc),
3677                    "libhb.Thr__new.3 (local_Kws_and_stacks)",
3678                    HG_(free), sizeof(ULong_n_EC) );
3679
3680   /* Add this Thr* <-> ThrID binding to the mapping, and
3681      cross-check */
3682   if (!thrid_to_thr_map) {
3683      thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
3684                                     HG_(free), sizeof(Thr*) );
3685      tl_assert(thrid_to_thr_map);
3686   }
3687
3688   if (thrid_counter >= ThrID_MAX_VALID) {
3689      /* We're hosed.  We have to stop. */
3690      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
3691   }
3692
3693   thr->thrid = thrid_counter++;
3694   Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
3695   tl_assert(ix + 1024 == thr->thrid);
3696
3697   return thr;
3698}
3699
3700static void note_local_Kw_n_stack_for ( Thr* thr )
3701{
3702   Word       nPresent;
3703   ULong_n_EC pair;
3704   tl_assert(thr);
3705
3706   // We only collect this info at history level 1 (approx)
3707   if (HG_(clo_history_level) != 1)
3708      return;
3709
3710   /* This is the scalar Kw for thr. */
3711   pair.ull = VtsID__indexAt( thr->viW, thr );
3712   pair.ec  = main_get_EC( thr );
3713   tl_assert(pair.ec);
3714   tl_assert(thr->local_Kws_n_stacks);
3715
3716   /* check that we're not adding duplicates */
3717   nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
3718
3719   /* Throw away old stacks, if necessary.  We can't accumulate stuff
3720      indefinitely. */
3721   if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
3722      VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
3723      nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
3724      if (0)
3725         VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
3726                     thr, pair.ull, pair.ec );
3727   }
3728
3729   if (nPresent > 0) {
3730      ULong_n_EC* prevPair
3731         = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
3732      tl_assert( prevPair->ull <= pair.ull );
3733   }
3734
3735   if (nPresent == 0)
3736      pair.ec = NULL;
3737
3738   VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
3739
3740   if (0)
3741      VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
3742                  thr, pair.ull, pair.ec );
3743   if (0)
3744      VG_(pp_ExeContext)(pair.ec);
3745}
3746
3747static Int cmp__ULong_n_EC__by_ULong ( ULong_n_EC* pair1, ULong_n_EC* pair2 )
3748{
3749   if (pair1->ull < pair2->ull) return -1;
3750   if (pair1->ull > pair2->ull) return 1;
3751   return 0;
3752}
3753
3754
3755/////////////////////////////////////////////////////////
3756//                                                     //
3757// Shadow Values                                       //
3758//                                                     //
3759/////////////////////////////////////////////////////////
3760
3761// type SVal, SVal_INVALID and SVal_NOACCESS are defined by
3762// hb_zsm.h.  We have to do everything else here.
3763
3764/* SVal is 64 bit unsigned int.
3765
3766      <---------30--------->    <---------30--------->
3767   00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
3768   10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
3769   11 0--------------------0 00 0--------------------0   A: SVal_INVALID
3770
3771*/
3772#define SVAL_TAGMASK (3ULL << 62)
3773
3774static inline Bool SVal__isC ( SVal s ) {
3775   return (0ULL << 62) == (s & SVAL_TAGMASK);
3776}
3777static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
3778   //tl_assert(VtsID__is_valid(rmini));
3779   //tl_assert(VtsID__is_valid(wmini));
3780   return (((ULong)rmini) << 32) | ((ULong)wmini);
3781}
3782static inline VtsID SVal__unC_Rmin ( SVal s ) {
3783   tl_assert(SVal__isC(s));
3784   return (VtsID)(s >> 32);
3785}
3786static inline VtsID SVal__unC_Wmin ( SVal s ) {
3787   tl_assert(SVal__isC(s));
3788   return (VtsID)(s & 0xFFFFFFFFULL);
3789}
3790
3791static inline Bool SVal__isA ( SVal s ) {
3792   return (2ULL << 62) == (s & SVAL_TAGMASK);
3793}
3794static inline SVal SVal__mkA ( void ) {
3795   return 2ULL << 62;
3796}
3797
3798/* Direct callback from lib_zsm. */
3799static void SVal__rcinc ( SVal s ) {
3800   if (SVal__isC(s)) {
3801      VtsID__rcinc( SVal__unC_Rmin(s) );
3802      VtsID__rcinc( SVal__unC_Wmin(s) );
3803   }
3804}
3805
3806/* Direct callback from lib_zsm. */
3807static void SVal__rcdec ( SVal s ) {
3808   if (SVal__isC(s)) {
3809      VtsID__rcdec( SVal__unC_Rmin(s) );
3810      VtsID__rcdec( SVal__unC_Wmin(s) );
3811   }
3812}
3813
3814
3815/////////////////////////////////////////////////////////
3816//                                                     //
3817// A simple group (memory) allocator                   //
3818//                                                     //
3819/////////////////////////////////////////////////////////
3820
3821//////////////// BEGIN general group allocator
3822typedef
3823   struct {
3824      UWord   elemSzB;        /* element size */
3825      UWord   nPerGroup;      /* # elems per group */
3826      void*   (*alloc)(HChar*, SizeT); /* group allocator */
3827      HChar*  cc; /* group allocator's cc */
3828      void    (*free)(void*); /* group allocator's free-er (unused) */
3829      /* XArray of void* (pointers to groups).  The groups themselves.
3830         Each element is a pointer to a block of size (elemSzB *
3831         nPerGroup) bytes. */
3832      XArray* groups;
3833      /* next free element.  Is a pointer to an element in one of the
3834         groups pointed to by .groups. */
3835      void* nextFree;
3836   }
3837   GroupAlloc;
3838
3839static void init_GroupAlloc ( /*MOD*/GroupAlloc* ga,
3840                              UWord  elemSzB,
3841                              UWord  nPerGroup,
3842                              void*  (*alloc)(HChar*, SizeT),
3843                              HChar* cc,
3844                              void   (*free)(void*) )
3845{
3846   tl_assert(0 == (elemSzB % sizeof(UWord)));
3847   tl_assert(elemSzB >= sizeof(UWord));
3848   tl_assert(nPerGroup >= 100); /* let's say */
3849   tl_assert(alloc);
3850   tl_assert(cc);
3851   tl_assert(free);
3852   tl_assert(ga);
3853   VG_(memset)(ga, 0, sizeof(*ga));
3854   ga->elemSzB   = elemSzB;
3855   ga->nPerGroup = nPerGroup;
3856   ga->groups    = NULL;
3857   ga->alloc     = alloc;
3858   ga->cc        = cc;
3859   ga->free      = free;
3860   ga->groups    = VG_(newXA)( alloc, cc, free, sizeof(void*) );
3861   ga->nextFree  = NULL;
3862   tl_assert(ga->groups);
3863}
3864
3865/* The freelist is empty.  Allocate a new group and put all the new
3866   elements in it onto the freelist. */
3867__attribute__((noinline))
3868static void gal_add_new_group ( GroupAlloc* ga )
3869{
3870   Word   i;
3871   UWord* group;
3872   tl_assert(ga);
3873   tl_assert(ga->nextFree == NULL);
3874   group = ga->alloc( ga->cc, ga->elemSzB * ga->nPerGroup );
3875   tl_assert(group);
3876   /* extend the freelist through the new group.  Place the freelist
3877      pointer in the first word of each element.  That's why the
3878      element size must be at least one word. */
3879   for (i = ga->nPerGroup-1; i >= 0; i--) {
3880      UChar* elemC = ((UChar*)group) + i * ga->elemSzB;
3881      UWord* elem  = (UWord*)elemC;
3882      tl_assert(0 == (((UWord)elem) % sizeof(UWord)));
3883      *elem = (UWord)ga->nextFree;
3884      ga->nextFree = elem;
3885   }
3886   /* and add to our collection of groups */
3887   VG_(addToXA)( ga->groups, &group );
3888}
3889
3890inline static void* gal_Alloc ( GroupAlloc* ga )
3891{
3892   UWord* elem;
3893   if (UNLIKELY(ga->nextFree == NULL)) {
3894      gal_add_new_group(ga);
3895   }
3896   elem = ga->nextFree;
3897   ga->nextFree = (void*)*elem;
3898   *elem = 0; /* unnecessary, but just to be on the safe side */
3899   return elem;
3900}
3901
3902inline static void* gal_Alloc_w_size_check ( GroupAlloc* ga, SizeT n )
3903{
3904   tl_assert(n == ga->elemSzB);
3905   return gal_Alloc( ga );
3906}
3907
3908inline static void gal_Free ( GroupAlloc* ga, void* p )
3909{
3910   UWord* elem = (UWord*)p;
3911   *elem = (UWord)ga->nextFree;
3912   ga->nextFree = elem;
3913}
3914//////////////// END general group allocator
3915
3916
3917/////////////////////////////////////////////////////////
3918//                                                     //
3919// Change-event map2                                   //
3920//                                                     //
3921/////////////////////////////////////////////////////////
3922
3923#define EVENT_MAP_GC_DISCARD_FRACTION  0.5
3924
3925/* This is in two parts:
3926
3927   1. A hash table of RCECs.  This is a set of reference-counted stack
3928      traces.  When the reference count of a stack trace becomes zero,
3929      it is removed from the set and freed up.  The intent is to have
3930      a set of stack traces which can be referred to from (2), but to
3931      only represent each one once.  The set is indexed/searched by
3932      ordering on the stack trace vectors.
3933
3934   2. A SparseWA of OldRefs.  These store information about each old
3935      ref that we need to record.  It is indexed by address of the
3936      location for which the information is recorded.  For LRU
3937      purposes, each OldRef also contains a generation number,
3938      indicating when it was most recently accessed.
3939
3940      The important part of an OldRef is, however, its accs[] array.
3941      This is an array of N_OLDREF_ACCS which binds (thread, R/W,
3942      size) triples to RCECs.  This allows us to collect the last
3943      access-traceback by up to N_OLDREF_ACCS different triples for
3944      this location.  The accs[] array is a MTF-array.  If a binding
3945      falls off the end, that's too bad -- we will lose info about
3946      that triple's access to this location.
3947
3948      When the SparseWA becomes too big, we can throw away the OldRefs
3949      whose generation numbers are below some threshold; hence doing
3950      approximate LRU discarding.  For each discarded OldRef we must
3951      of course decrement the reference count on the all RCECs it
3952      refers to, in order that entries from (1) eventually get
3953      discarded too.
3954
3955   A major improvement in reliability of this mechanism would be to
3956   have a dynamically sized OldRef.accs[] array, so no entries ever
3957   fall off the end.  In investigations (Dec 08) it appears that a
3958   major cause for the non-availability of conflicting-access traces
3959   in race reports is caused by the fixed size of this array.  I
3960   suspect for most OldRefs, only a few entries are used, but for a
3961   minority of cases there is an overflow, leading to info lossage.
3962   Investigations also suggest this is very workload and scheduling
3963   sensitive.  Therefore a dynamic sizing would be better.
3964
3965   However, dynamic sizing would defeat the use of a GroupAllocator
3966   for OldRef structures.  And that's important for performance.  So
3967   it's not straightforward to do.
3968*/
3969
3970
3971static UWord stats__ctxt_rcdec1 = 0;
3972static UWord stats__ctxt_rcdec2 = 0;
3973static UWord stats__ctxt_rcdec3 = 0;
3974static UWord stats__ctxt_rcdec_calls = 0;
3975static UWord stats__ctxt_rcdec_discards = 0;
3976static UWord stats__ctxt_rcdec1_eq = 0;
3977
3978static UWord stats__ctxt_tab_curr = 0;
3979static UWord stats__ctxt_tab_max  = 0;
3980
3981static UWord stats__ctxt_tab_qs   = 0;
3982static UWord stats__ctxt_tab_cmps = 0;
3983
3984
3985///////////////////////////////////////////////////////
3986//// Part (1): A hash table of RCECs
3987///
3988
3989#define N_FRAMES 8
3990
3991// (UInt) `echo "Reference Counted Execution Context" | md5sum`
3992#define RCEC_MAGIC 0xab88abb2UL
3993
3994//#define N_RCEC_TAB 98317 /* prime */
3995#define N_RCEC_TAB 196613 /* prime */
3996
3997typedef
3998   struct _RCEC {
3999      UWord magic;  /* sanity check only */
4000      struct _RCEC* next;
4001      UWord rc;
4002      UWord rcX; /* used for crosschecking */
4003      UWord frames_hash;          /* hash of all the frames */
4004      UWord frames[N_FRAMES];
4005   }
4006   RCEC;
4007
4008static RCEC** contextTab = NULL; /* hash table of RCEC*s */
4009
4010
4011/* Gives an arbitrary total order on RCEC .frames fields */
4012static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
4013   Word i;
4014   tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
4015   tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
4016   if (ec1->frames_hash < ec2->frames_hash) return -1;
4017   if (ec1->frames_hash > ec2->frames_hash) return  1;
4018   for (i = 0; i < N_FRAMES; i++) {
4019      if (ec1->frames[i] < ec2->frames[i]) return -1;
4020      if (ec1->frames[i] > ec2->frames[i]) return  1;
4021   }
4022   return 0;
4023}
4024
4025
4026/* Dec the ref of this RCEC. */
4027static void ctxt__rcdec ( RCEC* ec )
4028{
4029   stats__ctxt_rcdec_calls++;
4030   tl_assert(ec && ec->magic == RCEC_MAGIC);
4031   tl_assert(ec->rc > 0);
4032   ec->rc--;
4033}
4034
4035static void ctxt__rcinc ( RCEC* ec )
4036{
4037   tl_assert(ec && ec->magic == RCEC_MAGIC);
4038   ec->rc++;
4039}
4040
4041
4042//////////// BEGIN RCEC group allocator
4043static GroupAlloc rcec_group_allocator;
4044
4045static RCEC* alloc_RCEC ( void ) {
4046   return gal_Alloc ( &rcec_group_allocator );
4047}
4048
4049static void free_RCEC ( RCEC* rcec ) {
4050   tl_assert(rcec->magic == RCEC_MAGIC);
4051   gal_Free( &rcec_group_allocator, rcec );
4052}
4053//////////// END RCEC group allocator
4054
4055
4056/* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
4057   move it one step closer the the front of the list, so as to make
4058   subsequent searches for it cheaper. */
4059static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
4060{
4061   RCEC *ec0, *ec1, *ec2;
4062   if (ec == *headp)
4063      tl_assert(0); /* already at head of list */
4064   tl_assert(ec != NULL);
4065   ec0 = *headp;
4066   ec1 = NULL;
4067   ec2 = NULL;
4068   while (True) {
4069      if (ec0 == NULL || ec0 == ec) break;
4070      ec2 = ec1;
4071      ec1 = ec0;
4072      ec0 = ec0->next;
4073   }
4074   tl_assert(ec0 == ec);
4075   if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
4076      RCEC* tmp;
4077      /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
4078         predecessor.  Swap ec0 and ec1, that is, move ec0 one step
4079         closer to the start of the list. */
4080      tl_assert(ec2->next == ec1);
4081      tl_assert(ec1->next == ec0);
4082      tmp = ec0->next;
4083      ec2->next = ec0;
4084      ec0->next = ec1;
4085      ec1->next = tmp;
4086   }
4087   else
4088   if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
4089      /* it's second in the list. */
4090      tl_assert(*headp == ec1);
4091      tl_assert(ec1->next == ec0);
4092      ec1->next = ec0->next;
4093      ec0->next = ec1;
4094      *headp = ec0;
4095   }
4096}
4097
4098
4099/* Find the given RCEC in the tree, and return a pointer to it.  Or,
4100   if not present, add the given one to the tree (by making a copy of
4101   it, so the caller can immediately deallocate the original) and
4102   return a pointer to the copy.  The caller can safely have 'example'
4103   on its stack, since we will always return a pointer to a copy of
4104   it, not to the original.  Note that the inserted node will have .rc
4105   of zero and so the caller must immediatly increment it. */
4106__attribute__((noinline))
4107static RCEC* ctxt__find_or_add ( RCEC* example )
4108{
4109   UWord hent;
4110   RCEC* copy;
4111   tl_assert(example && example->magic == RCEC_MAGIC);
4112   tl_assert(example->rc == 0);
4113
4114   /* Search the hash table to see if we already have it. */
4115   stats__ctxt_tab_qs++;
4116   hent = example->frames_hash % N_RCEC_TAB;
4117   copy = contextTab[hent];
4118   while (1) {
4119      if (!copy) break;
4120      tl_assert(copy->magic == RCEC_MAGIC);
4121      stats__ctxt_tab_cmps++;
4122      if (0 == RCEC__cmp_by_frames(copy, example)) break;
4123      copy = copy->next;
4124   }
4125
4126   if (copy) {
4127      tl_assert(copy != example);
4128      /* optimisation: if it's not at the head of its list, move 1
4129         step fwds, to make future searches cheaper */
4130      if (copy != contextTab[hent]) {
4131         move_RCEC_one_step_forward( &contextTab[hent], copy );
4132      }
4133   } else {
4134      copy = alloc_RCEC();
4135      tl_assert(copy != example);
4136      *copy = *example;
4137      copy->next = contextTab[hent];
4138      contextTab[hent] = copy;
4139      stats__ctxt_tab_curr++;
4140      if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4141         stats__ctxt_tab_max = stats__ctxt_tab_curr;
4142   }
4143   return copy;
4144}
4145
4146static inline UWord ROLW ( UWord w, Int n )
4147{
4148   Int bpw = 8 * sizeof(UWord);
4149   w = (w << n) | (w >> (bpw-n));
4150   return w;
4151}
4152
4153__attribute__((noinline))
4154static RCEC* get_RCEC ( Thr* thr )
4155{
4156   UWord hash, i;
4157   RCEC  example;
4158   example.magic = RCEC_MAGIC;
4159   example.rc = 0;
4160   example.rcX = 0;
4161   main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
4162   hash = 0;
4163   for (i = 0; i < N_FRAMES; i++) {
4164      hash ^= example.frames[i];
4165      hash = ROLW(hash, 19);
4166   }
4167   example.frames_hash = hash;
4168   return ctxt__find_or_add( &example );
4169}
4170
4171///////////////////////////////////////////////////////
4172//// Part (2):
4173///  A SparseWA guest-addr -> OldRef, that refers to (1)
4174///
4175
4176// (UInt) `echo "Old Reference Information" | md5sum`
4177#define OldRef_MAGIC 0x30b1f075UL
4178
4179/* Records an access: a thread, a context (size & writeness) and the
4180   number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
4181   2, 10 = 4, 11 = 8.
4182*/
4183typedef
4184   struct {
4185      RCEC*     rcec;
4186      WordSetID locksHeldW;
4187      UInt      thrid  : SCALARTS_N_THRBITS;
4188      UInt      szLg2B : 2;
4189      UInt      isW    : 1;
4190   }
4191   Thr_n_RCEC;
4192
4193#define N_OLDREF_ACCS 5
4194
4195typedef
4196   struct {
4197      UWord magic;  /* sanity check only */
4198      UWord gen;    /* when most recently accessed */
4199                    /* or free list when not in use */
4200      /* unused slots in this array have .thrid == 0, which is invalid */
4201      Thr_n_RCEC accs[N_OLDREF_ACCS];
4202   }
4203   OldRef;
4204
4205
4206//////////// BEGIN OldRef group allocator
4207static GroupAlloc oldref_group_allocator;
4208
4209static OldRef* alloc_OldRef ( void ) {
4210   return gal_Alloc ( &oldref_group_allocator );
4211}
4212
4213static void free_OldRef ( OldRef* r ) {
4214   tl_assert(r->magic == OldRef_MAGIC);
4215   gal_Free( &oldref_group_allocator, r );
4216}
4217//////////// END OldRef group allocator
4218
4219
4220static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
4221static UWord     oldrefGen      = 0;    /* current LRU generation # */
4222static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
4223static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */
4224
4225inline static UInt min_UInt ( UInt a, UInt b ) {
4226   return a < b ? a : b;
4227}
4228
4229/* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4230   first interval is lower, 1 if the first interval is higher, and 0
4231   if there is any overlap.  Redundant paranoia with casting is there
4232   following what looked distinctly like a bug in gcc-4.1.2, in which
4233   some of the comparisons were done signedly instead of
4234   unsignedly. */
4235/* Copied from exp-ptrcheck/sg_main.c */
4236static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4237                                     Addr a2, SizeT n2 ) {
4238   UWord a1w = (UWord)a1;
4239   UWord n1w = (UWord)n1;
4240   UWord a2w = (UWord)a2;
4241   UWord n2w = (UWord)n2;
4242   tl_assert(n1w > 0 && n2w > 0);
4243   if (a1w + n1w <= a2w) return -1L;
4244   if (a2w + n2w <= a1w) return 1L;
4245   return 0;
4246}
4247
4248static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4249{
4250   OldRef* ref;
4251   RCEC*   rcec;
4252   Word    i, j;
4253   UWord   keyW, valW;
4254   Bool    b;
4255
4256   tl_assert(thr);
4257   ThrID thrid = thr->thrid;
4258   tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4259
4260   WordSetID locksHeldW = thr->hgthread->locksetW;
4261
4262   rcec = get_RCEC( thr );
4263   ctxt__rcinc(rcec);
4264
4265   UInt szLg2B = 0;
4266   switch (szB) {
4267      /* This doesn't look particularly branch-predictor friendly. */
4268      case 1:  szLg2B = 0; break;
4269      case 2:  szLg2B = 1; break;
4270      case 4:  szLg2B = 2; break;
4271      case 8:  szLg2B = 3; break;
4272      default: tl_assert(0);
4273   }
4274
4275   /* Look in the map to see if we already have a record for this
4276      address. */
4277   b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );
4278
4279   if (b) {
4280
4281      /* We already have a record for this address.  We now need to
4282         see if we have a stack trace pertaining to this (thrid, R/W,
4283         size) triple. */
4284      tl_assert(keyW == a);
4285      ref = (OldRef*)valW;
4286      tl_assert(ref->magic == OldRef_MAGIC);
4287
4288      for (i = 0; i < N_OLDREF_ACCS; i++) {
4289         if (ref->accs[i].thrid != thrid)
4290            continue;
4291         if (ref->accs[i].szLg2B != szLg2B)
4292            continue;
4293         if (ref->accs[i].isW != (UInt)(isW & 1))
4294            continue;
4295         /* else we have a match, so stop looking. */
4296         break;
4297      }
4298
4299      if (i < N_OLDREF_ACCS) {
4300         /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
4301         if (i > 0) {
4302            Thr_n_RCEC tmp = ref->accs[i-1];
4303            ref->accs[i-1] = ref->accs[i];
4304            ref->accs[i] = tmp;
4305            i--;
4306         }
4307         if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
4308         stats__ctxt_rcdec1++;
4309         ctxt__rcdec( ref->accs[i].rcec );
4310         tl_assert(ref->accs[i].thrid == thrid);
4311         /* Update the RCEC and the W-held lockset. */
4312         ref->accs[i].rcec       = rcec;
4313         ref->accs[i].locksHeldW = locksHeldW;
4314      } else {
4315         /* No entry for this (thread, R/W, size, nWHeld) quad.
4316            Shuffle all of them down one slot, and put the new entry
4317            at the start of the array. */
4318         if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
4319            /* the last slot is in use.  We must dec the rc on the
4320               associated rcec. */
4321            tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
4322            stats__ctxt_rcdec2++;
4323            if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
4324               VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
4325            ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
4326         } else {
4327            tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
4328         }
4329         for (j = N_OLDREF_ACCS-1; j >= 1; j--)
4330            ref->accs[j] = ref->accs[j-1];
4331         ref->accs[0].thrid      = thrid;
4332         ref->accs[0].szLg2B     = szLg2B;
4333         ref->accs[0].isW        = (UInt)(isW & 1);
4334         ref->accs[0].locksHeldW = locksHeldW;
4335         ref->accs[0].rcec       = rcec;
4336         /* thrid==0 is used to signify an empty slot, so we can't
4337            add zero thrid (such a ThrID is invalid anyway). */
4338         /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
4339      }
4340
4341      ref->gen = oldrefGen;
4342
4343   } else {
4344
4345      /* We don't have a record for this address.  Create a new one. */
4346      if (oldrefTreeN >= oldrefGenIncAt) {
4347         oldrefGen++;
4348         oldrefGenIncAt = oldrefTreeN + 50000;
4349         if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
4350                            oldrefGen, oldrefTreeN );
4351      }
4352
4353      ref = alloc_OldRef();
4354      ref->magic = OldRef_MAGIC;
4355      ref->gen   = oldrefGen;
4356      ref->accs[0].thrid      = thrid;
4357      ref->accs[0].szLg2B     = szLg2B;
4358      ref->accs[0].isW        = (UInt)(isW & 1);
4359      ref->accs[0].locksHeldW = locksHeldW;
4360      ref->accs[0].rcec       = rcec;
4361
4362      /* thrid==0 is used to signify an empty slot, so we can't
4363         add zero thrid (such a ThrID is invalid anyway). */
4364      /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
4365
4366      /* Clear out the rest of the entries */
4367      for (j = 1; j < N_OLDREF_ACCS; j++) {
4368         ref->accs[j].rcec       = NULL;
4369         ref->accs[j].thrid      = 0;
4370         ref->accs[j].szLg2B     = 0;
4371         ref->accs[j].isW        = 0;
4372         ref->accs[j].locksHeldW = 0;
4373      }
4374      VG_(addToSWA)( oldrefTree, a, (UWord)ref );
4375      oldrefTreeN++;
4376
4377   }
4378}
4379
4380
4381/* Extract info from the conflicting-access machinery. */
4382Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4383                              /*OUT*/Thr**        resThr,
4384                              /*OUT*/SizeT*       resSzB,
4385                              /*OUT*/Bool*        resIsW,
4386                              /*OUT*/WordSetID*   locksHeldW,
4387                              Thr* thr, Addr a, SizeT szB, Bool isW )
4388{
4389   Word    i, j;
4390   OldRef* ref;
4391   UWord   keyW, valW;
4392   Bool    b;
4393
4394   ThrID     cand_thrid;
4395   RCEC*     cand_rcec;
4396   Bool      cand_isW;
4397   SizeT     cand_szB;
4398   WordSetID cand_locksHeldW;
4399   Addr      cand_a;
4400
4401   Addr toCheck[15];
4402   Int  nToCheck = 0;
4403
4404   tl_assert(thr);
4405   tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4406
4407   ThrID thrid = thr->thrid;
4408
4409   toCheck[nToCheck++] = a;
4410   for (i = -7; i < (Word)szB; i++) {
4411      if (i != 0)
4412         toCheck[nToCheck++] = a + i;
4413   }
4414   tl_assert(nToCheck <= 15);
4415
4416   /* Now see if we can find a suitable matching event for
4417      any of the addresses in toCheck[0 .. nToCheck-1]. */
4418   for (j = 0; j < nToCheck; j++) {
4419
4420      cand_a = toCheck[j];
4421      //      VG_(printf)("test %ld %p\n", j, cand_a);
4422
4423      b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
4424      if (!b)
4425         continue;
4426
4427      ref = (OldRef*)valW;
4428      tl_assert(keyW == cand_a);
4429      tl_assert(ref->magic == OldRef_MAGIC);
4430      tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */
4431
4432      cand_thrid      = 0; /* invalid; see comments in event_map_bind */
4433      cand_rcec       = NULL;
4434      cand_isW        = False;
4435      cand_szB        = 0;
4436      cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */
4437
4438      for (i = 0; i < N_OLDREF_ACCS; i++) {
4439         Thr_n_RCEC* cand = &ref->accs[i];
4440         cand_rcec       = cand->rcec;
4441         cand_thrid      = cand->thrid;
4442         cand_isW        = (Bool)cand->isW;
4443         cand_szB        = 1 << cand->szLg2B;
4444         cand_locksHeldW = cand->locksHeldW;
4445
4446         if (cand_thrid == 0)
4447            /* This slot isn't in use.  Ignore it. */
4448            continue;
4449
4450         if (cand_thrid == thrid)
4451            /* This is an access by the same thread, but we're only
4452               interested in accesses from other threads.  Ignore. */
4453            continue;
4454
4455         if ((!cand_isW) && (!isW))
4456            /* We don't want to report a read racing against another
4457               read; that's stupid.  So in this case move on. */
4458            continue;
4459
4460         if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
4461            /* No overlap with the access we're asking about.  Ignore. */
4462            continue;
4463
4464         /* We have a match.  Stop searching. */
4465         break;
4466      }
4467
4468      tl_assert(i >= 0 && i <= N_OLDREF_ACCS);
4469
4470      if (i < N_OLDREF_ACCS) {
4471         Int n, maxNFrames;
4472         /* return with success */
4473         tl_assert(cand_thrid);
4474         tl_assert(cand_rcec);
4475         tl_assert(cand_rcec->magic == RCEC_MAGIC);
4476         tl_assert(cand_szB >= 1);
4477         /* Count how many non-zero frames we have. */
4478         maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
4479         for (n = 0; n < maxNFrames; n++) {
4480            if (0 == cand_rcec->frames[n]) break;
4481         }
4482         *resEC      = VG_(make_ExeContext_from_StackTrace)
4483                          (cand_rcec->frames, n);
4484         *resThr     = Thr__from_ThrID(cand_thrid);
4485         *resSzB     = cand_szB;
4486         *resIsW     = cand_isW;
4487         *locksHeldW = cand_locksHeldW;
4488         return True;
4489      }
4490
4491      /* consider next address in toCheck[] */
4492   } /* for (j = 0; j < nToCheck; j++) */
4493
4494   /* really didn't find anything. */
4495   return False;
4496}
4497
4498static void event_map_init ( void )
4499{
4500   Word i;
4501
4502   /* Context (RCEC) group allocator */
4503   init_GroupAlloc ( &rcec_group_allocator,
4504                     sizeof(RCEC),
4505                     1000 /* RCECs per group */,
4506                     HG_(zalloc),
4507                     "libhb.event_map_init.1 (RCEC groups)",
4508                     HG_(free) );
4509
4510   /* Context table */
4511   tl_assert(!contextTab);
4512   contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
4513                             N_RCEC_TAB * sizeof(RCEC*) );
4514   tl_assert(contextTab);
4515   for (i = 0; i < N_RCEC_TAB; i++)
4516      contextTab[i] = NULL;
4517
4518   /* Oldref group allocator */
4519   init_GroupAlloc ( &oldref_group_allocator,
4520                     sizeof(OldRef),
4521                     1000 /* OldRefs per group */,
4522                     HG_(zalloc),
4523                     "libhb.event_map_init.3 (OldRef groups)",
4524                     HG_(free) );
4525
4526   /* Oldref tree */
4527   tl_assert(!oldrefTree);
4528   oldrefTree = VG_(newSWA)(
4529                   HG_(zalloc),
4530                   "libhb.event_map_init.4 (oldref tree)",
4531                   HG_(free)
4532                );
4533   tl_assert(oldrefTree);
4534
4535   oldrefGen = 0;
4536   oldrefGenIncAt = 0;
4537   oldrefTreeN = 0;
4538}
4539
4540static void event_map__check_reference_counts ( Bool before )
4541{
4542   RCEC*   rcec;
4543   OldRef* oldref;
4544   Word    i;
4545   UWord   nEnts = 0;
4546   UWord   keyW, valW;
4547
4548   /* Set the 'check' reference counts to zero.  Also, optionally
4549      check that the real reference counts are non-zero.  We allow
4550      these to fall to zero before a GC, but the GC must get rid of
4551      all those that are zero, hence none should be zero after a
4552      GC. */
4553   for (i = 0; i < N_RCEC_TAB; i++) {
4554      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4555         nEnts++;
4556         tl_assert(rcec);
4557         tl_assert(rcec->magic == RCEC_MAGIC);
4558         if (!before)
4559            tl_assert(rcec->rc > 0);
4560         rcec->rcX = 0;
4561      }
4562   }
4563
4564   /* check that the stats are sane */
4565   tl_assert(nEnts == stats__ctxt_tab_curr);
4566   tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
4567
4568   /* visit all the referencing points, inc check ref counts */
4569   VG_(initIterSWA)( oldrefTree );
4570   while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4571      oldref = (OldRef*)valW;
4572      tl_assert(oldref->magic == OldRef_MAGIC);
4573      for (i = 0; i < N_OLDREF_ACCS; i++) {
4574         ThrID aThrID = oldref->accs[i].thrid;
4575         RCEC* aRef   = oldref->accs[i].rcec;
4576         if (aThrID != 0) {
4577            tl_assert(aRef);
4578            tl_assert(aRef->magic == RCEC_MAGIC);
4579            aRef->rcX++;
4580         } else {
4581            tl_assert(!aRef);
4582         }
4583      }
4584   }
4585
4586   /* compare check ref counts with actual */
4587   for (i = 0; i < N_RCEC_TAB; i++) {
4588      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4589         tl_assert(rcec->rc == rcec->rcX);
4590      }
4591   }
4592}
4593
4594__attribute__((noinline))
4595static void event_map_maybe_GC ( void )
4596{
4597   OldRef* oldref;
4598   UWord   keyW, valW, retained, maxGen;
4599   XArray* refs2del;
4600   Word    i, j, n2del;
4601
4602   UWord* genMap      = NULL;
4603   UWord  genMap_min  = 0;
4604   UWord  genMap_size = 0;
4605
4606   if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
4607      return;
4608
4609   if (0)
4610      VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);
4611
4612   /* Check for sane command line params.  Limit values must match
4613      those in hg_process_cmd_line_option. */
4614   tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
4615   tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );
4616
4617   /* Check our counting is sane (expensive) */
4618   if (CHECK_CEM)
4619      tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));
4620
4621   /* Check the reference counts (expensive) */
4622   if (CHECK_CEM)
4623      event_map__check_reference_counts( True/*before*/ );
4624
4625   /* Compute the distribution of generation values in the ref tree.
4626      There are likely only to be a few different generation numbers
4627      in the whole tree, but we don't know what they are.  Hence use a
4628      dynamically resized array of counters.  The array is genMap[0
4629      .. genMap_size-1], where genMap[0] is the count for the
4630      generation number genMap_min, genMap[1] is the count for
4631      genMap_min+1, etc.  If a new number is seen outside the range
4632      [genMap_min .. genMap_min + genMap_size - 1] then the array is
4633      copied into a larger array, and genMap_min and genMap_size are
4634      adjusted accordingly. */
4635
4636   /* genMap :: generation-number -> count-of-nodes-with-that-number */
4637
4638   VG_(initIterSWA)( oldrefTree );
4639   while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4640
4641       UWord ea, key;
4642       oldref = (OldRef*)valW;
4643       key = oldref->gen;
4644
4645      /* BEGIN find 'ea', which is the index in genMap holding the
4646         count for generation number 'key'. */
4647      if (UNLIKELY(genMap == NULL)) {
4648         /* deal with the first key to be seen, so that the following
4649            cases don't need to handle the complexity of a NULL count
4650            array. */
4651         genMap_min  = key;
4652         genMap_size = 1;
4653         genMap = HG_(zalloc)( "libhb.emmG.1a",
4654                                genMap_size * sizeof(UWord) );
4655         ea = 0;
4656         if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
4657                            key, genMap_min, genMap_min+genMap_size- 1 );
4658      }
4659      else
4660      if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
4661         /* this is the expected (almost-always-happens) case: 'key'
4662            is already mapped in the array. */
4663         ea = key - genMap_min;
4664      }
4665      else
4666      if (key < genMap_min) {
4667         /* 'key' appears before the start of the current array.
4668            Extend the current array by allocating a larger one and
4669            copying the current one to the upper end of it. */
4670         Word   more;
4671         UWord* map2;
4672         more = genMap_min - key;
4673         tl_assert(more > 0);
4674         map2 = HG_(zalloc)( "libhb.emmG.1b",
4675                             (genMap_size + more) * sizeof(UWord) );
4676         VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
4677         HG_(free)( genMap );
4678         genMap = map2;
4679         genMap_size += more;
4680         genMap_min -= more;
4681         ea = 0;
4682         tl_assert(genMap_min == key);
4683         if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
4684                            key, genMap_min,  genMap_min+genMap_size- 1 );
4685      }
4686      else {
4687         /* 'key' appears after the end of the current array.  Extend
4688            the current array by allocating a larger one and copying
4689            the current one to the lower end of it. */
4690         Word   more;
4691         UWord* map2;
4692         tl_assert(key >= genMap_min + genMap_size);
4693         more = key - (genMap_min + genMap_size) + 1;
4694         tl_assert(more > 0);
4695         map2 = HG_(zalloc)( "libhb.emmG.1c",
4696                             (genMap_size + more) * sizeof(UWord) );
4697         VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
4698         HG_(free)( genMap );
4699         genMap = map2;
4700         genMap_size += more;
4701         ea = genMap_size - 1;;
4702         tl_assert(genMap_min + genMap_size - 1 == key);
4703         if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
4704                            key, genMap_min, genMap_min+genMap_size- 1 );
4705      }
4706      /* END find 'ea' from 'key' */
4707
4708      tl_assert(ea >= 0 && ea < genMap_size);
4709      /* and the whole point of this elaborate computation of 'ea' is .. */
4710      genMap[ea]++;
4711   }
4712
4713   tl_assert(genMap);
4714   tl_assert(genMap_size > 0);
4715
4716   /* Sanity check what we just computed */
4717   { UWord sum = 0;
4718     for (i = 0; i < genMap_size; i++) {
4719        if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
4720                           i + genMap_min, genMap[i] );
4721        sum += genMap[i];
4722     }
4723     tl_assert(sum == oldrefTreeN);
4724   }
4725
4726   /* Figure out how many generations to throw away */
4727   retained = oldrefTreeN;
4728   maxGen = 0;
4729
4730   for (i = 0; i < genMap_size; i++) {
4731      keyW = i + genMap_min;
4732      valW = genMap[i];
4733      tl_assert(keyW > 0); /* can't allow a generation # 0 */
4734      if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
4735      tl_assert(keyW >= maxGen);
4736      tl_assert(retained >= valW);
4737      if (retained - valW
4738          > (UWord)(HG_(clo_conflict_cache_size)
4739                    * EVENT_MAP_GC_DISCARD_FRACTION)) {
4740         retained -= valW;
4741         maxGen = keyW;
4742      } else {
4743         break;
4744      }
4745   }
4746
4747   HG_(free)(genMap);
4748
4749   tl_assert(retained >= 0 && retained <= oldrefTreeN);
4750
4751   /* Now make up a big list of the oldrefTree entries we want to
4752      delete.  We can't simultaneously traverse the tree and delete
4753      stuff from it, so first we need to copy them off somewhere
4754      else. (sigh) */
4755   refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
4756                          HG_(free), sizeof(Addr) );
4757
4758   if (retained < oldrefTreeN) {
4759
4760      /* This is the normal (expected) case.  We discard any ref whose
4761         generation number <= maxGen. */
4762      VG_(initIterSWA)( oldrefTree );
4763      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4764         oldref = (OldRef*)valW;
4765         tl_assert(oldref->magic == OldRef_MAGIC);
4766         if (oldref->gen <= maxGen) {
4767            VG_(addToXA)( refs2del, &keyW );
4768         }
4769      }
4770      if (VG_(clo_stats)) {
4771         VG_(message)(Vg_DebugMsg,
4772            "libhb: EvM GC: delete generations %lu and below, "
4773            "retaining %lu entries\n",
4774            maxGen, retained );
4775      }
4776
4777   } else {
4778
4779      static UInt rand_seed = 0; /* leave as static */
4780
4781      /* Degenerate case: there's only one generation in the entire
4782         tree, so we need to have some other way of deciding which
4783         refs to throw away.  Just throw out half of them randomly. */
4784      tl_assert(retained == oldrefTreeN);
4785      VG_(initIterSWA)( oldrefTree );
4786      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4787         UInt n;
4788         oldref = (OldRef*)valW;
4789         tl_assert(oldref->magic == OldRef_MAGIC);
4790         n = VG_(random)( &rand_seed );
4791         if ((n & 0xFFF) < 0x800) {
4792            VG_(addToXA)( refs2del, &keyW );
4793            retained--;
4794         }
4795      }
4796      if (VG_(clo_stats)) {
4797         VG_(message)(Vg_DebugMsg,
4798            "libhb: EvM GC: randomly delete half the entries, "
4799            "retaining %lu entries\n",
4800            retained );
4801      }
4802
4803   }
4804
4805   n2del = VG_(sizeXA)( refs2del );
4806   tl_assert(n2del == (Word)(oldrefTreeN - retained));
4807
4808   if (0) VG_(printf)("%s","deleting entries\n");
4809   for (i = 0; i < n2del; i++) {
4810      Bool  b;
4811      Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
4812      b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
4813      tl_assert(b);
4814      tl_assert(keyW == ga2del);
4815      oldref = (OldRef*)valW;
4816      for (j = 0; j < N_OLDREF_ACCS; j++) {
4817         ThrID aThrID = oldref->accs[j].thrid;
4818         RCEC* aRef   = oldref->accs[j].rcec;
4819         if (aRef) {
4820            tl_assert(aThrID != 0);
4821            stats__ctxt_rcdec3++;
4822            ctxt__rcdec( aRef );
4823         } else {
4824            tl_assert(aThrID == 0);
4825         }
4826      }
4827
4828      free_OldRef( oldref );
4829   }
4830
4831   VG_(deleteXA)( refs2del );
4832
4833   tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );
4834
4835   oldrefTreeN = retained;
4836   oldrefGenIncAt = oldrefTreeN; /* start new gen right away */
4837
4838   /* Throw away all RCECs with zero reference counts */
4839   for (i = 0; i < N_RCEC_TAB; i++) {
4840      RCEC** pp = &contextTab[i];
4841      RCEC*  p  = *pp;
4842      while (p) {
4843         if (p->rc == 0) {
4844            *pp = p->next;
4845            free_RCEC(p);
4846            p = *pp;
4847            tl_assert(stats__ctxt_tab_curr > 0);
4848            stats__ctxt_tab_curr--;
4849         } else {
4850            pp = &p->next;
4851            p = p->next;
4852         }
4853      }
4854   }
4855
4856   /* Check the reference counts (expensive) */
4857   if (CHECK_CEM)
4858      event_map__check_reference_counts( False/*after*/ );
4859
4860   //if (0)
4861   //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
4862   //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));
4863
4864}
4865
4866
4867/////////////////////////////////////////////////////////
4868//                                                     //
4869// Core MSM                                            //
4870//                                                     //
4871/////////////////////////////////////////////////////////
4872
4873/* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
4874   Nov 08, and again after [...],
4875   June 09. */
4876
4877static ULong stats__msmcread         = 0;
4878static ULong stats__msmcread_change  = 0;
4879static ULong stats__msmcwrite        = 0;
4880static ULong stats__msmcwrite_change = 0;
4881
4882/* Some notes on the H1 history mechanism:
4883
4884   Transition rules are:
4885
4886   read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
4887   write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
4888
4889   After any access by a thread T to a location L, L's constraint pair
4890   (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
4891
4892   After a race by thread T conflicting with some previous access by
4893   some other thread U, for a location with constraint (before
4894   processing the later access) (Cr,Cw), then Cw[U] is the segment in
4895   which the previously access lies.
4896
4897   Hence in record_race_info, we pass in Cfailed and Kfailed, which
4898   are compared so as to find out which thread(s) this access
4899   conflicts with.  Once that is established, we also require the
4900   pre-update Cw for the location, so we can index into it for those
4901   threads, to get the scalar clock values for the point at which the
4902   former accesses were made.  (In fact we only bother to do any of
4903   this for an arbitrarily chosen one of the conflicting threads, as
4904   that's simpler, it avoids flooding the user with vast amounts of
4905   mostly useless information, and because the program is wrong if it
4906   contains any races at all -- so we don't really need to show all
4907   conflicting access pairs initially, so long as we only show none if
4908   none exist).
4909
4910   ---
4911
4912   That requires the auxiliary proof that
4913
4914      (Cr `join` Kw)[T] == Kw[T]
4915
4916   Why should that be true?  Because for any thread T, Kw[T] >= the
4917   scalar clock value for T known by any other thread.  In other
4918   words, because T's value for its own scalar clock is at least as up
4919   to date as the value for it known by any other thread (that is true
4920   for both the R- and W- scalar clocks).  Hence no other thread will
4921   be able to feed in a value for that element (indirectly via a
4922   constraint) which will exceed Kw[T], and hence the join cannot
4923   cause that particular element to advance.
4924*/
4925
4926__attribute__((noinline))
4927static void record_race_info ( Thr* acc_thr,
4928                               Addr acc_addr, SizeT szB, Bool isWrite,
4929                               VtsID Cfailed,
4930                               VtsID Kfailed,
4931                               VtsID Cw )
4932{
4933   /* Call here to report a race.  We just hand it onwards to
4934      HG_(record_error_Race).  If that in turn discovers that the
4935      error is going to be collected, then, at history_level 2, that
4936      queries the conflicting-event map.  The alternative would be to
4937      query it right here.  But that causes a lot of pointless queries
4938      for errors which will shortly be discarded as duplicates, and
4939      can become a performance overhead; so we defer the query until
4940      we know the error is not a duplicate. */
4941
4942   /* Stacks for the bounds of the (or one of the) conflicting
4943      segment(s).  These are only set at history_level 1. */
4944   ExeContext* hist1_seg_start = NULL;
4945   ExeContext* hist1_seg_end   = NULL;
4946   Thread*     hist1_conf_thr  = NULL;
4947
4948   tl_assert(acc_thr);
4949   tl_assert(acc_thr->hgthread);
4950   tl_assert(acc_thr->hgthread->hbthr == acc_thr);
4951   tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
4952
4953   if (HG_(clo_history_level) == 1) {
4954      Bool found;
4955      Word firstIx, lastIx;
4956      ULong_n_EC key;
4957
4958      /* At history_level 1, we must round up the relevant stack-pair
4959         for the conflicting segment right now.  This is because
4960         deferring it is complex; we can't (easily) put Kfailed and
4961         Cfailed into the XError and wait for later without
4962         getting tied up in difficulties with VtsID reference
4963         counting.  So just do it now. */
4964      Thr*  confThr;
4965      ULong confTym = 0;
4966      /* Which thread are we in conflict with?  There may be more than
4967         one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
4968         (in fact it's the one with the lowest Thr* value). */
4969      confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
4970      /* This must exist!  since if it was NULL then there's no
4971         conflict (semantics of return value of
4972         VtsID__findFirst_notLEQ), and msmc{read,write}, which has
4973         called us, just checked exactly this -- that there was in
4974         fact a race. */
4975      tl_assert(confThr);
4976
4977      /* Get the scalar clock value that the conflicting thread
4978         introduced into the constraint.  A careful examination of the
4979         base machine rules shows that this must be the same as the
4980         conflicting thread's scalar clock when it created this
4981         constraint.  Hence we know the scalar clock of the
4982         conflicting thread when the conflicting access was made. */
4983      confTym = VtsID__indexAt( Cfailed, confThr );
4984
4985      /* Using this scalar clock, index into the conflicting thread's
4986         collection of stack traces made each time its vector clock
4987         (hence its scalar clock) changed.  This gives the stack
4988         traces at the start and end of the conflicting segment (well,
4989         as per comment just above, of one of the conflicting
4990         segments, if there are more than one). */
4991      key.ull = confTym;
4992      key.ec  = NULL;
4993      /* tl_assert(confThr); -- asserted just above */
4994      tl_assert(confThr->local_Kws_n_stacks);
4995      firstIx = lastIx = 0;
4996      found = VG_(lookupXA_UNSAFE)(
4997                 confThr->local_Kws_n_stacks,
4998                 &key, &firstIx, &lastIx,
4999                 (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong
5000              );
5001      if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
5002                         "confTym %llu found %d (%lu,%lu)\n",
5003                         Cfailed, Kfailed, Cw,
5004                         confThr, confTym, found, firstIx, lastIx);
5005      /* We can't indefinitely collect stack traces at VTS
5006         transitions, since we'd eventually run out of memory.  Hence
5007         note_local_Kw_n_stack_for will eventually throw away old
5008         ones, which in turn means we might fail to find index value
5009         confTym in the array. */
5010      if (found) {
5011         ULong_n_EC *pair_start, *pair_end;
5012         pair_start
5013            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
5014         hist1_seg_start = pair_start->ec;
5015         if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
5016            pair_end
5017               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
5018                                            lastIx+1 );
5019            /* from properties of VG_(lookupXA) and the comparison fn used: */
5020            tl_assert(pair_start->ull < pair_end->ull);
5021            hist1_seg_end = pair_end->ec;
5022            /* Could do a bit better here.  It may be that pair_end
5023               doesn't have a stack, but the following entries in the
5024               array have the same scalar Kw and to have a stack.  So
5025               we should search a bit further along the array than
5026               lastIx+1 if hist1_seg_end is NULL. */
5027         } else {
5028            if (!confThr->llexit_done)
5029               hist1_seg_end = main_get_EC( confThr );
5030         }
5031         // seg_start could be NULL iff this is the first stack in the thread
5032         //if (seg_start) VG_(pp_ExeContext)(seg_start);
5033         //if (seg_end)   VG_(pp_ExeContext)(seg_end);
5034         hist1_conf_thr = confThr->hgthread;
5035      }
5036   }
5037
5038   HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
5039                           szB, isWrite,
5040                           hist1_conf_thr, hist1_seg_start, hist1_seg_end );
5041}
5042
5043static Bool is_sane_SVal_C ( SVal sv ) {
5044   Bool leq;
5045   if (!SVal__isC(sv)) return True;
5046   leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
5047   return leq;
5048}
5049
5050
5051/* Compute new state following a read */
5052static inline SVal msmcread ( SVal svOld,
5053                              /* The following are only needed for
5054                                 creating error reports. */
5055                              Thr* acc_thr,
5056                              Addr acc_addr, SizeT szB )
5057{
5058   SVal svNew = SVal_INVALID;
5059   stats__msmcread++;
5060
5061   /* Redundant sanity check on the constraints */
5062   if (CHECK_MSM) {
5063      tl_assert(is_sane_SVal_C(svOld));
5064   }
5065
5066   if (LIKELY(SVal__isC(svOld))) {
5067      VtsID tviR  = acc_thr->viR;
5068      VtsID tviW  = acc_thr->viW;
5069      VtsID rmini = SVal__unC_Rmin(svOld);
5070      VtsID wmini = SVal__unC_Wmin(svOld);
5071      Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
5072      if (LIKELY(leq)) {
5073         /* no race */
5074         /* Note: RWLOCK subtlety: use tviW, not tviR */
5075         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5076         goto out;
5077      } else {
5078         /* assert on sanity of constraints. */
5079         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5080         tl_assert(leqxx);
5081         // same as in non-race case
5082         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5083         record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
5084                           rmini, /* Cfailed */
5085                           tviR,  /* Kfailed */
5086                           wmini  /* Cw */ );
5087         goto out;
5088      }
5089   }
5090   if (SVal__isA(svOld)) {
5091      /* reading no-access memory (sigh); leave unchanged */
5092      /* check for no pollution */
5093      tl_assert(svOld == SVal_NOACCESS);
5094      svNew = SVal_NOACCESS;
5095      goto out;
5096   }
5097   if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
5098   tl_assert(0);
5099
5100  out:
5101   if (CHECK_MSM) {
5102      tl_assert(is_sane_SVal_C(svNew));
5103   }
5104   if (UNLIKELY(svNew != svOld)) {
5105      tl_assert(svNew != SVal_INVALID);
5106      if (HG_(clo_history_level) >= 2
5107          && SVal__isC(svOld) && SVal__isC(svNew)) {
5108         event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5109         stats__msmcread_change++;
5110      }
5111   }
5112   return svNew;
5113}
5114
5115
5116/* Compute new state following a write */
5117static inline SVal msmcwrite ( SVal svOld,
5118                              /* The following are only needed for
5119                                 creating error reports. */
5120                              Thr* acc_thr,
5121                              Addr acc_addr, SizeT szB )
5122{
5123   SVal svNew = SVal_INVALID;
5124   stats__msmcwrite++;
5125
5126   /* Redundant sanity check on the constraints */
5127   if (CHECK_MSM) {
5128      tl_assert(is_sane_SVal_C(svOld));
5129   }
5130
5131   if (LIKELY(SVal__isC(svOld))) {
5132      VtsID tviW  = acc_thr->viW;
5133      VtsID wmini = SVal__unC_Wmin(svOld);
5134      Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5135      if (LIKELY(leq)) {
5136         /* no race */
5137         svNew = SVal__mkC( tviW, tviW );
5138         goto out;
5139      } else {
5140         VtsID rmini = SVal__unC_Rmin(svOld);
5141         /* assert on sanity of constraints. */
5142         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5143         tl_assert(leqxx);
5144         // same as in non-race case
5145         // proof: in the non-race case, we have
5146         //    rmini <= wmini (invar on constraints)
5147         //    tviW <= tviR (invar on thread clocks)
5148         //    wmini <= tviW (from run-time check)
5149         // hence from transitivity of <= we have
5150         //    rmini <= wmini <= tviW
5151         // and so join(rmini,tviW) == tviW
5152         // and    join(wmini,tviW) == tviW
5153         // qed.
5154         svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5155                            VtsID__join2(wmini, tviW) );
5156         record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5157                           wmini, /* Cfailed */
5158                           tviW,  /* Kfailed */
5159                           wmini  /* Cw */ );
5160         goto out;
5161      }
5162   }
5163   if (SVal__isA(svOld)) {
5164      /* writing no-access memory (sigh); leave unchanged */
5165      /* check for no pollution */
5166      tl_assert(svOld == SVal_NOACCESS);
5167      svNew = SVal_NOACCESS;
5168      goto out;
5169   }
5170   if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5171   tl_assert(0);
5172
5173  out:
5174   if (CHECK_MSM) {
5175      tl_assert(is_sane_SVal_C(svNew));
5176   }
5177   if (UNLIKELY(svNew != svOld)) {
5178      tl_assert(svNew != SVal_INVALID);
5179      if (HG_(clo_history_level) >= 2
5180          && SVal__isC(svOld) && SVal__isC(svNew)) {
5181         event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5182         stats__msmcwrite_change++;
5183      }
5184   }
5185   return svNew;
5186}
5187
5188
5189/////////////////////////////////////////////////////////
5190//                                                     //
5191// Apply core MSM to specific memory locations         //
5192//                                                     //
5193/////////////////////////////////////////////////////////
5194
5195/*------------- ZSM accesses: 8 bit sapply ------------- */
5196
5197static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5198   CacheLine* cl;
5199   UWord      cloff, tno, toff;
5200   SVal       svOld, svNew;
5201   UShort     descr;
5202   stats__cline_cread08s++;
5203   cl    = get_cacheline(a);
5204   cloff = get_cacheline_offset(a);
5205   tno   = get_treeno(a);
5206   toff  = get_tree_offset(a); /* == 0 .. 7 */
5207   descr = cl->descrs[tno];
5208   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5209      SVal* tree = &cl->svals[tno << 3];
5210      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5211      if (CHECK_ZSM)
5212         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5213   }
5214   svOld = cl->svals[cloff];
5215   svNew = msmcread( svOld, thr,a,1 );
5216   if (CHECK_ZSM)
5217      tl_assert(svNew != SVal_INVALID);
5218   cl->svals[cloff] = svNew;
5219}
5220
5221static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5222   CacheLine* cl;
5223   UWord      cloff, tno, toff;
5224   SVal       svOld, svNew;
5225   UShort     descr;
5226   stats__cline_cwrite08s++;
5227   cl    = get_cacheline(a);
5228   cloff = get_cacheline_offset(a);
5229   tno   = get_treeno(a);
5230   toff  = get_tree_offset(a); /* == 0 .. 7 */
5231   descr = cl->descrs[tno];
5232   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5233      SVal* tree = &cl->svals[tno << 3];
5234      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5235      if (CHECK_ZSM)
5236         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5237   }
5238   svOld = cl->svals[cloff];
5239   svNew = msmcwrite( svOld, thr,a,1 );
5240   if (CHECK_ZSM)
5241      tl_assert(svNew != SVal_INVALID);
5242   cl->svals[cloff] = svNew;
5243}
5244
5245/*------------- ZSM accesses: 16 bit sapply ------------- */
5246
5247static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5248   CacheLine* cl;
5249   UWord      cloff, tno, toff;
5250   SVal       svOld, svNew;
5251   UShort     descr;
5252   stats__cline_cread16s++;
5253   if (UNLIKELY(!aligned16(a))) goto slowcase;
5254   cl    = get_cacheline(a);
5255   cloff = get_cacheline_offset(a);
5256   tno   = get_treeno(a);
5257   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5258   descr = cl->descrs[tno];
5259   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5260      if (valid_value_is_below_me_16(descr, toff)) {
5261         goto slowcase;
5262      } else {
5263         SVal* tree = &cl->svals[tno << 3];
5264         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5265      }
5266      if (CHECK_ZSM)
5267         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5268   }
5269   svOld = cl->svals[cloff];
5270   svNew = msmcread( svOld, thr,a,2 );
5271   if (CHECK_ZSM)
5272      tl_assert(svNew != SVal_INVALID);
5273   cl->svals[cloff] = svNew;
5274   return;
5275  slowcase: /* misaligned, or must go further down the tree */
5276   stats__cline_16to8splits++;
5277   zsm_sapply08__msmcread( thr, a + 0 );
5278   zsm_sapply08__msmcread( thr, a + 1 );
5279}
5280
5281static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5282   CacheLine* cl;
5283   UWord      cloff, tno, toff;
5284   SVal       svOld, svNew;
5285   UShort     descr;
5286   stats__cline_cwrite16s++;
5287   if (UNLIKELY(!aligned16(a))) goto slowcase;
5288   cl    = get_cacheline(a);
5289   cloff = get_cacheline_offset(a);
5290   tno   = get_treeno(a);
5291   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5292   descr = cl->descrs[tno];
5293   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5294      if (valid_value_is_below_me_16(descr, toff)) {
5295         goto slowcase;
5296      } else {
5297         SVal* tree = &cl->svals[tno << 3];
5298         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5299      }
5300      if (CHECK_ZSM)
5301         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5302   }
5303   svOld = cl->svals[cloff];
5304   svNew = msmcwrite( svOld, thr,a,2 );
5305   if (CHECK_ZSM)
5306      tl_assert(svNew != SVal_INVALID);
5307   cl->svals[cloff] = svNew;
5308   return;
5309  slowcase: /* misaligned, or must go further down the tree */
5310   stats__cline_16to8splits++;
5311   zsm_sapply08__msmcwrite( thr, a + 0 );
5312   zsm_sapply08__msmcwrite( thr, a + 1 );
5313}
5314
5315/*------------- ZSM accesses: 32 bit sapply ------------- */
5316
5317static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5318   CacheLine* cl;
5319   UWord      cloff, tno, toff;
5320   SVal       svOld, svNew;
5321   UShort     descr;
5322   stats__cline_cread32s++;
5323   if (UNLIKELY(!aligned32(a))) goto slowcase;
5324   cl    = get_cacheline(a);
5325   cloff = get_cacheline_offset(a);
5326   tno   = get_treeno(a);
5327   toff  = get_tree_offset(a); /* == 0 or 4 */
5328   descr = cl->descrs[tno];
5329   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5330      if (valid_value_is_above_me_32(descr, toff)) {
5331         SVal* tree = &cl->svals[tno << 3];
5332         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5333      } else {
5334         goto slowcase;
5335      }
5336      if (CHECK_ZSM)
5337         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5338   }
5339   svOld = cl->svals[cloff];
5340   svNew = msmcread( svOld, thr,a,4 );
5341   if (CHECK_ZSM)
5342      tl_assert(svNew != SVal_INVALID);
5343   cl->svals[cloff] = svNew;
5344   return;
5345  slowcase: /* misaligned, or must go further down the tree */
5346   stats__cline_32to16splits++;
5347   zsm_sapply16__msmcread( thr, a + 0 );
5348   zsm_sapply16__msmcread( thr, a + 2 );
5349}
5350
5351static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5352   CacheLine* cl;
5353   UWord      cloff, tno, toff;
5354   SVal       svOld, svNew;
5355   UShort     descr;
5356   stats__cline_cwrite32s++;
5357   if (UNLIKELY(!aligned32(a))) goto slowcase;
5358   cl    = get_cacheline(a);
5359   cloff = get_cacheline_offset(a);
5360   tno   = get_treeno(a);
5361   toff  = get_tree_offset(a); /* == 0 or 4 */
5362   descr = cl->descrs[tno];
5363   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5364      if (valid_value_is_above_me_32(descr, toff)) {
5365         SVal* tree = &cl->svals[tno << 3];
5366         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5367      } else {
5368         goto slowcase;
5369      }
5370      if (CHECK_ZSM)
5371         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5372   }
5373   svOld = cl->svals[cloff];
5374   svNew = msmcwrite( svOld, thr,a,4 );
5375   if (CHECK_ZSM)
5376      tl_assert(svNew != SVal_INVALID);
5377   cl->svals[cloff] = svNew;
5378   return;
5379  slowcase: /* misaligned, or must go further down the tree */
5380   stats__cline_32to16splits++;
5381   zsm_sapply16__msmcwrite( thr, a + 0 );
5382   zsm_sapply16__msmcwrite( thr, a + 2 );
5383}
5384
5385/*------------- ZSM accesses: 64 bit sapply ------------- */
5386
5387static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5388   CacheLine* cl;
5389   UWord      cloff, tno;
5390   //UWord      toff;
5391   SVal       svOld, svNew;
5392   UShort     descr;
5393   stats__cline_cread64s++;
5394   if (UNLIKELY(!aligned64(a))) goto slowcase;
5395   cl    = get_cacheline(a);
5396   cloff = get_cacheline_offset(a);
5397   tno   = get_treeno(a);
5398   //toff  = get_tree_offset(a); /* == 0, unused */
5399   descr = cl->descrs[tno];
5400   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5401      goto slowcase;
5402   }
5403   svOld = cl->svals[cloff];
5404   svNew = msmcread( svOld, thr,a,8 );
5405   if (CHECK_ZSM)
5406      tl_assert(svNew != SVal_INVALID);
5407   cl->svals[cloff] = svNew;
5408   return;
5409  slowcase: /* misaligned, or must go further down the tree */
5410   stats__cline_64to32splits++;
5411   zsm_sapply32__msmcread( thr, a + 0 );
5412   zsm_sapply32__msmcread( thr, a + 4 );
5413}
5414
5415static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5416   CacheLine* cl;
5417   UWord      cloff, tno;
5418   //UWord      toff;
5419   SVal       svOld, svNew;
5420   UShort     descr;
5421   stats__cline_cwrite64s++;
5422   if (UNLIKELY(!aligned64(a))) goto slowcase;
5423   cl    = get_cacheline(a);
5424   cloff = get_cacheline_offset(a);
5425   tno   = get_treeno(a);
5426   //toff  = get_tree_offset(a); /* == 0, unused */
5427   descr = cl->descrs[tno];
5428   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5429      goto slowcase;
5430   }
5431   svOld = cl->svals[cloff];
5432   svNew = msmcwrite( svOld, thr,a,8 );
5433   if (CHECK_ZSM)
5434      tl_assert(svNew != SVal_INVALID);
5435   cl->svals[cloff] = svNew;
5436   return;
5437  slowcase: /* misaligned, or must go further down the tree */
5438   stats__cline_64to32splits++;
5439   zsm_sapply32__msmcwrite( thr, a + 0 );
5440   zsm_sapply32__msmcwrite( thr, a + 4 );
5441}
5442
5443/*--------------- ZSM accesses: 8 bit swrite --------------- */
5444
5445static
5446void zsm_swrite08 ( Addr a, SVal svNew ) {
5447   CacheLine* cl;
5448   UWord      cloff, tno, toff;
5449   UShort     descr;
5450   stats__cline_swrite08s++;
5451   cl    = get_cacheline(a);
5452   cloff = get_cacheline_offset(a);
5453   tno   = get_treeno(a);
5454   toff  = get_tree_offset(a); /* == 0 .. 7 */
5455   descr = cl->descrs[tno];
5456   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5457      SVal* tree = &cl->svals[tno << 3];
5458      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5459      if (CHECK_ZSM)
5460         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5461   }
5462   tl_assert(svNew != SVal_INVALID);
5463   cl->svals[cloff] = svNew;
5464}
5465
5466/*--------------- ZSM accesses: 16 bit swrite --------------- */
5467
5468static
5469void zsm_swrite16 ( Addr a, SVal svNew ) {
5470   CacheLine* cl;
5471   UWord      cloff, tno, toff;
5472   UShort     descr;
5473   stats__cline_swrite16s++;
5474   if (UNLIKELY(!aligned16(a))) goto slowcase;
5475   cl    = get_cacheline(a);
5476   cloff = get_cacheline_offset(a);
5477   tno   = get_treeno(a);
5478   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5479   descr = cl->descrs[tno];
5480   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5481      if (valid_value_is_below_me_16(descr, toff)) {
5482         /* Writing at this level.  Need to fix up 'descr'. */
5483         cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5484         /* At this point, the tree does not match cl->descr[tno] any
5485            more.  The assignments below will fix it up. */
5486      } else {
5487         /* We can't indiscriminately write on the w16 node as in the
5488            w64 case, as that might make the node inconsistent with
5489            its parent.  So first, pull down to this level. */
5490         SVal* tree = &cl->svals[tno << 3];
5491         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5492      if (CHECK_ZSM)
5493         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5494      }
5495   }
5496   tl_assert(svNew != SVal_INVALID);
5497   cl->svals[cloff + 0] = svNew;
5498   cl->svals[cloff + 1] = SVal_INVALID;
5499   return;
5500  slowcase: /* misaligned */
5501   stats__cline_16to8splits++;
5502   zsm_swrite08( a + 0, svNew );
5503   zsm_swrite08( a + 1, svNew );
5504}
5505
5506/*--------------- ZSM accesses: 32 bit swrite --------------- */
5507
5508static
5509void zsm_swrite32 ( Addr a, SVal svNew ) {
5510   CacheLine* cl;
5511   UWord      cloff, tno, toff;
5512   UShort     descr;
5513   stats__cline_swrite32s++;
5514   if (UNLIKELY(!aligned32(a))) goto slowcase;
5515   cl    = get_cacheline(a);
5516   cloff = get_cacheline_offset(a);
5517   tno   = get_treeno(a);
5518   toff  = get_tree_offset(a); /* == 0 or 4 */
5519   descr = cl->descrs[tno];
5520   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5521      if (valid_value_is_above_me_32(descr, toff)) {
5522         /* We can't indiscriminately write on the w32 node as in the
5523            w64 case, as that might make the node inconsistent with
5524            its parent.  So first, pull down to this level. */
5525         SVal* tree = &cl->svals[tno << 3];
5526         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5527         if (CHECK_ZSM)
5528            tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5529      } else {
5530         /* Writing at this level.  Need to fix up 'descr'. */
5531         cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5532         /* At this point, the tree does not match cl->descr[tno] any
5533            more.  The assignments below will fix it up. */
5534      }
5535   }
5536   tl_assert(svNew != SVal_INVALID);
5537   cl->svals[cloff + 0] = svNew;
5538   cl->svals[cloff + 1] = SVal_INVALID;
5539   cl->svals[cloff + 2] = SVal_INVALID;
5540   cl->svals[cloff + 3] = SVal_INVALID;
5541   return;
5542  slowcase: /* misaligned */
5543   stats__cline_32to16splits++;
5544   zsm_swrite16( a + 0, svNew );
5545   zsm_swrite16( a + 2, svNew );
5546}
5547
5548/*--------------- ZSM accesses: 64 bit swrite --------------- */
5549
5550static
5551void zsm_swrite64 ( Addr a, SVal svNew ) {
5552   CacheLine* cl;
5553   UWord      cloff, tno;
5554   //UWord    toff;
5555   stats__cline_swrite64s++;
5556   if (UNLIKELY(!aligned64(a))) goto slowcase;
5557   cl    = get_cacheline(a);
5558   cloff = get_cacheline_offset(a);
5559   tno   = get_treeno(a);
5560   //toff  = get_tree_offset(a); /* == 0, unused */
5561   cl->descrs[tno] = TREE_DESCR_64;
5562   tl_assert(svNew != SVal_INVALID);
5563   cl->svals[cloff + 0] = svNew;
5564   cl->svals[cloff + 1] = SVal_INVALID;
5565   cl->svals[cloff + 2] = SVal_INVALID;
5566   cl->svals[cloff + 3] = SVal_INVALID;
5567   cl->svals[cloff + 4] = SVal_INVALID;
5568   cl->svals[cloff + 5] = SVal_INVALID;
5569   cl->svals[cloff + 6] = SVal_INVALID;
5570   cl->svals[cloff + 7] = SVal_INVALID;
5571   return;
5572  slowcase: /* misaligned */
5573   stats__cline_64to32splits++;
5574   zsm_swrite32( a + 0, svNew );
5575   zsm_swrite32( a + 4, svNew );
5576}
5577
5578/*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5579
5580static
5581SVal zsm_sread08 ( Addr a ) {
5582   CacheLine* cl;
5583   UWord      cloff, tno, toff;
5584   UShort     descr;
5585   stats__cline_sread08s++;
5586   cl    = get_cacheline(a);
5587   cloff = get_cacheline_offset(a);
5588   tno   = get_treeno(a);
5589   toff  = get_tree_offset(a); /* == 0 .. 7 */
5590   descr = cl->descrs[tno];
5591   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5592      SVal* tree = &cl->svals[tno << 3];
5593      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5594   }
5595   return cl->svals[cloff];
5596}
5597
5598static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5599   SVal       sv;
5600   stats__cline_scopy08s++;
5601   sv = zsm_sread08( src );
5602   zsm_swrite08( dst, sv );
5603}
5604
5605
5606/* Block-copy states (needed for implementing realloc()).  Note this
5607   doesn't change the filtering arrangements.  The caller of
5608   zsm_scopy_range needs to attend to that. */
5609
5610static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
5611{
5612   SizeT i;
5613   if (len == 0)
5614      return;
5615
5616   /* assert for non-overlappingness */
5617   tl_assert(src+len <= dst || dst+len <= src);
5618
5619   /* To be simple, just copy byte by byte.  But so as not to wreck
5620      performance for later accesses to dst[0 .. len-1], normalise
5621      destination lines as we finish with them, and also normalise the
5622      line containing the first and last address. */
5623   for (i = 0; i < len; i++) {
5624      Bool normalise
5625         = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
5626           || i == 0       /* first in range */
5627           || i == len-1;  /* last in range */
5628      zsm_scopy08( src+i, dst+i, normalise );
5629   }
5630}
5631
5632
5633/* For setting address ranges to a given value.  Has considerable
5634   sophistication so as to avoid generating large numbers of pointless
5635   cache loads/writebacks for large ranges. */
5636
5637/* Do small ranges in-cache, in the obvious way. */
5638static
5639void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
5640{
5641   /* fast track a couple of common cases */
5642   if (len == 4 && aligned32(a)) {
5643      zsm_swrite32( a, svNew );
5644      return;
5645   }
5646   if (len == 8 && aligned64(a)) {
5647      zsm_swrite64( a, svNew );
5648      return;
5649   }
5650
5651   /* be completely general (but as efficient as possible) */
5652   if (len == 0) return;
5653
5654   if (!aligned16(a) && len >= 1) {
5655      zsm_swrite08( a, svNew );
5656      a += 1;
5657      len -= 1;
5658      tl_assert(aligned16(a));
5659   }
5660   if (len == 0) return;
5661
5662   if (!aligned32(a) && len >= 2) {
5663      zsm_swrite16( a, svNew );
5664      a += 2;
5665      len -= 2;
5666      tl_assert(aligned32(a));
5667   }
5668   if (len == 0) return;
5669
5670   if (!aligned64(a) && len >= 4) {
5671      zsm_swrite32( a, svNew );
5672      a += 4;
5673      len -= 4;
5674      tl_assert(aligned64(a));
5675   }
5676   if (len == 0) return;
5677
5678   if (len >= 8) {
5679      tl_assert(aligned64(a));
5680      while (len >= 8) {
5681         zsm_swrite64( a, svNew );
5682         a += 8;
5683         len -= 8;
5684      }
5685      tl_assert(aligned64(a));
5686   }
5687   if (len == 0) return;
5688
5689   if (len >= 4)
5690      tl_assert(aligned32(a));
5691   if (len >= 4) {
5692      zsm_swrite32( a, svNew );
5693      a += 4;
5694      len -= 4;
5695   }
5696   if (len == 0) return;
5697
5698   if (len >= 2)
5699      tl_assert(aligned16(a));
5700   if (len >= 2) {
5701      zsm_swrite16( a, svNew );
5702      a += 2;
5703      len -= 2;
5704   }
5705   if (len == 0) return;
5706
5707   if (len >= 1) {
5708      zsm_swrite08( a, svNew );
5709      //a += 1;
5710      len -= 1;
5711   }
5712   tl_assert(len == 0);
5713}
5714
5715
5716/* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
5717   for larger ranges, try to operate directly on the out-of-cache
5718   representation, rather than dragging lines into the cache,
5719   overwriting them, and forcing them out.  This turns out to be an
5720   important performance optimisation.
5721
5722   Note that this doesn't change the filtering arrangements.  The
5723   caller of zsm_sset_range needs to attend to that. */
5724
5725static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
5726{
5727   tl_assert(svNew != SVal_INVALID);
5728   stats__cache_make_New_arange += (ULong)len;
5729
5730   if (0 && len > 500)
5731      VG_(printf)("make New      ( %#lx, %ld )\n", a, len );
5732
5733   if (0) {
5734      static UWord n_New_in_cache = 0;
5735      static UWord n_New_not_in_cache = 0;
5736      /* tag is 'a' with the in-line offset masked out,
5737         eg a[31]..a[4] 0000 */
5738      Addr       tag = a & ~(N_LINE_ARANGE - 1);
5739      UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
5740      if (LIKELY(tag == cache_shmem.tags0[wix])) {
5741         n_New_in_cache++;
5742      } else {
5743         n_New_not_in_cache++;
5744      }
5745      if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
5746         VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
5747                     n_New_in_cache, n_New_not_in_cache );
5748   }
5749
5750   if (LIKELY(len < 2 * N_LINE_ARANGE)) {
5751      zsm_sset_range_SMALL( a, len, svNew );
5752   } else {
5753      Addr  before_start  = a;
5754      Addr  aligned_start = cacheline_ROUNDUP(a);
5755      Addr  after_start   = cacheline_ROUNDDN(a + len);
5756      UWord before_len    = aligned_start - before_start;
5757      UWord aligned_len   = after_start - aligned_start;
5758      UWord after_len     = a + len - after_start;
5759      tl_assert(before_start <= aligned_start);
5760      tl_assert(aligned_start <= after_start);
5761      tl_assert(before_len < N_LINE_ARANGE);
5762      tl_assert(after_len < N_LINE_ARANGE);
5763      tl_assert(get_cacheline_offset(aligned_start) == 0);
5764      if (get_cacheline_offset(a) == 0) {
5765         tl_assert(before_len == 0);
5766         tl_assert(a == aligned_start);
5767      }
5768      if (get_cacheline_offset(a+len) == 0) {
5769         tl_assert(after_len == 0);
5770         tl_assert(after_start == a+len);
5771      }
5772      if (before_len > 0) {
5773         zsm_sset_range_SMALL( before_start, before_len, svNew );
5774      }
5775      if (after_len > 0) {
5776         zsm_sset_range_SMALL( after_start, after_len, svNew );
5777      }
5778      stats__cache_make_New_inZrep += (ULong)aligned_len;
5779
5780      while (1) {
5781         Addr tag;
5782         UWord wix;
5783         if (aligned_start >= after_start)
5784            break;
5785         tl_assert(get_cacheline_offset(aligned_start) == 0);
5786         tag = aligned_start & ~(N_LINE_ARANGE - 1);
5787         wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
5788         if (tag == cache_shmem.tags0[wix]) {
5789            UWord i;
5790            for (i = 0; i < N_LINE_ARANGE / 8; i++)
5791               zsm_swrite64( aligned_start + i * 8, svNew );
5792         } else {
5793            UWord i;
5794            Word zix;
5795            SecMap* sm;
5796            LineZ* lineZ;
5797            /* This line is not in the cache.  Do not force it in; instead
5798               modify it in-place. */
5799            /* find the Z line to write in and rcdec it or the
5800               associated F line. */
5801            find_Z_for_writing( &sm, &zix, tag );
5802            tl_assert(sm);
5803            tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
5804            lineZ = &sm->linesZ[zix];
5805            lineZ->dict[0] = svNew;
5806            lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
5807            for (i = 0; i < N_LINE_ARANGE/4; i++)
5808               lineZ->ix2s[i] = 0; /* all refer to dict[0] */
5809            rcinc_LineZ(lineZ);
5810         }
5811         aligned_start += N_LINE_ARANGE;
5812         aligned_len -= N_LINE_ARANGE;
5813      }
5814      tl_assert(aligned_start == after_start);
5815      tl_assert(aligned_len == 0);
5816   }
5817}
5818
5819
5820/////////////////////////////////////////////////////////
5821//                                                     //
5822// Front-filtering accesses                            //
5823//                                                     //
5824/////////////////////////////////////////////////////////
5825
5826static UWord stats__f_ac = 0;
5827static UWord stats__f_sk = 0;
5828
5829#if 0
5830#  define STATS__F_SHOW \
5831     do { \
5832        if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
5833           VG_(printf)("filters: ac %lu sk %lu\n",   \
5834           stats__f_ac, stats__f_sk); \
5835     } while (0)
5836#else
5837#  define STATS__F_SHOW /* */
5838#endif
5839
5840void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
5841   stats__f_ac++;
5842   STATS__F_SHOW;
5843   if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
5844      stats__f_sk++;
5845      return;
5846   }
5847   zsm_sapply08__msmcwrite(thr, a);
5848}
5849
5850void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
5851   stats__f_ac++;
5852   STATS__F_SHOW;
5853   if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
5854      stats__f_sk++;
5855      return;
5856   }
5857   zsm_sapply16__msmcwrite(thr, a);
5858}
5859
5860void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
5861   stats__f_ac++;
5862   STATS__F_SHOW;
5863   if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
5864      stats__f_sk++;
5865      return;
5866   }
5867   zsm_sapply32__msmcwrite(thr, a);
5868}
5869
5870void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
5871   stats__f_ac++;
5872   STATS__F_SHOW;
5873   if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
5874      stats__f_sk++;
5875      return;
5876   }
5877   zsm_sapply64__msmcwrite(thr, a);
5878}
5879
5880void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
5881{
5882   /* fast track a couple of common cases */
5883   if (len == 4 && aligned32(a)) {
5884      zsm_sapply32_f__msmcwrite( thr, a );
5885      return;
5886   }
5887   if (len == 8 && aligned64(a)) {
5888      zsm_sapply64_f__msmcwrite( thr, a );
5889      return;
5890   }
5891
5892   /* be completely general (but as efficient as possible) */
5893   if (len == 0) return;
5894
5895   if (!aligned16(a) && len >= 1) {
5896      zsm_sapply08_f__msmcwrite( thr, a );
5897      a += 1;
5898      len -= 1;
5899      tl_assert(aligned16(a));
5900   }
5901   if (len == 0) return;
5902
5903   if (!aligned32(a) && len >= 2) {
5904      zsm_sapply16_f__msmcwrite( thr, a );
5905      a += 2;
5906      len -= 2;
5907      tl_assert(aligned32(a));
5908   }
5909   if (len == 0) return;
5910
5911   if (!aligned64(a) && len >= 4) {
5912      zsm_sapply32_f__msmcwrite( thr, a );
5913      a += 4;
5914      len -= 4;
5915      tl_assert(aligned64(a));
5916   }
5917   if (len == 0) return;
5918
5919   if (len >= 8) {
5920      tl_assert(aligned64(a));
5921      while (len >= 8) {
5922         zsm_sapply64_f__msmcwrite( thr, a );
5923         a += 8;
5924         len -= 8;
5925      }
5926      tl_assert(aligned64(a));
5927   }
5928   if (len == 0) return;
5929
5930   if (len >= 4)
5931      tl_assert(aligned32(a));
5932   if (len >= 4) {
5933      zsm_sapply32_f__msmcwrite( thr, a );
5934      a += 4;
5935      len -= 4;
5936   }
5937   if (len == 0) return;
5938
5939   if (len >= 2)
5940      tl_assert(aligned16(a));
5941   if (len >= 2) {
5942      zsm_sapply16_f__msmcwrite( thr, a );
5943      a += 2;
5944      len -= 2;
5945   }
5946   if (len == 0) return;
5947
5948   if (len >= 1) {
5949      zsm_sapply08_f__msmcwrite( thr, a );
5950      //a += 1;
5951      len -= 1;
5952   }
5953   tl_assert(len == 0);
5954}
5955
5956void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
5957   stats__f_ac++;
5958   STATS__F_SHOW;
5959   if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
5960      stats__f_sk++;
5961      return;
5962   }
5963   zsm_sapply08__msmcread(thr, a);
5964}
5965
5966void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
5967   stats__f_ac++;
5968   STATS__F_SHOW;
5969   if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
5970      stats__f_sk++;
5971      return;
5972   }
5973   zsm_sapply16__msmcread(thr, a);
5974}
5975
5976void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
5977   stats__f_ac++;
5978   STATS__F_SHOW;
5979   if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
5980      stats__f_sk++;
5981      return;
5982   }
5983   zsm_sapply32__msmcread(thr, a);
5984}
5985
5986void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
5987   stats__f_ac++;
5988   STATS__F_SHOW;
5989   if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
5990      stats__f_sk++;
5991      return;
5992   }
5993   zsm_sapply64__msmcread(thr, a);
5994}
5995
5996void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
5997{
5998   /* fast track a couple of common cases */
5999   if (len == 4 && aligned32(a)) {
6000      zsm_sapply32_f__msmcread( thr, a );
6001      return;
6002   }
6003   if (len == 8 && aligned64(a)) {
6004      zsm_sapply64_f__msmcread( thr, a );
6005      return;
6006   }
6007
6008   /* be completely general (but as efficient as possible) */
6009   if (len == 0) return;
6010
6011   if (!aligned16(a) && len >= 1) {
6012      zsm_sapply08_f__msmcread( thr, a );
6013      a += 1;
6014      len -= 1;
6015      tl_assert(aligned16(a));
6016   }
6017   if (len == 0) return;
6018
6019   if (!aligned32(a) && len >= 2) {
6020      zsm_sapply16_f__msmcread( thr, a );
6021      a += 2;
6022      len -= 2;
6023      tl_assert(aligned32(a));
6024   }
6025   if (len == 0) return;
6026
6027   if (!aligned64(a) && len >= 4) {
6028      zsm_sapply32_f__msmcread( thr, a );
6029      a += 4;
6030      len -= 4;
6031      tl_assert(aligned64(a));
6032   }
6033   if (len == 0) return;
6034
6035   if (len >= 8) {
6036      tl_assert(aligned64(a));
6037      while (len >= 8) {
6038         zsm_sapply64_f__msmcread( thr, a );
6039         a += 8;
6040         len -= 8;
6041      }
6042      tl_assert(aligned64(a));
6043   }
6044   if (len == 0) return;
6045
6046   if (len >= 4)
6047      tl_assert(aligned32(a));
6048   if (len >= 4) {
6049      zsm_sapply32_f__msmcread( thr, a );
6050      a += 4;
6051      len -= 4;
6052   }
6053   if (len == 0) return;
6054
6055   if (len >= 2)
6056      tl_assert(aligned16(a));
6057   if (len >= 2) {
6058      zsm_sapply16_f__msmcread( thr, a );
6059      a += 2;
6060      len -= 2;
6061   }
6062   if (len == 0) return;
6063
6064   if (len >= 1) {
6065      zsm_sapply08_f__msmcread( thr, a );
6066      //a += 1;
6067      len -= 1;
6068   }
6069   tl_assert(len == 0);
6070}
6071
6072void libhb_Thr_resumes ( Thr* thr )
6073{
6074   if (0) VG_(printf)("resume %p\n", thr);
6075   tl_assert(thr);
6076   tl_assert(!thr->llexit_done);
6077   Filter__clear(thr->filter, "libhb_Thr_resumes");
6078   /* A kludge, but .. if this thread doesn't have any marker stacks
6079      at all, get one right now.  This is easier than figuring out
6080      exactly when at thread startup we can and can't take a stack
6081      snapshot. */
6082   if (HG_(clo_history_level) == 1) {
6083      tl_assert(thr->local_Kws_n_stacks);
6084      if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
6085         note_local_Kw_n_stack_for(thr);
6086   }
6087}
6088
6089
6090/////////////////////////////////////////////////////////
6091//                                                     //
6092// Synchronisation objects                             //
6093//                                                     //
6094/////////////////////////////////////////////////////////
6095
6096/* A double linked list of all the SO's. */
6097SO* admin_SO = NULL;
6098
6099static SO* SO__Alloc ( void )
6100{
6101   SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
6102   so->viR   = VtsID_INVALID;
6103   so->viW   = VtsID_INVALID;
6104   so->magic = SO_MAGIC;
6105   /* Add to double linked list */
6106   if (admin_SO) {
6107      tl_assert(admin_SO->admin_prev == NULL);
6108      admin_SO->admin_prev = so;
6109      so->admin_next = admin_SO;
6110   } else {
6111      so->admin_next = NULL;
6112   }
6113   so->admin_prev = NULL;
6114   admin_SO = so;
6115   /* */
6116   return so;
6117}
6118
6119static void SO__Dealloc ( SO* so )
6120{
6121   tl_assert(so);
6122   tl_assert(so->magic == SO_MAGIC);
6123   if (so->viR == VtsID_INVALID) {
6124      tl_assert(so->viW == VtsID_INVALID);
6125   } else {
6126      tl_assert(so->viW != VtsID_INVALID);
6127      VtsID__rcdec(so->viR);
6128      VtsID__rcdec(so->viW);
6129   }
6130   so->magic = 0;
6131   /* Del from double linked list */
6132   if (so->admin_prev)
6133      so->admin_prev->admin_next = so->admin_next;
6134   if (so->admin_next)
6135      so->admin_next->admin_prev = so->admin_prev;
6136   if (so == admin_SO)
6137      admin_SO = so->admin_next;
6138   /* */
6139   HG_(free)( so );
6140}
6141
6142
6143/////////////////////////////////////////////////////////
6144//                                                     //
6145// Top Level API                                       //
6146//                                                     //
6147/////////////////////////////////////////////////////////
6148
6149static void show_thread_state ( HChar* str, Thr* t )
6150{
6151   if (1) return;
6152   if (t->viR == t->viW) {
6153      VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6154      VtsID__pp( t->viR );
6155      VG_(printf)("%s","\n");
6156   } else {
6157      VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6158      VtsID__pp( t->viR );
6159      VG_(printf)(" viW %u==", t->viW);
6160      VtsID__pp( t->viW );
6161      VG_(printf)("%s","\n");
6162   }
6163}
6164
6165
6166Thr* libhb_init (
6167        void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6168        ExeContext* (*get_EC)( Thr* )
6169     )
6170{
6171   Thr*  thr;
6172   VtsID vi;
6173
6174   // We will have to have to store a large number of these,
6175   // so make sure they're the size we expect them to be.
6176   tl_assert(sizeof(ScalarTS) == 8);
6177
6178   /* because first 1024 unusable */
6179   tl_assert(SCALARTS_N_THRBITS >= 11);
6180   /* so as to fit in a UInt w/ 3 bits to spare (see defn of
6181      Thr_n_RCEC). */
6182   tl_assert(SCALARTS_N_THRBITS <= 29);
6183
6184   /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6185      (32-bit).  It's not correctness-critical, but there are a lot of
6186      them, so it's important from a space viewpoint.  Unfortunately
6187      we simply can't pack it into 2 words on a 32-bit target. */
6188   if (sizeof(UWord) == 8) {
6189      tl_assert(sizeof(Thr_n_RCEC) == 16);
6190   } else {
6191      tl_assert(sizeof(Thr_n_RCEC) == 12);
6192   }
6193
6194   /* Word sets really are 32 bits.  Even on a 64 bit target. */
6195   tl_assert(sizeof(WordSetID) == 4);
6196   tl_assert(sizeof(WordSet) == sizeof(WordSetID));
6197
6198   tl_assert(get_stacktrace);
6199   tl_assert(get_EC);
6200   main_get_stacktrace   = get_stacktrace;
6201   main_get_EC           = get_EC;
6202
6203   // No need to initialise hg_wordfm.
6204   // No need to initialise hg_wordset.
6205
6206   /* Allocated once and never deallocated.  Used as a temporary in
6207      VTS singleton, tick and join operations. */
6208   temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6209   temp_max_sized_VTS->id = VtsID_INVALID;
6210   verydead_thread_table_init();
6211   vts_set_init();
6212   vts_tab_init();
6213   event_map_init();
6214   VtsID__invalidate_caches();
6215
6216   // initialise shadow memory
6217   zsm_init( SVal__rcinc, SVal__rcdec );
6218
6219   thr = Thr__new();
6220   vi  = VtsID__mk_Singleton( thr, 1 );
6221   thr->viR = vi;
6222   thr->viW = vi;
6223   VtsID__rcinc(thr->viR);
6224   VtsID__rcinc(thr->viW);
6225
6226   show_thread_state("  root", thr);
6227   return thr;
6228}
6229
6230
6231Thr* libhb_create ( Thr* parent )
6232{
6233   /* The child's VTSs are copies of the parent's VTSs, but ticked at
6234      the child's index.  Since the child's index is guaranteed
6235      unique, it has never been seen before, so the implicit value
6236      before the tick is zero and after that is one. */
6237   Thr* child = Thr__new();
6238
6239   child->viR = VtsID__tick( parent->viR, child );
6240   child->viW = VtsID__tick( parent->viW, child );
6241   Filter__clear(child->filter, "libhb_create(child)");
6242   VtsID__rcinc(child->viR);
6243   VtsID__rcinc(child->viW);
6244   /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6245      early for that - it may not have a valid TId yet.  So, let
6246      libhb_Thr_resumes pick it up the first time the thread runs. */
6247
6248   tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6249   tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6250
6251   /* and the parent has to move along too */
6252   VtsID__rcdec(parent->viR);
6253   VtsID__rcdec(parent->viW);
6254   parent->viR = VtsID__tick( parent->viR, parent );
6255   parent->viW = VtsID__tick( parent->viW, parent );
6256   Filter__clear(parent->filter, "libhb_create(parent)");
6257   VtsID__rcinc(parent->viR);
6258   VtsID__rcinc(parent->viW);
6259   note_local_Kw_n_stack_for( parent );
6260
6261   show_thread_state(" child", child);
6262   show_thread_state("parent", parent);
6263
6264   return child;
6265}
6266
6267/* Shut down the library, and print stats (in fact that's _all_
6268   this is for. */
6269void libhb_shutdown ( Bool show_stats )
6270{
6271   if (show_stats) {
6272      VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6273      VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6274                  stats__secmaps_allocd,
6275                  stats__secmap_ga_space_covered);
6276      VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6277                  stats__secmap_linesZ_allocd,
6278                  stats__secmap_linesZ_bytes);
6279      VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
6280                  stats__secmap_linesF_allocd,
6281                  stats__secmap_linesF_bytes);
6282      VG_(printf)(" secmaps: %'10lu iterator steppings\n",
6283                  stats__secmap_iterator_steppings);
6284      VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6285                  stats__secmaps_search, stats__secmaps_search_slow);
6286
6287      VG_(printf)("%s","\n");
6288      VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6289                  stats__cache_totrefs, stats__cache_totmisses );
6290      VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6291                  stats__cache_Z_fetches, stats__cache_F_fetches );
6292      VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6293                  stats__cache_Z_wbacks, stats__cache_F_wbacks );
6294      VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
6295                  stats__cache_invals, stats__cache_flushes );
6296      VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6297                  stats__cache_make_New_arange,
6298                  stats__cache_make_New_inZrep);
6299
6300      VG_(printf)("%s","\n");
6301      VG_(printf)("   cline: %'10lu normalises\n",
6302                  stats__cline_normalises );
6303      VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6304                  stats__cline_cread64s,
6305                  stats__cline_cread32s,
6306                  stats__cline_cread16s,
6307                  stats__cline_cread08s );
6308      VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6309                  stats__cline_cwrite64s,
6310                  stats__cline_cwrite32s,
6311                  stats__cline_cwrite16s,
6312                  stats__cline_cwrite08s );
6313      VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6314                  stats__cline_swrite64s,
6315                  stats__cline_swrite32s,
6316                  stats__cline_swrite16s,
6317                  stats__cline_swrite08s );
6318      VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6319                  stats__cline_sread08s, stats__cline_scopy08s );
6320      VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
6321                 stats__cline_64to32splits,
6322                 stats__cline_32to16splits,
6323                 stats__cline_16to8splits );
6324      VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
6325                 stats__cline_64to32pulldown,
6326                 stats__cline_32to16pulldown,
6327                 stats__cline_16to8pulldown );
6328      if (0)
6329      VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
6330                  (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);
6331
6332      VG_(printf)("%s","\n");
6333
6334      VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6335                  stats__msmcread, stats__msmcread_change);
6336      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6337                  stats__msmcwrite, stats__msmcwrite_change);
6338      VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6339                  stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6340      VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6341                  stats__join2_queries, stats__join2_misses);
6342
6343      VG_(printf)("%s","\n");
6344      VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6345                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6346      VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6347                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
6348      VG_(printf)( "   libhb: VTSset: find__or__clone_and_add %'lu (%'lu allocd)\n",
6349                   stats__vts_set__focaa, stats__vts_set__focaa_a );
6350      VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6351                   stats__vts__indexat_slow );
6352
6353      VG_(printf)("%s","\n");
6354      VG_(printf)(
6355         "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6356         VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6357      );
6358      VG_(printf)( "   libhb: %lu entries in vts_set\n",
6359                   VG_(sizeFM)( vts_set ) );
6360
6361      VG_(printf)("%s","\n");
6362      VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
6363                   stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
6364                   stats__ctxt_rcdec2,
6365                   stats__ctxt_rcdec3 );
6366      VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
6367                   stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
6368      VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
6369                   (UWord)N_RCEC_TAB,
6370                   stats__ctxt_tab_curr );
6371      VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6372                   stats__ctxt_tab_qs,
6373                   stats__ctxt_tab_cmps );
6374#if 0
6375      VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
6376      VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
6377      VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
6378      VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
6379      VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
6380      VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
6381      VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
6382      VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
6383      VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
6384      VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
6385      VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
6386      VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
6387      VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
6388      VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
6389
6390      VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
6391      VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
6392      VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
6393      VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
6394#endif
6395
6396      VG_(printf)("%s","<<< END libhb stats >>>\n");
6397      VG_(printf)("%s","\n");
6398
6399   }
6400}
6401
6402/* Receive notification that a thread has low level exited.  The
6403   significance here is that we do not expect to see any more memory
6404   references from it. */
6405void libhb_async_exit ( Thr* thr )
6406{
6407   tl_assert(thr);
6408   tl_assert(!thr->llexit_done);
6409   thr->llexit_done = True;
6410
6411   /* free up Filter and local_Kws_n_stacks (well, actually not the
6412      latter ..) */
6413   tl_assert(thr->filter);
6414   HG_(free)(thr->filter);
6415   thr->filter = NULL;
6416
6417   /* Tell the VTS mechanism this thread has exited, so it can
6418      participate in VTS pruning.  Note this can only happen if the
6419      thread has both ll_exited and has been joined with. */
6420   if (thr->joinedwith_done)
6421      VTS__declare_thread_very_dead(thr);
6422
6423   /* Another space-accuracy tradeoff.  Do we want to be able to show
6424      H1 history for conflicts in threads which have since exited?  If
6425      yes, then we better not free up thr->local_Kws_n_stacks.  The
6426      downside is a potential per-thread leak of up to
6427      N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6428      XArray average overcommit factor is (1.5 I'd guess). */
6429   // hence:
6430   // VG_(deleteXA)(thr->local_Kws_n_stacks);
6431   // thr->local_Kws_n_stacks = NULL;
6432}
6433
6434/* Receive notification that a thread has been joined with.  The
6435   significance here is that we do not expect to see any further
6436   references to its vector clocks (Thr::viR and Thr::viW). */
6437void libhb_joinedwith_done ( Thr* thr )
6438{
6439   tl_assert(thr);
6440   /* Caller must ensure that this is only ever called once per Thr. */
6441   tl_assert(!thr->joinedwith_done);
6442   thr->joinedwith_done = True;
6443   if (thr->llexit_done)
6444      VTS__declare_thread_very_dead(thr);
6445}
6446
6447
6448/* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6449   a Seg that points at a VTS is its one-and-only owner, and ditto for
6450   a SO that points at a VTS. */
6451
6452SO* libhb_so_alloc ( void )
6453{
6454   return SO__Alloc();
6455}
6456
6457void libhb_so_dealloc ( SO* so )
6458{
6459   tl_assert(so);
6460   tl_assert(so->magic == SO_MAGIC);
6461   SO__Dealloc(so);
6462}
6463
6464/* See comments in libhb.h for details on the meaning of
6465   strong vs weak sends and strong vs weak receives. */
6466void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6467{
6468   /* Copy the VTSs from 'thr' into the sync object, and then move
6469      the thread along one step. */
6470
6471   tl_assert(so);
6472   tl_assert(so->magic == SO_MAGIC);
6473
6474   /* stay sane .. a thread's read-clock must always lead or be the
6475      same as its write-clock */
6476   { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6477     tl_assert(leq);
6478   }
6479
6480   /* since we're overwriting the VtsIDs in the SO, we need to drop
6481      any references made by the previous contents thereof */
6482   if (so->viR == VtsID_INVALID) {
6483      tl_assert(so->viW == VtsID_INVALID);
6484      so->viR = thr->viR;
6485      so->viW = thr->viW;
6486      VtsID__rcinc(so->viR);
6487      VtsID__rcinc(so->viW);
6488   } else {
6489      /* In a strong send, we dump any previous VC in the SO and
6490         install the sending thread's VC instead.  For a weak send we
6491         must join2 with what's already there. */
6492      tl_assert(so->viW != VtsID_INVALID);
6493      VtsID__rcdec(so->viR);
6494      VtsID__rcdec(so->viW);
6495      so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6496      so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6497      VtsID__rcinc(so->viR);
6498      VtsID__rcinc(so->viW);
6499   }
6500
6501   /* move both parent clocks along */
6502   VtsID__rcdec(thr->viR);
6503   VtsID__rcdec(thr->viW);
6504   thr->viR = VtsID__tick( thr->viR, thr );
6505   thr->viW = VtsID__tick( thr->viW, thr );
6506   if (!thr->llexit_done) {
6507      Filter__clear(thr->filter, "libhb_so_send");
6508      note_local_Kw_n_stack_for(thr);
6509   }
6510   VtsID__rcinc(thr->viR);
6511   VtsID__rcinc(thr->viW);
6512
6513   if (strong_send)
6514      show_thread_state("s-send", thr);
6515   else
6516      show_thread_state("w-send", thr);
6517}
6518
6519void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
6520{
6521   tl_assert(so);
6522   tl_assert(so->magic == SO_MAGIC);
6523
6524   if (so->viR != VtsID_INVALID) {
6525      tl_assert(so->viW != VtsID_INVALID);
6526
6527      /* Weak receive (basically, an R-acquisition of a R-W lock).
6528         This advances the read-clock of the receiver, but not the
6529         write-clock. */
6530      VtsID__rcdec(thr->viR);
6531      thr->viR = VtsID__join2( thr->viR, so->viR );
6532      VtsID__rcinc(thr->viR);
6533
6534      /* At one point (r10589) it seemed safest to tick the clocks for
6535         the receiving thread after the join.  But on reflection, I
6536         wonder if that might cause it to 'overtake' constraints,
6537         which could lead to missing races.  So, back out that part of
6538         r10589. */
6539      //VtsID__rcdec(thr->viR);
6540      //thr->viR = VtsID__tick( thr->viR, thr );
6541      //VtsID__rcinc(thr->viR);
6542
6543      /* For a strong receive, we also advance the receiver's write
6544         clock, which means the receive as a whole is essentially
6545         equivalent to a W-acquisition of a R-W lock. */
6546      if (strong_recv) {
6547         VtsID__rcdec(thr->viW);
6548         thr->viW = VtsID__join2( thr->viW, so->viW );
6549         VtsID__rcinc(thr->viW);
6550
6551         /* See comment just above, re r10589. */
6552         //VtsID__rcdec(thr->viW);
6553         //thr->viW = VtsID__tick( thr->viW, thr );
6554         //VtsID__rcinc(thr->viW);
6555      }
6556
6557      if (thr->filter)
6558         Filter__clear(thr->filter, "libhb_so_recv");
6559      note_local_Kw_n_stack_for(thr);
6560
6561      if (strong_recv)
6562         show_thread_state("s-recv", thr);
6563      else
6564         show_thread_state("w-recv", thr);
6565
6566   } else {
6567      tl_assert(so->viW == VtsID_INVALID);
6568      /* Deal with degenerate case: 'so' has no vts, so there has been
6569         no message posted to it.  Just ignore this case. */
6570      show_thread_state("d-recv", thr);
6571   }
6572}
6573
6574Bool libhb_so_everSent ( SO* so )
6575{
6576   if (so->viR == VtsID_INVALID) {
6577      tl_assert(so->viW == VtsID_INVALID);
6578      return False;
6579   } else {
6580      tl_assert(so->viW != VtsID_INVALID);
6581      return True;
6582   }
6583}
6584
6585#define XXX1 0 // 0x67a106c
6586#define XXX2 0
6587
6588static inline Bool TRACEME(Addr a, SizeT szB) {
6589   if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
6590   if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
6591   return False;
6592}
6593static void trace ( Thr* thr, Addr a, SizeT szB, HChar* s ) {
6594  SVal sv = zsm_sread08(a);
6595  VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
6596  show_thread_state("", thr);
6597  VG_(printf)("%s","\n");
6598}
6599
6600void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
6601{
6602   SVal sv = SVal__mkC(thr->viW, thr->viW);
6603   tl_assert(is_sane_SVal_C(sv));
6604   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
6605   zsm_sset_range( a, szB, sv );
6606   Filter__clear_range( thr->filter, a, szB );
6607   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
6608}
6609
6610void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
6611{
6612   /* do nothing */
6613}
6614
6615void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
6616{
6617   /* This really does put the requested range in NoAccess.  It's
6618      expensive though. */
6619   SVal sv = SVal_NOACCESS;
6620   tl_assert(is_sane_SVal_C(sv));
6621   zsm_sset_range( a, szB, sv );
6622   Filter__clear_range( thr->filter, a, szB );
6623}
6624
6625void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
6626{
6627   SVal sv = SVal_NOACCESS;
6628   tl_assert(is_sane_SVal_C(sv));
6629   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
6630   zsm_sset_range( a, szB, sv );
6631   Filter__clear_range( thr->filter, a, szB );
6632   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
6633}
6634
6635Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
6636   tl_assert(thr);
6637   return thr->hgthread;
6638}
6639
6640void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
6641   tl_assert(thr);
6642   thr->hgthread = hgthread;
6643}
6644
6645void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
6646{
6647   zsm_scopy_range(src, dst, len);
6648   Filter__clear_range( thr->filter, dst, len );
6649}
6650
6651void libhb_maybe_GC ( void )
6652{
6653   event_map_maybe_GC();
6654   /* If there are still freelist entries available, no need for a
6655      GC. */
6656   if (vts_tab_freelist != VtsID_INVALID)
6657      return;
6658   /* So all the table entries are full, and we're having to expand
6659      the table.  But did we hit the threshhold point yet? */
6660   if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
6661      return;
6662   vts_tab__do_GC( False/*don't show stats*/ );
6663}
6664
6665
6666/////////////////////////////////////////////////////////////////
6667/////////////////////////////////////////////////////////////////
6668//                                                             //
6669// SECTION END main library                                    //
6670//                                                             //
6671/////////////////////////////////////////////////////////////////
6672/////////////////////////////////////////////////////////////////
6673
6674/*--------------------------------------------------------------------*/
6675/*--- end                                             libhb_main.c ---*/
6676/*--------------------------------------------------------------------*/
6677