1//===-- working_set.cpp ---------------------------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file is a part of EfficiencySanitizer, a family of performance tuners.
11//
12// This file contains working-set-specific code.
13//===----------------------------------------------------------------------===//
14
15#include "working_set.h"
16#include "esan.h"
17#include "esan_circular_buffer.h"
18#include "esan_flags.h"
19#include "esan_shadow.h"
20#include "esan_sideline.h"
21#include "sanitizer_common/sanitizer_procmaps.h"
22
23// We shadow every cache line of app memory with one shadow byte.
24// - The highest bit of each shadow byte indicates whether the corresponding
25//   cache line has ever been accessed.
26// - The lowest bit of each shadow byte indicates whether the corresponding
27//   cache line was accessed since the last sample.
28// - The other bits are used for working set snapshots at successively
29//   lower frequencies, each bit to the left from the lowest bit stepping
30//   down the frequency by 2 to the power of getFlags()->snapshot_step.
31// Thus we have something like this:
32//   Bit 0: Since last sample
33//   Bit 1: Since last 2^2 samples
34//   Bit 2: Since last 2^4 samples
35//   Bit 3: ...
36//   Bit 7: Ever accessed.
37// We live with races in accessing each shadow byte.
38typedef unsigned char byte;
39
40namespace __esan {
41
42// Our shadow memory assumes that the line size is 64.
43static const u32 CacheLineSize = 64;
44
45// See the shadow byte layout description above.
46static const u32 TotalWorkingSetBitIdx = 7;
47// We accumulate to the left until we hit this bit.
48// We don't need to accumulate to the final bit as it's set on each ref
49// by the compiler instrumentation.
50static const u32 MaxAccumBitIdx = 6;
51static const u32 CurWorkingSetBitIdx = 0;
52static const byte ShadowAccessedVal =
53  (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
54
55static SidelineThread Thread;
56// If we use real-time-based timer samples this won't overflow in any realistic
57// scenario, but if we switch to some other unit (such as memory accesses) we
58// may want to consider a 64-bit int.
59static u32 SnapshotNum;
60
61// We store the wset size for each of 8 different sampling frequencies.
62static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
63// We cannot use static objects as the global destructor is called
64// prior to our finalize routine.
65// These are each circular buffers, sized up front.
66CircularBuffer<u32> SizePerFreq[NumFreq];
67// We cannot rely on static initializers (they may run too late) but
68// we record the size here for clarity:
69u32 CircularBufferSizes[NumFreq] = {
70  // These are each mmap-ed so our minimum is one page.
71  32*1024,
72  16*1024,
73  8*1024,
74  4*1024,
75  4*1024,
76  4*1024,
77  4*1024,
78  4*1024,
79};
80
81void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
82                                  bool IsWrite) {
83  if (Size == 0)
84    return;
85  SIZE_T I = 0;
86  uptr LineSize = getFlags()->cache_line_size;
87  // As Addr+Size could overflow at the top of a 32-bit address space,
88  // we avoid the simpler formula that rounds the start and end.
89  SIZE_T NumLines = Size / LineSize +
90    // Add any extra at the start or end adding on an extra line:
91    (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
92  byte *Shadow = (byte *)appToShadow(Addr);
93  // Write shadow bytes until we're word-aligned.
94  while (I < NumLines && (uptr)Shadow % 4 != 0) {
95    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
96      *Shadow |= ShadowAccessedVal;
97    ++Shadow;
98    ++I;
99  }
100  // Write whole shadow words at a time.
101  // Using a word-stride loop improves the runtime of a microbenchmark of
102  // memset calls by 10%.
103  u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
104    ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
105  while (I + 4 <= NumLines) {
106    if ((*(u32*)Shadow & WordValue) != WordValue)
107      *(u32*)Shadow |= WordValue;
108    Shadow += 4;
109    I += 4;
110  }
111  // Write any trailing shadow bytes.
112  while (I < NumLines) {
113    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
114      *Shadow |= ShadowAccessedVal;
115    ++Shadow;
116    ++I;
117  }
118}
119
120// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
121// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
122// measures the access during the entire execution and should never be cleared.
123static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
124                                     uptr ShadowEnd) {
125  u32 WorkingSetSize = 0;
126  u32 ByteValue = 0x1 << BitIdx;
127  u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
128    ByteValue << 24;
129  // Get word aligned start.
130  ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
131  bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
132  // Do not clear the bit that measures access during the entire execution.
133  bool Clear = BitIdx < TotalWorkingSetBitIdx;
134  for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
135    if ((*Ptr & WordValue) != 0) {
136      byte *BytePtr = (byte *)Ptr;
137      for (u32 j = 0; j < sizeof(u32); ++j) {
138        if (BytePtr[j] & ByteValue) {
139          ++WorkingSetSize;
140          if (Accum) {
141            // Accumulate to the lower-frequency bit to the left.
142            BytePtr[j] |= (ByteValue << 1);
143          }
144        }
145      }
146      if (Clear) {
147        // Clear this bit from every shadow byte.
148        *Ptr &= ~WordValue;
149      }
150    }
151  }
152  return WorkingSetSize;
153}
154
155// Scan shadow memory to calculate the number of cache lines being accessed,
156// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
157// We also clear the lowest bits (most recent working set snapshot).
158// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
159// measures the access during the entire execution and should never be cleared.
160static u32 computeWorkingSizeAndReset(u32 BitIdx) {
161  u32 WorkingSetSize = 0;
162  MemoryMappingLayout MemIter(true/*cache*/);
163  uptr Start, End, Prot;
164  while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/,
165                      0/*file size*/, &Prot)) {
166    VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
167            __FUNCTION__, Start, End, Prot, isAppMem(Start),
168            isShadowMem(Start));
169    if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
170      VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
171      WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
172    }
173  }
174  return WorkingSetSize;
175}
176
177// This is invoked from a signal handler but in a sideline thread doing nothing
178// else so it is a little less fragile than a typical signal handler.
179static void takeSample(void *Arg) {
180  u32 BitIdx = CurWorkingSetBitIdx;
181  u32 Freq = 1;
182  ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
183  while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
184    u32 NumLines = computeWorkingSizeAndReset(BitIdx);
185    VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
186            SnapshotNum, BitIdx, Freq, NumLines);
187    SizePerFreq[BitIdx].push_back(NumLines);
188    Freq = Freq << getFlags()->snapshot_step;
189    BitIdx++;
190  }
191}
192
193// Initialization that must be done before any instrumented code is executed.
194void initializeShadowWorkingSet() {
195  CHECK(getFlags()->cache_line_size == CacheLineSize);
196  registerMemoryFaultHandler();
197}
198
199void initializeWorkingSet() {
200  if (getFlags()->record_snapshots) {
201    for (u32 i = 0; i < NumFreq; ++i)
202      SizePerFreq[i].initialize(CircularBufferSizes[i]);
203    Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
204  }
205}
206
207static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
208  if (MilliSec > 600000) {
209    Unit = "min";
210    return MilliSec / 60000;
211  } else if (MilliSec > 10000) {
212    Unit = "sec";
213    return MilliSec / 1000;
214  } else {
215    Unit = "ms";
216    return MilliSec;
217  }
218}
219
220static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
221  // We need a constant to avoid software divide support:
222  static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
223  static const u32 MegabyteCachelines = KilobyteCachelines << 10;
224
225  if (NumOfCachelines > 10 * MegabyteCachelines) {
226    Unit = "MB";
227    return NumOfCachelines / MegabyteCachelines;
228  } else if (NumOfCachelines > 10 * KilobyteCachelines) {
229    Unit = "KB";
230    return NumOfCachelines / KilobyteCachelines;
231  } else {
232    Unit = "Bytes";
233    return NumOfCachelines * CacheLineSize;
234  }
235}
236
237void reportWorkingSet() {
238  const char *Unit;
239  if (getFlags()->record_snapshots) {
240    u32 Freq = 1;
241    Report(" Total number of samples: %u\n", SnapshotNum);
242    for (u32 i = 0; i < NumFreq; ++i) {
243      u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
244      Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
245      // FIXME: report whether we wrapped around and thus whether we
246      // have data on the whole run or just the last N samples.
247      for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
248        u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
249        Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
250               SizePerFreq[i][j]);
251      }
252      Freq = Freq << getFlags()->snapshot_step;
253    }
254  }
255
256  // Get the working set size for the entire execution.
257  u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
258  u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
259  Report(" %s: the total working set size: %u %s (%u cache lines)\n",
260         SanitizerToolName, Size, Unit, NumOfCachelines);
261}
262
263int finalizeWorkingSet() {
264  if (getFlags()->record_snapshots)
265    Thread.joinThread();
266  reportWorkingSet();
267  if (getFlags()->record_snapshots) {
268    for (u32 i = 0; i < NumFreq; ++i)
269      SizePerFreq[i].free();
270  }
271  return 0;
272}
273
274} // namespace __esan
275