1// Copyright (c) 2008, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// ---
31// Author: Sanjay Ghemawat <opensource@google.com>
32
33#ifndef TCMALLOC_THREAD_CACHE_H_
34#define TCMALLOC_THREAD_CACHE_H_
35
36#include <config.h>
37#ifdef HAVE_PTHREAD
38#include <pthread.h>                    // for pthread_t, pthread_key_t
39#endif
40#include <stddef.h>                     // for size_t, NULL
41#ifdef HAVE_STDINT_H
42#include <stdint.h>                     // for uint32_t, uint64_t
43#endif
44#include <sys/types.h>                  // for ssize_t
45#include "common.h"
46#include "linked_list.h"
47#include "maybe_threads.h"
48#include "page_heap_allocator.h"
49#include "sampler.h"
50#include "static_vars.h"
51
52#include "common.h"            // for SizeMap, kMaxSize, etc
53#include "internal_logging.h"  // for ASSERT, etc
54#include "linked_list.h"       // for SLL_Pop, SLL_PopRange, etc
55#include "page_heap_allocator.h"  // for PageHeapAllocator
56#include "sampler.h"           // for Sampler
57#include "static_vars.h"       // for Static
58
59namespace tcmalloc {
60
61// Even if we have support for thread-local storage in the compiler
62// and linker, the OS may not support it.  We need to check that at
63// runtime.  Right now, we have to keep a manual set of "bad" OSes.
64#if defined(HAVE_TLS)
65extern bool kernel_supports_tls;   // defined in thread_cache.cc
66void CheckIfKernelSupportsTLS();
67inline bool KernelSupportsTLS() {
68  return kernel_supports_tls;
69}
70#endif    // HAVE_TLS
71
72//-------------------------------------------------------------------
73// Data kept per thread
74//-------------------------------------------------------------------
75
76class ThreadCache {
77 public:
78  // All ThreadCache objects are kept in a linked list (for stats collection)
79  ThreadCache* next_;
80  ThreadCache* prev_;
81
82  void Init(pthread_t tid);
83  void Cleanup();
84
85  // Accessors (mostly just for printing stats)
86  int freelist_length(size_t cl) const { return list_[cl].length(); }
87
88  // Total byte size in cache
89  size_t Size() const { return size_; }
90
91  // Allocate an object of the given size and class. The size given
92  // must be the same as the size of the class in the size map.
93  void* Allocate(size_t size, size_t cl);
94  void Deallocate(void* ptr, size_t size_class);
95
96  void Scavenge();
97
98  int GetSamplePeriod();
99
100  // Record allocation of "k" bytes.  Return true iff allocation
101  // should be sampled
102  bool SampleAllocation(size_t k);
103
104  static void         InitModule();
105  static void         InitTSD();
106  static ThreadCache* GetThreadHeap();
107  static ThreadCache* GetCache();
108  static ThreadCache* GetCacheIfPresent();
109  static ThreadCache* CreateCacheIfNecessary();
110  static void         BecomeIdle();
111
112  // Return the number of thread heaps in use.
113  static inline int HeapsInUse();
114
115  // Writes to total_bytes the total number of bytes used by all thread heaps.
116  // class_count must be an array of size kNumClasses.  Writes the number of
117  // items on the corresponding freelist.  class_count may be NULL.
118  // The storage of both parameters must be zero intialized.
119  // REQUIRES: Static::pageheap_lock is held.
120  static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
121
122  // Sets the total thread cache size to new_size, recomputing the
123  // individual thread cache sizes as necessary.
124  // REQUIRES: Static::pageheap lock is held.
125  static void set_overall_thread_cache_size(size_t new_size);
126  static size_t overall_thread_cache_size() {
127    return overall_thread_cache_size_;
128  }
129
130 private:
131  class FreeList {
132   private:
133    void*    list_;       // Linked list of nodes
134
135#ifdef _LP64
136    // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
137    uint32_t length_;      // Current length.
138    uint32_t lowater_;     // Low water mark for list length.
139    uint32_t max_length_;  // Dynamic max list length based on usage.
140    // Tracks the number of times a deallocation has caused
141    // length_ > max_length_.  After the kMaxOverages'th time, max_length_
142    // shrinks and length_overages_ is reset to zero.
143    uint32_t length_overages_;
144#else
145    // If we aren't using 64-bit pointers then pack these into less space.
146    uint16_t length_;
147    uint16_t lowater_;
148    uint16_t max_length_;
149    uint16_t length_overages_;
150#endif
151
152   public:
153    void Init() {
154      list_ = NULL;
155      length_ = 0;
156      lowater_ = 0;
157      max_length_ = 1;
158      length_overages_ = 0;
159    }
160
161    // Return current length of list
162    size_t length() const {
163      return length_;
164    }
165
166    // Return the maximum length of the list.
167    size_t max_length() const {
168      return max_length_;
169    }
170
171    // Set the maximum length of the list.  If 'new_max' > length(), the
172    // client is responsible for removing objects from the list.
173    void set_max_length(size_t new_max) {
174      max_length_ = new_max;
175    }
176
177    // Return the number of times that length() has gone over max_length().
178    size_t length_overages() const {
179      return length_overages_;
180    }
181
182    void set_length_overages(size_t new_count) {
183      length_overages_ = new_count;
184    }
185
186    // Is list empty?
187    bool empty() const {
188      return list_ == NULL;
189    }
190
191    // Low-water mark management
192    int lowwatermark() const { return lowater_; }
193    void clear_lowwatermark() { lowater_ = length_; }
194
195    void Push(void* ptr) {
196      SLL_Push(&list_, ptr);
197      length_++;
198    }
199
200    void* Pop() {
201      ASSERT(list_ != NULL);
202      length_--;
203      if (length_ < lowater_) lowater_ = length_;
204      return SLL_Pop(&list_);
205    }
206
207    void* Next() {
208      return SLL_Next(&list_);
209    }
210
211    void PushRange(int N, void *start, void *end) {
212      SLL_PushRange(&list_, start, end);
213      length_ += N;
214    }
215
216    void PopRange(int N, void **start, void **end) {
217      SLL_PopRange(&list_, N, start, end);
218      ASSERT(length_ >= N);
219      length_ -= N;
220      if (length_ < lowater_) lowater_ = length_;
221    }
222  };
223
224  // Gets and returns an object from the central cache, and, if possible,
225  // also adds some objects of that size class to this thread cache.
226  void* FetchFromCentralCache(size_t cl, size_t byte_size);
227
228  // Releases some number of items from src.  Adjusts the list's max_length
229  // to eventually converge on num_objects_to_move(cl).
230  void ListTooLong(FreeList* src, size_t cl);
231
232  // Releases N items from this thread cache.
233  void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
234
235  // Increase max_size_ by reducing unclaimed_cache_space_ or by
236  // reducing the max_size_ of some other thread.  In both cases,
237  // the delta is kStealAmount.
238  void IncreaseCacheLimit();
239  // Same as above but requires Static::pageheap_lock() is held.
240  void IncreaseCacheLimitLocked();
241
242  // If TLS is available, we also store a copy of the per-thread object
243  // in a __thread variable since __thread variables are faster to read
244  // than pthread_getspecific().  We still need pthread_setspecific()
245  // because __thread variables provide no way to run cleanup code when
246  // a thread is destroyed.
247  // We also give a hint to the compiler to use the "initial exec" TLS
248  // model.  This is faster than the default TLS model, at the cost that
249  // you cannot dlopen this library.  (To see the difference, look at
250  // the CPU use of __tls_get_addr with and without this attribute.)
251  // Since we don't really use dlopen in google code -- and using dlopen
252  // on a malloc replacement is asking for trouble in any case -- that's
253  // a good tradeoff for us.
254#ifdef HAVE_TLS
255  static __thread ThreadCache* threadlocal_heap_
256# ifdef HAVE___ATTRIBUTE__
257   __attribute__ ((tls_model ("initial-exec")))
258# endif
259   ;
260#endif
261
262  // Thread-specific key.  Initialization here is somewhat tricky
263  // because some Linux startup code invokes malloc() before it
264  // is in a good enough state to handle pthread_keycreate().
265  // Therefore, we use TSD keys only after tsd_inited is set to true.
266  // Until then, we use a slow path to get the heap object.
267  static bool tsd_inited_;
268  static pthread_key_t heap_key_;
269
270  // Linked list of heap objects.  Protected by Static::pageheap_lock.
271  static ThreadCache* thread_heaps_;
272  static int thread_heap_count_;
273
274  // A pointer to one of the objects in thread_heaps_.  Represents
275  // the next ThreadCache from which a thread over its max_size_ should
276  // steal memory limit.  Round-robin through all of the objects in
277  // thread_heaps_.  Protected by Static::pageheap_lock.
278  static ThreadCache* next_memory_steal_;
279
280  // Overall thread cache size.  Protected by Static::pageheap_lock.
281  static size_t overall_thread_cache_size_;
282
283  // Global per-thread cache size.  Writes are protected by
284  // Static::pageheap_lock.  Reads are done without any locking, which should be
285  // fine as long as size_t can be written atomically and we don't place
286  // invariants between this variable and other pieces of state.
287  static volatile size_t per_thread_cache_size_;
288
289  // Represents overall_thread_cache_size_ minus the sum of max_size_
290  // across all ThreadCaches.  Protected by Static::pageheap_lock.
291  static ssize_t unclaimed_cache_space_;
292
293  // This class is laid out with the most frequently used fields
294  // first so that hot elements are placed on the same cache line.
295
296  size_t        size_;                  // Combined size of data
297  size_t        max_size_;              // size_ > max_size_ --> Scavenge()
298
299  // We sample allocations, biased by the size of the allocation
300  Sampler       sampler_;               // A sampler
301
302  FreeList      list_[kNumClasses];     // Array indexed by size-class
303
304  pthread_t     tid_;                   // Which thread owns it
305  bool          in_setspecific_;        // In call to pthread_setspecific?
306
307  // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
308  static ThreadCache* NewHeap(pthread_t tid);
309
310  // Use only as pthread thread-specific destructor function.
311  static void DestroyThreadCache(void* ptr);
312
313  static void DeleteCache(ThreadCache* heap);
314  static void RecomputePerThreadCacheSize();
315
316  // Ensure that this class is cacheline-aligned. This is critical for
317  // performance, as false sharing would negate many of the benefits
318  // of a per-thread cache.
319} CACHELINE_ALIGNED;
320
321// Allocator for thread heaps
322// This is logically part of the ThreadCache class, but MSVC, at
323// least, does not like using ThreadCache as a template argument
324// before the class is fully defined.  So we put it outside the class.
325extern PageHeapAllocator<ThreadCache> threadcache_allocator;
326
327inline int ThreadCache::HeapsInUse() {
328  return threadcache_allocator.inuse();
329}
330
331inline bool ThreadCache::SampleAllocation(size_t k) {
332  return sampler_.SampleAllocation(k);
333}
334
335inline void* ThreadCache::Allocate(size_t size, size_t cl) {
336  ASSERT(size <= kMaxSize);
337  ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
338
339  FreeList* list = &list_[cl];
340  if (list->empty()) {
341    return FetchFromCentralCache(cl, size);
342  }
343  size_ -= size;
344  return list->Pop();
345}
346
347inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
348  FreeList* list = &list_[cl];
349  size_ += Static::sizemap()->ByteSizeForClass(cl);
350  ssize_t size_headroom = max_size_ - size_ - 1;
351
352  // This catches back-to-back frees of allocs in the same size
353  // class. A more comprehensive (and expensive) test would be to walk
354  // the entire freelist. But this might be enough to find some bugs.
355  ASSERT(ptr != list->Next());
356
357  list->Push(ptr);
358  ssize_t list_headroom =
359      static_cast<ssize_t>(list->max_length()) - list->length();
360
361  // There are two relatively uncommon things that require further work.
362  // In the common case we're done, and in that case we need a single branch
363  // because of the bitwise-or trick that follows.
364  if ((list_headroom | size_headroom) < 0) {
365    if (list_headroom < 0) {
366      ListTooLong(list, cl);
367    }
368    if (size_ >= max_size_) Scavenge();
369  }
370}
371
372inline ThreadCache* ThreadCache::GetThreadHeap() {
373#ifdef HAVE_TLS
374  // __thread is faster, but only when the kernel supports it
375  if (KernelSupportsTLS())
376    return threadlocal_heap_;
377#endif
378  return reinterpret_cast<ThreadCache *>(
379      perftools_pthread_getspecific(heap_key_));
380}
381
382inline ThreadCache* ThreadCache::GetCache() {
383  ThreadCache* ptr = NULL;
384  if (!tsd_inited_) {
385    InitModule();
386  } else {
387    ptr = GetThreadHeap();
388  }
389  if (ptr == NULL) ptr = CreateCacheIfNecessary();
390  return ptr;
391}
392
393// In deletion paths, we do not try to create a thread-cache.  This is
394// because we may be in the thread destruction code and may have
395// already cleaned up the cache for this thread.
396inline ThreadCache* ThreadCache::GetCacheIfPresent() {
397  if (!tsd_inited_) return NULL;
398  return GetThreadHeap();
399}
400
401}  // namespace tcmalloc
402
403#endif  // TCMALLOC_THREAD_CACHE_H_
404