1// Copyright (c) 2008, Google Inc. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above 11// copyright notice, this list of conditions and the following disclaimer 12// in the documentation and/or other materials provided with the 13// distribution. 14// * Neither the name of Google Inc. nor the names of its 15// contributors may be used to endorse or promote products derived from 16// this software without specific prior written permission. 17// 18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30// --- 31// Author: Sanjay Ghemawat <opensource@google.com> 32 33#ifndef TCMALLOC_THREAD_CACHE_H_ 34#define TCMALLOC_THREAD_CACHE_H_ 35 36#include <config.h> 37#ifdef HAVE_PTHREAD 38#include <pthread.h> // for pthread_t, pthread_key_t 39#endif 40#include <stddef.h> // for size_t, NULL 41#ifdef HAVE_STDINT_H 42#include <stdint.h> // for uint32_t, uint64_t 43#endif 44#include <sys/types.h> // for ssize_t 45#include "common.h" 46#include "linked_list.h" 47#include "maybe_threads.h" 48#include "page_heap_allocator.h" 49#include "sampler.h" 50#include "static_vars.h" 51 52#include "common.h" // for SizeMap, kMaxSize, etc 53#include "internal_logging.h" // for ASSERT, etc 54#include "linked_list.h" // for SLL_Pop, SLL_PopRange, etc 55#include "page_heap_allocator.h" // for PageHeapAllocator 56#include "sampler.h" // for Sampler 57#include "static_vars.h" // for Static 58 59namespace tcmalloc { 60 61// Even if we have support for thread-local storage in the compiler 62// and linker, the OS may not support it. We need to check that at 63// runtime. Right now, we have to keep a manual set of "bad" OSes. 64#if defined(HAVE_TLS) 65extern bool kernel_supports_tls; // defined in thread_cache.cc 66void CheckIfKernelSupportsTLS(); 67inline bool KernelSupportsTLS() { 68 return kernel_supports_tls; 69} 70#endif // HAVE_TLS 71 72//------------------------------------------------------------------- 73// Data kept per thread 74//------------------------------------------------------------------- 75 76class ThreadCache { 77 public: 78 // All ThreadCache objects are kept in a linked list (for stats collection) 79 ThreadCache* next_; 80 ThreadCache* prev_; 81 82 void Init(pthread_t tid); 83 void Cleanup(); 84 85 // Accessors (mostly just for printing stats) 86 int freelist_length(size_t cl) const { return list_[cl].length(); } 87 88 // Total byte size in cache 89 size_t Size() const { return size_; } 90 91 // Allocate an object of the given size and class. The size given 92 // must be the same as the size of the class in the size map. 93 void* Allocate(size_t size, size_t cl); 94 void Deallocate(void* ptr, size_t size_class); 95 96 void Scavenge(); 97 98 int GetSamplePeriod(); 99 100 // Record allocation of "k" bytes. Return true iff allocation 101 // should be sampled 102 bool SampleAllocation(size_t k); 103 104 static void InitModule(); 105 static void InitTSD(); 106 static ThreadCache* GetThreadHeap(); 107 static ThreadCache* GetCache(); 108 static ThreadCache* GetCacheIfPresent(); 109 static ThreadCache* CreateCacheIfNecessary(); 110 static void BecomeIdle(); 111 112 // Return the number of thread heaps in use. 113 static inline int HeapsInUse(); 114 115 // Writes to total_bytes the total number of bytes used by all thread heaps. 116 // class_count must be an array of size kNumClasses. Writes the number of 117 // items on the corresponding freelist. class_count may be NULL. 118 // The storage of both parameters must be zero intialized. 119 // REQUIRES: Static::pageheap_lock is held. 120 static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count); 121 122 // Sets the total thread cache size to new_size, recomputing the 123 // individual thread cache sizes as necessary. 124 // REQUIRES: Static::pageheap lock is held. 125 static void set_overall_thread_cache_size(size_t new_size); 126 static size_t overall_thread_cache_size() { 127 return overall_thread_cache_size_; 128 } 129 130 private: 131 class FreeList { 132 private: 133 void* list_; // Linked list of nodes 134 135#ifdef _LP64 136 // On 64-bit hardware, manipulating 16-bit values may be slightly slow. 137 uint32_t length_; // Current length. 138 uint32_t lowater_; // Low water mark for list length. 139 uint32_t max_length_; // Dynamic max list length based on usage. 140 // Tracks the number of times a deallocation has caused 141 // length_ > max_length_. After the kMaxOverages'th time, max_length_ 142 // shrinks and length_overages_ is reset to zero. 143 uint32_t length_overages_; 144#else 145 // If we aren't using 64-bit pointers then pack these into less space. 146 uint16_t length_; 147 uint16_t lowater_; 148 uint16_t max_length_; 149 uint16_t length_overages_; 150#endif 151 152 public: 153 void Init() { 154 list_ = NULL; 155 length_ = 0; 156 lowater_ = 0; 157 max_length_ = 1; 158 length_overages_ = 0; 159 } 160 161 // Return current length of list 162 size_t length() const { 163 return length_; 164 } 165 166 // Return the maximum length of the list. 167 size_t max_length() const { 168 return max_length_; 169 } 170 171 // Set the maximum length of the list. If 'new_max' > length(), the 172 // client is responsible for removing objects from the list. 173 void set_max_length(size_t new_max) { 174 max_length_ = new_max; 175 } 176 177 // Return the number of times that length() has gone over max_length(). 178 size_t length_overages() const { 179 return length_overages_; 180 } 181 182 void set_length_overages(size_t new_count) { 183 length_overages_ = new_count; 184 } 185 186 // Is list empty? 187 bool empty() const { 188 return list_ == NULL; 189 } 190 191 // Low-water mark management 192 int lowwatermark() const { return lowater_; } 193 void clear_lowwatermark() { lowater_ = length_; } 194 195 void Push(void* ptr) { 196 SLL_Push(&list_, ptr); 197 length_++; 198 } 199 200 void* Pop() { 201 ASSERT(list_ != NULL); 202 length_--; 203 if (length_ < lowater_) lowater_ = length_; 204 return SLL_Pop(&list_); 205 } 206 207 void* Next() { 208 return SLL_Next(&list_); 209 } 210 211 void PushRange(int N, void *start, void *end) { 212 SLL_PushRange(&list_, start, end); 213 length_ += N; 214 } 215 216 void PopRange(int N, void **start, void **end) { 217 SLL_PopRange(&list_, N, start, end); 218 ASSERT(length_ >= N); 219 length_ -= N; 220 if (length_ < lowater_) lowater_ = length_; 221 } 222 }; 223 224 // Gets and returns an object from the central cache, and, if possible, 225 // also adds some objects of that size class to this thread cache. 226 void* FetchFromCentralCache(size_t cl, size_t byte_size); 227 228 // Releases some number of items from src. Adjusts the list's max_length 229 // to eventually converge on num_objects_to_move(cl). 230 void ListTooLong(FreeList* src, size_t cl); 231 232 // Releases N items from this thread cache. 233 void ReleaseToCentralCache(FreeList* src, size_t cl, int N); 234 235 // Increase max_size_ by reducing unclaimed_cache_space_ or by 236 // reducing the max_size_ of some other thread. In both cases, 237 // the delta is kStealAmount. 238 void IncreaseCacheLimit(); 239 // Same as above but requires Static::pageheap_lock() is held. 240 void IncreaseCacheLimitLocked(); 241 242 // If TLS is available, we also store a copy of the per-thread object 243 // in a __thread variable since __thread variables are faster to read 244 // than pthread_getspecific(). We still need pthread_setspecific() 245 // because __thread variables provide no way to run cleanup code when 246 // a thread is destroyed. 247 // We also give a hint to the compiler to use the "initial exec" TLS 248 // model. This is faster than the default TLS model, at the cost that 249 // you cannot dlopen this library. (To see the difference, look at 250 // the CPU use of __tls_get_addr with and without this attribute.) 251 // Since we don't really use dlopen in google code -- and using dlopen 252 // on a malloc replacement is asking for trouble in any case -- that's 253 // a good tradeoff for us. 254#ifdef HAVE_TLS 255 static __thread ThreadCache* threadlocal_heap_ 256# ifdef HAVE___ATTRIBUTE__ 257 __attribute__ ((tls_model ("initial-exec"))) 258# endif 259 ; 260#endif 261 262 // Thread-specific key. Initialization here is somewhat tricky 263 // because some Linux startup code invokes malloc() before it 264 // is in a good enough state to handle pthread_keycreate(). 265 // Therefore, we use TSD keys only after tsd_inited is set to true. 266 // Until then, we use a slow path to get the heap object. 267 static bool tsd_inited_; 268 static pthread_key_t heap_key_; 269 270 // Linked list of heap objects. Protected by Static::pageheap_lock. 271 static ThreadCache* thread_heaps_; 272 static int thread_heap_count_; 273 274 // A pointer to one of the objects in thread_heaps_. Represents 275 // the next ThreadCache from which a thread over its max_size_ should 276 // steal memory limit. Round-robin through all of the objects in 277 // thread_heaps_. Protected by Static::pageheap_lock. 278 static ThreadCache* next_memory_steal_; 279 280 // Overall thread cache size. Protected by Static::pageheap_lock. 281 static size_t overall_thread_cache_size_; 282 283 // Global per-thread cache size. Writes are protected by 284 // Static::pageheap_lock. Reads are done without any locking, which should be 285 // fine as long as size_t can be written atomically and we don't place 286 // invariants between this variable and other pieces of state. 287 static volatile size_t per_thread_cache_size_; 288 289 // Represents overall_thread_cache_size_ minus the sum of max_size_ 290 // across all ThreadCaches. Protected by Static::pageheap_lock. 291 static ssize_t unclaimed_cache_space_; 292 293 // This class is laid out with the most frequently used fields 294 // first so that hot elements are placed on the same cache line. 295 296 size_t size_; // Combined size of data 297 size_t max_size_; // size_ > max_size_ --> Scavenge() 298 299 // We sample allocations, biased by the size of the allocation 300 Sampler sampler_; // A sampler 301 302 FreeList list_[kNumClasses]; // Array indexed by size-class 303 304 pthread_t tid_; // Which thread owns it 305 bool in_setspecific_; // In call to pthread_setspecific? 306 307 // Allocate a new heap. REQUIRES: Static::pageheap_lock is held. 308 static ThreadCache* NewHeap(pthread_t tid); 309 310 // Use only as pthread thread-specific destructor function. 311 static void DestroyThreadCache(void* ptr); 312 313 static void DeleteCache(ThreadCache* heap); 314 static void RecomputePerThreadCacheSize(); 315 316 // Ensure that this class is cacheline-aligned. This is critical for 317 // performance, as false sharing would negate many of the benefits 318 // of a per-thread cache. 319} CACHELINE_ALIGNED; 320 321// Allocator for thread heaps 322// This is logically part of the ThreadCache class, but MSVC, at 323// least, does not like using ThreadCache as a template argument 324// before the class is fully defined. So we put it outside the class. 325extern PageHeapAllocator<ThreadCache> threadcache_allocator; 326 327inline int ThreadCache::HeapsInUse() { 328 return threadcache_allocator.inuse(); 329} 330 331inline bool ThreadCache::SampleAllocation(size_t k) { 332 return sampler_.SampleAllocation(k); 333} 334 335inline void* ThreadCache::Allocate(size_t size, size_t cl) { 336 ASSERT(size <= kMaxSize); 337 ASSERT(size == Static::sizemap()->ByteSizeForClass(cl)); 338 339 FreeList* list = &list_[cl]; 340 if (list->empty()) { 341 return FetchFromCentralCache(cl, size); 342 } 343 size_ -= size; 344 return list->Pop(); 345} 346 347inline void ThreadCache::Deallocate(void* ptr, size_t cl) { 348 FreeList* list = &list_[cl]; 349 size_ += Static::sizemap()->ByteSizeForClass(cl); 350 ssize_t size_headroom = max_size_ - size_ - 1; 351 352 // This catches back-to-back frees of allocs in the same size 353 // class. A more comprehensive (and expensive) test would be to walk 354 // the entire freelist. But this might be enough to find some bugs. 355 ASSERT(ptr != list->Next()); 356 357 list->Push(ptr); 358 ssize_t list_headroom = 359 static_cast<ssize_t>(list->max_length()) - list->length(); 360 361 // There are two relatively uncommon things that require further work. 362 // In the common case we're done, and in that case we need a single branch 363 // because of the bitwise-or trick that follows. 364 if ((list_headroom | size_headroom) < 0) { 365 if (list_headroom < 0) { 366 ListTooLong(list, cl); 367 } 368 if (size_ >= max_size_) Scavenge(); 369 } 370} 371 372inline ThreadCache* ThreadCache::GetThreadHeap() { 373#ifdef HAVE_TLS 374 // __thread is faster, but only when the kernel supports it 375 if (KernelSupportsTLS()) 376 return threadlocal_heap_; 377#endif 378 return reinterpret_cast<ThreadCache *>( 379 perftools_pthread_getspecific(heap_key_)); 380} 381 382inline ThreadCache* ThreadCache::GetCache() { 383 ThreadCache* ptr = NULL; 384 if (!tsd_inited_) { 385 InitModule(); 386 } else { 387 ptr = GetThreadHeap(); 388 } 389 if (ptr == NULL) ptr = CreateCacheIfNecessary(); 390 return ptr; 391} 392 393// In deletion paths, we do not try to create a thread-cache. This is 394// because we may be in the thread destruction code and may have 395// already cleaned up the cache for this thread. 396inline ThreadCache* ThreadCache::GetCacheIfPresent() { 397 if (!tsd_inited_) return NULL; 398 return GetThreadHeap(); 399} 400 401} // namespace tcmalloc 402 403#endif // TCMALLOC_THREAD_CACHE_H_ 404