1// Copyright (c) 2008, Google Inc. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above 11// copyright notice, this list of conditions and the following disclaimer 12// in the documentation and/or other materials provided with the 13// distribution. 14// * Neither the name of Google Inc. nor the names of its 15// contributors may be used to endorse or promote products derived from 16// this software without specific prior written permission. 17// 18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30// --- 31// Author: Ken Ashcraft <opensource@google.com> 32 33#include <config.h> 34#include "thread_cache.h" 35#include <errno.h> 36#include <string.h> // for memcpy 37#include <algorithm> // for max, min 38#include "base/commandlineflags.h" // for SpinLockHolder 39#include "base/spinlock.h" // for SpinLockHolder 40#include "central_freelist.h" // for CentralFreeListPadded 41#include "maybe_threads.h" 42 43using std::min; 44using std::max; 45 46DEFINE_int64(tcmalloc_max_total_thread_cache_bytes, 47 EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", 48 kDefaultOverallThreadCacheSize), 49 "Bound on the total amount of bytes allocated to " 50 "thread caches. This bound is not strict, so it is possible " 51 "for the cache to go over this bound in certain circumstances. " 52 "Maximum value of this flag is capped to 1 GB."); 53 54namespace tcmalloc { 55 56static bool phinited = false; 57 58volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize; 59size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize; 60ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize; 61PageHeapAllocator<ThreadCache> threadcache_allocator; 62ThreadCache* ThreadCache::thread_heaps_ = NULL; 63int ThreadCache::thread_heap_count_ = 0; 64ThreadCache* ThreadCache::next_memory_steal_ = NULL; 65#ifdef HAVE_TLS 66__thread ThreadCache* ThreadCache::threadlocal_heap_ 67# ifdef HAVE___ATTRIBUTE__ 68 __attribute__ ((tls_model ("initial-exec"))) 69# endif 70 ; 71#endif 72bool ThreadCache::tsd_inited_ = false; 73pthread_key_t ThreadCache::heap_key_; 74 75#if defined(HAVE_TLS) 76bool kernel_supports_tls = false; // be conservative 77# if defined(_WIN32) // windows has supported TLS since winnt, I think. 78 void CheckIfKernelSupportsTLS() { 79 kernel_supports_tls = true; 80 } 81# elif !HAVE_DECL_UNAME // if too old for uname, probably too old for TLS 82 void CheckIfKernelSupportsTLS() { 83 kernel_supports_tls = false; 84 } 85# else 86# include <sys/utsname.h> // DECL_UNAME checked for <sys/utsname.h> too 87 void CheckIfKernelSupportsTLS() { 88 struct utsname buf; 89 if (uname(&buf) < 0) { // should be impossible 90 Log(kLog, __FILE__, __LINE__, 91 "uname failed assuming no TLS support (errno)", errno); 92 kernel_supports_tls = false; 93 } else if (strcasecmp(buf.sysname, "linux") == 0) { 94 // The linux case: the first kernel to support TLS was 2.6.0 95 if (buf.release[0] < '2' && buf.release[1] == '.') // 0.x or 1.x 96 kernel_supports_tls = false; 97 else if (buf.release[0] == '2' && buf.release[1] == '.' && 98 buf.release[2] >= '0' && buf.release[2] < '6' && 99 buf.release[3] == '.') // 2.0 - 2.5 100 kernel_supports_tls = false; 101 else 102 kernel_supports_tls = true; 103 } else if (strcasecmp(buf.sysname, "CYGWIN_NT-6.1-WOW64") == 0) { 104 // In my testing, this version of cygwin, at least, would hang 105 // when using TLS. 106 kernel_supports_tls = false; 107 } else { // some other kernel, we'll be optimisitic 108 kernel_supports_tls = true; 109 } 110 // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG 111 } 112# endif // HAVE_DECL_UNAME 113#endif // HAVE_TLS 114 115void ThreadCache::Init(pthread_t tid) { 116 size_ = 0; 117 118 max_size_ = 0; 119 IncreaseCacheLimitLocked(); 120 if (max_size_ == 0) { 121 // There isn't enough memory to go around. Just give the minimum to 122 // this thread. 123 max_size_ = kMinThreadCacheSize; 124 125 // Take unclaimed_cache_space_ negative. 126 unclaimed_cache_space_ -= kMinThreadCacheSize; 127 ASSERT(unclaimed_cache_space_ < 0); 128 } 129 130 next_ = NULL; 131 prev_ = NULL; 132 tid_ = tid; 133 in_setspecific_ = false; 134 for (size_t cl = 0; cl < kNumClasses; ++cl) { 135 list_[cl].Init(); 136 } 137 138 uint32_t sampler_seed; 139 memcpy(&sampler_seed, &tid, sizeof(sampler_seed)); 140 sampler_.Init(sampler_seed); 141} 142 143void ThreadCache::Cleanup() { 144 // Put unused memory back into central cache 145 for (int cl = 0; cl < kNumClasses; ++cl) { 146 if (list_[cl].length() > 0) { 147 ReleaseToCentralCache(&list_[cl], cl, list_[cl].length()); 148 } 149 } 150} 151 152// Remove some objects of class "cl" from central cache and add to thread heap. 153// On success, return the first object for immediate use; otherwise return NULL. 154void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) { 155 FreeList* list = &list_[cl]; 156 ASSERT(list->empty()); 157 const int batch_size = Static::sizemap()->num_objects_to_move(cl); 158 159 const int num_to_move = min<int>(list->max_length(), batch_size); 160 void *start, *end; 161 int fetch_count = Static::central_cache()[cl].RemoveRange( 162 &start, &end, num_to_move); 163 164 ASSERT((start == NULL) == (fetch_count == 0)); 165 if (--fetch_count >= 0) { 166 size_ += byte_size * fetch_count; 167 list->PushRange(fetch_count, SLL_Next(start), end); 168 } 169 170 // Increase max length slowly up to batch_size. After that, 171 // increase by batch_size in one shot so that the length is a 172 // multiple of batch_size. 173 if (list->max_length() < batch_size) { 174 list->set_max_length(list->max_length() + 1); 175 } else { 176 // Don't let the list get too long. In 32 bit builds, the length 177 // is represented by a 16 bit int, so we need to watch out for 178 // integer overflow. 179 int new_length = min<int>(list->max_length() + batch_size, 180 kMaxDynamicFreeListLength); 181 // The list's max_length must always be a multiple of batch_size, 182 // and kMaxDynamicFreeListLength is not necessarily a multiple 183 // of batch_size. 184 new_length -= new_length % batch_size; 185 ASSERT(new_length % batch_size == 0); 186 list->set_max_length(new_length); 187 } 188 return start; 189} 190 191void ThreadCache::ListTooLong(FreeList* list, size_t cl) { 192 const int batch_size = Static::sizemap()->num_objects_to_move(cl); 193 ReleaseToCentralCache(list, cl, batch_size); 194 195 // If the list is too long, we need to transfer some number of 196 // objects to the central cache. Ideally, we would transfer 197 // num_objects_to_move, so the code below tries to make max_length 198 // converge on num_objects_to_move. 199 200 if (list->max_length() < batch_size) { 201 // Slow start the max_length so we don't overreserve. 202 list->set_max_length(list->max_length() + 1); 203 } else if (list->max_length() > batch_size) { 204 // If we consistently go over max_length, shrink max_length. If we don't 205 // shrink it, some amount of memory will always stay in this freelist. 206 list->set_length_overages(list->length_overages() + 1); 207 if (list->length_overages() > kMaxOverages) { 208 ASSERT(list->max_length() > batch_size); 209 list->set_max_length(list->max_length() - batch_size); 210 list->set_length_overages(0); 211 } 212 } 213} 214 215// Remove some objects of class "cl" from thread heap and add to central cache 216void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) { 217 ASSERT(src == &list_[cl]); 218 if (N > src->length()) N = src->length(); 219 size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl); 220 221 // We return prepackaged chains of the correct size to the central cache. 222 // TODO: Use the same format internally in the thread caches? 223 int batch_size = Static::sizemap()->num_objects_to_move(cl); 224 while (N > batch_size) { 225 void *tail, *head; 226 src->PopRange(batch_size, &head, &tail); 227 Static::central_cache()[cl].InsertRange(head, tail, batch_size); 228 N -= batch_size; 229 } 230 void *tail, *head; 231 src->PopRange(N, &head, &tail); 232 Static::central_cache()[cl].InsertRange(head, tail, N); 233 size_ -= delta_bytes; 234} 235 236// Release idle memory to the central cache 237void ThreadCache::Scavenge() { 238 // If the low-water mark for the free list is L, it means we would 239 // not have had to allocate anything from the central cache even if 240 // we had reduced the free list size by L. We aim to get closer to 241 // that situation by dropping L/2 nodes from the free list. This 242 // may not release much memory, but if so we will call scavenge again 243 // pretty soon and the low-water marks will be high on that call. 244 //int64 start = CycleClock::Now(); 245 for (int cl = 0; cl < kNumClasses; cl++) { 246 FreeList* list = &list_[cl]; 247 const int lowmark = list->lowwatermark(); 248 if (lowmark > 0) { 249 const int drop = (lowmark > 1) ? lowmark/2 : 1; 250 ReleaseToCentralCache(list, cl, drop); 251 252 // Shrink the max length if it isn't used. Only shrink down to 253 // batch_size -- if the thread was active enough to get the max_length 254 // above batch_size, it will likely be that active again. If 255 // max_length shinks below batch_size, the thread will have to 256 // go through the slow-start behavior again. The slow-start is useful 257 // mainly for threads that stay relatively idle for their entire 258 // lifetime. 259 const int batch_size = Static::sizemap()->num_objects_to_move(cl); 260 if (list->max_length() > batch_size) { 261 list->set_max_length( 262 max<int>(list->max_length() - batch_size, batch_size)); 263 } 264 } 265 list->clear_lowwatermark(); 266 } 267 268 IncreaseCacheLimit(); 269} 270 271void ThreadCache::IncreaseCacheLimit() { 272 SpinLockHolder h(Static::pageheap_lock()); 273 IncreaseCacheLimitLocked(); 274} 275 276void ThreadCache::IncreaseCacheLimitLocked() { 277 if (unclaimed_cache_space_ > 0) { 278 // Possibly make unclaimed_cache_space_ negative. 279 unclaimed_cache_space_ -= kStealAmount; 280 max_size_ += kStealAmount; 281 return; 282 } 283 // Don't hold pageheap_lock too long. Try to steal from 10 other 284 // threads before giving up. The i < 10 condition also prevents an 285 // infinite loop in case none of the existing thread heaps are 286 // suitable places to steal from. 287 for (int i = 0; i < 10; 288 ++i, next_memory_steal_ = next_memory_steal_->next_) { 289 // Reached the end of the linked list. Start at the beginning. 290 if (next_memory_steal_ == NULL) { 291 ASSERT(thread_heaps_ != NULL); 292 next_memory_steal_ = thread_heaps_; 293 } 294 if (next_memory_steal_ == this || 295 next_memory_steal_->max_size_ <= kMinThreadCacheSize) { 296 continue; 297 } 298 next_memory_steal_->max_size_ -= kStealAmount; 299 max_size_ += kStealAmount; 300 301 next_memory_steal_ = next_memory_steal_->next_; 302 return; 303 } 304} 305 306int ThreadCache::GetSamplePeriod() { 307 return sampler_.GetSamplePeriod(); 308} 309 310void ThreadCache::InitModule() { 311 SpinLockHolder h(Static::pageheap_lock()); 312 if (!phinited) { 313 Static::InitStaticVars(); 314 threadcache_allocator.Init(); 315 phinited = 1; 316 } 317} 318 319void ThreadCache::InitTSD() { 320 ASSERT(!tsd_inited_); 321 perftools_pthread_key_create(&heap_key_, DestroyThreadCache); 322 tsd_inited_ = true; 323 324#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY 325 // We may have used a fake pthread_t for the main thread. Fix it. 326 pthread_t zero; 327 memset(&zero, 0, sizeof(zero)); 328 SpinLockHolder h(Static::pageheap_lock()); 329 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 330 if (h->tid_ == zero) { 331 h->tid_ = pthread_self(); 332 } 333 } 334#endif 335} 336 337ThreadCache* ThreadCache::CreateCacheIfNecessary() { 338 // Initialize per-thread data if necessary 339 ThreadCache* heap = NULL; 340 { 341 SpinLockHolder h(Static::pageheap_lock()); 342 // On some old glibc's, and on freebsd's libc (as of freebsd 8.1), 343 // calling pthread routines (even pthread_self) too early could 344 // cause a segfault. Since we can call pthreads quite early, we 345 // have to protect against that in such situations by making a 346 // 'fake' pthread. This is not ideal since it doesn't work well 347 // when linking tcmalloc statically with apps that create threads 348 // before main, so we only do it if we have to. 349#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY 350 pthread_t me; 351 if (!tsd_inited_) { 352 memset(&me, 0, sizeof(me)); 353 } else { 354 me = pthread_self(); 355 } 356#else 357 const pthread_t me = pthread_self(); 358#endif 359 360 // This may be a recursive malloc call from pthread_setspecific() 361 // In that case, the heap for this thread has already been created 362 // and added to the linked list. So we search for that first. 363 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 364 if (h->tid_ == me) { 365 heap = h; 366 break; 367 } 368 } 369 370 if (heap == NULL) heap = NewHeap(me); 371 } 372 373 // We call pthread_setspecific() outside the lock because it may 374 // call malloc() recursively. We check for the recursive call using 375 // the "in_setspecific_" flag so that we can avoid calling 376 // pthread_setspecific() if we are already inside pthread_setspecific(). 377 if (!heap->in_setspecific_ && tsd_inited_) { 378 heap->in_setspecific_ = true; 379 perftools_pthread_setspecific(heap_key_, heap); 380#ifdef HAVE_TLS 381 // Also keep a copy in __thread for faster retrieval 382 threadlocal_heap_ = heap; 383#endif 384 heap->in_setspecific_ = false; 385 } 386 return heap; 387} 388 389ThreadCache* ThreadCache::NewHeap(pthread_t tid) { 390 // Create the heap and add it to the linked list 391 ThreadCache *heap = threadcache_allocator.New(); 392 heap->Init(tid); 393 heap->next_ = thread_heaps_; 394 heap->prev_ = NULL; 395 if (thread_heaps_ != NULL) { 396 thread_heaps_->prev_ = heap; 397 } else { 398 // This is the only thread heap at the momment. 399 ASSERT(next_memory_steal_ == NULL); 400 next_memory_steal_ = heap; 401 } 402 thread_heaps_ = heap; 403 thread_heap_count_++; 404 return heap; 405} 406 407void ThreadCache::BecomeIdle() { 408 if (!tsd_inited_) return; // No caches yet 409 ThreadCache* heap = GetThreadHeap(); 410 if (heap == NULL) return; // No thread cache to remove 411 if (heap->in_setspecific_) return; // Do not disturb the active caller 412 413 heap->in_setspecific_ = true; 414 perftools_pthread_setspecific(heap_key_, NULL); 415#ifdef HAVE_TLS 416 // Also update the copy in __thread 417 threadlocal_heap_ = NULL; 418#endif 419 heap->in_setspecific_ = false; 420 if (GetThreadHeap() == heap) { 421 // Somehow heap got reinstated by a recursive call to malloc 422 // from pthread_setspecific. We give up in this case. 423 return; 424 } 425 426 // We can now get rid of the heap 427 DeleteCache(heap); 428} 429 430void ThreadCache::DestroyThreadCache(void* ptr) { 431 // Note that "ptr" cannot be NULL since pthread promises not 432 // to invoke the destructor on NULL values, but for safety, 433 // we check anyway. 434 if (ptr == NULL) return; 435#ifdef HAVE_TLS 436 // Prevent fast path of GetThreadHeap() from returning heap. 437 threadlocal_heap_ = NULL; 438#endif 439 DeleteCache(reinterpret_cast<ThreadCache*>(ptr)); 440} 441 442void ThreadCache::DeleteCache(ThreadCache* heap) { 443 // Remove all memory from heap 444 heap->Cleanup(); 445 446 // Remove from linked list 447 SpinLockHolder h(Static::pageheap_lock()); 448 if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_; 449 if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_; 450 if (thread_heaps_ == heap) thread_heaps_ = heap->next_; 451 thread_heap_count_--; 452 453 if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_; 454 if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_; 455 unclaimed_cache_space_ += heap->max_size_; 456 457 threadcache_allocator.Delete(heap); 458} 459 460void ThreadCache::RecomputePerThreadCacheSize() { 461 // Divide available space across threads 462 int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1; 463 size_t space = overall_thread_cache_size_ / n; 464 465 // Limit to allowed range 466 if (space < kMinThreadCacheSize) space = kMinThreadCacheSize; 467 if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize; 468 469 double ratio = space / max<double>(1, per_thread_cache_size_); 470 size_t claimed = 0; 471 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 472 // Increasing the total cache size should not circumvent the 473 // slow-start growth of max_size_. 474 if (ratio < 1.0) { 475 h->max_size_ = static_cast<size_t>(h->max_size_ * ratio); 476 } 477 claimed += h->max_size_; 478 } 479 unclaimed_cache_space_ = overall_thread_cache_size_ - claimed; 480 per_thread_cache_size_ = space; 481} 482 483void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) { 484 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 485 *total_bytes += h->Size(); 486 if (class_count) { 487 for (int cl = 0; cl < kNumClasses; ++cl) { 488 class_count[cl] += h->freelist_length(cl); 489 } 490 } 491 } 492} 493 494void ThreadCache::set_overall_thread_cache_size(size_t new_size) { 495 // Clip the value to a reasonable range 496 if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize; 497 if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB 498 overall_thread_cache_size_ = new_size; 499 500 RecomputePerThreadCacheSize(); 501} 502 503} // namespace tcmalloc 504