1// Copyright (c) 2008, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// ---
31// Author: Ken Ashcraft <opensource@google.com>
32
33#include <config.h>
34#include "thread_cache.h"
35#include <errno.h>
36#include <string.h>                     // for memcpy
37#include <algorithm>                    // for max, min
38#include "base/commandlineflags.h"      // for SpinLockHolder
39#include "base/spinlock.h"              // for SpinLockHolder
40#include "central_freelist.h"           // for CentralFreeListPadded
41#include "maybe_threads.h"
42
43using std::min;
44using std::max;
45
46DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
47             EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
48                        kDefaultOverallThreadCacheSize),
49             "Bound on the total amount of bytes allocated to "
50             "thread caches. This bound is not strict, so it is possible "
51             "for the cache to go over this bound in certain circumstances. "
52             "Maximum value of this flag is capped to 1 GB.");
53
54namespace tcmalloc {
55
56static bool phinited = false;
57
58volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
59size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
60ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
61PageHeapAllocator<ThreadCache> threadcache_allocator;
62ThreadCache* ThreadCache::thread_heaps_ = NULL;
63int ThreadCache::thread_heap_count_ = 0;
64ThreadCache* ThreadCache::next_memory_steal_ = NULL;
65#ifdef HAVE_TLS
66__thread ThreadCache* ThreadCache::threadlocal_heap_
67// See comments in thread_cache.h about this. Bug here:
68// http://code.google.com/p/chromium/issues/detail?id=124489
69#if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE)
70   __attribute__ ((tls_model ("initial-exec")))
71# endif
72   ;
73#endif
74bool ThreadCache::tsd_inited_ = false;
75pthread_key_t ThreadCache::heap_key_;
76
77#if defined(HAVE_TLS)
78bool kernel_supports_tls = false;      // be conservative
79# if defined(_WIN32)    // windows has supported TLS since winnt, I think.
80    void CheckIfKernelSupportsTLS() {
81      kernel_supports_tls = true;
82    }
83# elif !HAVE_DECL_UNAME    // if too old for uname, probably too old for TLS
84    void CheckIfKernelSupportsTLS() {
85      kernel_supports_tls = false;
86    }
87# else
88#   include <sys/utsname.h>    // DECL_UNAME checked for <sys/utsname.h> too
89    void CheckIfKernelSupportsTLS() {
90      struct utsname buf;
91      if (uname(&buf) < 0) {   // should be impossible
92        Log(kLog, __FILE__, __LINE__,
93            "uname failed assuming no TLS support (errno)", errno);
94        kernel_supports_tls = false;
95      } else if (strcasecmp(buf.sysname, "linux") == 0) {
96        // The linux case: the first kernel to support TLS was 2.6.0
97        if (buf.release[0] < '2' && buf.release[1] == '.')    // 0.x or 1.x
98          kernel_supports_tls = false;
99        else if (buf.release[0] == '2' && buf.release[1] == '.' &&
100                 buf.release[2] >= '0' && buf.release[2] < '6' &&
101                 buf.release[3] == '.')                       // 2.0 - 2.5
102          kernel_supports_tls = false;
103        else
104          kernel_supports_tls = true;
105      } else if (strcasecmp(buf.sysname, "CYGWIN_NT-6.1-WOW64") == 0) {
106        // In my testing, this version of cygwin, at least, would hang
107        // when using TLS.
108        kernel_supports_tls = false;
109      } else {        // some other kernel, we'll be optimisitic
110        kernel_supports_tls = true;
111      }
112      // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG
113    }
114#  endif  // HAVE_DECL_UNAME
115#endif    // HAVE_TLS
116
117void ThreadCache::Init(pthread_t tid) {
118  size_ = 0;
119  total_bytes_allocated_ = 0;
120
121  max_size_ = 0;
122  IncreaseCacheLimitLocked();
123  if (max_size_ == 0) {
124    // There isn't enough memory to go around.  Just give the minimum to
125    // this thread.
126    max_size_ = kMinThreadCacheSize;
127
128    // Take unclaimed_cache_space_ negative.
129    unclaimed_cache_space_ -= kMinThreadCacheSize;
130    ASSERT(unclaimed_cache_space_ < 0);
131  }
132
133  next_ = NULL;
134  prev_ = NULL;
135  tid_  = tid;
136  in_setspecific_ = false;
137  for (size_t cl = 0; cl < kNumClasses; ++cl) {
138    list_[cl].Init();
139  }
140
141  uint32_t sampler_seed;
142  memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
143  sampler_.Init(sampler_seed);
144}
145
146void ThreadCache::Cleanup() {
147  // Put unused memory back into central cache
148  for (int cl = 0; cl < kNumClasses; ++cl) {
149    if (list_[cl].length() > 0) {
150      ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
151    }
152  }
153}
154
155// Remove some objects of class "cl" from central cache and add to thread heap.
156// On success, return the first object for immediate use; otherwise return NULL.
157void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
158  FreeList* list = &list_[cl];
159  ASSERT(list->empty());
160  const int batch_size = Static::sizemap()->num_objects_to_move(cl);
161
162  const int num_to_move = min<int>(list->max_length(), batch_size);
163  void *start, *end;
164  int fetch_count = Static::central_cache()[cl].RemoveRange(
165      &start, &end, num_to_move);
166
167  ASSERT((start == NULL) == (fetch_count == 0));
168  if (--fetch_count >= 0) {
169    size_ += byte_size * fetch_count;
170    // Pop the top of the list and add the rest to the freelist.
171    void *second = start;
172    start = FL_Pop(&second);
173    list->PushRange(fetch_count, second, end);
174  }
175
176  // Increase max length slowly up to batch_size.  After that,
177  // increase by batch_size in one shot so that the length is a
178  // multiple of batch_size.
179  if (list->max_length() < batch_size) {
180    list->set_max_length(list->max_length() + 1);
181  } else {
182    // Don't let the list get too long.  In 32 bit builds, the length
183    // is represented by a 16 bit int, so we need to watch out for
184    // integer overflow.
185    int new_length = min<int>(list->max_length() + batch_size,
186                              kMaxDynamicFreeListLength);
187    // The list's max_length must always be a multiple of batch_size,
188    // and kMaxDynamicFreeListLength is not necessarily a multiple
189    // of batch_size.
190    new_length -= new_length % batch_size;
191    ASSERT(new_length % batch_size == 0);
192    list->set_max_length(new_length);
193  }
194  return start;
195}
196
197void ThreadCache::ListTooLong(FreeList* list, size_t cl) {
198  const int batch_size = Static::sizemap()->num_objects_to_move(cl);
199  ReleaseToCentralCache(list, cl, batch_size);
200
201  // If the list is too long, we need to transfer some number of
202  // objects to the central cache.  Ideally, we would transfer
203  // num_objects_to_move, so the code below tries to make max_length
204  // converge on num_objects_to_move.
205
206  if (list->max_length() < batch_size) {
207    // Slow start the max_length so we don't overreserve.
208    list->set_max_length(list->max_length() + 1);
209  } else if (list->max_length() > batch_size) {
210    // If we consistently go over max_length, shrink max_length.  If we don't
211    // shrink it, some amount of memory will always stay in this freelist.
212    list->set_length_overages(list->length_overages() + 1);
213    if (list->length_overages() > kMaxOverages) {
214      ASSERT(list->max_length() > batch_size);
215      list->set_max_length(list->max_length() - batch_size);
216      list->set_length_overages(0);
217    }
218  }
219}
220
221// Remove some objects of class "cl" from thread heap and add to central cache
222void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) {
223  ASSERT(src == &list_[cl]);
224  if (N > src->length()) N = src->length();
225  size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
226
227  // We return prepackaged chains of the correct size to the central cache.
228  // TODO: Use the same format internally in the thread caches?
229  int batch_size = Static::sizemap()->num_objects_to_move(cl);
230  while (N > batch_size) {
231    void *tail, *head;
232    src->PopRange(batch_size, &head, &tail);
233    Static::central_cache()[cl].InsertRange(head, tail, batch_size);
234    N -= batch_size;
235  }
236  void *tail, *head;
237  src->PopRange(N, &head, &tail);
238  Static::central_cache()[cl].InsertRange(head, tail, N);
239  size_ -= delta_bytes;
240}
241
242// Release idle memory to the central cache
243void ThreadCache::Scavenge() {
244  // If the low-water mark for the free list is L, it means we would
245  // not have had to allocate anything from the central cache even if
246  // we had reduced the free list size by L.  We aim to get closer to
247  // that situation by dropping L/2 nodes from the free list.  This
248  // may not release much memory, but if so we will call scavenge again
249  // pretty soon and the low-water marks will be high on that call.
250  //int64 start = CycleClock::Now();
251  for (int cl = 0; cl < kNumClasses; cl++) {
252    FreeList* list = &list_[cl];
253    const int lowmark = list->lowwatermark();
254    if (lowmark > 0) {
255      const int drop = (lowmark > 1) ? lowmark/2 : 1;
256      ReleaseToCentralCache(list, cl, drop);
257
258      // Shrink the max length if it isn't used.  Only shrink down to
259      // batch_size -- if the thread was active enough to get the max_length
260      // above batch_size, it will likely be that active again.  If
261      // max_length shinks below batch_size, the thread will have to
262      // go through the slow-start behavior again.  The slow-start is useful
263      // mainly for threads that stay relatively idle for their entire
264      // lifetime.
265      const int batch_size = Static::sizemap()->num_objects_to_move(cl);
266      if (list->max_length() > batch_size) {
267        list->set_max_length(
268            max<int>(list->max_length() - batch_size, batch_size));
269      }
270    }
271    list->clear_lowwatermark();
272  }
273
274  IncreaseCacheLimit();
275}
276
277void ThreadCache::IncreaseCacheLimit() {
278  SpinLockHolder h(Static::pageheap_lock());
279  IncreaseCacheLimitLocked();
280}
281
282void ThreadCache::IncreaseCacheLimitLocked() {
283  if (unclaimed_cache_space_ > 0) {
284    // Possibly make unclaimed_cache_space_ negative.
285    unclaimed_cache_space_ -= kStealAmount;
286    max_size_ += kStealAmount;
287    return;
288  }
289  // Don't hold pageheap_lock too long.  Try to steal from 10 other
290  // threads before giving up.  The i < 10 condition also prevents an
291  // infinite loop in case none of the existing thread heaps are
292  // suitable places to steal from.
293  for (int i = 0; i < 10;
294       ++i, next_memory_steal_ = next_memory_steal_->next_) {
295    // Reached the end of the linked list.  Start at the beginning.
296    if (next_memory_steal_ == NULL) {
297      ASSERT(thread_heaps_ != NULL);
298      next_memory_steal_ = thread_heaps_;
299    }
300    if (next_memory_steal_ == this ||
301        next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
302      continue;
303    }
304    next_memory_steal_->max_size_ -= kStealAmount;
305    max_size_ += kStealAmount;
306
307    next_memory_steal_ = next_memory_steal_->next_;
308    return;
309  }
310}
311
312int ThreadCache::GetSamplePeriod() {
313  return sampler_.GetSamplePeriod();
314}
315
316// static
317unsigned int ThreadCache::GetBytesAllocatedOnCurrentThread() {
318  return ThreadCache::GetThreadHeap()->GetTotalBytesAllocated();
319}
320
321void ThreadCache::InitModule() {
322  SpinLockHolder h(Static::pageheap_lock());
323  if (!phinited) {
324    Static::InitStaticVars();
325    threadcache_allocator.Init();
326    phinited = 1;
327  }
328}
329
330void ThreadCache::InitTSD() {
331  ASSERT(!tsd_inited_);
332  perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
333  tsd_inited_ = true;
334
335#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
336  // We may have used a fake pthread_t for the main thread.  Fix it.
337  pthread_t zero;
338  memset(&zero, 0, sizeof(zero));
339  SpinLockHolder h(Static::pageheap_lock());
340  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
341    if (h->tid_ == zero) {
342      h->tid_ = pthread_self();
343    }
344  }
345#endif
346}
347
348ThreadCache* ThreadCache::CreateCacheIfNecessary() {
349  // Initialize per-thread data if necessary
350  ThreadCache* heap = NULL;
351  {
352    SpinLockHolder h(Static::pageheap_lock());
353    // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
354    // calling pthread routines (even pthread_self) too early could
355    // cause a segfault.  Since we can call pthreads quite early, we
356    // have to protect against that in such situations by making a
357    // 'fake' pthread.  This is not ideal since it doesn't work well
358    // when linking tcmalloc statically with apps that create threads
359    // before main, so we only do it if we have to.
360#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
361    pthread_t me;
362    if (!tsd_inited_) {
363      memset(&me, 0, sizeof(me));
364    } else {
365      me = pthread_self();
366    }
367#else
368    const pthread_t me = pthread_self();
369#endif
370
371    // This may be a recursive malloc call from pthread_setspecific()
372    // In that case, the heap for this thread has already been created
373    // and added to the linked list.  So we search for that first.
374    for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
375      if (h->tid_ == me) {
376        heap = h;
377        break;
378      }
379    }
380
381    if (heap == NULL) heap = NewHeap(me);
382  }
383
384  // We call pthread_setspecific() outside the lock because it may
385  // call malloc() recursively.  We check for the recursive call using
386  // the "in_setspecific_" flag so that we can avoid calling
387  // pthread_setspecific() if we are already inside pthread_setspecific().
388  if (!heap->in_setspecific_ && tsd_inited_) {
389    heap->in_setspecific_ = true;
390    perftools_pthread_setspecific(heap_key_, heap);
391#ifdef HAVE_TLS
392    // Also keep a copy in __thread for faster retrieval
393    threadlocal_heap_ = heap;
394#endif
395    heap->in_setspecific_ = false;
396  }
397  return heap;
398}
399
400ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
401  // Create the heap and add it to the linked list
402  ThreadCache *heap = threadcache_allocator.New();
403  heap->Init(tid);
404  heap->next_ = thread_heaps_;
405  heap->prev_ = NULL;
406  if (thread_heaps_ != NULL) {
407    thread_heaps_->prev_ = heap;
408  } else {
409    // This is the only thread heap at the momment.
410    ASSERT(next_memory_steal_ == NULL);
411    next_memory_steal_ = heap;
412  }
413  thread_heaps_ = heap;
414  thread_heap_count_++;
415  return heap;
416}
417
418void ThreadCache::BecomeIdle() {
419  if (!tsd_inited_) return;              // No caches yet
420  ThreadCache* heap = GetThreadHeap();
421  if (heap == NULL) return;             // No thread cache to remove
422  if (heap->in_setspecific_) return;    // Do not disturb the active caller
423
424  heap->in_setspecific_ = true;
425  perftools_pthread_setspecific(heap_key_, NULL);
426#ifdef HAVE_TLS
427  // Also update the copy in __thread
428  threadlocal_heap_ = NULL;
429#endif
430  heap->in_setspecific_ = false;
431  if (GetThreadHeap() == heap) {
432    // Somehow heap got reinstated by a recursive call to malloc
433    // from pthread_setspecific.  We give up in this case.
434    return;
435  }
436
437  // We can now get rid of the heap
438  DeleteCache(heap);
439}
440
441void ThreadCache::DestroyThreadCache(void* ptr) {
442  // Note that "ptr" cannot be NULL since pthread promises not
443  // to invoke the destructor on NULL values, but for safety,
444  // we check anyway.
445  if (ptr == NULL) return;
446#ifdef HAVE_TLS
447  // Prevent fast path of GetThreadHeap() from returning heap.
448  threadlocal_heap_ = NULL;
449#endif
450  DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
451}
452
453void ThreadCache::DeleteCache(ThreadCache* heap) {
454  // Remove all memory from heap
455  heap->Cleanup();
456
457  // Remove from linked list
458  SpinLockHolder h(Static::pageheap_lock());
459  if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
460  if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
461  if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
462  thread_heap_count_--;
463
464  if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
465  if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
466  unclaimed_cache_space_ += heap->max_size_;
467
468  threadcache_allocator.Delete(heap);
469}
470
471void ThreadCache::RecomputePerThreadCacheSize() {
472  // Divide available space across threads
473  int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
474  size_t space = overall_thread_cache_size_ / n;
475
476  // Limit to allowed range
477  if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
478  if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
479
480  double ratio = space / max<double>(1, per_thread_cache_size_);
481  size_t claimed = 0;
482  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
483    // Increasing the total cache size should not circumvent the
484    // slow-start growth of max_size_.
485    if (ratio < 1.0) {
486        h->max_size_ = static_cast<size_t>(h->max_size_ * ratio);
487    }
488    claimed += h->max_size_;
489  }
490  unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
491  per_thread_cache_size_ = space;
492}
493
494void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
495  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
496    *total_bytes += h->Size();
497    if (class_count) {
498      for (int cl = 0; cl < kNumClasses; ++cl) {
499        class_count[cl] += h->freelist_length(cl);
500      }
501    }
502  }
503}
504
505void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
506  // Clip the value to a reasonable range
507  if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
508  if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
509  overall_thread_cache_size_ = new_size;
510
511  RecomputePerThreadCacheSize();
512}
513
514}  // namespace tcmalloc
515