1// Copyright (c) 2008, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// ---
31// Author: Ken Ashcraft <opensource@google.com>
32
33#include <config.h>
34#include "thread_cache.h"
35#include <errno.h>
36#include <string.h>                     // for memcpy
37#include <algorithm>                    // for max, min
38#include "base/commandlineflags.h"      // for SpinLockHolder
39#include "base/spinlock.h"              // for SpinLockHolder
40#include "central_freelist.h"           // for CentralFreeListPadded
41#include "maybe_threads.h"
42
43using std::min;
44using std::max;
45
46DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
47             EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
48                        kDefaultOverallThreadCacheSize),
49             "Bound on the total amount of bytes allocated to "
50             "thread caches. This bound is not strict, so it is possible "
51             "for the cache to go over this bound in certain circumstances. "
52             "Maximum value of this flag is capped to 1 GB.");
53
54namespace tcmalloc {
55
56static bool phinited = false;
57
58volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
59size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
60ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
61PageHeapAllocator<ThreadCache> threadcache_allocator;
62ThreadCache* ThreadCache::thread_heaps_ = NULL;
63int ThreadCache::thread_heap_count_ = 0;
64ThreadCache* ThreadCache::next_memory_steal_ = NULL;
65#ifdef HAVE_TLS
66__thread ThreadCache* ThreadCache::threadlocal_heap_
67# ifdef HAVE___ATTRIBUTE__
68   __attribute__ ((tls_model ("initial-exec")))
69# endif
70   ;
71#endif
72bool ThreadCache::tsd_inited_ = false;
73pthread_key_t ThreadCache::heap_key_;
74
75#if defined(HAVE_TLS)
76bool kernel_supports_tls = false;      // be conservative
77# if defined(_WIN32)    // windows has supported TLS since winnt, I think.
78    void CheckIfKernelSupportsTLS() {
79      kernel_supports_tls = true;
80    }
81# elif !HAVE_DECL_UNAME    // if too old for uname, probably too old for TLS
82    void CheckIfKernelSupportsTLS() {
83      kernel_supports_tls = false;
84    }
85# else
86#   include <sys/utsname.h>    // DECL_UNAME checked for <sys/utsname.h> too
87    void CheckIfKernelSupportsTLS() {
88      struct utsname buf;
89      if (uname(&buf) < 0) {   // should be impossible
90        Log(kLog, __FILE__, __LINE__,
91            "uname failed assuming no TLS support (errno)", errno);
92        kernel_supports_tls = false;
93      } else if (strcasecmp(buf.sysname, "linux") == 0) {
94        // The linux case: the first kernel to support TLS was 2.6.0
95        if (buf.release[0] < '2' && buf.release[1] == '.')    // 0.x or 1.x
96          kernel_supports_tls = false;
97        else if (buf.release[0] == '2' && buf.release[1] == '.' &&
98                 buf.release[2] >= '0' && buf.release[2] < '6' &&
99                 buf.release[3] == '.')                       // 2.0 - 2.5
100          kernel_supports_tls = false;
101        else
102          kernel_supports_tls = true;
103      } else if (strcasecmp(buf.sysname, "CYGWIN_NT-6.1-WOW64") == 0) {
104        // In my testing, this version of cygwin, at least, would hang
105        // when using TLS.
106        kernel_supports_tls = false;
107      } else {        // some other kernel, we'll be optimisitic
108        kernel_supports_tls = true;
109      }
110      // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG
111    }
112#  endif  // HAVE_DECL_UNAME
113#endif    // HAVE_TLS
114
115void ThreadCache::Init(pthread_t tid) {
116  size_ = 0;
117
118  max_size_ = 0;
119  IncreaseCacheLimitLocked();
120  if (max_size_ == 0) {
121    // There isn't enough memory to go around.  Just give the minimum to
122    // this thread.
123    max_size_ = kMinThreadCacheSize;
124
125    // Take unclaimed_cache_space_ negative.
126    unclaimed_cache_space_ -= kMinThreadCacheSize;
127    ASSERT(unclaimed_cache_space_ < 0);
128  }
129
130  next_ = NULL;
131  prev_ = NULL;
132  tid_  = tid;
133  in_setspecific_ = false;
134  for (size_t cl = 0; cl < kNumClasses; ++cl) {
135    list_[cl].Init();
136  }
137
138  uint32_t sampler_seed;
139  memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
140  sampler_.Init(sampler_seed);
141}
142
143void ThreadCache::Cleanup() {
144  // Put unused memory back into central cache
145  for (int cl = 0; cl < kNumClasses; ++cl) {
146    if (list_[cl].length() > 0) {
147      ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
148    }
149  }
150}
151
152// Remove some objects of class "cl" from central cache and add to thread heap.
153// On success, return the first object for immediate use; otherwise return NULL.
154void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
155  FreeList* list = &list_[cl];
156  ASSERT(list->empty());
157  const int batch_size = Static::sizemap()->num_objects_to_move(cl);
158
159  const int num_to_move = min<int>(list->max_length(), batch_size);
160  void *start, *end;
161  int fetch_count = Static::central_cache()[cl].RemoveRange(
162      &start, &end, num_to_move);
163
164  ASSERT((start == NULL) == (fetch_count == 0));
165  if (--fetch_count >= 0) {
166    size_ += byte_size * fetch_count;
167    list->PushRange(fetch_count, SLL_Next(start), end);
168  }
169
170  // Increase max length slowly up to batch_size.  After that,
171  // increase by batch_size in one shot so that the length is a
172  // multiple of batch_size.
173  if (list->max_length() < batch_size) {
174    list->set_max_length(list->max_length() + 1);
175  } else {
176    // Don't let the list get too long.  In 32 bit builds, the length
177    // is represented by a 16 bit int, so we need to watch out for
178    // integer overflow.
179    int new_length = min<int>(list->max_length() + batch_size,
180                              kMaxDynamicFreeListLength);
181    // The list's max_length must always be a multiple of batch_size,
182    // and kMaxDynamicFreeListLength is not necessarily a multiple
183    // of batch_size.
184    new_length -= new_length % batch_size;
185    ASSERT(new_length % batch_size == 0);
186    list->set_max_length(new_length);
187  }
188  return start;
189}
190
191void ThreadCache::ListTooLong(FreeList* list, size_t cl) {
192  const int batch_size = Static::sizemap()->num_objects_to_move(cl);
193  ReleaseToCentralCache(list, cl, batch_size);
194
195  // If the list is too long, we need to transfer some number of
196  // objects to the central cache.  Ideally, we would transfer
197  // num_objects_to_move, so the code below tries to make max_length
198  // converge on num_objects_to_move.
199
200  if (list->max_length() < batch_size) {
201    // Slow start the max_length so we don't overreserve.
202    list->set_max_length(list->max_length() + 1);
203  } else if (list->max_length() > batch_size) {
204    // If we consistently go over max_length, shrink max_length.  If we don't
205    // shrink it, some amount of memory will always stay in this freelist.
206    list->set_length_overages(list->length_overages() + 1);
207    if (list->length_overages() > kMaxOverages) {
208      ASSERT(list->max_length() > batch_size);
209      list->set_max_length(list->max_length() - batch_size);
210      list->set_length_overages(0);
211    }
212  }
213}
214
215// Remove some objects of class "cl" from thread heap and add to central cache
216void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) {
217  ASSERT(src == &list_[cl]);
218  if (N > src->length()) N = src->length();
219  size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
220
221  // We return prepackaged chains of the correct size to the central cache.
222  // TODO: Use the same format internally in the thread caches?
223  int batch_size = Static::sizemap()->num_objects_to_move(cl);
224  while (N > batch_size) {
225    void *tail, *head;
226    src->PopRange(batch_size, &head, &tail);
227    Static::central_cache()[cl].InsertRange(head, tail, batch_size);
228    N -= batch_size;
229  }
230  void *tail, *head;
231  src->PopRange(N, &head, &tail);
232  Static::central_cache()[cl].InsertRange(head, tail, N);
233  size_ -= delta_bytes;
234}
235
236// Release idle memory to the central cache
237void ThreadCache::Scavenge() {
238  // If the low-water mark for the free list is L, it means we would
239  // not have had to allocate anything from the central cache even if
240  // we had reduced the free list size by L.  We aim to get closer to
241  // that situation by dropping L/2 nodes from the free list.  This
242  // may not release much memory, but if so we will call scavenge again
243  // pretty soon and the low-water marks will be high on that call.
244  //int64 start = CycleClock::Now();
245  for (int cl = 0; cl < kNumClasses; cl++) {
246    FreeList* list = &list_[cl];
247    const int lowmark = list->lowwatermark();
248    if (lowmark > 0) {
249      const int drop = (lowmark > 1) ? lowmark/2 : 1;
250      ReleaseToCentralCache(list, cl, drop);
251
252      // Shrink the max length if it isn't used.  Only shrink down to
253      // batch_size -- if the thread was active enough to get the max_length
254      // above batch_size, it will likely be that active again.  If
255      // max_length shinks below batch_size, the thread will have to
256      // go through the slow-start behavior again.  The slow-start is useful
257      // mainly for threads that stay relatively idle for their entire
258      // lifetime.
259      const int batch_size = Static::sizemap()->num_objects_to_move(cl);
260      if (list->max_length() > batch_size) {
261        list->set_max_length(
262            max<int>(list->max_length() - batch_size, batch_size));
263      }
264    }
265    list->clear_lowwatermark();
266  }
267
268  IncreaseCacheLimit();
269}
270
271void ThreadCache::IncreaseCacheLimit() {
272  SpinLockHolder h(Static::pageheap_lock());
273  IncreaseCacheLimitLocked();
274}
275
276void ThreadCache::IncreaseCacheLimitLocked() {
277  if (unclaimed_cache_space_ > 0) {
278    // Possibly make unclaimed_cache_space_ negative.
279    unclaimed_cache_space_ -= kStealAmount;
280    max_size_ += kStealAmount;
281    return;
282  }
283  // Don't hold pageheap_lock too long.  Try to steal from 10 other
284  // threads before giving up.  The i < 10 condition also prevents an
285  // infinite loop in case none of the existing thread heaps are
286  // suitable places to steal from.
287  for (int i = 0; i < 10;
288       ++i, next_memory_steal_ = next_memory_steal_->next_) {
289    // Reached the end of the linked list.  Start at the beginning.
290    if (next_memory_steal_ == NULL) {
291      ASSERT(thread_heaps_ != NULL);
292      next_memory_steal_ = thread_heaps_;
293    }
294    if (next_memory_steal_ == this ||
295        next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
296      continue;
297    }
298    next_memory_steal_->max_size_ -= kStealAmount;
299    max_size_ += kStealAmount;
300
301    next_memory_steal_ = next_memory_steal_->next_;
302    return;
303  }
304}
305
306int ThreadCache::GetSamplePeriod() {
307  return sampler_.GetSamplePeriod();
308}
309
310void ThreadCache::InitModule() {
311  SpinLockHolder h(Static::pageheap_lock());
312  if (!phinited) {
313    Static::InitStaticVars();
314    threadcache_allocator.Init();
315    phinited = 1;
316  }
317}
318
319void ThreadCache::InitTSD() {
320  ASSERT(!tsd_inited_);
321  perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
322  tsd_inited_ = true;
323
324#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
325  // We may have used a fake pthread_t for the main thread.  Fix it.
326  pthread_t zero;
327  memset(&zero, 0, sizeof(zero));
328  SpinLockHolder h(Static::pageheap_lock());
329  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
330    if (h->tid_ == zero) {
331      h->tid_ = pthread_self();
332    }
333  }
334#endif
335}
336
337ThreadCache* ThreadCache::CreateCacheIfNecessary() {
338  // Initialize per-thread data if necessary
339  ThreadCache* heap = NULL;
340  {
341    SpinLockHolder h(Static::pageheap_lock());
342    // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
343    // calling pthread routines (even pthread_self) too early could
344    // cause a segfault.  Since we can call pthreads quite early, we
345    // have to protect against that in such situations by making a
346    // 'fake' pthread.  This is not ideal since it doesn't work well
347    // when linking tcmalloc statically with apps that create threads
348    // before main, so we only do it if we have to.
349#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
350    pthread_t me;
351    if (!tsd_inited_) {
352      memset(&me, 0, sizeof(me));
353    } else {
354      me = pthread_self();
355    }
356#else
357    const pthread_t me = pthread_self();
358#endif
359
360    // This may be a recursive malloc call from pthread_setspecific()
361    // In that case, the heap for this thread has already been created
362    // and added to the linked list.  So we search for that first.
363    for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
364      if (h->tid_ == me) {
365        heap = h;
366        break;
367      }
368    }
369
370    if (heap == NULL) heap = NewHeap(me);
371  }
372
373  // We call pthread_setspecific() outside the lock because it may
374  // call malloc() recursively.  We check for the recursive call using
375  // the "in_setspecific_" flag so that we can avoid calling
376  // pthread_setspecific() if we are already inside pthread_setspecific().
377  if (!heap->in_setspecific_ && tsd_inited_) {
378    heap->in_setspecific_ = true;
379    perftools_pthread_setspecific(heap_key_, heap);
380#ifdef HAVE_TLS
381    // Also keep a copy in __thread for faster retrieval
382    threadlocal_heap_ = heap;
383#endif
384    heap->in_setspecific_ = false;
385  }
386  return heap;
387}
388
389ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
390  // Create the heap and add it to the linked list
391  ThreadCache *heap = threadcache_allocator.New();
392  heap->Init(tid);
393  heap->next_ = thread_heaps_;
394  heap->prev_ = NULL;
395  if (thread_heaps_ != NULL) {
396    thread_heaps_->prev_ = heap;
397  } else {
398    // This is the only thread heap at the momment.
399    ASSERT(next_memory_steal_ == NULL);
400    next_memory_steal_ = heap;
401  }
402  thread_heaps_ = heap;
403  thread_heap_count_++;
404  return heap;
405}
406
407void ThreadCache::BecomeIdle() {
408  if (!tsd_inited_) return;              // No caches yet
409  ThreadCache* heap = GetThreadHeap();
410  if (heap == NULL) return;             // No thread cache to remove
411  if (heap->in_setspecific_) return;    // Do not disturb the active caller
412
413  heap->in_setspecific_ = true;
414  perftools_pthread_setspecific(heap_key_, NULL);
415#ifdef HAVE_TLS
416  // Also update the copy in __thread
417  threadlocal_heap_ = NULL;
418#endif
419  heap->in_setspecific_ = false;
420  if (GetThreadHeap() == heap) {
421    // Somehow heap got reinstated by a recursive call to malloc
422    // from pthread_setspecific.  We give up in this case.
423    return;
424  }
425
426  // We can now get rid of the heap
427  DeleteCache(heap);
428}
429
430void ThreadCache::DestroyThreadCache(void* ptr) {
431  // Note that "ptr" cannot be NULL since pthread promises not
432  // to invoke the destructor on NULL values, but for safety,
433  // we check anyway.
434  if (ptr == NULL) return;
435#ifdef HAVE_TLS
436  // Prevent fast path of GetThreadHeap() from returning heap.
437  threadlocal_heap_ = NULL;
438#endif
439  DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
440}
441
442void ThreadCache::DeleteCache(ThreadCache* heap) {
443  // Remove all memory from heap
444  heap->Cleanup();
445
446  // Remove from linked list
447  SpinLockHolder h(Static::pageheap_lock());
448  if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
449  if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
450  if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
451  thread_heap_count_--;
452
453  if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
454  if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
455  unclaimed_cache_space_ += heap->max_size_;
456
457  threadcache_allocator.Delete(heap);
458}
459
460void ThreadCache::RecomputePerThreadCacheSize() {
461  // Divide available space across threads
462  int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
463  size_t space = overall_thread_cache_size_ / n;
464
465  // Limit to allowed range
466  if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
467  if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
468
469  double ratio = space / max<double>(1, per_thread_cache_size_);
470  size_t claimed = 0;
471  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
472    // Increasing the total cache size should not circumvent the
473    // slow-start growth of max_size_.
474    if (ratio < 1.0) {
475        h->max_size_ = static_cast<size_t>(h->max_size_ * ratio);
476    }
477    claimed += h->max_size_;
478  }
479  unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
480  per_thread_cache_size_ = space;
481}
482
483void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
484  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
485    *total_bytes += h->Size();
486    if (class_count) {
487      for (int cl = 0; cl < kNumClasses; ++cl) {
488        class_count[cl] += h->freelist_length(cl);
489      }
490    }
491  }
492}
493
494void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
495  // Clip the value to a reasonable range
496  if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
497  if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
498  overall_thread_cache_size_ = new_size;
499
500  RecomputePerThreadCacheSize();
501}
502
503}  // namespace tcmalloc
504