1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/threading/thread_local_storage.h"
6
7#include <windows.h>
8
9#include "base/logging.h"
10
11
12namespace {
13// In order to make TLS destructors work, we need to keep function
14// pointers to the destructor for each TLS that we allocate.
15// We make this work by allocating a single OS-level TLS, which
16// contains an array of slots for the application to use.  In
17// parallel, we also allocate an array of destructors, which we
18// keep track of and call when threads terminate.
19
20// g_native_tls_key is the one native TLS that we use.  It stores our table.
21long g_native_tls_key = TLS_OUT_OF_INDEXES;
22
23// g_last_used_tls_key is the high-water-mark of allocated thread local storage.
24// Each allocation is an index into our g_tls_destructors[].  Each such index is
25// assigned to the instance variable slot_ in a ThreadLocalStorage::Slot
26// instance.  We reserve the value slot_ == 0 to indicate that the corresponding
27// instance of ThreadLocalStorage::Slot has been freed (i.e., destructor called,
28// etc.).  This reserved use of 0 is then stated as the initial value of
29// g_last_used_tls_key, so that the first issued index will be 1.
30long g_last_used_tls_key = 0;
31
32// The maximum number of 'slots' in our thread local storage stack.
33const int kThreadLocalStorageSize = 64;
34
35// The maximum number of times to try to clear slots by calling destructors.
36// Use pthread naming convention for clarity.
37const int kMaxDestructorIterations = kThreadLocalStorageSize;
38
39// An array of destructor function pointers for the slots.  If a slot has a
40// destructor, it will be stored in its corresponding entry in this array.
41// The elements are volatile to ensure that when the compiler reads the value
42// to potentially call the destructor, it does so once, and that value is tested
43// for null-ness and then used. Yes, that would be a weird de-optimization,
44// but I can imagine some register machines where it was just as easy to
45// re-fetch an array element, and I want to be sure a call to free the key
46// (i.e., null out the destructor entry) that happens on a separate thread can't
47// hurt the racy calls to the destructors on another thread.
48volatile base::ThreadLocalStorage::TLSDestructorFunc
49    g_tls_destructors[kThreadLocalStorageSize];
50
51void** ConstructTlsVector() {
52  if (g_native_tls_key == TLS_OUT_OF_INDEXES) {
53    long value = TlsAlloc();
54    DCHECK(value != TLS_OUT_OF_INDEXES);
55
56    // Atomically test-and-set the tls_key.  If the key is TLS_OUT_OF_INDEXES,
57    // go ahead and set it.  Otherwise, do nothing, as another
58    // thread already did our dirty work.
59    if (TLS_OUT_OF_INDEXES != InterlockedCompareExchange(
60            &g_native_tls_key, value, TLS_OUT_OF_INDEXES)) {
61      // We've been shortcut. Another thread replaced g_native_tls_key first so
62      // we need to destroy our index and use the one the other thread got
63      // first.
64      TlsFree(value);
65    }
66  }
67  DCHECK(!TlsGetValue(g_native_tls_key));
68
69  // Some allocators, such as TCMalloc, make use of thread local storage.
70  // As a result, any attempt to call new (or malloc) will lazily cause such a
71  // system to initialize, which will include registering for a TLS key.  If we
72  // are not careful here, then that request to create a key will call new back,
73  // and we'll have an infinite loop.  We avoid that as follows:
74  // Use a stack allocated vector, so that we don't have dependence on our
75  // allocator until our service is in place.  (i.e., don't even call new until
76  // after we're setup)
77  void* stack_allocated_tls_data[kThreadLocalStorageSize];
78  memset(stack_allocated_tls_data, 0, sizeof(stack_allocated_tls_data));
79  // Ensure that any rentrant calls change the temp version.
80  TlsSetValue(g_native_tls_key, stack_allocated_tls_data);
81
82  // Allocate an array to store our data.
83  void** tls_data = new void*[kThreadLocalStorageSize];
84  memcpy(tls_data, stack_allocated_tls_data, sizeof(stack_allocated_tls_data));
85  TlsSetValue(g_native_tls_key, tls_data);
86  return tls_data;
87}
88
89// Called when we terminate a thread, this function calls any TLS destructors
90// that are pending for this thread.
91void WinThreadExit() {
92  if (g_native_tls_key == TLS_OUT_OF_INDEXES)
93    return;
94
95  void** tls_data = static_cast<void**>(TlsGetValue(g_native_tls_key));
96  // Maybe we have never initialized TLS for this thread.
97  if (!tls_data)
98    return;
99
100  // Some allocators, such as TCMalloc, use TLS.  As a result, when a thread
101  // terminates, one of the destructor calls we make may be to shut down an
102  // allocator.  We have to be careful that after we've shutdown all of the
103  // known destructors (perchance including an allocator), that we don't call
104  // the allocator and cause it to resurrect itself (with no possibly destructor
105  // call to follow).  We handle this problem as follows:
106  // Switch to using a stack allocated vector, so that we don't have dependence
107  // on our allocator after we have called all g_tls_destructors.  (i.e., don't
108  // even call delete[] after we're done with destructors.)
109  void* stack_allocated_tls_data[kThreadLocalStorageSize];
110  memcpy(stack_allocated_tls_data, tls_data, sizeof(stack_allocated_tls_data));
111  // Ensure that any re-entrant calls change the temp version.
112  TlsSetValue(g_native_tls_key, stack_allocated_tls_data);
113  delete[] tls_data;  // Our last dependence on an allocator.
114
115  int remaining_attempts = kMaxDestructorIterations;
116  bool need_to_scan_destructors = true;
117  while (need_to_scan_destructors) {
118    need_to_scan_destructors = false;
119    // Try to destroy the first-created-slot (which is slot 1) in our last
120    // destructor call.  That user was able to function, and define a slot with
121    // no other services running, so perhaps it is a basic service (like an
122    // allocator) and should also be destroyed last.  If we get the order wrong,
123    // then we'll itterate several more times, so it is really not that
124    // critical (but it might help).
125    for (int slot = g_last_used_tls_key; slot > 0; --slot) {
126      void* value = stack_allocated_tls_data[slot];
127      if (value == NULL)
128        continue;
129      base::ThreadLocalStorage::TLSDestructorFunc destructor =
130          g_tls_destructors[slot];
131      if (destructor == NULL)
132        continue;
133      stack_allocated_tls_data[slot] = NULL;  // pre-clear the slot.
134      destructor(value);
135      // Any destructor might have called a different service, which then set
136      // a different slot to a non-NULL value.  Hence we need to check
137      // the whole vector again.  This is a pthread standard.
138      need_to_scan_destructors = true;
139    }
140    if (--remaining_attempts <= 0) {
141      NOTREACHED();  // Destructors might not have been called.
142      break;
143    }
144  }
145
146  // Remove our stack allocated vector.
147  TlsSetValue(g_native_tls_key, NULL);
148}
149
150}  // namespace
151
152namespace base {
153
154ThreadLocalStorage::Slot::Slot(TLSDestructorFunc destructor) {
155  initialized_ = false;
156  slot_ = 0;
157  Initialize(destructor);
158}
159
160bool ThreadLocalStorage::StaticSlot::Initialize(TLSDestructorFunc destructor) {
161  if (g_native_tls_key == TLS_OUT_OF_INDEXES || !TlsGetValue(g_native_tls_key))
162    ConstructTlsVector();
163
164  // Grab a new slot.
165  slot_ = InterlockedIncrement(&g_last_used_tls_key);
166  DCHECK_GT(slot_, 0);
167  if (slot_ >= kThreadLocalStorageSize) {
168    NOTREACHED();
169    return false;
170  }
171
172  // Setup our destructor.
173  g_tls_destructors[slot_] = destructor;
174  initialized_ = true;
175  return true;
176}
177
178void ThreadLocalStorage::StaticSlot::Free() {
179  // At this time, we don't reclaim old indices for TLS slots.
180  // So all we need to do is wipe the destructor.
181  DCHECK_GT(slot_, 0);
182  DCHECK_LT(slot_, kThreadLocalStorageSize);
183  g_tls_destructors[slot_] = NULL;
184  slot_ = 0;
185  initialized_ = false;
186}
187
188void* ThreadLocalStorage::StaticSlot::Get() const {
189  void** tls_data = static_cast<void**>(TlsGetValue(g_native_tls_key));
190  if (!tls_data)
191    tls_data = ConstructTlsVector();
192  DCHECK_GT(slot_, 0);
193  DCHECK_LT(slot_, kThreadLocalStorageSize);
194  return tls_data[slot_];
195}
196
197void ThreadLocalStorage::StaticSlot::Set(void* value) {
198  void** tls_data = static_cast<void**>(TlsGetValue(g_native_tls_key));
199  if (!tls_data)
200    tls_data = ConstructTlsVector();
201  DCHECK_GT(slot_, 0);
202  DCHECK_LT(slot_, kThreadLocalStorageSize);
203  tls_data[slot_] = value;
204}
205
206}  // namespace base
207
208// Thread Termination Callbacks.
209// Windows doesn't support a per-thread destructor with its
210// TLS primitives.  So, we build it manually by inserting a
211// function to be called on each thread's exit.
212// This magic is from http://www.codeproject.com/threads/tls.asp
213// and it works for VC++ 7.0 and later.
214
215// Force a reference to _tls_used to make the linker create the TLS directory
216// if it's not already there.  (e.g. if __declspec(thread) is not used).
217// Force a reference to p_thread_callback_base to prevent whole program
218// optimization from discarding the variable.
219#ifdef _WIN64
220
221#pragma comment(linker, "/INCLUDE:_tls_used")
222#pragma comment(linker, "/INCLUDE:p_thread_callback_base")
223
224#else  // _WIN64
225
226#pragma comment(linker, "/INCLUDE:__tls_used")
227#pragma comment(linker, "/INCLUDE:_p_thread_callback_base")
228
229#endif  // _WIN64
230
231// Static callback function to call with each thread termination.
232void NTAPI OnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
233  // On XP SP0 & SP1, the DLL_PROCESS_ATTACH is never seen. It is sent on SP2+
234  // and on W2K and W2K3. So don't assume it is sent.
235  if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason)
236    WinThreadExit();
237}
238
239// .CRT$XLA to .CRT$XLZ is an array of PIMAGE_TLS_CALLBACK pointers that are
240// called automatically by the OS loader code (not the CRT) when the module is
241// loaded and on thread creation. They are NOT called if the module has been
242// loaded by a LoadLibrary() call. It must have implicitly been loaded at
243// process startup.
244// By implicitly loaded, I mean that it is directly referenced by the main EXE
245// or by one of its dependent DLLs. Delay-loaded DLL doesn't count as being
246// implicitly loaded.
247//
248// See VC\crt\src\tlssup.c for reference.
249
250// extern "C" suppresses C++ name mangling so we know the symbol name for the
251// linker /INCLUDE:symbol pragma above.
252extern "C" {
253// The linker must not discard p_thread_callback_base.  (We force a reference
254// to this variable with a linker /INCLUDE:symbol pragma to ensure that.) If
255// this variable is discarded, the OnThreadExit function will never be called.
256#ifdef _WIN64
257
258// .CRT section is merged with .rdata on x64 so it must be constant data.
259#pragma const_seg(".CRT$XLB")
260// When defining a const variable, it must have external linkage to be sure the
261// linker doesn't discard it.
262extern const PIMAGE_TLS_CALLBACK p_thread_callback_base;
263const PIMAGE_TLS_CALLBACK p_thread_callback_base = OnThreadExit;
264
265// Reset the default section.
266#pragma const_seg()
267
268#else  // _WIN64
269
270#pragma data_seg(".CRT$XLB")
271PIMAGE_TLS_CALLBACK p_thread_callback_base = OnThreadExit;
272
273// Reset the default section.
274#pragma data_seg()
275
276#endif  // _WIN64
277}  // extern "C"
278