summaryrefslogtreecommitdiffstats
path: root/base/threading/thread_local_storage_win.cc
blob: 0ae3cb4c8cd39471b9b532a5dc87e2b5d61647a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/threading/thread_local_storage.h"

#include <windows.h>

#include "base/logging.h"


namespace {
// In order to make TLS destructors work, we need to keep function
// pointers to the destructor for each TLS that we allocate.
// We make this work by allocating a single OS-level TLS, which
// contains an array of slots for the application to use.  In
// parallel, we also allocate an array of destructors, which we
// keep track of and call when threads terminate.

// g_native_tls_key is the one native TLS that we use.  It stores our table.
long g_native_tls_key = TLS_OUT_OF_INDEXES;

// g_last_used_tls_key is the high-water-mark of allocated thread local storage.
// Each allocation is an index into our g_tls_destructors[].  Each such index is
// assigned to the instance variable slot_ in a ThreadLocalStorage::Slot
// instance.  We reserve the value slot_ == 0 to indicate that the corresponding
// instance of ThreadLocalStorage::Slot has been freed (i.e., destructor called,
// etc.).  This reserved use of 0 is then stated as the initial value of
// g_last_used_tls_key, so that the first issued index will be 1.
long g_last_used_tls_key = 0;

// The maximum number of 'slots' in our thread local storage stack.
const int kThreadLocalStorageSize = 64;

// The maximum number of times to try to clear slots by calling destructors.
// Use pthread naming convention for clarity.
const int kMaxDestructorIterations = kThreadLocalStorageSize;

// An array of destructor function pointers for the slots.  If a slot has a
// destructor, it will be stored in its corresponding entry in this array.
// The elements are volatile to ensure that when the compiler reads the value
// to potentially call the destructor, it does so once, and that value is tested
// for null-ness and then used. Yes, that would be a weird de-optimization,
// but I can imagine some register machines where it was just as easy to
// re-fetch an array element, and I want to be sure a call to free the key
// (i.e., null out the destructor entry) that happens on a separate thread can't
// hurt the racy calls to the destructors on another thread.
volatile base::ThreadLocalStorage::TLSDestructorFunc
    g_tls_destructors[kThreadLocalStorageSize];

void** ConstructTlsVector() {
  if (g_native_tls_key == TLS_OUT_OF_INDEXES) {
    long value = TlsAlloc();
    DCHECK(value != TLS_OUT_OF_INDEXES);

    // Atomically test-and-set the tls_key.  If the key is TLS_OUT_OF_INDEXES,
    // go ahead and set it.  Otherwise, do nothing, as another
    // thread already did our dirty work.
    if (TLS_OUT_OF_INDEXES != InterlockedCompareExchange(
            &g_native_tls_key, value, TLS_OUT_OF_INDEXES)) {
      // We've been shortcut. Another thread replaced g_native_tls_key first so
      // we need to destroy our index and use the one the other thread got
      // first.
      TlsFree(value);
    }
  }
  DCHECK(!TlsGetValue(g_native_tls_key));

  // Some allocators, such as TCMalloc, make use of thread local storage.
  // As a result, any attempt to call new (or malloc) will lazily cause such a
  // system to initialize, which will include registering for a TLS key.  If we
  // are not careful here, then that request to create a key will call new back,
  // and we'll have an infinite loop.  We avoid that as follows:
  // Use a stack allocated vector, so that we don't have dependence on our
  // allocator until our service is in place.  (i.e., don't even call new until
  // after we're setup)
  void* stack_allocated_tls_data[kThreadLocalStorageSize];
  memset(stack_allocated_tls_data, 0, sizeof(stack_allocated_tls_data));
  // Ensure that any rentrant calls change the temp version.
  TlsSetValue(g_native_tls_key, stack_allocated_tls_data);

  // Allocate an array to store our data.
  void** tls_data = new void*[kThreadLocalStorageSize];
  memcpy(tls_data, stack_allocated_tls_data, sizeof(stack_allocated_tls_data));
  TlsSetValue(g_native_tls_key, tls_data);
  return tls_data;
}

// Called when we terminate a thread, this function calls any TLS destructors
// that are pending for this thread.
void WinThreadExit() {
  if (g_native_tls_key == TLS_OUT_OF_INDEXES)
    return;

  void** tls_data = static_cast<void**>(TlsGetValue(g_native_tls_key));
  // Maybe we have never initialized TLS for this thread.
  if (!tls_data)
    return;

  // Some allocators, such as TCMalloc, use TLS.  As a result, when a thread
  // terminates, one of the destructor calls we make may be to shut down an
  // allocator.  We have to be careful that after we've shutdown all of the
  // known destructors (perchance including an allocator), that we don't call
  // the allocator and cause it to resurrect itself (with no possibly destructor
  // call to follow).  We handle this problem as follows:
  // Switch to using a stack allocated vector, so that we don't have dependence
  // on our allocator after we have called all g_tls_destructors.  (i.e., don't
  // even call delete[] after we're done with destructors.)
  void* stack_allocated_tls_data[kThreadLocalStorageSize];
  memcpy(stack_allocated_tls_data, tls_data, sizeof(stack_allocated_tls_data));
  // Ensure that any re-entrant calls change the temp version.
  TlsSetValue(g_native_tls_key, stack_allocated_tls_data);
  delete[] tls_data;  // Our last dependence on an allocator.

  int remaining_attempts = kMaxDestructorIterations;
  bool need_to_scan_destructors = true;
  while (need_to_scan_destructors) {
    need_to_scan_destructors = false;
    // Try to destroy the first-created-slot (which is slot 1) in our last
    // destructor call.  That user was able to function, and define a slot with
    // no other services running, so perhaps it is a basic service (like an
    // allocator) and should also be destroyed last.  If we get the order wrong,
    // then we'll itterate several more times, so it is really not that
    // critical (but it might help).
    for (int slot = g_last_used_tls_key; slot > 0; --slot) {
      void* value = stack_allocated_tls_data[slot];
      if (value == NULL)
        continue;
      base::ThreadLocalStorage::TLSDestructorFunc destructor =
          g_tls_destructors[slot];
      if (destructor == NULL)
        continue;
      stack_allocated_tls_data[slot] = NULL;  // pre-clear the slot.
      destructor(value);
      // Any destructor might have called a different service, which then set
      // a different slot to a non-NULL value.  Hence we need to check
      // the whole vector again.  This is a pthread standard.
      need_to_scan_destructors = true;
    }
    if (--remaining_attempts <= 0) {
      NOTREACHED();  // Destructors might not have been called.
      break;
    }
  }

  // Remove our stack allocated vector.
  TlsSetValue(g_native_tls_key, NULL);
}

}  // namespace

namespace base {

ThreadLocalStorage::Slot::Slot(TLSDestructorFunc destructor) {
  initialized_ = false;
  slot_ = 0;
  Initialize(destructor);
}

bool ThreadLocalStorage::StaticSlot::Initialize(TLSDestructorFunc destructor) {
  if (g_native_tls_key == TLS_OUT_OF_INDEXES || !TlsGetValue(g_native_tls_key))
    ConstructTlsVector();

  // Grab a new slot.
  slot_ = InterlockedIncrement(&g_last_used_tls_key);
  DCHECK_GT(slot_, 0);
  if (slot_ >= kThreadLocalStorageSize) {
    NOTREACHED();
    return false;
  }

  // Setup our destructor.
  g_tls_destructors[slot_] = destructor;
  initialized_ = true;
  return true;
}

void ThreadLocalStorage::StaticSlot::Free() {
  // At this time, we don't reclaim old indices for TLS slots.
  // So all we need to do is wipe the destructor.
  DCHECK_GT(slot_, 0);
  DCHECK_LT(slot_, kThreadLocalStorageSize);
  g_tls_destructors[slot_] = NULL;
  slot_ = 0;
  initialized_ = false;
}

void* ThreadLocalStorage::StaticSlot::Get() const {
  void** tls_data = static_cast<void**>(TlsGetValue(g_native_tls_key));
  if (!tls_data)
    tls_data = ConstructTlsVector();
  DCHECK_GT(slot_, 0);
  DCHECK_LT(slot_, kThreadLocalStorageSize);
  return tls_data[slot_];
}

void ThreadLocalStorage::StaticSlot::Set(void* value) {
  void** tls_data = static_cast<void**>(TlsGetValue(g_native_tls_key));
  if (!tls_data)
    tls_data = ConstructTlsVector();
  DCHECK_GT(slot_, 0);
  DCHECK_LT(slot_, kThreadLocalStorageSize);
  tls_data[slot_] = value;
}

}  // namespace base

// Thread Termination Callbacks.
// Windows doesn't support a per-thread destructor with its
// TLS primitives.  So, we build it manually by inserting a
// function to be called on each thread's exit.
// This magic is from http://www.codeproject.com/threads/tls.asp
// and it works for VC++ 7.0 and later.

// Force a reference to _tls_used to make the linker create the TLS directory
// if it's not already there.  (e.g. if __declspec(thread) is not used).
// Force a reference to p_thread_callback_base to prevent whole program
// optimization from discarding the variable.
#ifdef _WIN64

#pragma comment(linker, "/INCLUDE:_tls_used")
#pragma comment(linker, "/INCLUDE:p_thread_callback_base")

#else  // _WIN64

#pragma comment(linker, "/INCLUDE:__tls_used")
#pragma comment(linker, "/INCLUDE:_p_thread_callback_base")

#endif  // _WIN64

// Static callback function to call with each thread termination.
void NTAPI OnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
  // On XP SP0 & SP1, the DLL_PROCESS_ATTACH is never seen. It is sent on SP2+
  // and on W2K and W2K3. So don't assume it is sent.
  if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason)
    WinThreadExit();
}

// .CRT$XLA to .CRT$XLZ is an array of PIMAGE_TLS_CALLBACK pointers that are
// called automatically by the OS loader code (not the CRT) when the module is
// loaded and on thread creation. They are NOT called if the module has been
// loaded by a LoadLibrary() call. It must have implicitly been loaded at
// process startup.
// By implicitly loaded, I mean that it is directly referenced by the main EXE
// or by one of its dependent DLLs. Delay-loaded DLL doesn't count as being
// implicitly loaded.
//
// See VC\crt\src\tlssup.c for reference.

// extern "C" suppresses C++ name mangling so we know the symbol name for the
// linker /INCLUDE:symbol pragma above.
extern "C" {
// The linker must not discard p_thread_callback_base.  (We force a reference
// to this variable with a linker /INCLUDE:symbol pragma to ensure that.) If
// this variable is discarded, the OnThreadExit function will never be called.
#ifdef _WIN64

// .CRT section is merged with .rdata on x64 so it must be constant data.
#pragma const_seg(".CRT$XLB")
// When defining a const variable, it must have external linkage to be sure the
// linker doesn't discard it.
extern const PIMAGE_TLS_CALLBACK p_thread_callback_base;
const PIMAGE_TLS_CALLBACK p_thread_callback_base = OnThreadExit;

// Reset the default section.
#pragma const_seg()

#else  // _WIN64

#pragma data_seg(".CRT$XLB")
PIMAGE_TLS_CALLBACK p_thread_callback_base = OnThreadExit;

// Reset the default section.
#pragma data_seg()

#endif  // _WIN64
}  // extern "C"