1// Copyright 2015 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// instrumentation.h: contains the definitions needed to
16// instrument code for profiling:
17//   ScopedProfilingLabel, RegisterCurrentThreadForProfiling.
18//
19// profiler.h is only needed to drive the profiler:
20//   StartProfiling, FinishProfiling.
21//
22// See the usage example in profiler.h.
23
24#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_
25#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_
26
27#include <pthread.h>
28#include <cstdio>
29
30#ifndef GEMMLOWP_USE_STLPORT
31#include <cstdint>
32#else
33#include <stdint.h>
34namespace std {
35using ::uint8_t;
36using ::uint16_t;
37using ::uint32_t;
38using ::int8_t;
39using ::int16_t;
40using ::int32_t;
41using ::size_t;
42using ::uintptr_t;
43}
44#endif
45
46#include <algorithm>
47#include <cassert>
48#include <cstdlib>
49
50#ifdef GEMMLOWP_PROFILING
51#include <cstring>
52#include <set>
53#endif
54
55// We should always use C++11 thread_local; unfortunately that
56// isn't fully supported on Apple yet.
57#ifdef __APPLE__
58#define GEMMLOWP_THREAD_LOCAL static __thread
59#define GEMMLOWP_USING_OLD_THREAD_LOCAL
60#else
61#define GEMMLOWP_THREAD_LOCAL thread_local
62#endif
63
64namespace gemmlowp {
65
66inline void ReleaseBuildAssertion(bool condition, const char* msg) {
67  if (!condition) {
68    fprintf(stderr, "gemmlowp error: %s\n", msg);
69    abort();
70  }
71}
72
73// To be used as template parameter for GlobalLock.
74// GlobalLock<ProfilerLockId> is the profiler global lock:
75// registering threads, starting profiling, finishing profiling, and
76// the profiler itself as it samples threads, all need to lock it.
77struct ProfilerLockId;
78
79// A very plain global lock. Templated in LockId so we can have multiple
80// locks, one for each LockId type.
81template <typename LockId>
82class GlobalLock {
83  static pthread_mutex_t* Mutex() {
84    static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
85    return &m;
86  }
87
88 public:
89  static void Lock() { pthread_mutex_lock(Mutex()); }
90  static void Unlock() { pthread_mutex_unlock(Mutex()); }
91};
92
93// A very simple RAII helper to lock and unlock a GlobalLock
94template <typename LockId>
95struct AutoGlobalLock {
96  AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
97  ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
98};
99
100// MemoryBarrier is purely a compile-time thing; it tells two things
101// to the compiler:
102//   1) It prevents reordering code across it
103//     (thanks to the 'volatile' after 'asm')
104//   2) It requires the compiler to assume that any value previously
105//     read from memory, may have changed. Thus it offers an alternative
106//     to using 'volatile' variables.
107inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
108
109// Profiling definitions. Two paths: when profiling is enabled,
110// and when profiling is disabled.
111#ifdef GEMMLOWP_PROFILING
112// This code path is when profiling is enabled.
113
114// A pseudo-call-stack. Contrary to a real call-stack, this only
115// contains pointers to literal strings that were manually entered
116// in the instrumented code (see ScopedProfilingLabel).
117struct ProfilingStack {
118  static const std::size_t kMaxSize = 15;
119  typedef const char* LabelsArrayType[kMaxSize];
120  LabelsArrayType labels;
121  std::size_t size;
122
123  ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
124
125  void Push(const char* label) {
126    MemoryBarrier();
127    ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
128    labels[size] = label;
129    MemoryBarrier();
130    size++;
131    MemoryBarrier();
132  }
133
134  void Pop() {
135    MemoryBarrier();
136    ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
137    size--;
138    MemoryBarrier();
139  }
140
141  void UpdateTop(const char* new_label) {
142    MemoryBarrier();
143    assert(size);
144    labels[size - 1] = new_label;
145    MemoryBarrier();
146  }
147
148  ProfilingStack& operator=(const ProfilingStack& other) {
149    memcpy(this, &other, sizeof(ProfilingStack));
150    return *this;
151  }
152
153  bool operator==(const ProfilingStack& other) const {
154    return !memcmp(this, &other, sizeof(ProfilingStack));
155  }
156};
157
158static_assert(
159    !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)),
160    "ProfilingStack should have power-of-two size to fit in cache lines");
161
162struct ThreadInfo;
163
164// The global set of threads being profiled.
165inline std::set<ThreadInfo*>& ThreadsUnderProfiling() {
166  static std::set<ThreadInfo*> v;
167  return v;
168}
169
170struct ThreadInfo {
171  pthread_key_t key;  // used only to get a callback at thread exit.
172  ProfilingStack stack;
173
174  ThreadInfo() {
175    pthread_key_create(&key, ThreadExitCallback);
176    pthread_setspecific(key, this);
177  }
178
179  static void ThreadExitCallback(void* ptr) {
180    AutoGlobalLock<ProfilerLockId> lock;
181    ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
182    ThreadsUnderProfiling().erase(self);
183    pthread_key_delete(self->key);
184  }
185};
186
187inline ThreadInfo& ThreadLocalThreadInfo() {
188#ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL
189  // We're leaking this ThreadInfo structure, because Apple doesn't support
190  // non-trivial constructors or destructors for their __thread type modifier.
191  GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr;
192  if (i == nullptr) {
193    i = new ThreadInfo();
194  }
195  return *i;
196#else
197  GEMMLOWP_THREAD_LOCAL ThreadInfo i;
198  return i;
199#endif
200}
201
202// ScopedProfilingLabel is how one instruments code for profiling
203// with this profiler. Construct local ScopedProfilingLabel variables,
204// passing a literal string describing the local code. Profile
205// samples will then be annotated with this label, while it is in scope
206// (whence the name --- also known as RAII).
207// See the example in profiler.h.
208class ScopedProfilingLabel {
209  ProfilingStack* profiling_stack_;
210
211 public:
212  explicit ScopedProfilingLabel(const char* label)
213      : profiling_stack_(&ThreadLocalThreadInfo().stack) {
214    profiling_stack_->Push(label);
215  }
216
217  ~ScopedProfilingLabel() { profiling_stack_->Pop(); }
218
219  void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); }
220};
221
222// To be called once on each thread to be profiled.
223inline void RegisterCurrentThreadForProfiling() {
224  AutoGlobalLock<ProfilerLockId> lock;
225  ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
226}
227
228#else  // not GEMMLOWP_PROFILING
229// This code path is when profiling is disabled.
230
231// This empty definition of ScopedProfilingLabel ensures that
232// it has zero runtime overhead when profiling is disabled.
233struct ScopedProfilingLabel {
234  explicit ScopedProfilingLabel(const char*) {}
235  void Update(const char*) {}
236};
237
238inline void RegisterCurrentThreadForProfiling() {}
239
240#endif
241
242}  // end namespace gemmlowp
243
244#endif  // GEMMLOWP_PROFILING_INSTRUMENTATION_H_
245