1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// instrumentation.h: contains the definitions needed to 16// instrument code for profiling: 17// ScopedProfilingLabel, RegisterCurrentThreadForProfiling. 18// 19// profiler.h is only needed to drive the profiler: 20// StartProfiling, FinishProfiling. 21// 22// See the usage example in profiler.h. 23 24#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_ 25#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_ 26 27#include <pthread.h> 28#include <cstdio> 29 30#ifndef GEMMLOWP_USE_STLPORT 31#include <cstdint> 32#else 33#include <stdint.h> 34namespace std { 35using ::uint8_t; 36using ::uint16_t; 37using ::uint32_t; 38using ::int8_t; 39using ::int16_t; 40using ::int32_t; 41using ::size_t; 42using ::uintptr_t; 43} 44#endif 45 46#include <algorithm> 47#include <cassert> 48#include <cstdlib> 49 50#ifdef GEMMLOWP_PROFILING 51#include <cstring> 52#include <set> 53#endif 54 55// We should always use C++11 thread_local; unfortunately that 56// isn't fully supported on Apple yet. 57#ifdef __APPLE__ 58#define GEMMLOWP_THREAD_LOCAL static __thread 59#define GEMMLOWP_USING_OLD_THREAD_LOCAL 60#else 61#define GEMMLOWP_THREAD_LOCAL thread_local 62#endif 63 64namespace gemmlowp { 65 66inline void ReleaseBuildAssertion(bool condition, const char* msg) { 67 if (!condition) { 68 fprintf(stderr, "gemmlowp error: %s\n", msg); 69 abort(); 70 } 71} 72 73// To be used as template parameter for GlobalLock. 74// GlobalLock<ProfilerLockId> is the profiler global lock: 75// registering threads, starting profiling, finishing profiling, and 76// the profiler itself as it samples threads, all need to lock it. 77struct ProfilerLockId; 78 79// A very plain global lock. Templated in LockId so we can have multiple 80// locks, one for each LockId type. 81template <typename LockId> 82class GlobalLock { 83 static pthread_mutex_t* Mutex() { 84 static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; 85 return &m; 86 } 87 88 public: 89 static void Lock() { pthread_mutex_lock(Mutex()); } 90 static void Unlock() { pthread_mutex_unlock(Mutex()); } 91}; 92 93// A very simple RAII helper to lock and unlock a GlobalLock 94template <typename LockId> 95struct AutoGlobalLock { 96 AutoGlobalLock() { GlobalLock<LockId>::Lock(); } 97 ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } 98}; 99 100// MemoryBarrier is purely a compile-time thing; it tells two things 101// to the compiler: 102// 1) It prevents reordering code across it 103// (thanks to the 'volatile' after 'asm') 104// 2) It requires the compiler to assume that any value previously 105// read from memory, may have changed. Thus it offers an alternative 106// to using 'volatile' variables. 107inline void MemoryBarrier() { asm volatile("" ::: "memory"); } 108 109// Profiling definitions. Two paths: when profiling is enabled, 110// and when profiling is disabled. 111#ifdef GEMMLOWP_PROFILING 112// This code path is when profiling is enabled. 113 114// A pseudo-call-stack. Contrary to a real call-stack, this only 115// contains pointers to literal strings that were manually entered 116// in the instrumented code (see ScopedProfilingLabel). 117struct ProfilingStack { 118 static const std::size_t kMaxSize = 15; 119 typedef const char* LabelsArrayType[kMaxSize]; 120 LabelsArrayType labels; 121 std::size_t size; 122 123 ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } 124 125 void Push(const char* label) { 126 MemoryBarrier(); 127 ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); 128 labels[size] = label; 129 MemoryBarrier(); 130 size++; 131 MemoryBarrier(); 132 } 133 134 void Pop() { 135 MemoryBarrier(); 136 ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); 137 size--; 138 MemoryBarrier(); 139 } 140 141 void UpdateTop(const char* new_label) { 142 MemoryBarrier(); 143 assert(size); 144 labels[size - 1] = new_label; 145 MemoryBarrier(); 146 } 147 148 ProfilingStack& operator=(const ProfilingStack& other) { 149 memcpy(this, &other, sizeof(ProfilingStack)); 150 return *this; 151 } 152 153 bool operator==(const ProfilingStack& other) const { 154 return !memcmp(this, &other, sizeof(ProfilingStack)); 155 } 156}; 157 158static_assert( 159 !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)), 160 "ProfilingStack should have power-of-two size to fit in cache lines"); 161 162struct ThreadInfo; 163 164// The global set of threads being profiled. 165inline std::set<ThreadInfo*>& ThreadsUnderProfiling() { 166 static std::set<ThreadInfo*> v; 167 return v; 168} 169 170struct ThreadInfo { 171 pthread_key_t key; // used only to get a callback at thread exit. 172 ProfilingStack stack; 173 174 ThreadInfo() { 175 pthread_key_create(&key, ThreadExitCallback); 176 pthread_setspecific(key, this); 177 } 178 179 static void ThreadExitCallback(void* ptr) { 180 AutoGlobalLock<ProfilerLockId> lock; 181 ThreadInfo* self = static_cast<ThreadInfo*>(ptr); 182 ThreadsUnderProfiling().erase(self); 183 pthread_key_delete(self->key); 184 } 185}; 186 187inline ThreadInfo& ThreadLocalThreadInfo() { 188#ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL 189 // We're leaking this ThreadInfo structure, because Apple doesn't support 190 // non-trivial constructors or destructors for their __thread type modifier. 191 GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr; 192 if (i == nullptr) { 193 i = new ThreadInfo(); 194 } 195 return *i; 196#else 197 GEMMLOWP_THREAD_LOCAL ThreadInfo i; 198 return i; 199#endif 200} 201 202// ScopedProfilingLabel is how one instruments code for profiling 203// with this profiler. Construct local ScopedProfilingLabel variables, 204// passing a literal string describing the local code. Profile 205// samples will then be annotated with this label, while it is in scope 206// (whence the name --- also known as RAII). 207// See the example in profiler.h. 208class ScopedProfilingLabel { 209 ProfilingStack* profiling_stack_; 210 211 public: 212 explicit ScopedProfilingLabel(const char* label) 213 : profiling_stack_(&ThreadLocalThreadInfo().stack) { 214 profiling_stack_->Push(label); 215 } 216 217 ~ScopedProfilingLabel() { profiling_stack_->Pop(); } 218 219 void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); } 220}; 221 222// To be called once on each thread to be profiled. 223inline void RegisterCurrentThreadForProfiling() { 224 AutoGlobalLock<ProfilerLockId> lock; 225 ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); 226} 227 228#else // not GEMMLOWP_PROFILING 229// This code path is when profiling is disabled. 230 231// This empty definition of ScopedProfilingLabel ensures that 232// it has zero runtime overhead when profiling is disabled. 233struct ScopedProfilingLabel { 234 explicit ScopedProfilingLabel(const char*) {} 235 void Update(const char*) {} 236}; 237 238inline void RegisterCurrentThreadForProfiling() {} 239 240#endif 241 242} // end namespace gemmlowp 243 244#endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_ 245