1// Copyright 2015 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// profiler.h: a simple sampling profiler that's always just one #include away!
16//
17// Overview
18// ========
19//
20// This profiler only samples a pseudo-stack, not the actual call stack.
21// The code to be profiled needs to be instrumented with
22// pseudo-stack "labels", see ScopedProfilingLabel.
23// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
24// portable, and independent of compilation details such as function inlining
25// and frame pointers. The granularity of instrumentation can be freely chosen,
26// and it is possible to get some annotate-like detail, i.e. detail within one
27// function without splitting it into multiple functions.
28//
29// This profiler should remain small and simple; its key feature is to fit in
30// a single header file so that there should never be a reason to refrain
31// from profiling. More complex and feature-rich alternatives are
32// readily available. This one offers a strict superset of its
33// functionality: https://github.com/bgirard/GeckoProfiler, including
34// intertwining pseudostacks with real call stacks, more annotation options,
35// and advanced visualization.
36//
37// Usage
38// =====
39//
40// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
41//    not enabled, profiling instrumentation from instrumentation.h
42//    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
43//    is still defined but does nothing. On the other hand,
44//    when profiling is not enabled, it is an error to #include the
45//    present file.
46//
47// 1. Each thread can opt in to profiling by calling
48//    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
49//    This can be done at any time, before or during profiling.
50//    No sample will be collected from a thread until
51//    it has called RegisterCurrentThreadForProfiling().
52//
53// 2. Instrument your code to be profiled with ScopedProfilingLabel,
54//    which is a RAII helper defined in instrumentation.h. The identifier
55//    names (some_label, etc) do not matter; what will show up
56//    in the profile is the string passed to the constructor, which
57//    must be a literal string. See the full example below.
58//
59//    Note: the overhead of ScopedProfilingLabel is zero when not
60//    enabling profiling (when not defining GEMMLOWP_PROFILING).
61//
62// 3. Use the profiler.h interface to control profiling. There are two
63//    functions: StartProfiling() and FinishProfiling(). They must be
64//    called on the same thread. FinishProfiling() prints the profile
65//    on stdout.
66//
67// Full example
68// ============
69/*
70    #define GEMMLOWP_PROFILING
71    #include "profiling/instrumentation.h"
72    using namespace gemmlowp;
73
74    const int iters = 100000000;
75    volatile int i;
76
77    void Bar() {
78      ScopedProfilingLabel label("Bar");
79      for (i = 0; i < iters; i++) {}
80    }
81
82    void Foo() {
83      ScopedProfilingLabel label("Foo");
84      for (i = 0; i < iters; i++) {}
85      Bar();
86    }
87
88    void Init() {
89      RegisterCurrentThreadForProfiling();
90    }
91
92    #include "profiling/profiler.h"
93
94    int main() {
95      Init();
96      StartProfiling();
97      Foo();
98      FinishProfiling();
99    }
100*
101* Output:
102*
103    gemmlowp profile (1 threads, 304 samples)
104    100.00% Foo
105        51.32% other
106        48.68% Bar
107    0.00% other (outside of any label)
108*/
109//
110// Interpreting results
111// ====================
112//
113//  Each node shows the absolute percentage, among all the samples,
114//  of the number of samples that recorded the given pseudo-stack.
115//  The percentages are *NOT* relative to the parent node. In addition
116//  to your own labels, you will also see 'other' nodes that collect
117//  the remainder of samples under the parent node that didn't fall into
118//  any of the labelled child nodes. Example:
119//
120//  20% Foo
121//      12% Bar
122//      6% Xyz
123//      2% other
124//
125//  This means that 20% of all labels were under Foo, of which 12%/20%==60%
126//  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
127//  under either Bar or Xyz.
128//
129//  Typically, one wants to keep adding ScopedProfilingLabel's until
130//  the 'other' nodes show low percentages.
131//
132// Interpreting results with multiple threads
133// ==========================================
134//
135// At each sample, each thread registered for profiling gets sampled once.
136// So if there is one "main thread" spending its time in MainFunc() and
137// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
138// samples will be in WorkerFunc, so the profile will look like this:
139//
140// 80% WorkerFunc
141// 20% MainFunc
142
143#ifndef GEMMLOWP_PROFILING_PROFILER_H_
144#define GEMMLOWP_PROFILING_PROFILER_H_
145
146#ifndef GEMMLOWP_PROFILING
147#error Profiling is not enabled!
148#endif
149
150#include <vector>
151
152#include "instrumentation.h"
153
154namespace gemmlowp {
155
156// A tree view of a profile.
157class ProfileTreeView {
158  struct Node {
159    std::vector<Node*> children;
160    const char* label;
161    std::size_t weight;
162    Node() : label(nullptr), weight(0) {}
163    ~Node() {
164      for (auto child : children) {
165        delete child;
166      }
167    }
168  };
169
170  static bool CompareNodes(Node* n1, Node* n2) {
171    return n1->weight > n2->weight;
172  }
173
174  Node root_;
175
176  void PrintNode(const Node* node, int level) const {
177    if (level) {
178      for (int i = 1; i < level; i++) {
179        printf("    ");
180      }
181      printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
182    }
183    for (auto child : node->children) {
184      PrintNode(child, level + 1);
185    }
186  }
187
188  static void AddStackToNode(const ProfilingStack& stack, Node* node,
189                             std::size_t level) {
190    node->weight++;
191    if (stack.size == level) {
192      return;
193    }
194    Node* child_to_add_to = nullptr;
195    for (auto child : node->children) {
196      if (child->label == stack.labels[level]) {
197        child_to_add_to = child;
198        break;
199      }
200    }
201    if (!child_to_add_to) {
202      child_to_add_to = new Node;
203      child_to_add_to->label = stack.labels[level];
204      node->children.push_back(child_to_add_to);
205    }
206    AddStackToNode(stack, child_to_add_to, level + 1);
207    return;
208  }
209
210  void AddStack(const ProfilingStack& stack) {
211    AddStackToNode(stack, &root_, 0);
212  }
213
214  void AddOtherChildrenToNode(Node* node) {
215    std::size_t top_level_children_weight = 0;
216    for (auto c : node->children) {
217      AddOtherChildrenToNode(c);
218      top_level_children_weight += c->weight;
219    }
220    if (top_level_children_weight) {
221      Node* other_child = new Node;
222      other_child->label =
223          node == &root_ ? "other (outside of any label)" : "other";
224      other_child->weight = node->weight - top_level_children_weight;
225      node->children.push_back(other_child);
226    }
227  }
228
229  void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
230
231  void SortNode(Node* node) {
232    std::sort(node->children.begin(), node->children.end(), CompareNodes);
233    for (auto child : node->children) {
234      SortNode(child);
235    }
236  }
237
238  void Sort() { SortNode(&root_); }
239
240 public:
241  explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
242    for (auto stack : stacks) {
243      AddStack(stack);
244    }
245    AddOtherNodes();
246    Sort();
247  }
248
249  void Print() const {
250    printf("\n");
251    printf("gemmlowp profile (%d threads, %d samples)\n",
252           static_cast<int>(ThreadsUnderProfiling().size()),
253           static_cast<int>(root_.weight));
254    PrintNode(&root_, 0);
255    printf("\n");
256  }
257};
258
259// This function is the only place that determines our sampling frequency.
260inline void WaitOneProfilerTick() {
261  static const int millisecond = 1000000;
262
263#if defined __arm__ || defined __aarch64__
264  // Reduced sampling frequency on mobile devices helps limit time and memory
265  // overhead there.
266  static const int interval = 10 * millisecond;
267#else
268  static const int interval = 1 * millisecond;
269#endif
270
271  timespec ts;
272  ts.tv_sec = 0;
273  ts.tv_nsec = interval;
274  nanosleep(&ts, nullptr);
275}
276
277// This is how we track whether we've already started profiling,
278// to guard against misuse of the API.
279inline bool& IsProfiling() {
280  static bool b;
281  return b;
282}
283
284// This is how we tell the profiler thread to finish.
285inline bool& ProfilerThreadShouldFinish() {
286  static bool b;
287  return b;
288}
289
290// The profiler thread. See ProfilerThreadFunc.
291inline pthread_t& ProfilerThread() {
292  static pthread_t t;
293  return t;
294}
295
296// Records a stack from a running thread.
297// The tricky part is that we're not interrupting the thread.
298// This is OK because we're looking at a pseudo-stack of labels,
299// not at the real thread stack, and if the pseudo-stack changes
300// while we're recording it, we are OK with getting either the
301// old or the new stack. Note that ProfilingStack::Pop
302// only decrements the size, and doesn't null the popped label,
303// so if we're concurrently recording it, it shouldn't change
304// under our feet until another label is pushed, at which point
305// we are OK with getting either this new label or the old one.
306// In the end, the key atomicity property that we are relying on
307// here is that pointers are changed atomically, and the labels
308// are pointers (to literal strings).
309inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) {
310  assert(!dst->size);
311  while (dst->size < thread->stack.size) {
312    dst->labels[dst->size] = thread->stack.labels[dst->size];
313    dst->size++;
314    MemoryBarrier();  // thread->stack can change at any time
315  }
316}
317
318// The profiler thread's entry point.
319// Note that a separate thread is to be started each time we call
320// StartProfiling(), and finishes when we call FinishProfiling().
321// So here we only need to handle the recording and reporting of
322// a single profile.
323inline void* ProfilerThreadFunc(void*) {
324  assert(ProfilerThread() == pthread_self());
325
326  // Since we only handle one profile per profiler thread, the
327  // profile data (the array of recorded stacks) can be a local variable here.
328  std::vector<ProfilingStack> stacks;
329
330  while (!ProfilerThreadShouldFinish()) {
331    WaitOneProfilerTick();
332    {
333      AutoGlobalLock<ProfilerLockId> lock;
334      for (auto t : ThreadsUnderProfiling()) {
335        ProfilingStack s;
336        RecordStack(t, &s);
337        stacks.push_back(s);
338      }
339    }
340  }
341
342  // Profiling is finished and we now report the results.
343  ProfileTreeView(stacks).Print();
344
345  return nullptr;
346}
347
348// Starts recording samples.
349inline void StartProfiling() {
350  AutoGlobalLock<ProfilerLockId> lock;
351  ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
352  IsProfiling() = true;
353  ProfilerThreadShouldFinish() = false;
354  pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
355}
356
357// Stops recording samples, and prints a profile tree-view on stdout.
358inline void FinishProfiling() {
359  {
360    AutoGlobalLock<ProfilerLockId> lock;
361    ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
362    // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
363    // as the scary comments below should make clear.
364    // Should we use a condition variable?
365    ProfilerThreadShouldFinish() = true;
366  }  // must release the lock here to avoid deadlock with profiler thread.
367  pthread_join(ProfilerThread(), nullptr);
368  IsProfiling() = false;  // yikes, this should be guarded by the lock!
369}
370
371}  // namespace gemmlowp
372
373#endif  // GEMMLOWP_PROFILING_PROFILER_H_
374