11963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License");
475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License.
575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at
675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//     http://www.apache.org/licenses/LICENSE-2.0
875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software
1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS,
1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and
1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License.
1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// profiler.h: a simple sampling profiler that's always just one #include away!
1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Overview
1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ========
1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This profiler only samples a pseudo-stack, not the actual call stack.
2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The code to be profiled needs to be instrumented with
2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// pseudo-stack "labels", see ScopedProfilingLabel.
2375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// portable, and independent of compilation details such as function inlining
2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// and frame pointers. The granularity of instrumentation can be freely chosen,
2675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// and it is possible to get some annotate-like detail, i.e. detail within one
2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// function without splitting it into multiple functions.
2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This profiler should remain small and simple; its key feature is to fit in
3075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// a single header file so that there should never be a reason to refrain
3175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// from profiling. More complex and feature-rich alternatives are
3275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// readily available. This one offers a strict superset of its
3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// functionality: https://github.com/bgirard/GeckoProfiler, including
3475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// intertwining pseudostacks with real call stacks, more annotation options,
3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// and advanced visualization.
3675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Usage
3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// =====
3975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
41cb3e140fae8e2be7cc5043d7c4f217bb57df5250Benoit Jacob//    not enabled, profiling instrumentation from instrumentation.h
4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
4375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    is still defined but does nothing. On the other hand,
4475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    when profiling is not enabled, it is an error to #include the
4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    present file.
4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 1. Each thread can opt in to profiling by calling
48cb3e140fae8e2be7cc5043d7c4f217bb57df5250Benoit Jacob//    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
4975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    This can be done at any time, before or during profiling.
5075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    No sample will be collected from a thread until
5175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    it has called RegisterCurrentThreadForProfiling().
5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
5375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 2. Instrument your code to be profiled with ScopedProfilingLabel,
54cb3e140fae8e2be7cc5043d7c4f217bb57df5250Benoit Jacob//    which is a RAII helper defined in instrumentation.h. The identifier
5575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    names (some_label, etc) do not matter; what will show up
5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    in the profile is the string passed to the constructor, which
5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    must be a literal string. See the full example below.
5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    Note: the overhead of ScopedProfilingLabel is zero when not
6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    enabling profiling (when not defining GEMMLOWP_PROFILING).
6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 3. Use the profiler.h interface to control profiling. There are two
6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    functions: StartProfiling() and FinishProfiling(). They must be
6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    called on the same thread. FinishProfiling() prints the profile
6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//    on stdout.
6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Full example
6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ============
6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob/*
7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    #define GEMMLOWP_PROFILING
7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    #include "profiling/instrumentation.h"
7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    using namespace gemmlowp;
7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    const int iters = 100000000;
7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    volatile int i;
7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    void Bar() {
7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      ScopedProfilingLabel label("Bar");
7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (i = 0; i < iters; i++) {}
8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    void Foo() {
8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      ScopedProfilingLabel label("Foo");
8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (i = 0; i < iters; i++) {}
8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      Bar();
8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    void Init() {
8975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      RegisterCurrentThreadForProfiling();
9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
9175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    #include "profiling/profiler.h"
9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    int main() {
9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      Init();
9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      StartProfiling();
9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      Foo();
9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      FinishProfiling();
9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob*
10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob* Output:
10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob*
10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    gemmlowp profile (1 threads, 304 samples)
10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    100.00% Foo
10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        51.32% other
10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        48.68% Bar
10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    0.00% other (outside of any label)
10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob*/
10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
11075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Interpreting results
11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ====================
11275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  Each node shows the absolute percentage, among all the samples,
11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  of the number of samples that recorded the given pseudo-stack.
11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  The percentages are *NOT* relative to the parent node. In addition
11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  to your own labels, you will also see 'other' nodes that collect
11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  the remainder of samples under the parent node that didn't fall into
11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  any of the labelled child nodes. Example:
11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  20% Foo
12175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//      12% Bar
12275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//      6% Xyz
12375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//      2% other
12475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
12575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  This means that 20% of all labels were under Foo, of which 12%/20%==60%
12675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
12775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  under either Bar or Xyz.
12875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
12975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  Typically, one wants to keep adding ScopedProfilingLabel's until
13075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//  the 'other' nodes show low percentages.
13175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
13275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Interpreting results with multiple threads
13375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ==========================================
13475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
13575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// At each sample, each thread registered for profiling gets sampled once.
13675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// So if there is one "main thread" spending its time in MainFunc() and
13775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
13875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// samples will be in WorkerFunc, so the profile will look like this:
13975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
14075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 80% WorkerFunc
14175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 20% MainFunc
14275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
14375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_PROFILING_PROFILER_H_
14475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_PROFILING_PROFILER_H_
14575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
14675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_PROFILING
14775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#error Profiling is not enabled!
14875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif
14975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
15075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#include <vector>
15175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
152544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "instrumentation.h"
15375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
15475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp {
15575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
15675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// A tree view of a profile.
15775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass ProfileTreeView {
15875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  struct Node {
15975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    std::vector<Node*> children;
16075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    const char* label;
16175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    std::size_t weight;
16275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    Node() : label(nullptr), weight(0) {}
16375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    ~Node() {
16475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (auto child : children) {
16575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        delete child;
16675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
16775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
16875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  };
16975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
17075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static bool CompareNodes(Node* n1, Node* n2) {
17175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return n1->weight > n2->weight;
17275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
17375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
17475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Node root_;
17575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
17675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void PrintNode(const Node* node, int level) const {
17775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    if (level) {
17875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (int i = 1; i < level; i++) {
17975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        printf("    ");
18075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
18175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
18275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
18375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (auto child : node->children) {
18475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      PrintNode(child, level + 1);
18575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
18675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
18775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
18875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static void AddStackToNode(const ProfilingStack& stack, Node* node,
18975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                             std::size_t level) {
19075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    node->weight++;
19175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    if (stack.size == level) {
19275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      return;
19375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
19475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    Node* child_to_add_to = nullptr;
19575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (auto child : node->children) {
19675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      if (child->label == stack.labels[level]) {
19775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        child_to_add_to = child;
19875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        break;
19975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
20075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
20175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    if (!child_to_add_to) {
20275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      child_to_add_to = new Node;
20375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      child_to_add_to->label = stack.labels[level];
20475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      node->children.push_back(child_to_add_to);
20575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
20675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    AddStackToNode(stack, child_to_add_to, level + 1);
20775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return;
20875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
20975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
21075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void AddStack(const ProfilingStack& stack) {
21175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    AddStackToNode(stack, &root_, 0);
21275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
21375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
21475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void AddOtherChildrenToNode(Node* node) {
21575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    std::size_t top_level_children_weight = 0;
21675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (auto c : node->children) {
21775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      AddOtherChildrenToNode(c);
21875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      top_level_children_weight += c->weight;
21975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
22075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    if (top_level_children_weight) {
22175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      Node* other_child = new Node;
22275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      other_child->label =
22375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          node == &root_ ? "other (outside of any label)" : "other";
22475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      other_child->weight = node->weight - top_level_children_weight;
22575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      node->children.push_back(other_child);
22675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
22775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
22875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
22975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
23075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
23175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void SortNode(Node* node) {
23275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    std::sort(node->children.begin(), node->children.end(), CompareNodes);
23375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (auto child : node->children) {
23475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      SortNode(child);
23575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
23675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
23775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
23875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void Sort() { SortNode(&root_); }
23975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
24075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public:
24175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
24275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (auto stack : stacks) {
24375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      AddStack(stack);
24475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
24575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    AddOtherNodes();
24675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    Sort();
24775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
24875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
24975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void Print() const {
25075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    printf("\n");
25175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    printf("gemmlowp profile (%d threads, %d samples)\n",
25275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob           static_cast<int>(ThreadsUnderProfiling().size()),
25375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob           static_cast<int>(root_.weight));
25475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    PrintNode(&root_, 0);
25575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    printf("\n");
25675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
25775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
25875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
25975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This function is the only place that determines our sampling frequency.
26075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void WaitOneProfilerTick() {
26175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static const int millisecond = 1000000;
26275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
26375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#if defined __arm__ || defined __aarch64__
26475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // Reduced sampling frequency on mobile devices helps limit time and memory
26575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // overhead there.
26675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static const int interval = 10 * millisecond;
26775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#else
26875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static const int interval = 1 * millisecond;
26975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif
27075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
27175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  timespec ts;
27275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ts.tv_sec = 0;
27375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ts.tv_nsec = interval;
27475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  nanosleep(&ts, nullptr);
27575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
27675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
27775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This is how we track whether we've already started profiling,
27875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to guard against misuse of the API.
27975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline bool& IsProfiling() {
28075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static bool b;
28175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  return b;
28275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
28375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
28475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This is how we tell the profiler thread to finish.
28575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline bool& ProfilerThreadShouldFinish() {
28675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static bool b;
28775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  return b;
28875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
28975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
29075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The profiler thread. See ProfilerThreadFunc.
29175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline pthread_t& ProfilerThread() {
29275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static pthread_t t;
29375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  return t;
29475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
29575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
29675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Records a stack from a running thread.
29775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The tricky part is that we're not interrupting the thread.
29875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This is OK because we're looking at a pseudo-stack of labels,
29975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// not at the real thread stack, and if the pseudo-stack changes
30075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// while we're recording it, we are OK with getting either the
30175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// old or the new stack. Note that ProfilingStack::Pop
30275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// only decrements the size, and doesn't null the popped label,
30375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// so if we're concurrently recording it, it shouldn't change
30475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// under our feet until another label is pushed, at which point
30575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// we are OK with getting either this new label or the old one.
30675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// In the end, the key atomicity property that we are relying on
30775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// here is that pointers are changed atomically, and the labels
30875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// are pointers (to literal strings).
3091963df9ac4a0424674e72ef5da522b5d830605fdMiao Wanginline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) {
3101963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang  ScopedLock sl(thread->stack.lock);
31175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  assert(!dst->size);
31275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  while (dst->size < thread->stack.size) {
31375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    dst->labels[dst->size] = thread->stack.labels[dst->size];
31475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    dst->size++;
31575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
31675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
31775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
31875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The profiler thread's entry point.
31975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Note that a separate thread is to be started each time we call
32075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// StartProfiling(), and finishes when we call FinishProfiling().
32175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// So here we only need to handle the recording and reporting of
32275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// a single profile.
32375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void* ProfilerThreadFunc(void*) {
32475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  assert(ProfilerThread() == pthread_self());
32575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
32675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // Since we only handle one profile per profiler thread, the
32775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // profile data (the array of recorded stacks) can be a local variable here.
32875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  std::vector<ProfilingStack> stacks;
32975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
33075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  while (!ProfilerThreadShouldFinish()) {
33175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    WaitOneProfilerTick();
33275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    {
3331963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang      ScopedLock sl(GlobalMutexes::Profiler());
33475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (auto t : ThreadsUnderProfiling()) {
33575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        ProfilingStack s;
33675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        RecordStack(t, &s);
33775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        stacks.push_back(s);
33875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
33975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
34075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
34175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
34275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // Profiling is finished and we now report the results.
34375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ProfileTreeView(stacks).Print();
34475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
34575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  return nullptr;
34675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
34775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
34875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Starts recording samples.
34975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void StartProfiling() {
3501963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang  ScopedLock sl(GlobalMutexes::Profiler());
35175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
35275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  IsProfiling() = true;
35375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ProfilerThreadShouldFinish() = false;
35475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
35575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
35675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
35775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Stops recording samples, and prints a profile tree-view on stdout.
35875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void FinishProfiling() {
35975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  {
3601963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang    ScopedLock sl(GlobalMutexes::Profiler());
36175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
36275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
36375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    // as the scary comments below should make clear.
36475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    // Should we use a condition variable?
36575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    ProfilerThreadShouldFinish() = true;
36675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }  // must release the lock here to avoid deadlock with profiler thread.
36775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  pthread_join(ProfilerThread(), nullptr);
36875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  IsProfiling() = false;  // yikes, this should be guarded by the lock!
36975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
37075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
37175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}  // namespace gemmlowp
37275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
37375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif  // GEMMLOWP_PROFILING_PROFILER_H_
374