11963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License"); 475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License. 575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at 675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// http://www.apache.org/licenses/LICENSE-2.0 875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software 1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS, 1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and 1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License. 1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// profiler.h: a simple sampling profiler that's always just one #include away! 1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Overview 1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ======== 1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This profiler only samples a pseudo-stack, not the actual call stack. 2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The code to be profiled needs to be instrumented with 2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// pseudo-stack "labels", see ScopedProfilingLabel. 2375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Using pseudo-stacks allows this profiler to be very simple, low-overhead, 2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// portable, and independent of compilation details such as function inlining 2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// and frame pointers. The granularity of instrumentation can be freely chosen, 2675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// and it is possible to get some annotate-like detail, i.e. detail within one 2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// function without splitting it into multiple functions. 2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This profiler should remain small and simple; its key feature is to fit in 3075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// a single header file so that there should never be a reason to refrain 3175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// from profiling. More complex and feature-rich alternatives are 3275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// readily available. This one offers a strict superset of its 3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// functionality: https://github.com/bgirard/GeckoProfiler, including 3475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// intertwining pseudostacks with real call stacks, more annotation options, 3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// and advanced visualization. 3675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Usage 3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ===== 3975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is 41cb3e140fae8e2be7cc5043d7c4f217bb57df5250Benoit Jacob// not enabled, profiling instrumentation from instrumentation.h 4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// (ScopedProfilingLabel, RegisterCurrentThreadForProfiling) 4375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// is still defined but does nothing. On the other hand, 4475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// when profiling is not enabled, it is an error to #include the 4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// present file. 4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 1. Each thread can opt in to profiling by calling 48cb3e140fae8e2be7cc5043d7c4f217bb57df5250Benoit Jacob// RegisterCurrentThreadForProfiling() defined in instrumentation.h. 4975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This can be done at any time, before or during profiling. 5075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// No sample will be collected from a thread until 5175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// it has called RegisterCurrentThreadForProfiling(). 5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 5375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 2. Instrument your code to be profiled with ScopedProfilingLabel, 54cb3e140fae8e2be7cc5043d7c4f217bb57df5250Benoit Jacob// which is a RAII helper defined in instrumentation.h. The identifier 5575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// names (some_label, etc) do not matter; what will show up 5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// in the profile is the string passed to the constructor, which 5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// must be a literal string. See the full example below. 5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Note: the overhead of ScopedProfilingLabel is zero when not 6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// enabling profiling (when not defining GEMMLOWP_PROFILING). 6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 3. Use the profiler.h interface to control profiling. There are two 6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// functions: StartProfiling() and FinishProfiling(). They must be 6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// called on the same thread. FinishProfiling() prints the profile 6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// on stdout. 6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Full example 6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ============ 6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob/* 7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob #define GEMMLOWP_PROFILING 7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob #include "profiling/instrumentation.h" 7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob using namespace gemmlowp; 7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const int iters = 100000000; 7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob volatile int i; 7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void Bar() { 7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ScopedProfilingLabel label("Bar"); 7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (i = 0; i < iters; i++) {} 8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void Foo() { 8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ScopedProfilingLabel label("Foo"); 8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (i = 0; i < iters; i++) {} 8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Bar(); 8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void Init() { 8975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob RegisterCurrentThreadForProfiling(); 9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob #include "profiling/profiler.h" 9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int main() { 9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Init(); 9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob StartProfiling(); 9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Foo(); 9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob FinishProfiling(); 9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob* 10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob* Output: 10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob* 10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob gemmlowp profile (1 threads, 304 samples) 10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 100.00% Foo 10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 51.32% other 10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 48.68% Bar 10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 0.00% other (outside of any label) 10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob*/ 10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 11075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Interpreting results 11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ==================== 11275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Each node shows the absolute percentage, among all the samples, 11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// of the number of samples that recorded the given pseudo-stack. 11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The percentages are *NOT* relative to the parent node. In addition 11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to your own labels, you will also see 'other' nodes that collect 11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// the remainder of samples under the parent node that didn't fall into 11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// any of the labelled child nodes. Example: 11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 20% Foo 12175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 12% Bar 12275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 6% Xyz 12375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 2% other 12475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 12575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This means that 20% of all labels were under Foo, of which 12%/20%==60% 12675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not 12775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// under either Bar or Xyz. 12875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 12975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Typically, one wants to keep adding ScopedProfilingLabel's until 13075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// the 'other' nodes show low percentages. 13175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 13275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Interpreting results with multiple threads 13375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// ========================================== 13475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 13575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// At each sample, each thread registered for profiling gets sampled once. 13675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// So if there is one "main thread" spending its time in MainFunc() and 13775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the 13875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// samples will be in WorkerFunc, so the profile will look like this: 13975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 14075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 80% WorkerFunc 14175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 20% MainFunc 14275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 14375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_PROFILING_PROFILER_H_ 14475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_PROFILING_PROFILER_H_ 14575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 14675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_PROFILING 14775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#error Profiling is not enabled! 14875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif 14975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 15075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#include <vector> 15175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 152544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "instrumentation.h" 15375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 15475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp { 15575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 15675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// A tree view of a profile. 15775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass ProfileTreeView { 15875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob struct Node { 15975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::vector<Node*> children; 16075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const char* label; 16175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::size_t weight; 16275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Node() : label(nullptr), weight(0) {} 16375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ~Node() { 16475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto child : children) { 16575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob delete child; 16675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 16775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 16875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob }; 16975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 17075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static bool CompareNodes(Node* n1, Node* n2) { 17175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return n1->weight > n2->weight; 17275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 17375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 17475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Node root_; 17575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 17675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void PrintNode(const Node* node, int level) const { 17775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob if (level) { 17875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int i = 1; i < level; i++) { 17975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob printf(" "); 18075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 18175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label); 18275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 18375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto child : node->children) { 18475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob PrintNode(child, level + 1); 18575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 18675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 18775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 18875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static void AddStackToNode(const ProfilingStack& stack, Node* node, 18975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::size_t level) { 19075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob node->weight++; 19175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob if (stack.size == level) { 19275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return; 19375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 19475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Node* child_to_add_to = nullptr; 19575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto child : node->children) { 19675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob if (child->label == stack.labels[level]) { 19775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob child_to_add_to = child; 19875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob break; 19975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 20075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 20175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob if (!child_to_add_to) { 20275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob child_to_add_to = new Node; 20375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob child_to_add_to->label = stack.labels[level]; 20475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob node->children.push_back(child_to_add_to); 20575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 20675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob AddStackToNode(stack, child_to_add_to, level + 1); 20775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return; 20875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 20975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 21075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void AddStack(const ProfilingStack& stack) { 21175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob AddStackToNode(stack, &root_, 0); 21275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 21375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 21475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void AddOtherChildrenToNode(Node* node) { 21575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::size_t top_level_children_weight = 0; 21675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto c : node->children) { 21775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob AddOtherChildrenToNode(c); 21875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob top_level_children_weight += c->weight; 21975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 22075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob if (top_level_children_weight) { 22175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Node* other_child = new Node; 22275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob other_child->label = 22375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob node == &root_ ? "other (outside of any label)" : "other"; 22475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob other_child->weight = node->weight - top_level_children_weight; 22575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob node->children.push_back(other_child); 22675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 22775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 22875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 22975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void AddOtherNodes() { AddOtherChildrenToNode(&root_); } 23075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 23175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void SortNode(Node* node) { 23275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::sort(node->children.begin(), node->children.end(), CompareNodes); 23375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto child : node->children) { 23475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SortNode(child); 23575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 23675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 23775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 23875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void Sort() { SortNode(&root_); } 23975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 24075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public: 24175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) { 24275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto stack : stacks) { 24375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob AddStack(stack); 24475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 24575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob AddOtherNodes(); 24675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Sort(); 24775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 24875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 24975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void Print() const { 25075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob printf("\n"); 25175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob printf("gemmlowp profile (%d threads, %d samples)\n", 25275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static_cast<int>(ThreadsUnderProfiling().size()), 25375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static_cast<int>(root_.weight)); 25475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob PrintNode(&root_, 0); 25575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob printf("\n"); 25675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 25775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}; 25875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 25975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This function is the only place that determines our sampling frequency. 26075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void WaitOneProfilerTick() { 26175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static const int millisecond = 1000000; 26275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 26375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#if defined __arm__ || defined __aarch64__ 26475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Reduced sampling frequency on mobile devices helps limit time and memory 26575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // overhead there. 26675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static const int interval = 10 * millisecond; 26775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#else 26875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static const int interval = 1 * millisecond; 26975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif 27075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 27175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob timespec ts; 27275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ts.tv_sec = 0; 27375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ts.tv_nsec = interval; 27475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob nanosleep(&ts, nullptr); 27575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 27675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 27775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This is how we track whether we've already started profiling, 27875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to guard against misuse of the API. 27975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline bool& IsProfiling() { 28075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static bool b; 28175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return b; 28275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 28375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 28475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This is how we tell the profiler thread to finish. 28575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline bool& ProfilerThreadShouldFinish() { 28675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static bool b; 28775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return b; 28875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 28975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 29075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The profiler thread. See ProfilerThreadFunc. 29175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline pthread_t& ProfilerThread() { 29275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static pthread_t t; 29375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return t; 29475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 29575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 29675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Records a stack from a running thread. 29775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The tricky part is that we're not interrupting the thread. 29875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This is OK because we're looking at a pseudo-stack of labels, 29975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// not at the real thread stack, and if the pseudo-stack changes 30075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// while we're recording it, we are OK with getting either the 30175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// old or the new stack. Note that ProfilingStack::Pop 30275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// only decrements the size, and doesn't null the popped label, 30375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// so if we're concurrently recording it, it shouldn't change 30475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// under our feet until another label is pushed, at which point 30575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// we are OK with getting either this new label or the old one. 30675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// In the end, the key atomicity property that we are relying on 30775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// here is that pointers are changed atomically, and the labels 30875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// are pointers (to literal strings). 3091963df9ac4a0424674e72ef5da522b5d830605fdMiao Wanginline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) { 3101963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang ScopedLock sl(thread->stack.lock); 31175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob assert(!dst->size); 31275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob while (dst->size < thread->stack.size) { 31375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob dst->labels[dst->size] = thread->stack.labels[dst->size]; 31475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob dst->size++; 31575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 31675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 31775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 31875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// The profiler thread's entry point. 31975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Note that a separate thread is to be started each time we call 32075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// StartProfiling(), and finishes when we call FinishProfiling(). 32175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// So here we only need to handle the recording and reporting of 32275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// a single profile. 32375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void* ProfilerThreadFunc(void*) { 32475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob assert(ProfilerThread() == pthread_self()); 32575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 32675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Since we only handle one profile per profiler thread, the 32775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // profile data (the array of recorded stacks) can be a local variable here. 32875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::vector<ProfilingStack> stacks; 32975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 33075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob while (!ProfilerThreadShouldFinish()) { 33175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob WaitOneProfilerTick(); 33275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob { 3331963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang ScopedLock sl(GlobalMutexes::Profiler()); 33475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (auto t : ThreadsUnderProfiling()) { 33575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ProfilingStack s; 33675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob RecordStack(t, &s); 33775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob stacks.push_back(s); 33875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 33975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 34075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 34175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 34275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Profiling is finished and we now report the results. 34375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ProfileTreeView(stacks).Print(); 34475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 34575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return nullptr; 34675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 34775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 34875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Starts recording samples. 34975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void StartProfiling() { 3501963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang ScopedLock sl(GlobalMutexes::Profiler()); 35175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); 35275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob IsProfiling() = true; 35375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ProfilerThreadShouldFinish() = false; 35475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr); 35575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 35675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 35775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Stops recording samples, and prints a profile tree-view on stdout. 35875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobinline void FinishProfiling() { 35975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob { 3601963df9ac4a0424674e72ef5da522b5d830605fdMiao Wang ScopedLock sl(GlobalMutexes::Profiler()); 36175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); 36275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, 36375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // as the scary comments below should make clear. 36475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Should we use a condition variable? 36575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ProfilerThreadShouldFinish() = true; 36675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } // must release the lock here to avoid deadlock with profiler thread. 36775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob pthread_join(ProfilerThread(), nullptr); 36875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob IsProfiling() = false; // yikes, this should be guarded by the lock! 36975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 37075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 37175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} // namespace gemmlowp 37275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 37375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif // GEMMLOWP_PROFILING_PROFILER_H_ 374