1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <unistd.h>
16#ifdef __APPLE__
17#include <sys/time.h>
18#endif
19
20#include <cstdint>
21#include <cstdlib>
22#include <ctime>
23#include <iostream>
24#include <map>
25#include <vector>
26#ifdef __APPLE__
27#include <TargetConditionals.h>
28#endif
29
30#include "test.h"
31
32#ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
33#define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
34#endif
35
36#if defined(__arm__) && !defined(GEMMLOWP_NEON)
37#warning "Building without NEON support on ARM, check your compiler setup!"
38#endif
39
40#if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4)
41#warning \
42    "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
43#endif
44
45namespace gemmlowp {
46
47double time() {
48#ifdef __APPLE__
49  timeval t;
50  gettimeofday(&t, nullptr);
51  return t.tv_sec + 1e-6 * t.tv_usec;
52#else
53  timespec t;
54  clock_gettime(CLOCK_REALTIME, &t);
55  return t.tv_sec + 1e-9 * t.tv_nsec;
56#endif
57}
58
59const double min_accurate_duration = 1e-1;
60const std::size_t min_working_set_size = 16 * 1024 * 1024;
61
62struct gemm_t {
63  int rows, depth, cols;
64  gemm_t() : rows(0), depth(0), cols(0) {}
65  gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
66};
67
68bool operator<(const gemm_t& a, const gemm_t& b) {
69  return a.rows < b.rows ||
70         (a.rows <= b.rows &&
71          (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
72}
73
74template <typename LhsType, typename RhsType, typename ResultType>
75double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
76  typedef std::uint8_t Scalar;
77
78  // set up the matrix pool
79
80  std::size_t combined_gemm_sizes = 0;
81  for (auto gemm : gemms) {
82    int rows = gemm.rows;
83    int depth = gemm.depth;
84    int cols = gemm.cols;
85    combined_gemm_sizes +=
86        sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
87  }
88
89  const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
90
91  std::vector<LhsType> lhs(pool_size * gemms.size());
92  std::vector<RhsType> rhs(pool_size * gemms.size());
93  std::vector<ResultType> result(pool_size * gemms.size());
94
95  for (std::size_t i = 0; i < pool_size; i++) {
96    for (std::size_t j = 0; j < gemms.size(); j++) {
97      int k = i * gemms.size() + j;
98      lhs[k].Resize(gemms[j].rows, gemms[j].depth);
99      MakeConstant(&lhs[k], 0);
100      rhs[k].Resize(gemms[j].depth, gemms[j].cols);
101      MakeConstant(&rhs[k], 0);
102      result[k].Resize(gemms[j].rows, gemms[j].cols);
103      MakeConstant(&result[k], 0);
104    }
105  }
106
107  // main benchmark loop
108
109  int iters_at_a_time = 1;
110  float time_per_iter = 0.0f;
111  std::size_t pool_index = 0;
112
113  while (true) {
114    double starttime = time();
115    for (int i = 0; i < iters_at_a_time; i++) {
116      for (size_t j = 0; j < gemms.size(); j++) {
117        int k = pool_index * gemms.size() + j;
118        Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
119            context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
120            -75, -91, 74980, 123, 20);
121      }
122      pool_index++;
123      if (pool_index == pool_size) {
124        pool_index = 0;
125      }
126    }
127    double endtime = time();
128
129    const float timing = static_cast<float>(endtime - starttime);
130
131    if (timing >= min_accurate_duration) {
132      time_per_iter = timing / iters_at_a_time;
133      break;
134    }
135
136    iters_at_a_time *= 2;
137  }
138
139  return time_per_iter;
140}
141
142template <typename LhsType, typename RhsType, typename ResultType>
143double gflops_for_gemms(GemmContext* context,
144                        const std::vector<gemm_t>& gemms) {
145  const double time_per_iter =
146      time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
147  double ops = 0;
148  for (auto gemm : gemms) {
149    ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
150  }
151  return 1e-9 * ops / time_per_iter;
152}
153
154void benchmark(GemmContext* context) {
155  std::map<gemm_t, std::vector<double>> benchmark_results;
156
157  std::vector<gemm_t> benchmark_gemms;
158  benchmark_gemms.emplace_back(10, 10, 10);
159  benchmark_gemms.emplace_back(20, 20, 20);
160  benchmark_gemms.emplace_back(30, 30, 30);
161  benchmark_gemms.emplace_back(40, 40, 40);
162  benchmark_gemms.emplace_back(50, 50, 50);
163  benchmark_gemms.emplace_back(60, 60, 60);
164  benchmark_gemms.emplace_back(64, 256, 147);
165  benchmark_gemms.emplace_back(100, 100, 1);
166  benchmark_gemms.emplace_back(100, 100, 100);
167  benchmark_gemms.emplace_back(100, 1000, 100);
168  benchmark_gemms.emplace_back(1000, 1000, 1);
169  benchmark_gemms.emplace_back(1000, 1000, 10);
170  benchmark_gemms.emplace_back(1000, 1000, 100);
171  benchmark_gemms.emplace_back(1000, 1000, 1000);
172
173  const int repeat = 2;
174
175  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
176  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
177  typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
178
179#ifdef GEMMLOWP_TEST_PROFILE
180  gemmlowp::RegisterCurrentThreadForProfiling();
181  gemmlowp::StartProfiling();
182#endif
183
184  // We don't record the first repetition, it's just warm-up.
185  for (int r = 0; r < repeat + 1; r++) {
186    std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
187              << std::flush;
188    for (auto gemm : benchmark_gemms) {
189      double gflops = 0;
190      std::vector<gemm_t> unique_gemm;
191      unique_gemm.push_back(gemm);
192      gflops =
193          gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
194      if (r > 0) {
195        benchmark_results[gemm].emplace_back(gflops);
196      }
197    }
198  }
199
200#ifdef GEMMLOWP_TEST_PROFILE
201  gemmlowp::FinishProfiling();
202#endif
203
204  std::cout << "                                                \r"
205            << std::flush;
206
207  std::cout.precision(4);
208
209  for (auto b : benchmark_results) {
210    sort(b.second.begin(), b.second.end());
211    std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
212              << " : " << b.second.back() << " GFlops/s" << std::endl;
213  }
214  std::cout << std::endl;
215}
216
217void benchmark_gemm_sizes(GemmContext* context,
218                          const std::vector<gemm_t>& gemms, double mintime) {
219  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
220  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
221  typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
222
223  std::vector<float> gemm_times;
224  std::cout << "running for " << mintime << " seconds..." << std::endl;
225
226#ifdef GEMMLOWP_TEST_PROFILE
227  gemmlowp::RegisterCurrentThreadForProfiling();
228  gemmlowp::StartProfiling();
229#endif
230
231  double starttime = time();
232  while (time() < starttime + mintime) {
233    gemm_times.push_back(
234        time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
235  }
236
237#ifdef GEMMLOWP_TEST_PROFILE
238  gemmlowp::FinishProfiling();
239#endif
240
241  std::sort(gemm_times.begin(), gemm_times.end());
242
243  double sum_gemm_times = 0;
244  double sum_gemm_times_trimmed = 0;
245  int count_gemm_times_trimmed = 0;
246  const float trim_ratio = 0.25;
247  const size_t count_trimmed = gemm_times.size() * trim_ratio;
248  double sum_gemm_times_best = 0;
249  int count_gemm_times_best = 0;
250  const float best_ratio = 0.1;
251  const size_t count_best = gemm_times.size() * best_ratio;
252
253  for (size_t i = 0; i < gemm_times.size(); i++) {
254    sum_gemm_times += gemm_times[i];
255    if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
256      sum_gemm_times_trimmed += gemm_times[i];
257      count_gemm_times_trimmed++;
258    }
259    if (i < count_best) {
260      sum_gemm_times_best += gemm_times[i];
261      count_gemm_times_best++;
262    }
263  }
264
265  const double min_latency = gemm_times.front();
266  const double max_latency = gemm_times.back();
267  const double mean_latency = sum_gemm_times / gemm_times.size();
268  const double trimmed_mean_latency =
269      sum_gemm_times_trimmed / count_gemm_times_trimmed;
270  const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
271
272  std::cout << "Graph latency (over " << gemm_times.size()
273            << " iterations):" << std::endl;
274  std::cout << "  Best:             " << min_latency << "s" << std::endl;
275  std::cout << "  Worst:            " << max_latency << "s" << std::endl;
276  std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
277  std::cout << "  " << 100 * trim_ratio
278            << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
279  std::cout << "  Mean of " << 100 * best_ratio
280            << "% best: " << best_mean_latency << "s" << std::endl;
281}
282
283void benchmark_googlenet(GemmContext* context) {
284  // These are the m, n, k sizes for a typical GoogLeNet.
285  const int googlenet_gemm_sizes[] = {
286      12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
287      784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
288      784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
289      784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
290      196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
291      196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
292      196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
293      196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
294      196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
295      196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
296      196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
297      196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
298      49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
299      49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
300      49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
301      16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
302  };
303  assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
304         0);
305  const std::size_t num_googlenet_gemms =
306      sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
307
308  std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
309  for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
310    googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
311    googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
312    googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
313  }
314
315  const double mintime = 20.0;
316  benchmark_gemm_sizes(context, googlenet_gemms, mintime);
317}
318
319void benchmark_small_model(GemmContext* context) {
320  // These are the m, n, k sizes for a small model with large batches.
321  const int small_model_gemm_sizes[] = {
322      29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
323  };
324  assert(sizeof(small_model_gemm_sizes) %
325             (3 * sizeof(small_model_gemm_sizes[0])) ==
326         0);
327  const std::size_t num_small_model_gemms =
328      sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
329
330  std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
331  for (std::size_t i = 0; i < num_small_model_gemms; i++) {
332    small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
333    small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
334    small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
335  }
336
337  const double mintime = 10.0;
338  benchmark_gemm_sizes(context, small_model_gemms, mintime);
339}
340
341void benchmark_all() {
342  {
343    gemmlowp::GemmContext context;
344    std::cout << "Benchmarking small model GEMMs..." << std::endl;
345    gemmlowp::benchmark_small_model(&context);
346  }
347
348  {
349    gemmlowp::GemmContext context;
350    std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
351    gemmlowp::benchmark_googlenet(&context);
352  }
353
354  {
355    gemmlowp::GemmContext context;
356    context.set_max_num_threads(0);
357    std::cout << "Benchmarking multi-threaded mode..." << std::endl;
358    gemmlowp::benchmark(&context);
359  }
360
361  {
362    gemmlowp::GemmContext context;
363    context.set_max_num_threads(1);
364    std::cout << "Benchmarking single-threaded mode..." << std::endl;
365    gemmlowp::benchmark(&context);
366  }
367}
368
369}  // end namespace gemmlowp
370
371// For iOS, we need to define our own main(), so skip it here.
372#if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
373int main() { gemmlowp::benchmark_all(); }
374#endif
375