1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15#include <unistd.h> 16#ifdef __APPLE__ 17#include <sys/time.h> 18#endif 19 20#include <cstdint> 21#include <cstdlib> 22#include <ctime> 23#include <iostream> 24#include <map> 25#include <vector> 26#ifdef __APPLE__ 27#include <TargetConditionals.h> 28#endif 29 30#include "test.h" 31 32#ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS 33#define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams 34#endif 35 36#if defined(__arm__) && !defined(GEMMLOWP_NEON) 37#warning "Building without NEON support on ARM, check your compiler setup!" 38#endif 39 40#if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4) 41#warning \ 42 "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!" 43#endif 44 45namespace gemmlowp { 46 47double time() { 48#ifdef __APPLE__ 49 timeval t; 50 gettimeofday(&t, nullptr); 51 return t.tv_sec + 1e-6 * t.tv_usec; 52#else 53 timespec t; 54 clock_gettime(CLOCK_REALTIME, &t); 55 return t.tv_sec + 1e-9 * t.tv_nsec; 56#endif 57} 58 59const double min_accurate_duration = 1e-1; 60const std::size_t min_working_set_size = 16 * 1024 * 1024; 61 62struct gemm_t { 63 int rows, depth, cols; 64 gemm_t() : rows(0), depth(0), cols(0) {} 65 gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {} 66}; 67 68bool operator<(const gemm_t& a, const gemm_t& b) { 69 return a.rows < b.rows || 70 (a.rows <= b.rows && 71 (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols)))); 72} 73 74template <typename LhsType, typename RhsType, typename ResultType> 75double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) { 76 typedef std::uint8_t Scalar; 77 78 // set up the matrix pool 79 80 std::size_t combined_gemm_sizes = 0; 81 for (auto gemm : gemms) { 82 int rows = gemm.rows; 83 int depth = gemm.depth; 84 int cols = gemm.cols; 85 combined_gemm_sizes += 86 sizeof(Scalar) * (rows * depth + depth * cols + rows * cols); 87 } 88 89 const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes; 90 91 std::vector<LhsType> lhs(pool_size * gemms.size()); 92 std::vector<RhsType> rhs(pool_size * gemms.size()); 93 std::vector<ResultType> result(pool_size * gemms.size()); 94 95 for (std::size_t i = 0; i < pool_size; i++) { 96 for (std::size_t j = 0; j < gemms.size(); j++) { 97 int k = i * gemms.size() + j; 98 lhs[k].Resize(gemms[j].rows, gemms[j].depth); 99 MakeConstant(&lhs[k], 0); 100 rhs[k].Resize(gemms[j].depth, gemms[j].cols); 101 MakeConstant(&rhs[k], 0); 102 result[k].Resize(gemms[j].rows, gemms[j].cols); 103 MakeConstant(&result[k], 0); 104 } 105 } 106 107 // main benchmark loop 108 109 int iters_at_a_time = 1; 110 float time_per_iter = 0.0f; 111 std::size_t pool_index = 0; 112 113 while (true) { 114 double starttime = time(); 115 for (int i = 0; i < iters_at_a_time; i++) { 116 for (size_t j = 0; j < gemms.size(); j++) { 117 int k = pool_index * gemms.size() + j; 118 Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>( 119 context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(), 120 -75, -91, 74980, 123, 20); 121 } 122 pool_index++; 123 if (pool_index == pool_size) { 124 pool_index = 0; 125 } 126 } 127 double endtime = time(); 128 129 const float timing = static_cast<float>(endtime - starttime); 130 131 if (timing >= min_accurate_duration) { 132 time_per_iter = timing / iters_at_a_time; 133 break; 134 } 135 136 iters_at_a_time *= 2; 137 } 138 139 return time_per_iter; 140} 141 142template <typename LhsType, typename RhsType, typename ResultType> 143double gflops_for_gemms(GemmContext* context, 144 const std::vector<gemm_t>& gemms) { 145 const double time_per_iter = 146 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms); 147 double ops = 0; 148 for (auto gemm : gemms) { 149 ops += 2.0 * gemm.rows * gemm.depth * gemm.cols; 150 } 151 return 1e-9 * ops / time_per_iter; 152} 153 154void benchmark(GemmContext* context) { 155 std::map<gemm_t, std::vector<double>> benchmark_results; 156 157 std::vector<gemm_t> benchmark_gemms; 158 benchmark_gemms.emplace_back(10, 10, 10); 159 benchmark_gemms.emplace_back(20, 20, 20); 160 benchmark_gemms.emplace_back(30, 30, 30); 161 benchmark_gemms.emplace_back(40, 40, 40); 162 benchmark_gemms.emplace_back(50, 50, 50); 163 benchmark_gemms.emplace_back(60, 60, 60); 164 benchmark_gemms.emplace_back(64, 256, 147); 165 benchmark_gemms.emplace_back(100, 100, 1); 166 benchmark_gemms.emplace_back(100, 100, 100); 167 benchmark_gemms.emplace_back(100, 1000, 100); 168 benchmark_gemms.emplace_back(1000, 1000, 1); 169 benchmark_gemms.emplace_back(1000, 1000, 10); 170 benchmark_gemms.emplace_back(1000, 1000, 100); 171 benchmark_gemms.emplace_back(1000, 1000, 1000); 172 173 const int repeat = 2; 174 175 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; 176 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; 177 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; 178 179#ifdef GEMMLOWP_TEST_PROFILE 180 gemmlowp::RegisterCurrentThreadForProfiling(); 181 gemmlowp::StartProfiling(); 182#endif 183 184 // We don't record the first repetition, it's just warm-up. 185 for (int r = 0; r < repeat + 1; r++) { 186 std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r" 187 << std::flush; 188 for (auto gemm : benchmark_gemms) { 189 double gflops = 0; 190 std::vector<gemm_t> unique_gemm; 191 unique_gemm.push_back(gemm); 192 gflops = 193 gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm); 194 if (r > 0) { 195 benchmark_results[gemm].emplace_back(gflops); 196 } 197 } 198 } 199 200#ifdef GEMMLOWP_TEST_PROFILE 201 gemmlowp::FinishProfiling(); 202#endif 203 204 std::cout << " \r" 205 << std::flush; 206 207 std::cout.precision(4); 208 209 for (auto b : benchmark_results) { 210 sort(b.second.begin(), b.second.end()); 211 std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols 212 << " : " << b.second.back() << " GFlops/s" << std::endl; 213 } 214 std::cout << std::endl; 215} 216 217void benchmark_gemm_sizes(GemmContext* context, 218 const std::vector<gemm_t>& gemms, double mintime) { 219 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; 220 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; 221 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; 222 223 std::vector<float> gemm_times; 224 std::cout << "running for " << mintime << " seconds..." << std::endl; 225 226#ifdef GEMMLOWP_TEST_PROFILE 227 gemmlowp::RegisterCurrentThreadForProfiling(); 228 gemmlowp::StartProfiling(); 229#endif 230 231 double starttime = time(); 232 while (time() < starttime + mintime) { 233 gemm_times.push_back( 234 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms)); 235 } 236 237#ifdef GEMMLOWP_TEST_PROFILE 238 gemmlowp::FinishProfiling(); 239#endif 240 241 std::sort(gemm_times.begin(), gemm_times.end()); 242 243 double sum_gemm_times = 0; 244 double sum_gemm_times_trimmed = 0; 245 int count_gemm_times_trimmed = 0; 246 const float trim_ratio = 0.25; 247 const size_t count_trimmed = gemm_times.size() * trim_ratio; 248 double sum_gemm_times_best = 0; 249 int count_gemm_times_best = 0; 250 const float best_ratio = 0.1; 251 const size_t count_best = gemm_times.size() * best_ratio; 252 253 for (size_t i = 0; i < gemm_times.size(); i++) { 254 sum_gemm_times += gemm_times[i]; 255 if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) { 256 sum_gemm_times_trimmed += gemm_times[i]; 257 count_gemm_times_trimmed++; 258 } 259 if (i < count_best) { 260 sum_gemm_times_best += gemm_times[i]; 261 count_gemm_times_best++; 262 } 263 } 264 265 const double min_latency = gemm_times.front(); 266 const double max_latency = gemm_times.back(); 267 const double mean_latency = sum_gemm_times / gemm_times.size(); 268 const double trimmed_mean_latency = 269 sum_gemm_times_trimmed / count_gemm_times_trimmed; 270 const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best; 271 272 std::cout << "Graph latency (over " << gemm_times.size() 273 << " iterations):" << std::endl; 274 std::cout << " Best: " << min_latency << "s" << std::endl; 275 std::cout << " Worst: " << max_latency << "s" << std::endl; 276 std::cout << " Mean: " << mean_latency << "s" << std::endl; 277 std::cout << " " << 100 * trim_ratio 278 << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl; 279 std::cout << " Mean of " << 100 * best_ratio 280 << "% best: " << best_mean_latency << "s" << std::endl; 281} 282 283void benchmark_googlenet(GemmContext* context) { 284 // These are the m, n, k sizes for a typical GoogLeNet. 285 const int googlenet_gemm_sizes[] = { 286 12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192, 287 784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400, 288 784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152, 289 784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480, 290 196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400, 291 196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008, 292 196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512, 293 196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600, 294 196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296, 295 196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528, 296 196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800, 297 196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440, 298 49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832, 299 49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200, 300 49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024, 301 16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024, 302 }; 303 assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) == 304 0); 305 const std::size_t num_googlenet_gemms = 306 sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0])); 307 308 std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms); 309 for (std::size_t i = 0; i < num_googlenet_gemms; i++) { 310 googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1]; 311 googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2]; 312 googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0]; 313 } 314 315 const double mintime = 20.0; 316 benchmark_gemm_sizes(context, googlenet_gemms, mintime); 317} 318 319void benchmark_small_model(GemmContext* context) { 320 // These are the m, n, k sizes for a small model with large batches. 321 const int small_model_gemm_sizes[] = { 322 29232, 16, 25, 7308, 6, 400, 203, 3002, 216, 323 }; 324 assert(sizeof(small_model_gemm_sizes) % 325 (3 * sizeof(small_model_gemm_sizes[0])) == 326 0); 327 const std::size_t num_small_model_gemms = 328 sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0])); 329 330 std::vector<gemm_t> small_model_gemms(num_small_model_gemms); 331 for (std::size_t i = 0; i < num_small_model_gemms; i++) { 332 small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1]; 333 small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2]; 334 small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0]; 335 } 336 337 const double mintime = 10.0; 338 benchmark_gemm_sizes(context, small_model_gemms, mintime); 339} 340 341void benchmark_all() { 342 { 343 gemmlowp::GemmContext context; 344 std::cout << "Benchmarking small model GEMMs..." << std::endl; 345 gemmlowp::benchmark_small_model(&context); 346 } 347 348 { 349 gemmlowp::GemmContext context; 350 std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl; 351 gemmlowp::benchmark_googlenet(&context); 352 } 353 354 { 355 gemmlowp::GemmContext context; 356 context.set_max_num_threads(0); 357 std::cout << "Benchmarking multi-threaded mode..." << std::endl; 358 gemmlowp::benchmark(&context); 359 } 360 361 { 362 gemmlowp::GemmContext context; 363 context.set_max_num_threads(1); 364 std::cout << "Benchmarking single-threaded mode..." << std::endl; 365 gemmlowp::benchmark(&context); 366 } 367} 368 369} // end namespace gemmlowp 370 371// For iOS, we need to define our own main(), so skip it here. 372#if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR)) 373int main() { gemmlowp::benchmark_all(); } 374#endif 375