1#define EIGEN_USE_THREADS 2 3#include <string> 4 5#include "tensor_benchmarks.h" 6 7#define CREATE_THREAD_POOL(threads) \ 8Eigen::ThreadPool pool(threads); \ 9Eigen::ThreadPoolDevice device(&pool, threads); 10 11// Simple functions 12#define BM_FuncCPU(FUNC, THREADS) \ 13 static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ 14 StopBenchmarkTiming(); \ 15 CREATE_THREAD_POOL(THREADS); \ 16 BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ 17 suite.FUNC(iters); \ 18 } \ 19 BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); 20 21BM_FuncCPU(memcpy, 4); 22BM_FuncCPU(memcpy, 8); 23BM_FuncCPU(memcpy, 12); 24 25BM_FuncCPU(typeCasting, 4); 26BM_FuncCPU(typeCasting, 8); 27BM_FuncCPU(typeCasting, 12); 28 29BM_FuncCPU(random, 4); 30BM_FuncCPU(random, 8); 31BM_FuncCPU(random, 12); 32 33BM_FuncCPU(slicing, 4); 34BM_FuncCPU(slicing, 8); 35BM_FuncCPU(slicing, 12); 36 37BM_FuncCPU(rowChip, 4); 38BM_FuncCPU(rowChip, 8); 39BM_FuncCPU(rowChip, 12); 40 41BM_FuncCPU(colChip, 4); 42BM_FuncCPU(colChip, 8); 43BM_FuncCPU(colChip, 12); 44 45BM_FuncCPU(shuffling, 4); 46BM_FuncCPU(shuffling, 8); 47BM_FuncCPU(shuffling, 12); 48 49BM_FuncCPU(padding, 4); 50BM_FuncCPU(padding, 8); 51BM_FuncCPU(padding, 12); 52 53BM_FuncCPU(striding, 4); 54BM_FuncCPU(striding, 8); 55BM_FuncCPU(striding, 12); 56 57BM_FuncCPU(broadcasting, 4); 58BM_FuncCPU(broadcasting, 8); 59BM_FuncCPU(broadcasting, 12); 60 61BM_FuncCPU(coeffWiseOp, 4); 62BM_FuncCPU(coeffWiseOp, 8); 63BM_FuncCPU(coeffWiseOp, 12); 64 65BM_FuncCPU(algebraicFunc, 4); 66BM_FuncCPU(algebraicFunc, 8); 67BM_FuncCPU(algebraicFunc, 12); 68 69BM_FuncCPU(transcendentalFunc, 4); 70BM_FuncCPU(transcendentalFunc, 8); 71BM_FuncCPU(transcendentalFunc, 12); 72 73BM_FuncCPU(rowReduction, 4); 74BM_FuncCPU(rowReduction, 8); 75BM_FuncCPU(rowReduction, 12); 76 77BM_FuncCPU(colReduction, 4); 78BM_FuncCPU(colReduction, 8); 79BM_FuncCPU(colReduction, 12); 80 81 82// Contractions 83#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ 84 static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \ 85 StopBenchmarkTiming(); \ 86 if (THREADS == 1) { \ 87 Eigen::DefaultDevice device; \ 88 BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \ 89 suite.FUNC(iters); \ 90 } else { \ 91 CREATE_THREAD_POOL(THREADS); \ 92 BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \ 93 suite.FUNC(iters); \ 94 } \ 95 } \ 96 BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); 97 98 99BM_FuncWithInputDimsCPU(contraction, N, N, N, 1); 100BM_FuncWithInputDimsCPU(contraction, N, N, N, 4); 101BM_FuncWithInputDimsCPU(contraction, N, N, N, 8); 102BM_FuncWithInputDimsCPU(contraction, N, N, N, 12); 103BM_FuncWithInputDimsCPU(contraction, N, N, N, 16); 104 105BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1); 106BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4); 107BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8); 108BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12); 109BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16); 110 111BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1); 112BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4); 113BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); 114BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); 115BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); 116 117BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); 118BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); 119BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); 120BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); 121BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); 122 123BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); 124BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); 125BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); 126BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12); 127BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16); 128 129BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1); 130BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4); 131BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8); 132BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12); 133BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); 134 135 136// Convolutions 137#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \ 138 static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ 139 StopBenchmarkTiming(); \ 140 CREATE_THREAD_POOL(THREADS); \ 141 BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ 142 suite.FUNC(iters, DIM1, DIM2); \ 143 } \ 144 BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); 145 146BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4); 147BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8); 148BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12); 149 150BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4); 151BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8); 152BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12); 153 154BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4); 155BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8); 156BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12); 157 158BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4); 159BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8); 160BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12); 161 162BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4); 163BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8); 164BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12); 165 166BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4); 167BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8); 168BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12); 169