1#define EIGEN_USE_THREADS
2
3#include <string>
4
5#include "tensor_benchmarks.h"
6
7#define CREATE_THREAD_POOL(threads)             \
8Eigen::ThreadPool pool(threads);                \
9Eigen::ThreadPoolDevice device(&pool, threads);
10
11// Simple functions
12#define BM_FuncCPU(FUNC, THREADS)                                    \
13  static void BM_##FUNC##_##THREADS##T(int iters, int N) {           \
14    StopBenchmarkTiming();                                           \
15    CREATE_THREAD_POOL(THREADS);                                     \
16    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
17    suite.FUNC(iters);                                               \
18  }                                                                  \
19  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
20
21BM_FuncCPU(memcpy, 4);
22BM_FuncCPU(memcpy, 8);
23BM_FuncCPU(memcpy, 12);
24
25BM_FuncCPU(typeCasting, 4);
26BM_FuncCPU(typeCasting, 8);
27BM_FuncCPU(typeCasting, 12);
28
29BM_FuncCPU(random, 4);
30BM_FuncCPU(random, 8);
31BM_FuncCPU(random, 12);
32
33BM_FuncCPU(slicing, 4);
34BM_FuncCPU(slicing, 8);
35BM_FuncCPU(slicing, 12);
36
37BM_FuncCPU(rowChip, 4);
38BM_FuncCPU(rowChip, 8);
39BM_FuncCPU(rowChip, 12);
40
41BM_FuncCPU(colChip, 4);
42BM_FuncCPU(colChip, 8);
43BM_FuncCPU(colChip, 12);
44
45BM_FuncCPU(shuffling, 4);
46BM_FuncCPU(shuffling, 8);
47BM_FuncCPU(shuffling, 12);
48
49BM_FuncCPU(padding, 4);
50BM_FuncCPU(padding, 8);
51BM_FuncCPU(padding, 12);
52
53BM_FuncCPU(striding, 4);
54BM_FuncCPU(striding, 8);
55BM_FuncCPU(striding, 12);
56
57BM_FuncCPU(broadcasting, 4);
58BM_FuncCPU(broadcasting, 8);
59BM_FuncCPU(broadcasting, 12);
60
61BM_FuncCPU(coeffWiseOp, 4);
62BM_FuncCPU(coeffWiseOp, 8);
63BM_FuncCPU(coeffWiseOp, 12);
64
65BM_FuncCPU(algebraicFunc, 4);
66BM_FuncCPU(algebraicFunc, 8);
67BM_FuncCPU(algebraicFunc, 12);
68
69BM_FuncCPU(transcendentalFunc, 4);
70BM_FuncCPU(transcendentalFunc, 8);
71BM_FuncCPU(transcendentalFunc, 12);
72
73BM_FuncCPU(rowReduction, 4);
74BM_FuncCPU(rowReduction, 8);
75BM_FuncCPU(rowReduction, 12);
76
77BM_FuncCPU(colReduction, 4);
78BM_FuncCPU(colReduction, 8);
79BM_FuncCPU(colReduction, 12);
80
81
82// Contractions
83#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                      \
84  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
85    StopBenchmarkTiming();                                                      \
86    if (THREADS == 1) {                                                         \
87      Eigen::DefaultDevice device;                                              \
88      BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3);    \
89      suite.FUNC(iters);                                                        \
90    } else {                                                                    \
91      CREATE_THREAD_POOL(THREADS);                                              \
92      BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
93      suite.FUNC(iters);                                                        \
94    }                                                                           \
95  }                                                                             \
96  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
97
98
99BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
100BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
101BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
102BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
103BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
104
105BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
106BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
107BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
108BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
109BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
110
111BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
112BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
113BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
114BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
115BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
116
117BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
118BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
119BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
120BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
121BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
122
123BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
124BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
125BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
126BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
127BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
128
129BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
130BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
131BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
132BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
133BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
134
135
136// Convolutions
137#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
138  static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
139    StopBenchmarkTiming();                                                     \
140    CREATE_THREAD_POOL(THREADS);                                               \
141    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N);	       \
142    suite.FUNC(iters, DIM1, DIM2);                                             \
143  }                                                                            \
144  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
145
146BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
147BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
148BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
149
150BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
151BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
152BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
153
154BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
155BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
156BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
157
158BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
159BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
160BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
161
162BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
163BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
164BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
165
166BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
167BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
168BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
169