1 #define EIGEN_USE_THREADS
2 
3 #include <string>
4 
5 #include "tensor_benchmarks.h"
6 
7 #define CREATE_THREAD_POOL(threads)             \
8 Eigen::ThreadPool pool(threads);                \
9 Eigen::ThreadPoolDevice device(&pool, threads);
10 
11 // Simple functions
12 #define BM_FuncCPU(FUNC, THREADS)                                    \
13   static void BM_##FUNC##_##THREADS##T(int iters, int N) {           \
14     StopBenchmarkTiming();                                           \
15     CREATE_THREAD_POOL(THREADS);                                     \
16     BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
17     suite.FUNC(iters);                                               \
18   }                                                                  \
19   BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
20 
21 BM_FuncCPU(memcpy, 4);
22 BM_FuncCPU(memcpy, 8);
23 BM_FuncCPU(memcpy, 12);
24 
25 BM_FuncCPU(typeCasting, 4);
26 BM_FuncCPU(typeCasting, 8);
27 BM_FuncCPU(typeCasting, 12);
28 
29 BM_FuncCPU(random, 4);
30 BM_FuncCPU(random, 8);
31 BM_FuncCPU(random, 12);
32 
33 BM_FuncCPU(slicing, 4);
34 BM_FuncCPU(slicing, 8);
35 BM_FuncCPU(slicing, 12);
36 
37 BM_FuncCPU(rowChip, 4);
38 BM_FuncCPU(rowChip, 8);
39 BM_FuncCPU(rowChip, 12);
40 
41 BM_FuncCPU(colChip, 4);
42 BM_FuncCPU(colChip, 8);
43 BM_FuncCPU(colChip, 12);
44 
45 BM_FuncCPU(shuffling, 4);
46 BM_FuncCPU(shuffling, 8);
47 BM_FuncCPU(shuffling, 12);
48 
49 BM_FuncCPU(padding, 4);
50 BM_FuncCPU(padding, 8);
51 BM_FuncCPU(padding, 12);
52 
53 BM_FuncCPU(striding, 4);
54 BM_FuncCPU(striding, 8);
55 BM_FuncCPU(striding, 12);
56 
57 BM_FuncCPU(broadcasting, 4);
58 BM_FuncCPU(broadcasting, 8);
59 BM_FuncCPU(broadcasting, 12);
60 
61 BM_FuncCPU(coeffWiseOp, 4);
62 BM_FuncCPU(coeffWiseOp, 8);
63 BM_FuncCPU(coeffWiseOp, 12);
64 
65 BM_FuncCPU(algebraicFunc, 4);
66 BM_FuncCPU(algebraicFunc, 8);
67 BM_FuncCPU(algebraicFunc, 12);
68 
69 BM_FuncCPU(transcendentalFunc, 4);
70 BM_FuncCPU(transcendentalFunc, 8);
71 BM_FuncCPU(transcendentalFunc, 12);
72 
73 BM_FuncCPU(rowReduction, 4);
74 BM_FuncCPU(rowReduction, 8);
75 BM_FuncCPU(rowReduction, 12);
76 
77 BM_FuncCPU(colReduction, 4);
78 BM_FuncCPU(colReduction, 8);
79 BM_FuncCPU(colReduction, 12);
80 
81 
82 // Contractions
83 #define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                      \
84   static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
85     StopBenchmarkTiming();                                                      \
86     if (THREADS == 1) {                                                         \
87       Eigen::DefaultDevice device;                                              \
88       BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3);    \
89       suite.FUNC(iters);                                                        \
90     } else {                                                                    \
91       CREATE_THREAD_POOL(THREADS);                                              \
92       BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
93       suite.FUNC(iters);                                                        \
94     }                                                                           \
95   }                                                                             \
96   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
97 
98 
99 BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
100 BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
101 BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
102 BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
103 BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
104 
105 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
106 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
107 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
108 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
109 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
110 
111 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
112 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
113 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
114 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
115 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
116 
117 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
118 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
119 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
120 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
121 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
122 
123 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
124 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
125 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
126 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
127 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
128 
129 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
130 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
131 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
132 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
133 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
134 
135 
136 // Convolutions
137 #define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
138   static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
139     StopBenchmarkTiming();                                                     \
140     CREATE_THREAD_POOL(THREADS);                                               \
141     BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N);	       \
142     suite.FUNC(iters, DIM1, DIM2);                                             \
143   }                                                                            \
144   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
145 
146 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
147 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
148 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
149 
150 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
151 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
152 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
153 
154 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
155 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
156 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
157 
158 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
159 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
160 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
161 
162 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
163 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
164 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
165 
166 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
167 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
168 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
169