1 #define EIGEN_USE_THREADS 2 3 #include <string> 4 5 #include "tensor_benchmarks.h" 6 7 #define CREATE_THREAD_POOL(threads) \ 8 Eigen::ThreadPool pool(threads); \ 9 Eigen::ThreadPoolDevice device(&pool, threads); 10 11 // Simple functions 12 #define BM_FuncCPU(FUNC, THREADS) \ 13 static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ 14 StopBenchmarkTiming(); \ 15 CREATE_THREAD_POOL(THREADS); \ 16 BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ 17 suite.FUNC(iters); \ 18 } \ 19 BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); 20 21 BM_FuncCPU(memcpy, 4); 22 BM_FuncCPU(memcpy, 8); 23 BM_FuncCPU(memcpy, 12); 24 25 BM_FuncCPU(typeCasting, 4); 26 BM_FuncCPU(typeCasting, 8); 27 BM_FuncCPU(typeCasting, 12); 28 29 BM_FuncCPU(random, 4); 30 BM_FuncCPU(random, 8); 31 BM_FuncCPU(random, 12); 32 33 BM_FuncCPU(slicing, 4); 34 BM_FuncCPU(slicing, 8); 35 BM_FuncCPU(slicing, 12); 36 37 BM_FuncCPU(rowChip, 4); 38 BM_FuncCPU(rowChip, 8); 39 BM_FuncCPU(rowChip, 12); 40 41 BM_FuncCPU(colChip, 4); 42 BM_FuncCPU(colChip, 8); 43 BM_FuncCPU(colChip, 12); 44 45 BM_FuncCPU(shuffling, 4); 46 BM_FuncCPU(shuffling, 8); 47 BM_FuncCPU(shuffling, 12); 48 49 BM_FuncCPU(padding, 4); 50 BM_FuncCPU(padding, 8); 51 BM_FuncCPU(padding, 12); 52 53 BM_FuncCPU(striding, 4); 54 BM_FuncCPU(striding, 8); 55 BM_FuncCPU(striding, 12); 56 57 BM_FuncCPU(broadcasting, 4); 58 BM_FuncCPU(broadcasting, 8); 59 BM_FuncCPU(broadcasting, 12); 60 61 BM_FuncCPU(coeffWiseOp, 4); 62 BM_FuncCPU(coeffWiseOp, 8); 63 BM_FuncCPU(coeffWiseOp, 12); 64 65 BM_FuncCPU(algebraicFunc, 4); 66 BM_FuncCPU(algebraicFunc, 8); 67 BM_FuncCPU(algebraicFunc, 12); 68 69 BM_FuncCPU(transcendentalFunc, 4); 70 BM_FuncCPU(transcendentalFunc, 8); 71 BM_FuncCPU(transcendentalFunc, 12); 72 73 BM_FuncCPU(rowReduction, 4); 74 BM_FuncCPU(rowReduction, 8); 75 BM_FuncCPU(rowReduction, 12); 76 77 BM_FuncCPU(colReduction, 4); 78 BM_FuncCPU(colReduction, 8); 79 BM_FuncCPU(colReduction, 12); 80 81 82 // Contractions 83 #define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ 84 static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \ 85 StopBenchmarkTiming(); \ 86 if (THREADS == 1) { \ 87 Eigen::DefaultDevice device; \ 88 BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \ 89 suite.FUNC(iters); \ 90 } else { \ 91 CREATE_THREAD_POOL(THREADS); \ 92 BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \ 93 suite.FUNC(iters); \ 94 } \ 95 } \ 96 BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); 97 98 99 BM_FuncWithInputDimsCPU(contraction, N, N, N, 1); 100 BM_FuncWithInputDimsCPU(contraction, N, N, N, 4); 101 BM_FuncWithInputDimsCPU(contraction, N, N, N, 8); 102 BM_FuncWithInputDimsCPU(contraction, N, N, N, 12); 103 BM_FuncWithInputDimsCPU(contraction, N, N, N, 16); 104 105 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1); 106 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4); 107 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8); 108 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12); 109 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16); 110 111 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1); 112 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4); 113 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); 114 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); 115 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); 116 117 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); 118 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); 119 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); 120 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); 121 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); 122 123 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); 124 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); 125 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); 126 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12); 127 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16); 128 129 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1); 130 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4); 131 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8); 132 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12); 133 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); 134 135 136 // Convolutions 137 #define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \ 138 static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ 139 StopBenchmarkTiming(); \ 140 CREATE_THREAD_POOL(THREADS); \ 141 BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ 142 suite.FUNC(iters, DIM1, DIM2); \ 143 } \ 144 BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); 145 146 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4); 147 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8); 148 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12); 149 150 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4); 151 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8); 152 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12); 153 154 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4); 155 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8); 156 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12); 157 158 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4); 159 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8); 160 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12); 161 162 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4); 163 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8); 164 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12); 165 166 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4); 167 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8); 168 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12); 169