1 #define EIGEN_USE_GPU 2 3 #include <cuda.h> 4 #include <cuda_runtime.h> 5 #include <iostream> 6 7 #include "tensor_benchmarks.h" 8 9 // Simple functions 10 #define BM_FuncGPU(FUNC) \ 11 static void BM_##FUNC(int iters, int N) { \ 12 StopBenchmarkTiming(); \ 13 Eigen::CudaStreamDevice stream; \ 14 Eigen::GpuDevice device(&stream); \ 15 BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \ 16 cudaDeviceSynchronize(); \ 17 suite.FUNC(iters); \ 18 } \ 19 BENCHMARK_RANGE(BM_##FUNC, 10, 5000); 20 21 BM_FuncGPU(memcpy); 22 BM_FuncGPU(typeCasting); 23 BM_FuncGPU(random); 24 BM_FuncGPU(slicing); 25 BM_FuncGPU(rowChip); 26 BM_FuncGPU(colChip); 27 BM_FuncGPU(shuffling); 28 BM_FuncGPU(padding); 29 BM_FuncGPU(striding); 30 BM_FuncGPU(broadcasting); 31 BM_FuncGPU(coeffWiseOp); 32 BM_FuncGPU(algebraicFunc); 33 BM_FuncGPU(transcendentalFunc); 34 BM_FuncGPU(rowReduction); 35 BM_FuncGPU(colReduction); 36 BM_FuncGPU(fullReduction); 37 38 39 // Contractions 40 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ 41 static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ 42 StopBenchmarkTiming(); \ 43 Eigen::CudaStreamDevice stream; \ 44 Eigen::GpuDevice device(&stream); \ 45 BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \ 46 cudaDeviceSynchronize(); \ 47 suite.FUNC(iters); \ 48 } \ 49 BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); 50 51 52 BM_FuncWithInputDimsGPU(contraction, N, N, N); 53 BM_FuncWithInputDimsGPU(contraction, 64, N, N); 54 BM_FuncWithInputDimsGPU(contraction, N, 64, N); 55 BM_FuncWithInputDimsGPU(contraction, N, N, 64); 56 57 58 // Convolutions 59 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ 60 static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ 61 StopBenchmarkTiming(); \ 62 Eigen::CudaStreamDevice stream; \ 63 Eigen::GpuDevice device(&stream); \ 64 BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \ 65 cudaDeviceSynchronize(); \ 66 suite.FUNC(iters, DIM1, DIM2); \ 67 } \ 68 BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); 69 70 BM_FuncWithKernelDimsGPU(convolution, 7, 1); 71 BM_FuncWithKernelDimsGPU(convolution, 1, 7); 72 BM_FuncWithKernelDimsGPU(convolution, 7, 4); 73 BM_FuncWithKernelDimsGPU(convolution, 4, 7); 74 BM_FuncWithKernelDimsGPU(convolution, 7, 64); 75 BM_FuncWithKernelDimsGPU(convolution, 64, 7); 76