1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 #define EIGEN_TEST_NO_COMPLEX 12 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda 13 #define EIGEN_USE_GPU 14 15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 16 #include <cuda_fp16.h> 17 #endif 18 #include "main.h" 19 #include <unsupported/Eigen/CXX11/Tensor> 20 21 22 template<typename Type, int DataLayout> 23 static void test_full_reductions() { 24 25 Eigen::CudaStreamDevice stream; 26 Eigen::GpuDevice gpu_device(&stream); 27 28 const int num_rows = internal::random<int>(1024, 5*1024); 29 const int num_cols = internal::random<int>(1024, 5*1024); 30 31 Tensor<Type, 2, DataLayout> in(num_rows, num_cols); 32 in.setRandom(); 33 34 Tensor<Type, 0, DataLayout> full_redux; 35 full_redux = in.sum(); 36 37 std::size_t in_bytes = in.size() * sizeof(Type); 38 std::size_t out_bytes = full_redux.size() * sizeof(Type); 39 Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes)); 40 Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes)); 41 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); 42 43 TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); 44 TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr); 45 46 out_gpu.device(gpu_device) = in_gpu.sum(); 47 48 Tensor<Type, 0, DataLayout> full_redux_gpu; 49 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); 50 gpu_device.synchronize(); 51 52 // Check that the CPU and GPU reductions return the same result. 53 VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); 54 55 gpu_device.deallocate(gpu_in_ptr); 56 gpu_device.deallocate(gpu_out_ptr); 57 } 58 59 template<typename Type, int DataLayout> 60 static void test_first_dim_reductions() { 61 int dim_x = 33; 62 int dim_y = 1; 63 int dim_z = 128; 64 65 Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); 66 in.setRandom(); 67 68 Eigen::array<int, 1> red_axis; 69 red_axis[0] = 0; 70 Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); 71 72 // Create device 73 Eigen::CudaStreamDevice stream; 74 Eigen::GpuDevice dev(&stream); 75 76 // Create data(T) 77 Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); 78 Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type)); 79 Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); 80 Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z); 81 82 // Perform operation 83 dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); 84 gpu_out.device(dev) = gpu_in.sum(red_axis); 85 gpu_out.device(dev) += gpu_in.sum(red_axis); 86 Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z); 87 dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); 88 dev.synchronize(); 89 90 // Check that the CPU and GPU reductions return the same result. 91 for (int i = 0; i < gpu_out.size(); ++i) { 92 VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); 93 } 94 95 dev.deallocate(in_data); 96 dev.deallocate(out_data); 97 } 98 99 template<typename Type, int DataLayout> 100 static void test_last_dim_reductions() { 101 int dim_x = 128; 102 int dim_y = 1; 103 int dim_z = 33; 104 105 Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); 106 in.setRandom(); 107 108 Eigen::array<int, 1> red_axis; 109 red_axis[0] = 2; 110 Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); 111 112 // Create device 113 Eigen::CudaStreamDevice stream; 114 Eigen::GpuDevice dev(&stream); 115 116 // Create data 117 Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); 118 Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type)); 119 Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); 120 Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y); 121 122 // Perform operation 123 dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); 124 gpu_out.device(dev) = gpu_in.sum(red_axis); 125 gpu_out.device(dev) += gpu_in.sum(red_axis); 126 Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y); 127 dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); 128 dev.synchronize(); 129 130 // Check that the CPU and GPU reductions return the same result. 131 for (int i = 0; i < gpu_out.size(); ++i) { 132 VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); 133 } 134 135 dev.deallocate(in_data); 136 dev.deallocate(out_data); 137 } 138 139 140 void test_cxx11_tensor_reduction_cuda() { 141 CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); 142 CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); 143 CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); 144 CALL_SUBTEST_2((test_full_reductions<double, RowMajor>())); 145 146 CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>())); 147 CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>())); 148 CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>())); 149 // Outer reductions of doubles aren't supported just yet. 150 // CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>())) 151 152 CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>())); 153 // Outer reductions of doubles aren't supported just yet. 154 // CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>())); 155 CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>())); 156 CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>())); 157 } 158