1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 #define EIGEN_TEST_NO_COMPLEX 12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda 13 #define EIGEN_USE_GPU 14 15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 16 #include <cuda_fp16.h> 17 #endif 18 #include "main.h" 19 #include <unsupported/Eigen/CXX11/Tensor> 20 21 using Eigen::Tensor; 22 23 void test_cuda_nullary() { 24 Tensor<float, 1, 0, int> in1(2); 25 Tensor<float, 1, 0, int> in2(2); 26 in1.setRandom(); 27 in2.setRandom(); 28 29 std::size_t tensor_bytes = in1.size() * sizeof(float); 30 31 float* d_in1; 32 float* d_in2; 33 cudaMalloc((void**)(&d_in1), tensor_bytes); 34 cudaMalloc((void**)(&d_in2), tensor_bytes); 35 cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice); 36 cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice); 37 38 Eigen::CudaStreamDevice stream; 39 Eigen::GpuDevice gpu_device(&stream); 40 41 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( 42 d_in1, 2); 43 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2( 44 d_in2, 2); 45 46 gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f); 47 gpu_in2.device(gpu_device) = gpu_in2.random(); 48 49 Tensor<float, 1, 0, int> new1(2); 50 Tensor<float, 1, 0, int> new2(2); 51 52 assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost, 53 gpu_device.stream()) == cudaSuccess); 54 assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost, 55 gpu_device.stream()) == cudaSuccess); 56 57 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 58 59 for (int i = 0; i < 2; ++i) { 60 VERIFY_IS_APPROX(new1(i), 3.14f); 61 VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); 62 } 63 64 cudaFree(d_in1); 65 cudaFree(d_in2); 66 } 67 68 void test_cuda_elementwise_small() { 69 Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2)); 70 Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2)); 71 Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2)); 72 in1.setRandom(); 73 in2.setRandom(); 74 75 std::size_t in1_bytes = in1.size() * sizeof(float); 76 std::size_t in2_bytes = in2.size() * sizeof(float); 77 std::size_t out_bytes = out.size() * sizeof(float); 78 79 float* d_in1; 80 float* d_in2; 81 float* d_out; 82 cudaMalloc((void**)(&d_in1), in1_bytes); 83 cudaMalloc((void**)(&d_in2), in2_bytes); 84 cudaMalloc((void**)(&d_out), out_bytes); 85 86 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); 87 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); 88 89 Eigen::CudaStreamDevice stream; 90 Eigen::GpuDevice gpu_device(&stream); 91 92 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( 93 d_in1, Eigen::array<Eigen::DenseIndex, 1>(2)); 94 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( 95 d_in2, Eigen::array<Eigen::DenseIndex, 1>(2)); 96 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( 97 d_out, Eigen::array<Eigen::DenseIndex, 1>(2)); 98 99 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; 100 101 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, 102 gpu_device.stream()) == cudaSuccess); 103 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 104 105 for (int i = 0; i < 2; ++i) { 106 VERIFY_IS_APPROX( 107 out(Eigen::array<Eigen::DenseIndex, 1>(i)), 108 in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i))); 109 } 110 111 cudaFree(d_in1); 112 cudaFree(d_in2); 113 cudaFree(d_out); 114 } 115 116 void test_cuda_elementwise() 117 { 118 Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 119 Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 120 Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 121 Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 122 in1.setRandom(); 123 in2.setRandom(); 124 in3.setRandom(); 125 126 std::size_t in1_bytes = in1.size() * sizeof(float); 127 std::size_t in2_bytes = in2.size() * sizeof(float); 128 std::size_t in3_bytes = in3.size() * sizeof(float); 129 std::size_t out_bytes = out.size() * sizeof(float); 130 131 float* d_in1; 132 float* d_in2; 133 float* d_in3; 134 float* d_out; 135 cudaMalloc((void**)(&d_in1), in1_bytes); 136 cudaMalloc((void**)(&d_in2), in2_bytes); 137 cudaMalloc((void**)(&d_in3), in3_bytes); 138 cudaMalloc((void**)(&d_out), out_bytes); 139 140 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); 141 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); 142 cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); 143 144 Eigen::CudaStreamDevice stream; 145 Eigen::GpuDevice gpu_device(&stream); 146 147 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 148 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 149 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 150 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 151 152 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; 153 154 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 155 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 156 157 for (int i = 0; i < 72; ++i) { 158 for (int j = 0; j < 53; ++j) { 159 for (int k = 0; k < 97; ++k) { 160 VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k))); 161 } 162 } 163 } 164 165 cudaFree(d_in1); 166 cudaFree(d_in2); 167 cudaFree(d_in3); 168 cudaFree(d_out); 169 } 170 171 void test_cuda_props() { 172 Tensor<float, 1> in1(200); 173 Tensor<bool, 1> out(200); 174 in1.setRandom(); 175 176 std::size_t in1_bytes = in1.size() * sizeof(float); 177 std::size_t out_bytes = out.size() * sizeof(bool); 178 179 float* d_in1; 180 bool* d_out; 181 cudaMalloc((void**)(&d_in1), in1_bytes); 182 cudaMalloc((void**)(&d_out), out_bytes); 183 184 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); 185 186 Eigen::CudaStreamDevice stream; 187 Eigen::GpuDevice gpu_device(&stream); 188 189 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( 190 d_in1, 200); 191 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out( 192 d_out, 200); 193 194 gpu_out.device(gpu_device) = (gpu_in1.isnan)(); 195 196 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, 197 gpu_device.stream()) == cudaSuccess); 198 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 199 200 for (int i = 0; i < 200; ++i) { 201 VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); 202 } 203 204 cudaFree(d_in1); 205 cudaFree(d_out); 206 } 207 208 void test_cuda_reduction() 209 { 210 Tensor<float, 4> in1(72,53,97,113); 211 Tensor<float, 2> out(72,97); 212 in1.setRandom(); 213 214 std::size_t in1_bytes = in1.size() * sizeof(float); 215 std::size_t out_bytes = out.size() * sizeof(float); 216 217 float* d_in1; 218 float* d_out; 219 cudaMalloc((void**)(&d_in1), in1_bytes); 220 cudaMalloc((void**)(&d_out), out_bytes); 221 222 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); 223 224 Eigen::CudaStreamDevice stream; 225 Eigen::GpuDevice gpu_device(&stream); 226 227 Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); 228 Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); 229 230 array<Eigen::DenseIndex, 2> reduction_axis; 231 reduction_axis[0] = 1; 232 reduction_axis[1] = 3; 233 234 gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); 235 236 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 237 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 238 239 for (int i = 0; i < 72; ++i) { 240 for (int j = 0; j < 97; ++j) { 241 float expected = 0; 242 for (int k = 0; k < 53; ++k) { 243 for (int l = 0; l < 113; ++l) { 244 expected = 245 std::max<float>(expected, in1(i, k, j, l)); 246 } 247 } 248 VERIFY_IS_APPROX(out(i,j), expected); 249 } 250 } 251 252 cudaFree(d_in1); 253 cudaFree(d_out); 254 } 255 256 template<int DataLayout> 257 void test_cuda_contraction() 258 { 259 // with these dimensions, the output has 300 * 140 elements, which is 260 // more than 30 * 1024, which is the number of threads in blocks on 261 // a 15 SM GK110 GPU 262 Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); 263 Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1)); 264 Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1)); 265 266 t_left.setRandom(); 267 t_right.setRandom(); 268 269 std::size_t t_left_bytes = t_left.size() * sizeof(float); 270 std::size_t t_right_bytes = t_right.size() * sizeof(float); 271 std::size_t t_result_bytes = t_result.size() * sizeof(float); 272 273 float* d_t_left; 274 float* d_t_right; 275 float* d_t_result; 276 277 cudaMalloc((void**)(&d_t_left), t_left_bytes); 278 cudaMalloc((void**)(&d_t_right), t_right_bytes); 279 cudaMalloc((void**)(&d_t_result), t_result_bytes); 280 281 cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); 282 cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); 283 284 Eigen::CudaStreamDevice stream; 285 Eigen::GpuDevice gpu_device(&stream); 286 287 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); 288 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); 289 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); 290 291 typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; 292 MapXf m_left(t_left.data(), 300, 93); 293 MapXf m_right(t_right.data(), 93, 140); 294 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140); 295 296 typedef Tensor<float, 1>::DimensionPair DimPair; 297 Eigen::array<DimPair, 2> dims; 298 dims[0] = DimPair(2, 0); 299 dims[1] = DimPair(3, 1); 300 301 m_result = m_left * m_right; 302 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); 303 304 cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); 305 306 for (DenseIndex i = 0; i < t_result.size(); i++) { 307 if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { 308 std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; 309 assert(false); 310 } 311 } 312 313 cudaFree(d_t_left); 314 cudaFree(d_t_right); 315 cudaFree(d_t_result); 316 } 317 318 template<int DataLayout> 319 void test_cuda_convolution_1d() 320 { 321 Tensor<float, 4, DataLayout> input(74,37,11,137); 322 Tensor<float, 1, DataLayout> kernel(4); 323 Tensor<float, 4, DataLayout> out(74,34,11,137); 324 input = input.constant(10.0f) + input.random(); 325 kernel = kernel.constant(7.0f) + kernel.random(); 326 327 std::size_t input_bytes = input.size() * sizeof(float); 328 std::size_t kernel_bytes = kernel.size() * sizeof(float); 329 std::size_t out_bytes = out.size() * sizeof(float); 330 331 float* d_input; 332 float* d_kernel; 333 float* d_out; 334 cudaMalloc((void**)(&d_input), input_bytes); 335 cudaMalloc((void**)(&d_kernel), kernel_bytes); 336 cudaMalloc((void**)(&d_out), out_bytes); 337 338 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); 339 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); 340 341 Eigen::CudaStreamDevice stream; 342 Eigen::GpuDevice gpu_device(&stream); 343 344 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); 345 Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); 346 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); 347 348 Eigen::array<Eigen::DenseIndex, 1> dims(1); 349 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 350 351 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 352 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 353 354 for (int i = 0; i < 74; ++i) { 355 for (int j = 0; j < 34; ++j) { 356 for (int k = 0; k < 11; ++k) { 357 for (int l = 0; l < 137; ++l) { 358 const float result = out(i,j,k,l); 359 const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + 360 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); 361 VERIFY_IS_APPROX(result, expected); 362 } 363 } 364 } 365 } 366 367 cudaFree(d_input); 368 cudaFree(d_kernel); 369 cudaFree(d_out); 370 } 371 372 void test_cuda_convolution_inner_dim_col_major_1d() 373 { 374 Tensor<float, 4, ColMajor> input(74,9,11,7); 375 Tensor<float, 1, ColMajor> kernel(4); 376 Tensor<float, 4, ColMajor> out(71,9,11,7); 377 input = input.constant(10.0f) + input.random(); 378 kernel = kernel.constant(7.0f) + kernel.random(); 379 380 std::size_t input_bytes = input.size() * sizeof(float); 381 std::size_t kernel_bytes = kernel.size() * sizeof(float); 382 std::size_t out_bytes = out.size() * sizeof(float); 383 384 float* d_input; 385 float* d_kernel; 386 float* d_out; 387 cudaMalloc((void**)(&d_input), input_bytes); 388 cudaMalloc((void**)(&d_kernel), kernel_bytes); 389 cudaMalloc((void**)(&d_out), out_bytes); 390 391 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); 392 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); 393 394 Eigen::CudaStreamDevice stream; 395 Eigen::GpuDevice gpu_device(&stream); 396 397 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); 398 Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); 399 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); 400 401 Eigen::array<Eigen::DenseIndex, 1> dims(0); 402 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 403 404 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 405 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 406 407 for (int i = 0; i < 71; ++i) { 408 for (int j = 0; j < 9; ++j) { 409 for (int k = 0; k < 11; ++k) { 410 for (int l = 0; l < 7; ++l) { 411 const float result = out(i,j,k,l); 412 const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + 413 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); 414 VERIFY_IS_APPROX(result, expected); 415 } 416 } 417 } 418 } 419 420 cudaFree(d_input); 421 cudaFree(d_kernel); 422 cudaFree(d_out); 423 } 424 425 void test_cuda_convolution_inner_dim_row_major_1d() 426 { 427 Tensor<float, 4, RowMajor> input(7,9,11,74); 428 Tensor<float, 1, RowMajor> kernel(4); 429 Tensor<float, 4, RowMajor> out(7,9,11,71); 430 input = input.constant(10.0f) + input.random(); 431 kernel = kernel.constant(7.0f) + kernel.random(); 432 433 std::size_t input_bytes = input.size() * sizeof(float); 434 std::size_t kernel_bytes = kernel.size() * sizeof(float); 435 std::size_t out_bytes = out.size() * sizeof(float); 436 437 float* d_input; 438 float* d_kernel; 439 float* d_out; 440 cudaMalloc((void**)(&d_input), input_bytes); 441 cudaMalloc((void**)(&d_kernel), kernel_bytes); 442 cudaMalloc((void**)(&d_out), out_bytes); 443 444 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); 445 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); 446 447 Eigen::CudaStreamDevice stream; 448 Eigen::GpuDevice gpu_device(&stream); 449 450 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); 451 Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); 452 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); 453 454 Eigen::array<Eigen::DenseIndex, 1> dims(3); 455 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 456 457 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 458 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 459 460 for (int i = 0; i < 7; ++i) { 461 for (int j = 0; j < 9; ++j) { 462 for (int k = 0; k < 11; ++k) { 463 for (int l = 0; l < 71; ++l) { 464 const float result = out(i,j,k,l); 465 const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + 466 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); 467 VERIFY_IS_APPROX(result, expected); 468 } 469 } 470 } 471 } 472 473 cudaFree(d_input); 474 cudaFree(d_kernel); 475 cudaFree(d_out); 476 } 477 478 template<int DataLayout> 479 void test_cuda_convolution_2d() 480 { 481 Tensor<float, 4, DataLayout> input(74,37,11,137); 482 Tensor<float, 2, DataLayout> kernel(3,4); 483 Tensor<float, 4, DataLayout> out(74,35,8,137); 484 input = input.constant(10.0f) + input.random(); 485 kernel = kernel.constant(7.0f) + kernel.random(); 486 487 std::size_t input_bytes = input.size() * sizeof(float); 488 std::size_t kernel_bytes = kernel.size() * sizeof(float); 489 std::size_t out_bytes = out.size() * sizeof(float); 490 491 float* d_input; 492 float* d_kernel; 493 float* d_out; 494 cudaMalloc((void**)(&d_input), input_bytes); 495 cudaMalloc((void**)(&d_kernel), kernel_bytes); 496 cudaMalloc((void**)(&d_out), out_bytes); 497 498 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); 499 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); 500 501 Eigen::CudaStreamDevice stream; 502 Eigen::GpuDevice gpu_device(&stream); 503 504 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); 505 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); 506 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); 507 508 Eigen::array<Eigen::DenseIndex, 2> dims(1,2); 509 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 510 511 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 512 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 513 514 for (int i = 0; i < 74; ++i) { 515 for (int j = 0; j < 35; ++j) { 516 for (int k = 0; k < 8; ++k) { 517 for (int l = 0; l < 137; ++l) { 518 const float result = out(i,j,k,l); 519 const float expected = input(i,j+0,k+0,l) * kernel(0,0) + 520 input(i,j+1,k+0,l) * kernel(1,0) + 521 input(i,j+2,k+0,l) * kernel(2,0) + 522 input(i,j+0,k+1,l) * kernel(0,1) + 523 input(i,j+1,k+1,l) * kernel(1,1) + 524 input(i,j+2,k+1,l) * kernel(2,1) + 525 input(i,j+0,k+2,l) * kernel(0,2) + 526 input(i,j+1,k+2,l) * kernel(1,2) + 527 input(i,j+2,k+2,l) * kernel(2,2) + 528 input(i,j+0,k+3,l) * kernel(0,3) + 529 input(i,j+1,k+3,l) * kernel(1,3) + 530 input(i,j+2,k+3,l) * kernel(2,3); 531 VERIFY_IS_APPROX(result, expected); 532 } 533 } 534 } 535 } 536 537 cudaFree(d_input); 538 cudaFree(d_kernel); 539 cudaFree(d_out); 540 } 541 542 template<int DataLayout> 543 void test_cuda_convolution_3d() 544 { 545 Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17)); 546 Tensor<float, 3, DataLayout> kernel(3,4,2); 547 Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17)); 548 input = input.constant(10.0f) + input.random(); 549 kernel = kernel.constant(7.0f) + kernel.random(); 550 551 std::size_t input_bytes = input.size() * sizeof(float); 552 std::size_t kernel_bytes = kernel.size() * sizeof(float); 553 std::size_t out_bytes = out.size() * sizeof(float); 554 555 float* d_input; 556 float* d_kernel; 557 float* d_out; 558 cudaMalloc((void**)(&d_input), input_bytes); 559 cudaMalloc((void**)(&d_kernel), kernel_bytes); 560 cudaMalloc((void**)(&d_out), out_bytes); 561 562 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); 563 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); 564 565 Eigen::CudaStreamDevice stream; 566 Eigen::GpuDevice gpu_device(&stream); 567 568 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); 569 Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); 570 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); 571 572 Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3); 573 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 574 575 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 576 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 577 578 for (int i = 0; i < 74; ++i) { 579 for (int j = 0; j < 35; ++j) { 580 for (int k = 0; k < 8; ++k) { 581 for (int l = 0; l < 136; ++l) { 582 for (int m = 0; m < 17; ++m) { 583 const float result = out(i,j,k,l,m); 584 const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + 585 input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + 586 input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + 587 input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + 588 input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + 589 input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + 590 input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + 591 input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + 592 input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + 593 input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + 594 input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + 595 input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + 596 input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + 597 input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + 598 input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + 599 input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + 600 input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + 601 input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + 602 input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + 603 input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + 604 input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + 605 input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + 606 input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + 607 input(i,j+2,k+3,l+1,m) * kernel(2,3,1); 608 VERIFY_IS_APPROX(result, expected); 609 } 610 } 611 } 612 } 613 } 614 615 cudaFree(d_input); 616 cudaFree(d_kernel); 617 cudaFree(d_out); 618 } 619 620 621 template <typename Scalar> 622 void test_cuda_lgamma(const Scalar stddev) 623 { 624 Tensor<Scalar, 2> in(72,97); 625 in.setRandom(); 626 in *= in.constant(stddev); 627 Tensor<Scalar, 2> out(72,97); 628 out.setZero(); 629 630 std::size_t bytes = in.size() * sizeof(Scalar); 631 632 Scalar* d_in; 633 Scalar* d_out; 634 cudaMalloc((void**)(&d_in), bytes); 635 cudaMalloc((void**)(&d_out), bytes); 636 637 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); 638 639 Eigen::CudaStreamDevice stream; 640 Eigen::GpuDevice gpu_device(&stream); 641 642 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); 643 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); 644 645 gpu_out.device(gpu_device) = gpu_in.lgamma(); 646 647 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 648 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 649 650 for (int i = 0; i < 72; ++i) { 651 for (int j = 0; j < 97; ++j) { 652 VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); 653 } 654 } 655 656 cudaFree(d_in); 657 cudaFree(d_out); 658 } 659 660 template <typename Scalar> 661 void test_cuda_digamma() 662 { 663 Tensor<Scalar, 1> in(7); 664 Tensor<Scalar, 1> out(7); 665 Tensor<Scalar, 1> expected_out(7); 666 out.setZero(); 667 668 in(0) = Scalar(1); 669 in(1) = Scalar(1.5); 670 in(2) = Scalar(4); 671 in(3) = Scalar(-10.5); 672 in(4) = Scalar(10000.5); 673 in(5) = Scalar(0); 674 in(6) = Scalar(-1); 675 676 expected_out(0) = Scalar(-0.5772156649015329); 677 expected_out(1) = Scalar(0.03648997397857645); 678 expected_out(2) = Scalar(1.2561176684318); 679 expected_out(3) = Scalar(2.398239129535781); 680 expected_out(4) = Scalar(9.210340372392849); 681 expected_out(5) = std::numeric_limits<Scalar>::infinity(); 682 expected_out(6) = std::numeric_limits<Scalar>::infinity(); 683 684 std::size_t bytes = in.size() * sizeof(Scalar); 685 686 Scalar* d_in; 687 Scalar* d_out; 688 cudaMalloc((void**)(&d_in), bytes); 689 cudaMalloc((void**)(&d_out), bytes); 690 691 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); 692 693 Eigen::CudaStreamDevice stream; 694 Eigen::GpuDevice gpu_device(&stream); 695 696 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); 697 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); 698 699 gpu_out.device(gpu_device) = gpu_in.digamma(); 700 701 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 702 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 703 704 for (int i = 0; i < 5; ++i) { 705 VERIFY_IS_APPROX(out(i), expected_out(i)); 706 } 707 for (int i = 5; i < 7; ++i) { 708 VERIFY_IS_EQUAL(out(i), expected_out(i)); 709 } 710 711 cudaFree(d_in); 712 cudaFree(d_out); 713 } 714 715 template <typename Scalar> 716 void test_cuda_zeta() 717 { 718 Tensor<Scalar, 1> in_x(6); 719 Tensor<Scalar, 1> in_q(6); 720 Tensor<Scalar, 1> out(6); 721 Tensor<Scalar, 1> expected_out(6); 722 out.setZero(); 723 724 in_x(0) = Scalar(1); 725 in_x(1) = Scalar(1.5); 726 in_x(2) = Scalar(4); 727 in_x(3) = Scalar(-10.5); 728 in_x(4) = Scalar(10000.5); 729 in_x(5) = Scalar(3); 730 731 in_q(0) = Scalar(1.2345); 732 in_q(1) = Scalar(2); 733 in_q(2) = Scalar(1.5); 734 in_q(3) = Scalar(3); 735 in_q(4) = Scalar(1.0001); 736 in_q(5) = Scalar(-2.5); 737 738 expected_out(0) = std::numeric_limits<Scalar>::infinity(); 739 expected_out(1) = Scalar(1.61237534869); 740 expected_out(2) = Scalar(0.234848505667); 741 expected_out(3) = Scalar(1.03086757337e-5); 742 expected_out(4) = Scalar(0.367879440865); 743 expected_out(5) = Scalar(0.054102025820864097); 744 745 std::size_t bytes = in_x.size() * sizeof(Scalar); 746 747 Scalar* d_in_x; 748 Scalar* d_in_q; 749 Scalar* d_out; 750 cudaMalloc((void**)(&d_in_x), bytes); 751 cudaMalloc((void**)(&d_in_q), bytes); 752 cudaMalloc((void**)(&d_out), bytes); 753 754 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); 755 cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice); 756 757 Eigen::CudaStreamDevice stream; 758 Eigen::GpuDevice gpu_device(&stream); 759 760 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); 761 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6); 762 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); 763 764 gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); 765 766 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 767 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 768 769 VERIFY_IS_EQUAL(out(0), expected_out(0)); 770 VERIFY((std::isnan)(out(3))); 771 772 for (int i = 1; i < 6; ++i) { 773 if (i != 3) { 774 VERIFY_IS_APPROX(out(i), expected_out(i)); 775 } 776 } 777 778 cudaFree(d_in_x); 779 cudaFree(d_in_q); 780 cudaFree(d_out); 781 } 782 783 template <typename Scalar> 784 void test_cuda_polygamma() 785 { 786 Tensor<Scalar, 1> in_x(7); 787 Tensor<Scalar, 1> in_n(7); 788 Tensor<Scalar, 1> out(7); 789 Tensor<Scalar, 1> expected_out(7); 790 out.setZero(); 791 792 in_n(0) = Scalar(1); 793 in_n(1) = Scalar(1); 794 in_n(2) = Scalar(1); 795 in_n(3) = Scalar(17); 796 in_n(4) = Scalar(31); 797 in_n(5) = Scalar(28); 798 in_n(6) = Scalar(8); 799 800 in_x(0) = Scalar(2); 801 in_x(1) = Scalar(3); 802 in_x(2) = Scalar(25.5); 803 in_x(3) = Scalar(4.7); 804 in_x(4) = Scalar(11.8); 805 in_x(5) = Scalar(17.7); 806 in_x(6) = Scalar(30.2); 807 808 expected_out(0) = Scalar(0.644934066848); 809 expected_out(1) = Scalar(0.394934066848); 810 expected_out(2) = Scalar(0.0399946696496); 811 expected_out(3) = Scalar(293.334565435); 812 expected_out(4) = Scalar(0.445487887616); 813 expected_out(5) = Scalar(-2.47810300902e-07); 814 expected_out(6) = Scalar(-8.29668781082e-09); 815 816 std::size_t bytes = in_x.size() * sizeof(Scalar); 817 818 Scalar* d_in_x; 819 Scalar* d_in_n; 820 Scalar* d_out; 821 cudaMalloc((void**)(&d_in_x), bytes); 822 cudaMalloc((void**)(&d_in_n), bytes); 823 cudaMalloc((void**)(&d_out), bytes); 824 825 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); 826 cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice); 827 828 Eigen::CudaStreamDevice stream; 829 Eigen::GpuDevice gpu_device(&stream); 830 831 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); 832 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7); 833 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); 834 835 gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); 836 837 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 838 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 839 840 for (int i = 0; i < 7; ++i) { 841 VERIFY_IS_APPROX(out(i), expected_out(i)); 842 } 843 844 cudaFree(d_in_x); 845 cudaFree(d_in_n); 846 cudaFree(d_out); 847 } 848 849 template <typename Scalar> 850 void test_cuda_igamma() 851 { 852 Tensor<Scalar, 2> a(6, 6); 853 Tensor<Scalar, 2> x(6, 6); 854 Tensor<Scalar, 2> out(6, 6); 855 out.setZero(); 856 857 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 858 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 859 860 for (int i = 0; i < 6; ++i) { 861 for (int j = 0; j < 6; ++j) { 862 a(i, j) = a_s[i]; 863 x(i, j) = x_s[j]; 864 } 865 } 866 867 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); 868 Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, 869 {0.0, 0.6321205588285578, 0.7768698398515702, 870 0.9816843611112658, 9.999500016666262e-05, 1.0}, 871 {0.0, 0.4275932955291202, 0.608374823728911, 872 0.9539882943107686, 7.522076445089201e-07, 1.0}, 873 {0.0, 0.01898815687615381, 0.06564245437845008, 874 0.5665298796332909, 4.166333347221828e-18, 1.0}, 875 {0.0, 0.9999780593618628, 0.9999899967080838, 876 0.9999996219837988, 0.9991370418689945, 1.0}, 877 {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; 878 879 880 881 std::size_t bytes = a.size() * sizeof(Scalar); 882 883 Scalar* d_a; 884 Scalar* d_x; 885 Scalar* d_out; 886 assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess); 887 assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess); 888 assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); 889 890 cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); 891 cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); 892 893 Eigen::CudaStreamDevice stream; 894 Eigen::GpuDevice gpu_device(&stream); 895 896 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); 897 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); 898 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); 899 900 gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); 901 902 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 903 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 904 905 for (int i = 0; i < 6; ++i) { 906 for (int j = 0; j < 6; ++j) { 907 if ((std::isnan)(igamma_s[i][j])) { 908 VERIFY((std::isnan)(out(i, j))); 909 } else { 910 VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); 911 } 912 } 913 } 914 915 cudaFree(d_a); 916 cudaFree(d_x); 917 cudaFree(d_out); 918 } 919 920 template <typename Scalar> 921 void test_cuda_igammac() 922 { 923 Tensor<Scalar, 2> a(6, 6); 924 Tensor<Scalar, 2> x(6, 6); 925 Tensor<Scalar, 2> out(6, 6); 926 out.setZero(); 927 928 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 929 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 930 931 for (int i = 0; i < 6; ++i) { 932 for (int j = 0; j < 6; ++j) { 933 a(i, j) = a_s[i]; 934 x(i, j) = x_s[j]; 935 } 936 } 937 938 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); 939 Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, 940 {1.0, 0.36787944117144233, 0.22313016014842982, 941 0.018315638888734182, 0.9999000049998333, 0.0}, 942 {1.0, 0.5724067044708798, 0.3916251762710878, 943 0.04601170568923136, 0.9999992477923555, 0.0}, 944 {1.0, 0.9810118431238462, 0.9343575456215499, 945 0.4334701203667089, 1.0, 0.0}, 946 {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 947 3.7801620118431334e-07, 0.0008629581310054535, 948 0.0}, 949 {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; 950 951 std::size_t bytes = a.size() * sizeof(Scalar); 952 953 Scalar* d_a; 954 Scalar* d_x; 955 Scalar* d_out; 956 cudaMalloc((void**)(&d_a), bytes); 957 cudaMalloc((void**)(&d_x), bytes); 958 cudaMalloc((void**)(&d_out), bytes); 959 960 cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); 961 cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); 962 963 Eigen::CudaStreamDevice stream; 964 Eigen::GpuDevice gpu_device(&stream); 965 966 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); 967 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); 968 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); 969 970 gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); 971 972 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 973 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 974 975 for (int i = 0; i < 6; ++i) { 976 for (int j = 0; j < 6; ++j) { 977 if ((std::isnan)(igammac_s[i][j])) { 978 VERIFY((std::isnan)(out(i, j))); 979 } else { 980 VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); 981 } 982 } 983 } 984 985 cudaFree(d_a); 986 cudaFree(d_x); 987 cudaFree(d_out); 988 } 989 990 template <typename Scalar> 991 void test_cuda_erf(const Scalar stddev) 992 { 993 Tensor<Scalar, 2> in(72,97); 994 in.setRandom(); 995 in *= in.constant(stddev); 996 Tensor<Scalar, 2> out(72,97); 997 out.setZero(); 998 999 std::size_t bytes = in.size() * sizeof(Scalar); 1000 1001 Scalar* d_in; 1002 Scalar* d_out; 1003 assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess); 1004 assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); 1005 1006 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); 1007 1008 Eigen::CudaStreamDevice stream; 1009 Eigen::GpuDevice gpu_device(&stream); 1010 1011 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); 1012 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); 1013 1014 gpu_out.device(gpu_device) = gpu_in.erf(); 1015 1016 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 1017 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 1018 1019 for (int i = 0; i < 72; ++i) { 1020 for (int j = 0; j < 97; ++j) { 1021 VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); 1022 } 1023 } 1024 1025 cudaFree(d_in); 1026 cudaFree(d_out); 1027 } 1028 1029 template <typename Scalar> 1030 void test_cuda_erfc(const Scalar stddev) 1031 { 1032 Tensor<Scalar, 2> in(72,97); 1033 in.setRandom(); 1034 in *= in.constant(stddev); 1035 Tensor<Scalar, 2> out(72,97); 1036 out.setZero(); 1037 1038 std::size_t bytes = in.size() * sizeof(Scalar); 1039 1040 Scalar* d_in; 1041 Scalar* d_out; 1042 cudaMalloc((void**)(&d_in), bytes); 1043 cudaMalloc((void**)(&d_out), bytes); 1044 1045 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); 1046 1047 Eigen::CudaStreamDevice stream; 1048 Eigen::GpuDevice gpu_device(&stream); 1049 1050 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); 1051 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); 1052 1053 gpu_out.device(gpu_device) = gpu_in.erfc(); 1054 1055 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 1056 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 1057 1058 for (int i = 0; i < 72; ++i) { 1059 for (int j = 0; j < 97; ++j) { 1060 VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); 1061 } 1062 } 1063 1064 cudaFree(d_in); 1065 cudaFree(d_out); 1066 } 1067 1068 template <typename Scalar> 1069 void test_cuda_betainc() 1070 { 1071 Tensor<Scalar, 1> in_x(125); 1072 Tensor<Scalar, 1> in_a(125); 1073 Tensor<Scalar, 1> in_b(125); 1074 Tensor<Scalar, 1> out(125); 1075 Tensor<Scalar, 1> expected_out(125); 1076 out.setZero(); 1077 1078 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); 1079 1080 Array<Scalar, 1, Dynamic> x(125); 1081 Array<Scalar, 1, Dynamic> a(125); 1082 Array<Scalar, 1, Dynamic> b(125); 1083 Array<Scalar, 1, Dynamic> v(125); 1084 1085 a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1086 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1087 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1088 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1089 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1090 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1091 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1092 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1093 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1094 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1095 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1096 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1097 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 1098 31.62177660168379, 31.62177660168379, 31.62177660168379, 1099 31.62177660168379, 31.62177660168379, 31.62177660168379, 1100 31.62177660168379, 31.62177660168379, 31.62177660168379, 1101 31.62177660168379, 31.62177660168379, 31.62177660168379, 1102 31.62177660168379, 31.62177660168379, 31.62177660168379, 1103 31.62177660168379, 31.62177660168379, 31.62177660168379, 1104 31.62177660168379, 31.62177660168379, 31.62177660168379, 1105 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1106 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 1107 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 1108 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; 1109 1110 b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 1111 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, 1112 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, 1113 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1114 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 1115 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1116 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 1117 31.62177660168379, 31.62177660168379, 31.62177660168379, 1118 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 1119 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 1120 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, 1121 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, 1122 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1123 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 1124 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1125 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 1126 31.62177660168379, 31.62177660168379, 31.62177660168379, 1127 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 1128 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 1129 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, 1130 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, 1131 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1132 999.999, 999.999, 999.999; 1133 1134 x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1135 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 1136 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 1137 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 1138 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, 1139 -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1140 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 1141 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 1142 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; 1143 1144 v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1145 nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1146 nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, 1147 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 1148 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, 1149 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, 1150 nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, 1151 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, 1152 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 1153 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 1154 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, 1155 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, 1156 nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, 1157 nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, 1158 nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, 1159 nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, 1160 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, 1161 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 1162 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, 1163 0.5000000000004947, 0.9999999999999999, nan; 1164 1165 for (int i = 0; i < 125; ++i) { 1166 in_x(i) = x(i); 1167 in_a(i) = a(i); 1168 in_b(i) = b(i); 1169 expected_out(i) = v(i); 1170 } 1171 1172 std::size_t bytes = in_x.size() * sizeof(Scalar); 1173 1174 Scalar* d_in_x; 1175 Scalar* d_in_a; 1176 Scalar* d_in_b; 1177 Scalar* d_out; 1178 cudaMalloc((void**)(&d_in_x), bytes); 1179 cudaMalloc((void**)(&d_in_a), bytes); 1180 cudaMalloc((void**)(&d_in_b), bytes); 1181 cudaMalloc((void**)(&d_out), bytes); 1182 1183 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); 1184 cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice); 1185 cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice); 1186 1187 Eigen::CudaStreamDevice stream; 1188 Eigen::GpuDevice gpu_device(&stream); 1189 1190 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); 1191 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125); 1192 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125); 1193 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125); 1194 1195 gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); 1196 1197 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); 1198 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 1199 1200 for (int i = 1; i < 125; ++i) { 1201 if ((std::isnan)(expected_out(i))) { 1202 VERIFY((std::isnan)(out(i))); 1203 } else { 1204 VERIFY_IS_APPROX(out(i), expected_out(i)); 1205 } 1206 } 1207 1208 cudaFree(d_in_x); 1209 cudaFree(d_in_a); 1210 cudaFree(d_in_b); 1211 cudaFree(d_out); 1212 } 1213 1214 1215 void test_cxx11_tensor_cuda() 1216 { 1217 CALL_SUBTEST_1(test_cuda_nullary()); 1218 CALL_SUBTEST_1(test_cuda_elementwise_small()); 1219 CALL_SUBTEST_1(test_cuda_elementwise()); 1220 CALL_SUBTEST_1(test_cuda_props()); 1221 CALL_SUBTEST_1(test_cuda_reduction()); 1222 CALL_SUBTEST_2(test_cuda_contraction<ColMajor>()); 1223 CALL_SUBTEST_2(test_cuda_contraction<RowMajor>()); 1224 CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>()); 1225 CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>()); 1226 CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); 1227 CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); 1228 CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>()); 1229 CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>()); 1230 CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>()); 1231 CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>()); 1232 1233 #if __cplusplus > 199711L 1234 // std::erf, std::erfc, and so on where only added in c++11. We use them 1235 // as a golden reference to validate the results produced by Eigen. Therefore 1236 // we can only run these tests if we use a c++11 compiler. 1237 CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f)); 1238 CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f)); 1239 CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f)); 1240 CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f)); 1241 1242 CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0)); 1243 CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0)); 1244 CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01)); 1245 CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001)); 1246 1247 CALL_SUBTEST_4(test_cuda_erf<float>(1.0f)); 1248 CALL_SUBTEST_4(test_cuda_erf<float>(100.0f)); 1249 CALL_SUBTEST_4(test_cuda_erf<float>(0.01f)); 1250 CALL_SUBTEST_4(test_cuda_erf<float>(0.001f)); 1251 1252 CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f)); 1253 // CALL_SUBTEST(test_cuda_erfc<float>(100.0f)); 1254 CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs 1255 CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f)); 1256 CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f)); 1257 1258 CALL_SUBTEST_4(test_cuda_erf<double>(1.0)); 1259 CALL_SUBTEST_4(test_cuda_erf<double>(100.0)); 1260 CALL_SUBTEST_4(test_cuda_erf<double>(0.01)); 1261 CALL_SUBTEST_4(test_cuda_erf<double>(0.001)); 1262 1263 CALL_SUBTEST_4(test_cuda_erfc<double>(1.0)); 1264 // CALL_SUBTEST(test_cuda_erfc<double>(100.0)); 1265 CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs 1266 CALL_SUBTEST_4(test_cuda_erfc<double>(0.01)); 1267 CALL_SUBTEST_4(test_cuda_erfc<double>(0.001)); 1268 1269 CALL_SUBTEST_5(test_cuda_digamma<float>()); 1270 CALL_SUBTEST_5(test_cuda_digamma<double>()); 1271 1272 CALL_SUBTEST_5(test_cuda_polygamma<float>()); 1273 CALL_SUBTEST_5(test_cuda_polygamma<double>()); 1274 1275 CALL_SUBTEST_5(test_cuda_zeta<float>()); 1276 CALL_SUBTEST_5(test_cuda_zeta<double>()); 1277 1278 CALL_SUBTEST_5(test_cuda_igamma<float>()); 1279 CALL_SUBTEST_5(test_cuda_igammac<float>()); 1280 1281 CALL_SUBTEST_5(test_cuda_igamma<double>()); 1282 CALL_SUBTEST_5(test_cuda_igammac<double>()); 1283 1284 CALL_SUBTEST_6(test_cuda_betainc<float>()); 1285 CALL_SUBTEST_6(test_cuda_betainc<double>()); 1286 #endif 1287 } 1288