1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
13 #define EIGEN_USE_GPU
14 
15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
16 #include <cuda_fp16.h>
17 #endif
18 #include "main.h"
19 #include <unsupported/Eigen/CXX11/Tensor>
20 
21 using Eigen::Tensor;
22 
23 void test_cuda_nullary() {
24   Tensor<float, 1, 0, int> in1(2);
25   Tensor<float, 1, 0, int> in2(2);
26   in1.setRandom();
27   in2.setRandom();
28 
29   std::size_t tensor_bytes = in1.size() * sizeof(float);
30 
31   float* d_in1;
32   float* d_in2;
33   cudaMalloc((void**)(&d_in1), tensor_bytes);
34   cudaMalloc((void**)(&d_in2), tensor_bytes);
35   cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
36   cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
37 
38   Eigen::CudaStreamDevice stream;
39   Eigen::GpuDevice gpu_device(&stream);
40 
41   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
42       d_in1, 2);
43   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
44       d_in2, 2);
45 
46   gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
47   gpu_in2.device(gpu_device) = gpu_in2.random();
48 
49   Tensor<float, 1, 0, int> new1(2);
50   Tensor<float, 1, 0, int> new2(2);
51 
52   assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
53                          gpu_device.stream()) == cudaSuccess);
54   assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
55                          gpu_device.stream()) == cudaSuccess);
56 
57   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
58 
59   for (int i = 0; i < 2; ++i) {
60     VERIFY_IS_APPROX(new1(i), 3.14f);
61     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
62   }
63 
64   cudaFree(d_in1);
65   cudaFree(d_in2);
66 }
67 
68 void test_cuda_elementwise_small() {
69   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
70   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
71   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
72   in1.setRandom();
73   in2.setRandom();
74 
75   std::size_t in1_bytes = in1.size() * sizeof(float);
76   std::size_t in2_bytes = in2.size() * sizeof(float);
77   std::size_t out_bytes = out.size() * sizeof(float);
78 
79   float* d_in1;
80   float* d_in2;
81   float* d_out;
82   cudaMalloc((void**)(&d_in1), in1_bytes);
83   cudaMalloc((void**)(&d_in2), in2_bytes);
84   cudaMalloc((void**)(&d_out), out_bytes);
85 
86   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
87   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
88 
89   Eigen::CudaStreamDevice stream;
90   Eigen::GpuDevice gpu_device(&stream);
91 
92   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
93       d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
94   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
95       d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
96   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
97       d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
98 
99   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
100 
101   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
102                          gpu_device.stream()) == cudaSuccess);
103   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
104 
105   for (int i = 0; i < 2; ++i) {
106     VERIFY_IS_APPROX(
107         out(Eigen::array<Eigen::DenseIndex, 1>(i)),
108         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
109   }
110 
111   cudaFree(d_in1);
112   cudaFree(d_in2);
113   cudaFree(d_out);
114 }
115 
116 void test_cuda_elementwise()
117 {
118   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
119   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
120   Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
121   Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
122   in1.setRandom();
123   in2.setRandom();
124   in3.setRandom();
125 
126   std::size_t in1_bytes = in1.size() * sizeof(float);
127   std::size_t in2_bytes = in2.size() * sizeof(float);
128   std::size_t in3_bytes = in3.size() * sizeof(float);
129   std::size_t out_bytes = out.size() * sizeof(float);
130 
131   float* d_in1;
132   float* d_in2;
133   float* d_in3;
134   float* d_out;
135   cudaMalloc((void**)(&d_in1), in1_bytes);
136   cudaMalloc((void**)(&d_in2), in2_bytes);
137   cudaMalloc((void**)(&d_in3), in3_bytes);
138   cudaMalloc((void**)(&d_out), out_bytes);
139 
140   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
141   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
142   cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
143 
144   Eigen::CudaStreamDevice stream;
145   Eigen::GpuDevice gpu_device(&stream);
146 
147   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
148   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
149   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
150   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
151 
152   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
153 
154   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
155   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
156 
157   for (int i = 0; i < 72; ++i) {
158     for (int j = 0; j < 53; ++j) {
159       for (int k = 0; k < 97; ++k) {
160         VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
161       }
162     }
163   }
164 
165   cudaFree(d_in1);
166   cudaFree(d_in2);
167   cudaFree(d_in3);
168   cudaFree(d_out);
169 }
170 
171 void test_cuda_props() {
172   Tensor<float, 1> in1(200);
173   Tensor<bool, 1> out(200);
174   in1.setRandom();
175 
176   std::size_t in1_bytes = in1.size() * sizeof(float);
177   std::size_t out_bytes = out.size() * sizeof(bool);
178 
179   float* d_in1;
180   bool* d_out;
181   cudaMalloc((void**)(&d_in1), in1_bytes);
182   cudaMalloc((void**)(&d_out), out_bytes);
183 
184   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
185 
186   Eigen::CudaStreamDevice stream;
187   Eigen::GpuDevice gpu_device(&stream);
188 
189   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
190       d_in1, 200);
191   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
192       d_out, 200);
193 
194   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
195 
196   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
197                          gpu_device.stream()) == cudaSuccess);
198   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
199 
200   for (int i = 0; i < 200; ++i) {
201     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
202   }
203 
204   cudaFree(d_in1);
205   cudaFree(d_out);
206 }
207 
208 void test_cuda_reduction()
209 {
210   Tensor<float, 4> in1(72,53,97,113);
211   Tensor<float, 2> out(72,97);
212   in1.setRandom();
213 
214   std::size_t in1_bytes = in1.size() * sizeof(float);
215   std::size_t out_bytes = out.size() * sizeof(float);
216 
217   float* d_in1;
218   float* d_out;
219   cudaMalloc((void**)(&d_in1), in1_bytes);
220   cudaMalloc((void**)(&d_out), out_bytes);
221 
222   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
223 
224   Eigen::CudaStreamDevice stream;
225   Eigen::GpuDevice gpu_device(&stream);
226 
227   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
228   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
229 
230   array<Eigen::DenseIndex, 2> reduction_axis;
231   reduction_axis[0] = 1;
232   reduction_axis[1] = 3;
233 
234   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
235 
236   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
237   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
238 
239   for (int i = 0; i < 72; ++i) {
240     for (int j = 0; j < 97; ++j) {
241       float expected = 0;
242       for (int k = 0; k < 53; ++k) {
243         for (int l = 0; l < 113; ++l) {
244           expected =
245               std::max<float>(expected, in1(i, k, j, l));
246         }
247       }
248       VERIFY_IS_APPROX(out(i,j), expected);
249     }
250   }
251 
252   cudaFree(d_in1);
253   cudaFree(d_out);
254 }
255 
256 template<int DataLayout>
257 void test_cuda_contraction()
258 {
259   // with these dimensions, the output has 300 * 140 elements, which is
260   // more than 30 * 1024, which is the number of threads in blocks on
261   // a 15 SM GK110 GPU
262   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
263   Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
264   Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
265 
266   t_left.setRandom();
267   t_right.setRandom();
268 
269   std::size_t t_left_bytes = t_left.size()  * sizeof(float);
270   std::size_t t_right_bytes = t_right.size() * sizeof(float);
271   std::size_t t_result_bytes = t_result.size() * sizeof(float);
272 
273   float* d_t_left;
274   float* d_t_right;
275   float* d_t_result;
276 
277   cudaMalloc((void**)(&d_t_left), t_left_bytes);
278   cudaMalloc((void**)(&d_t_right), t_right_bytes);
279   cudaMalloc((void**)(&d_t_result), t_result_bytes);
280 
281   cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
282   cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
283 
284   Eigen::CudaStreamDevice stream;
285   Eigen::GpuDevice gpu_device(&stream);
286 
287   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
288   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
289   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
290 
291   typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
292   MapXf m_left(t_left.data(), 300, 93);
293   MapXf m_right(t_right.data(), 93, 140);
294   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
295 
296   typedef Tensor<float, 1>::DimensionPair DimPair;
297   Eigen::array<DimPair, 2> dims;
298   dims[0] = DimPair(2, 0);
299   dims[1] = DimPair(3, 1);
300 
301   m_result = m_left * m_right;
302   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
303 
304   cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
305 
306   for (DenseIndex i = 0; i < t_result.size(); i++) {
307     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
308       std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
309       assert(false);
310     }
311   }
312 
313   cudaFree(d_t_left);
314   cudaFree(d_t_right);
315   cudaFree(d_t_result);
316 }
317 
318 template<int DataLayout>
319 void test_cuda_convolution_1d()
320 {
321   Tensor<float, 4, DataLayout> input(74,37,11,137);
322   Tensor<float, 1, DataLayout> kernel(4);
323   Tensor<float, 4, DataLayout> out(74,34,11,137);
324   input = input.constant(10.0f) + input.random();
325   kernel = kernel.constant(7.0f) + kernel.random();
326 
327   std::size_t input_bytes = input.size() * sizeof(float);
328   std::size_t kernel_bytes = kernel.size() * sizeof(float);
329   std::size_t out_bytes = out.size() * sizeof(float);
330 
331   float* d_input;
332   float* d_kernel;
333   float* d_out;
334   cudaMalloc((void**)(&d_input), input_bytes);
335   cudaMalloc((void**)(&d_kernel), kernel_bytes);
336   cudaMalloc((void**)(&d_out), out_bytes);
337 
338   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
339   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
340 
341   Eigen::CudaStreamDevice stream;
342   Eigen::GpuDevice gpu_device(&stream);
343 
344   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
345   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
346   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
347 
348   Eigen::array<Eigen::DenseIndex, 1> dims(1);
349   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
350 
351   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
352   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
353 
354   for (int i = 0; i < 74; ++i) {
355     for (int j = 0; j < 34; ++j) {
356       for (int k = 0; k < 11; ++k) {
357         for (int l = 0; l < 137; ++l) {
358           const float result = out(i,j,k,l);
359           const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
360                                  input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
361           VERIFY_IS_APPROX(result, expected);
362         }
363       }
364     }
365   }
366 
367   cudaFree(d_input);
368   cudaFree(d_kernel);
369   cudaFree(d_out);
370 }
371 
372 void test_cuda_convolution_inner_dim_col_major_1d()
373 {
374   Tensor<float, 4, ColMajor> input(74,9,11,7);
375   Tensor<float, 1, ColMajor> kernel(4);
376   Tensor<float, 4, ColMajor> out(71,9,11,7);
377   input = input.constant(10.0f) + input.random();
378   kernel = kernel.constant(7.0f) + kernel.random();
379 
380   std::size_t input_bytes = input.size() * sizeof(float);
381   std::size_t kernel_bytes = kernel.size() * sizeof(float);
382   std::size_t out_bytes = out.size() * sizeof(float);
383 
384   float* d_input;
385   float* d_kernel;
386   float* d_out;
387   cudaMalloc((void**)(&d_input), input_bytes);
388   cudaMalloc((void**)(&d_kernel), kernel_bytes);
389   cudaMalloc((void**)(&d_out), out_bytes);
390 
391   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
392   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
393 
394   Eigen::CudaStreamDevice stream;
395   Eigen::GpuDevice gpu_device(&stream);
396 
397   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
398   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
399   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
400 
401   Eigen::array<Eigen::DenseIndex, 1> dims(0);
402   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
403 
404   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
405   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
406 
407   for (int i = 0; i < 71; ++i) {
408     for (int j = 0; j < 9; ++j) {
409       for (int k = 0; k < 11; ++k) {
410         for (int l = 0; l < 7; ++l) {
411           const float result = out(i,j,k,l);
412           const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
413                                  input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
414           VERIFY_IS_APPROX(result, expected);
415         }
416       }
417     }
418   }
419 
420   cudaFree(d_input);
421   cudaFree(d_kernel);
422   cudaFree(d_out);
423 }
424 
425 void test_cuda_convolution_inner_dim_row_major_1d()
426 {
427   Tensor<float, 4, RowMajor> input(7,9,11,74);
428   Tensor<float, 1, RowMajor> kernel(4);
429   Tensor<float, 4, RowMajor> out(7,9,11,71);
430   input = input.constant(10.0f) + input.random();
431   kernel = kernel.constant(7.0f) + kernel.random();
432 
433   std::size_t input_bytes = input.size() * sizeof(float);
434   std::size_t kernel_bytes = kernel.size() * sizeof(float);
435   std::size_t out_bytes = out.size() * sizeof(float);
436 
437   float* d_input;
438   float* d_kernel;
439   float* d_out;
440   cudaMalloc((void**)(&d_input), input_bytes);
441   cudaMalloc((void**)(&d_kernel), kernel_bytes);
442   cudaMalloc((void**)(&d_out), out_bytes);
443 
444   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
445   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
446 
447   Eigen::CudaStreamDevice stream;
448   Eigen::GpuDevice gpu_device(&stream);
449 
450   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
451   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
452   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
453 
454   Eigen::array<Eigen::DenseIndex, 1> dims(3);
455   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
456 
457   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
458   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
459 
460   for (int i = 0; i < 7; ++i) {
461     for (int j = 0; j < 9; ++j) {
462       for (int k = 0; k < 11; ++k) {
463         for (int l = 0; l < 71; ++l) {
464           const float result = out(i,j,k,l);
465           const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
466                                  input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
467           VERIFY_IS_APPROX(result, expected);
468         }
469       }
470     }
471   }
472 
473   cudaFree(d_input);
474   cudaFree(d_kernel);
475   cudaFree(d_out);
476 }
477 
478 template<int DataLayout>
479 void test_cuda_convolution_2d()
480 {
481   Tensor<float, 4, DataLayout> input(74,37,11,137);
482   Tensor<float, 2, DataLayout> kernel(3,4);
483   Tensor<float, 4, DataLayout> out(74,35,8,137);
484   input = input.constant(10.0f) + input.random();
485   kernel = kernel.constant(7.0f) + kernel.random();
486 
487   std::size_t input_bytes = input.size() * sizeof(float);
488   std::size_t kernel_bytes = kernel.size() * sizeof(float);
489   std::size_t out_bytes = out.size() * sizeof(float);
490 
491   float* d_input;
492   float* d_kernel;
493   float* d_out;
494   cudaMalloc((void**)(&d_input), input_bytes);
495   cudaMalloc((void**)(&d_kernel), kernel_bytes);
496   cudaMalloc((void**)(&d_out), out_bytes);
497 
498   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
499   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
500 
501   Eigen::CudaStreamDevice stream;
502   Eigen::GpuDevice gpu_device(&stream);
503 
504   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
505   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
506   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
507 
508   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
509   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
510 
511   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
512   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
513 
514   for (int i = 0; i < 74; ++i) {
515     for (int j = 0; j < 35; ++j) {
516       for (int k = 0; k < 8; ++k) {
517         for (int l = 0; l < 137; ++l) {
518           const float result = out(i,j,k,l);
519           const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
520                                  input(i,j+1,k+0,l) * kernel(1,0) +
521                                  input(i,j+2,k+0,l) * kernel(2,0) +
522                                  input(i,j+0,k+1,l) * kernel(0,1) +
523                                  input(i,j+1,k+1,l) * kernel(1,1) +
524                                  input(i,j+2,k+1,l) * kernel(2,1) +
525                                  input(i,j+0,k+2,l) * kernel(0,2) +
526                                  input(i,j+1,k+2,l) * kernel(1,2) +
527                                  input(i,j+2,k+2,l) * kernel(2,2) +
528                                  input(i,j+0,k+3,l) * kernel(0,3) +
529                                  input(i,j+1,k+3,l) * kernel(1,3) +
530                                  input(i,j+2,k+3,l) * kernel(2,3);
531           VERIFY_IS_APPROX(result, expected);
532         }
533       }
534     }
535   }
536 
537   cudaFree(d_input);
538   cudaFree(d_kernel);
539   cudaFree(d_out);
540 }
541 
542 template<int DataLayout>
543 void test_cuda_convolution_3d()
544 {
545   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
546   Tensor<float, 3, DataLayout> kernel(3,4,2);
547   Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
548   input = input.constant(10.0f) + input.random();
549   kernel = kernel.constant(7.0f) + kernel.random();
550 
551   std::size_t input_bytes = input.size() * sizeof(float);
552   std::size_t kernel_bytes = kernel.size() * sizeof(float);
553   std::size_t out_bytes = out.size() * sizeof(float);
554 
555   float* d_input;
556   float* d_kernel;
557   float* d_out;
558   cudaMalloc((void**)(&d_input), input_bytes);
559   cudaMalloc((void**)(&d_kernel), kernel_bytes);
560   cudaMalloc((void**)(&d_out), out_bytes);
561 
562   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
563   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
564 
565   Eigen::CudaStreamDevice stream;
566   Eigen::GpuDevice gpu_device(&stream);
567 
568   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
569   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
570   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
571 
572   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
573   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
574 
575   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
576   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
577 
578   for (int i = 0; i < 74; ++i) {
579     for (int j = 0; j < 35; ++j) {
580       for (int k = 0; k < 8; ++k) {
581         for (int l = 0; l < 136; ++l) {
582           for (int m = 0; m < 17; ++m) {
583             const float result = out(i,j,k,l,m);
584             const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
585                                    input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
586                                    input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
587                                    input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
588                                    input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
589                                    input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
590                                    input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
591                                    input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
592                                    input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
593                                    input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
594                                    input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
595                                    input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
596                                    input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
597                                    input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
598                                    input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
599                                    input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
600                                    input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
601                                    input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
602                                    input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
603                                    input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
604                                    input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
605                                    input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
606                                    input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
607                                    input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
608             VERIFY_IS_APPROX(result, expected);
609           }
610         }
611       }
612     }
613   }
614 
615   cudaFree(d_input);
616   cudaFree(d_kernel);
617   cudaFree(d_out);
618 }
619 
620 
621 template <typename Scalar>
622 void test_cuda_lgamma(const Scalar stddev)
623 {
624   Tensor<Scalar, 2> in(72,97);
625   in.setRandom();
626   in *= in.constant(stddev);
627   Tensor<Scalar, 2> out(72,97);
628   out.setZero();
629 
630   std::size_t bytes = in.size() * sizeof(Scalar);
631 
632   Scalar* d_in;
633   Scalar* d_out;
634   cudaMalloc((void**)(&d_in), bytes);
635   cudaMalloc((void**)(&d_out), bytes);
636 
637   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
638 
639   Eigen::CudaStreamDevice stream;
640   Eigen::GpuDevice gpu_device(&stream);
641 
642   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
643   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
644 
645   gpu_out.device(gpu_device) = gpu_in.lgamma();
646 
647   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
648   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
649 
650   for (int i = 0; i < 72; ++i) {
651     for (int j = 0; j < 97; ++j) {
652       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
653     }
654   }
655 
656   cudaFree(d_in);
657   cudaFree(d_out);
658 }
659 
660 template <typename Scalar>
661 void test_cuda_digamma()
662 {
663   Tensor<Scalar, 1> in(7);
664   Tensor<Scalar, 1> out(7);
665   Tensor<Scalar, 1> expected_out(7);
666   out.setZero();
667 
668   in(0) = Scalar(1);
669   in(1) = Scalar(1.5);
670   in(2) = Scalar(4);
671   in(3) = Scalar(-10.5);
672   in(4) = Scalar(10000.5);
673   in(5) = Scalar(0);
674   in(6) = Scalar(-1);
675 
676   expected_out(0) = Scalar(-0.5772156649015329);
677   expected_out(1) = Scalar(0.03648997397857645);
678   expected_out(2) = Scalar(1.2561176684318);
679   expected_out(3) = Scalar(2.398239129535781);
680   expected_out(4) = Scalar(9.210340372392849);
681   expected_out(5) = std::numeric_limits<Scalar>::infinity();
682   expected_out(6) = std::numeric_limits<Scalar>::infinity();
683 
684   std::size_t bytes = in.size() * sizeof(Scalar);
685 
686   Scalar* d_in;
687   Scalar* d_out;
688   cudaMalloc((void**)(&d_in), bytes);
689   cudaMalloc((void**)(&d_out), bytes);
690 
691   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
692 
693   Eigen::CudaStreamDevice stream;
694   Eigen::GpuDevice gpu_device(&stream);
695 
696   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
697   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
698 
699   gpu_out.device(gpu_device) = gpu_in.digamma();
700 
701   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
702   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
703 
704   for (int i = 0; i < 5; ++i) {
705     VERIFY_IS_APPROX(out(i), expected_out(i));
706   }
707   for (int i = 5; i < 7; ++i) {
708     VERIFY_IS_EQUAL(out(i), expected_out(i));
709   }
710 
711   cudaFree(d_in);
712   cudaFree(d_out);
713 }
714 
715 template <typename Scalar>
716 void test_cuda_zeta()
717 {
718   Tensor<Scalar, 1> in_x(6);
719   Tensor<Scalar, 1> in_q(6);
720   Tensor<Scalar, 1> out(6);
721   Tensor<Scalar, 1> expected_out(6);
722   out.setZero();
723 
724   in_x(0) = Scalar(1);
725   in_x(1) = Scalar(1.5);
726   in_x(2) = Scalar(4);
727   in_x(3) = Scalar(-10.5);
728   in_x(4) = Scalar(10000.5);
729   in_x(5) = Scalar(3);
730 
731   in_q(0) = Scalar(1.2345);
732   in_q(1) = Scalar(2);
733   in_q(2) = Scalar(1.5);
734   in_q(3) = Scalar(3);
735   in_q(4) = Scalar(1.0001);
736   in_q(5) = Scalar(-2.5);
737 
738   expected_out(0) = std::numeric_limits<Scalar>::infinity();
739   expected_out(1) = Scalar(1.61237534869);
740   expected_out(2) = Scalar(0.234848505667);
741   expected_out(3) = Scalar(1.03086757337e-5);
742   expected_out(4) = Scalar(0.367879440865);
743   expected_out(5) = Scalar(0.054102025820864097);
744 
745   std::size_t bytes = in_x.size() * sizeof(Scalar);
746 
747   Scalar* d_in_x;
748   Scalar* d_in_q;
749   Scalar* d_out;
750   cudaMalloc((void**)(&d_in_x), bytes);
751   cudaMalloc((void**)(&d_in_q), bytes);
752   cudaMalloc((void**)(&d_out), bytes);
753 
754   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
755   cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
756 
757   Eigen::CudaStreamDevice stream;
758   Eigen::GpuDevice gpu_device(&stream);
759 
760   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
761   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
762   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
763 
764   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
765 
766   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
767   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
768 
769   VERIFY_IS_EQUAL(out(0), expected_out(0));
770   VERIFY((std::isnan)(out(3)));
771 
772   for (int i = 1; i < 6; ++i) {
773     if (i != 3) {
774       VERIFY_IS_APPROX(out(i), expected_out(i));
775     }
776   }
777 
778   cudaFree(d_in_x);
779   cudaFree(d_in_q);
780   cudaFree(d_out);
781 }
782 
783 template <typename Scalar>
784 void test_cuda_polygamma()
785 {
786   Tensor<Scalar, 1> in_x(7);
787   Tensor<Scalar, 1> in_n(7);
788   Tensor<Scalar, 1> out(7);
789   Tensor<Scalar, 1> expected_out(7);
790   out.setZero();
791 
792   in_n(0) = Scalar(1);
793   in_n(1) = Scalar(1);
794   in_n(2) = Scalar(1);
795   in_n(3) = Scalar(17);
796   in_n(4) = Scalar(31);
797   in_n(5) = Scalar(28);
798   in_n(6) = Scalar(8);
799 
800   in_x(0) = Scalar(2);
801   in_x(1) = Scalar(3);
802   in_x(2) = Scalar(25.5);
803   in_x(3) = Scalar(4.7);
804   in_x(4) = Scalar(11.8);
805   in_x(5) = Scalar(17.7);
806   in_x(6) = Scalar(30.2);
807 
808   expected_out(0) = Scalar(0.644934066848);
809   expected_out(1) = Scalar(0.394934066848);
810   expected_out(2) = Scalar(0.0399946696496);
811   expected_out(3) = Scalar(293.334565435);
812   expected_out(4) = Scalar(0.445487887616);
813   expected_out(5) = Scalar(-2.47810300902e-07);
814   expected_out(6) = Scalar(-8.29668781082e-09);
815 
816   std::size_t bytes = in_x.size() * sizeof(Scalar);
817 
818   Scalar* d_in_x;
819   Scalar* d_in_n;
820   Scalar* d_out;
821   cudaMalloc((void**)(&d_in_x), bytes);
822   cudaMalloc((void**)(&d_in_n), bytes);
823   cudaMalloc((void**)(&d_out), bytes);
824 
825   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
826   cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
827 
828   Eigen::CudaStreamDevice stream;
829   Eigen::GpuDevice gpu_device(&stream);
830 
831   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
832   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
833   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
834 
835   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
836 
837   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
838   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
839 
840   for (int i = 0; i < 7; ++i) {
841     VERIFY_IS_APPROX(out(i), expected_out(i));
842   }
843 
844   cudaFree(d_in_x);
845   cudaFree(d_in_n);
846   cudaFree(d_out);
847 }
848 
849 template <typename Scalar>
850 void test_cuda_igamma()
851 {
852   Tensor<Scalar, 2> a(6, 6);
853   Tensor<Scalar, 2> x(6, 6);
854   Tensor<Scalar, 2> out(6, 6);
855   out.setZero();
856 
857   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
858   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
859 
860   for (int i = 0; i < 6; ++i) {
861     for (int j = 0; j < 6; ++j) {
862       a(i, j) = a_s[i];
863       x(i, j) = x_s[j];
864     }
865   }
866 
867   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
868   Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
869                           {0.0, 0.6321205588285578, 0.7768698398515702,
870                            0.9816843611112658, 9.999500016666262e-05, 1.0},
871                           {0.0, 0.4275932955291202, 0.608374823728911,
872                            0.9539882943107686, 7.522076445089201e-07, 1.0},
873                           {0.0, 0.01898815687615381, 0.06564245437845008,
874                            0.5665298796332909, 4.166333347221828e-18, 1.0},
875                           {0.0, 0.9999780593618628, 0.9999899967080838,
876                            0.9999996219837988, 0.9991370418689945, 1.0},
877                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
878 
879 
880 
881   std::size_t bytes = a.size() * sizeof(Scalar);
882 
883   Scalar* d_a;
884   Scalar* d_x;
885   Scalar* d_out;
886   assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
887   assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
888   assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
889 
890   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
891   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
892 
893   Eigen::CudaStreamDevice stream;
894   Eigen::GpuDevice gpu_device(&stream);
895 
896   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
897   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
898   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
899 
900   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
901 
902   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
903   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
904 
905   for (int i = 0; i < 6; ++i) {
906     for (int j = 0; j < 6; ++j) {
907       if ((std::isnan)(igamma_s[i][j])) {
908         VERIFY((std::isnan)(out(i, j)));
909       } else {
910         VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
911       }
912     }
913   }
914 
915   cudaFree(d_a);
916   cudaFree(d_x);
917   cudaFree(d_out);
918 }
919 
920 template <typename Scalar>
921 void test_cuda_igammac()
922 {
923   Tensor<Scalar, 2> a(6, 6);
924   Tensor<Scalar, 2> x(6, 6);
925   Tensor<Scalar, 2> out(6, 6);
926   out.setZero();
927 
928   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
929   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
930 
931   for (int i = 0; i < 6; ++i) {
932     for (int j = 0; j < 6; ++j) {
933       a(i, j) = a_s[i];
934       x(i, j) = x_s[j];
935     }
936   }
937 
938   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
939   Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
940                            {1.0, 0.36787944117144233, 0.22313016014842982,
941                             0.018315638888734182, 0.9999000049998333, 0.0},
942                            {1.0, 0.5724067044708798, 0.3916251762710878,
943                             0.04601170568923136, 0.9999992477923555, 0.0},
944                            {1.0, 0.9810118431238462, 0.9343575456215499,
945                             0.4334701203667089, 1.0, 0.0},
946                            {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
947                             3.7801620118431334e-07, 0.0008629581310054535,
948                             0.0},
949                            {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
950 
951   std::size_t bytes = a.size() * sizeof(Scalar);
952 
953   Scalar* d_a;
954   Scalar* d_x;
955   Scalar* d_out;
956   cudaMalloc((void**)(&d_a), bytes);
957   cudaMalloc((void**)(&d_x), bytes);
958   cudaMalloc((void**)(&d_out), bytes);
959 
960   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
961   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
962 
963   Eigen::CudaStreamDevice stream;
964   Eigen::GpuDevice gpu_device(&stream);
965 
966   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
967   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
968   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
969 
970   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
971 
972   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
973   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
974 
975   for (int i = 0; i < 6; ++i) {
976     for (int j = 0; j < 6; ++j) {
977       if ((std::isnan)(igammac_s[i][j])) {
978         VERIFY((std::isnan)(out(i, j)));
979       } else {
980         VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
981       }
982     }
983   }
984 
985   cudaFree(d_a);
986   cudaFree(d_x);
987   cudaFree(d_out);
988 }
989 
990 template <typename Scalar>
991 void test_cuda_erf(const Scalar stddev)
992 {
993   Tensor<Scalar, 2> in(72,97);
994   in.setRandom();
995   in *= in.constant(stddev);
996   Tensor<Scalar, 2> out(72,97);
997   out.setZero();
998 
999   std::size_t bytes = in.size() * sizeof(Scalar);
1000 
1001   Scalar* d_in;
1002   Scalar* d_out;
1003   assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
1004   assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
1005 
1006   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1007 
1008   Eigen::CudaStreamDevice stream;
1009   Eigen::GpuDevice gpu_device(&stream);
1010 
1011   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1012   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1013 
1014   gpu_out.device(gpu_device) = gpu_in.erf();
1015 
1016   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1017   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1018 
1019   for (int i = 0; i < 72; ++i) {
1020     for (int j = 0; j < 97; ++j) {
1021       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
1022     }
1023   }
1024 
1025   cudaFree(d_in);
1026   cudaFree(d_out);
1027 }
1028 
1029 template <typename Scalar>
1030 void test_cuda_erfc(const Scalar stddev)
1031 {
1032   Tensor<Scalar, 2> in(72,97);
1033   in.setRandom();
1034   in *= in.constant(stddev);
1035   Tensor<Scalar, 2> out(72,97);
1036   out.setZero();
1037 
1038   std::size_t bytes = in.size() * sizeof(Scalar);
1039 
1040   Scalar* d_in;
1041   Scalar* d_out;
1042   cudaMalloc((void**)(&d_in), bytes);
1043   cudaMalloc((void**)(&d_out), bytes);
1044 
1045   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1046 
1047   Eigen::CudaStreamDevice stream;
1048   Eigen::GpuDevice gpu_device(&stream);
1049 
1050   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1051   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1052 
1053   gpu_out.device(gpu_device) = gpu_in.erfc();
1054 
1055   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1056   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1057 
1058   for (int i = 0; i < 72; ++i) {
1059     for (int j = 0; j < 97; ++j) {
1060       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
1061     }
1062   }
1063 
1064   cudaFree(d_in);
1065   cudaFree(d_out);
1066 }
1067 
1068 template <typename Scalar>
1069 void test_cuda_betainc()
1070 {
1071   Tensor<Scalar, 1> in_x(125);
1072   Tensor<Scalar, 1> in_a(125);
1073   Tensor<Scalar, 1> in_b(125);
1074   Tensor<Scalar, 1> out(125);
1075   Tensor<Scalar, 1> expected_out(125);
1076   out.setZero();
1077 
1078   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
1079 
1080   Array<Scalar, 1, Dynamic> x(125);
1081   Array<Scalar, 1, Dynamic> a(125);
1082   Array<Scalar, 1, Dynamic> b(125);
1083   Array<Scalar, 1, Dynamic> v(125);
1084 
1085   a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1086       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1087       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1088       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1089       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1090       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1091       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1092       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1093       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1094       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1095       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1096       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1097       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1098       31.62177660168379, 31.62177660168379, 31.62177660168379,
1099       31.62177660168379, 31.62177660168379, 31.62177660168379,
1100       31.62177660168379, 31.62177660168379, 31.62177660168379,
1101       31.62177660168379, 31.62177660168379, 31.62177660168379,
1102       31.62177660168379, 31.62177660168379, 31.62177660168379,
1103       31.62177660168379, 31.62177660168379, 31.62177660168379,
1104       31.62177660168379, 31.62177660168379, 31.62177660168379,
1105       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1106       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1107       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1108       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
1109 
1110   b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1111       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1112       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1113       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1114       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1115       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1116       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1117       31.62177660168379, 31.62177660168379, 31.62177660168379,
1118       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1119       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1120       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1121       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1122       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1123       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1124       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1125       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1126       31.62177660168379, 31.62177660168379, 31.62177660168379,
1127       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1128       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1129       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1130       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1131       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1132       999.999, 999.999, 999.999;
1133 
1134   x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1135       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1136       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1137       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
1138       0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
1139       -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1140       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1141       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1142       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
1143 
1144   v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1145       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1146       nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
1147       0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
1148       0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
1149       0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
1150       nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
1151       0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
1152       0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
1153       0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
1154       0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1155       1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
1156       nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
1157       nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
1158       nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
1159       nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
1160       2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1161       1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
1162       2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
1163       0.5000000000004947, 0.9999999999999999, nan;
1164 
1165   for (int i = 0; i < 125; ++i) {
1166     in_x(i) = x(i);
1167     in_a(i) = a(i);
1168     in_b(i) = b(i);
1169     expected_out(i) = v(i);
1170   }
1171 
1172   std::size_t bytes = in_x.size() * sizeof(Scalar);
1173 
1174   Scalar* d_in_x;
1175   Scalar* d_in_a;
1176   Scalar* d_in_b;
1177   Scalar* d_out;
1178   cudaMalloc((void**)(&d_in_x), bytes);
1179   cudaMalloc((void**)(&d_in_a), bytes);
1180   cudaMalloc((void**)(&d_in_b), bytes);
1181   cudaMalloc((void**)(&d_out), bytes);
1182 
1183   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
1184   cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
1185   cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
1186 
1187   Eigen::CudaStreamDevice stream;
1188   Eigen::GpuDevice gpu_device(&stream);
1189 
1190   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
1191   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
1192   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
1193   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
1194 
1195   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
1196 
1197   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1198   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1199 
1200   for (int i = 1; i < 125; ++i) {
1201     if ((std::isnan)(expected_out(i))) {
1202       VERIFY((std::isnan)(out(i)));
1203     } else {
1204       VERIFY_IS_APPROX(out(i), expected_out(i));
1205     }
1206   }
1207 
1208   cudaFree(d_in_x);
1209   cudaFree(d_in_a);
1210   cudaFree(d_in_b);
1211   cudaFree(d_out);
1212 }
1213 
1214 
1215 void test_cxx11_tensor_cuda()
1216 {
1217   CALL_SUBTEST_1(test_cuda_nullary());
1218   CALL_SUBTEST_1(test_cuda_elementwise_small());
1219   CALL_SUBTEST_1(test_cuda_elementwise());
1220   CALL_SUBTEST_1(test_cuda_props());
1221   CALL_SUBTEST_1(test_cuda_reduction());
1222   CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
1223   CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
1224   CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
1225   CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
1226   CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
1227   CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
1228   CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
1229   CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
1230   CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
1231   CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
1232 
1233 #if __cplusplus > 199711L
1234   // std::erf, std::erfc, and so on where only added in c++11. We use them
1235   // as a golden reference to validate the results produced by Eigen. Therefore
1236   // we can only run these tests if we use a c++11 compiler.
1237   CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
1238   CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
1239   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
1240   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
1241 
1242   CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
1243   CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
1244   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
1245   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
1246 
1247   CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
1248   CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
1249   CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
1250   CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
1251 
1252   CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
1253   // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
1254   CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
1255   CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
1256   CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
1257 
1258   CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
1259   CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
1260   CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
1261   CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
1262 
1263   CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
1264   // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
1265   CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
1266   CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
1267   CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
1268 
1269   CALL_SUBTEST_5(test_cuda_digamma<float>());
1270   CALL_SUBTEST_5(test_cuda_digamma<double>());
1271 
1272   CALL_SUBTEST_5(test_cuda_polygamma<float>());
1273   CALL_SUBTEST_5(test_cuda_polygamma<double>());
1274 
1275   CALL_SUBTEST_5(test_cuda_zeta<float>());
1276   CALL_SUBTEST_5(test_cuda_zeta<double>());
1277 
1278   CALL_SUBTEST_5(test_cuda_igamma<float>());
1279   CALL_SUBTEST_5(test_cuda_igammac<float>());
1280 
1281   CALL_SUBTEST_5(test_cuda_igamma<double>());
1282   CALL_SUBTEST_5(test_cuda_igammac<double>());
1283 
1284   CALL_SUBTEST_6(test_cuda_betainc<float>());
1285   CALL_SUBTEST_6(test_cuda_betainc<double>());
1286 #endif
1287 }
1288