1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
13 #define EIGEN_USE_GPU
14
15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
16 #include <cuda_fp16.h>
17 #endif
18 #include "main.h"
19 #include <unsupported/Eigen/CXX11/Tensor>
20
21 using Eigen::Tensor;
22
test_cuda_nullary()23 void test_cuda_nullary() {
24 Tensor<float, 1, 0, int> in1(2);
25 Tensor<float, 1, 0, int> in2(2);
26 in1.setRandom();
27 in2.setRandom();
28
29 std::size_t tensor_bytes = in1.size() * sizeof(float);
30
31 float* d_in1;
32 float* d_in2;
33 cudaMalloc((void**)(&d_in1), tensor_bytes);
34 cudaMalloc((void**)(&d_in2), tensor_bytes);
35 cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
36 cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
37
38 Eigen::CudaStreamDevice stream;
39 Eigen::GpuDevice gpu_device(&stream);
40
41 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
42 d_in1, 2);
43 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
44 d_in2, 2);
45
46 gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
47 gpu_in2.device(gpu_device) = gpu_in2.random();
48
49 Tensor<float, 1, 0, int> new1(2);
50 Tensor<float, 1, 0, int> new2(2);
51
52 assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
53 gpu_device.stream()) == cudaSuccess);
54 assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
55 gpu_device.stream()) == cudaSuccess);
56
57 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
58
59 for (int i = 0; i < 2; ++i) {
60 VERIFY_IS_APPROX(new1(i), 3.14f);
61 VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
62 }
63
64 cudaFree(d_in1);
65 cudaFree(d_in2);
66 }
67
test_cuda_elementwise_small()68 void test_cuda_elementwise_small() {
69 Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
70 Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
71 Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
72 in1.setRandom();
73 in2.setRandom();
74
75 std::size_t in1_bytes = in1.size() * sizeof(float);
76 std::size_t in2_bytes = in2.size() * sizeof(float);
77 std::size_t out_bytes = out.size() * sizeof(float);
78
79 float* d_in1;
80 float* d_in2;
81 float* d_out;
82 cudaMalloc((void**)(&d_in1), in1_bytes);
83 cudaMalloc((void**)(&d_in2), in2_bytes);
84 cudaMalloc((void**)(&d_out), out_bytes);
85
86 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
87 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
88
89 Eigen::CudaStreamDevice stream;
90 Eigen::GpuDevice gpu_device(&stream);
91
92 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
93 d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
94 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
95 d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
96 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
97 d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
98
99 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
100
101 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
102 gpu_device.stream()) == cudaSuccess);
103 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
104
105 for (int i = 0; i < 2; ++i) {
106 VERIFY_IS_APPROX(
107 out(Eigen::array<Eigen::DenseIndex, 1>(i)),
108 in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
109 }
110
111 cudaFree(d_in1);
112 cudaFree(d_in2);
113 cudaFree(d_out);
114 }
115
test_cuda_elementwise()116 void test_cuda_elementwise()
117 {
118 Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
119 Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
120 Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
121 Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
122 in1.setRandom();
123 in2.setRandom();
124 in3.setRandom();
125
126 std::size_t in1_bytes = in1.size() * sizeof(float);
127 std::size_t in2_bytes = in2.size() * sizeof(float);
128 std::size_t in3_bytes = in3.size() * sizeof(float);
129 std::size_t out_bytes = out.size() * sizeof(float);
130
131 float* d_in1;
132 float* d_in2;
133 float* d_in3;
134 float* d_out;
135 cudaMalloc((void**)(&d_in1), in1_bytes);
136 cudaMalloc((void**)(&d_in2), in2_bytes);
137 cudaMalloc((void**)(&d_in3), in3_bytes);
138 cudaMalloc((void**)(&d_out), out_bytes);
139
140 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
141 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
142 cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
143
144 Eigen::CudaStreamDevice stream;
145 Eigen::GpuDevice gpu_device(&stream);
146
147 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
148 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
149 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
150 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
151
152 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
153
154 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
155 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
156
157 for (int i = 0; i < 72; ++i) {
158 for (int j = 0; j < 53; ++j) {
159 for (int k = 0; k < 97; ++k) {
160 VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
161 }
162 }
163 }
164
165 cudaFree(d_in1);
166 cudaFree(d_in2);
167 cudaFree(d_in3);
168 cudaFree(d_out);
169 }
170
test_cuda_props()171 void test_cuda_props() {
172 Tensor<float, 1> in1(200);
173 Tensor<bool, 1> out(200);
174 in1.setRandom();
175
176 std::size_t in1_bytes = in1.size() * sizeof(float);
177 std::size_t out_bytes = out.size() * sizeof(bool);
178
179 float* d_in1;
180 bool* d_out;
181 cudaMalloc((void**)(&d_in1), in1_bytes);
182 cudaMalloc((void**)(&d_out), out_bytes);
183
184 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
185
186 Eigen::CudaStreamDevice stream;
187 Eigen::GpuDevice gpu_device(&stream);
188
189 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
190 d_in1, 200);
191 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
192 d_out, 200);
193
194 gpu_out.device(gpu_device) = (gpu_in1.isnan)();
195
196 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
197 gpu_device.stream()) == cudaSuccess);
198 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
199
200 for (int i = 0; i < 200; ++i) {
201 VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
202 }
203
204 cudaFree(d_in1);
205 cudaFree(d_out);
206 }
207
test_cuda_reduction()208 void test_cuda_reduction()
209 {
210 Tensor<float, 4> in1(72,53,97,113);
211 Tensor<float, 2> out(72,97);
212 in1.setRandom();
213
214 std::size_t in1_bytes = in1.size() * sizeof(float);
215 std::size_t out_bytes = out.size() * sizeof(float);
216
217 float* d_in1;
218 float* d_out;
219 cudaMalloc((void**)(&d_in1), in1_bytes);
220 cudaMalloc((void**)(&d_out), out_bytes);
221
222 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
223
224 Eigen::CudaStreamDevice stream;
225 Eigen::GpuDevice gpu_device(&stream);
226
227 Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
228 Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
229
230 array<Eigen::DenseIndex, 2> reduction_axis;
231 reduction_axis[0] = 1;
232 reduction_axis[1] = 3;
233
234 gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
235
236 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
237 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
238
239 for (int i = 0; i < 72; ++i) {
240 for (int j = 0; j < 97; ++j) {
241 float expected = 0;
242 for (int k = 0; k < 53; ++k) {
243 for (int l = 0; l < 113; ++l) {
244 expected =
245 std::max<float>(expected, in1(i, k, j, l));
246 }
247 }
248 VERIFY_IS_APPROX(out(i,j), expected);
249 }
250 }
251
252 cudaFree(d_in1);
253 cudaFree(d_out);
254 }
255
256 template<int DataLayout>
test_cuda_contraction()257 void test_cuda_contraction()
258 {
259 // with these dimensions, the output has 300 * 140 elements, which is
260 // more than 30 * 1024, which is the number of threads in blocks on
261 // a 15 SM GK110 GPU
262 Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
263 Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
264 Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
265
266 t_left.setRandom();
267 t_right.setRandom();
268
269 std::size_t t_left_bytes = t_left.size() * sizeof(float);
270 std::size_t t_right_bytes = t_right.size() * sizeof(float);
271 std::size_t t_result_bytes = t_result.size() * sizeof(float);
272
273 float* d_t_left;
274 float* d_t_right;
275 float* d_t_result;
276
277 cudaMalloc((void**)(&d_t_left), t_left_bytes);
278 cudaMalloc((void**)(&d_t_right), t_right_bytes);
279 cudaMalloc((void**)(&d_t_result), t_result_bytes);
280
281 cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
282 cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
283
284 Eigen::CudaStreamDevice stream;
285 Eigen::GpuDevice gpu_device(&stream);
286
287 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
288 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
289 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
290
291 typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
292 MapXf m_left(t_left.data(), 300, 93);
293 MapXf m_right(t_right.data(), 93, 140);
294 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
295
296 typedef Tensor<float, 1>::DimensionPair DimPair;
297 Eigen::array<DimPair, 2> dims;
298 dims[0] = DimPair(2, 0);
299 dims[1] = DimPair(3, 1);
300
301 m_result = m_left * m_right;
302 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
303
304 cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
305
306 for (DenseIndex i = 0; i < t_result.size(); i++) {
307 if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
308 std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
309 assert(false);
310 }
311 }
312
313 cudaFree(d_t_left);
314 cudaFree(d_t_right);
315 cudaFree(d_t_result);
316 }
317
318 template<int DataLayout>
test_cuda_convolution_1d()319 void test_cuda_convolution_1d()
320 {
321 Tensor<float, 4, DataLayout> input(74,37,11,137);
322 Tensor<float, 1, DataLayout> kernel(4);
323 Tensor<float, 4, DataLayout> out(74,34,11,137);
324 input = input.constant(10.0f) + input.random();
325 kernel = kernel.constant(7.0f) + kernel.random();
326
327 std::size_t input_bytes = input.size() * sizeof(float);
328 std::size_t kernel_bytes = kernel.size() * sizeof(float);
329 std::size_t out_bytes = out.size() * sizeof(float);
330
331 float* d_input;
332 float* d_kernel;
333 float* d_out;
334 cudaMalloc((void**)(&d_input), input_bytes);
335 cudaMalloc((void**)(&d_kernel), kernel_bytes);
336 cudaMalloc((void**)(&d_out), out_bytes);
337
338 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
339 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
340
341 Eigen::CudaStreamDevice stream;
342 Eigen::GpuDevice gpu_device(&stream);
343
344 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
345 Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
346 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
347
348 Eigen::array<Eigen::DenseIndex, 1> dims(1);
349 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
350
351 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
352 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
353
354 for (int i = 0; i < 74; ++i) {
355 for (int j = 0; j < 34; ++j) {
356 for (int k = 0; k < 11; ++k) {
357 for (int l = 0; l < 137; ++l) {
358 const float result = out(i,j,k,l);
359 const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
360 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
361 VERIFY_IS_APPROX(result, expected);
362 }
363 }
364 }
365 }
366
367 cudaFree(d_input);
368 cudaFree(d_kernel);
369 cudaFree(d_out);
370 }
371
test_cuda_convolution_inner_dim_col_major_1d()372 void test_cuda_convolution_inner_dim_col_major_1d()
373 {
374 Tensor<float, 4, ColMajor> input(74,9,11,7);
375 Tensor<float, 1, ColMajor> kernel(4);
376 Tensor<float, 4, ColMajor> out(71,9,11,7);
377 input = input.constant(10.0f) + input.random();
378 kernel = kernel.constant(7.0f) + kernel.random();
379
380 std::size_t input_bytes = input.size() * sizeof(float);
381 std::size_t kernel_bytes = kernel.size() * sizeof(float);
382 std::size_t out_bytes = out.size() * sizeof(float);
383
384 float* d_input;
385 float* d_kernel;
386 float* d_out;
387 cudaMalloc((void**)(&d_input), input_bytes);
388 cudaMalloc((void**)(&d_kernel), kernel_bytes);
389 cudaMalloc((void**)(&d_out), out_bytes);
390
391 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
392 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
393
394 Eigen::CudaStreamDevice stream;
395 Eigen::GpuDevice gpu_device(&stream);
396
397 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
398 Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
399 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
400
401 Eigen::array<Eigen::DenseIndex, 1> dims(0);
402 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
403
404 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
405 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
406
407 for (int i = 0; i < 71; ++i) {
408 for (int j = 0; j < 9; ++j) {
409 for (int k = 0; k < 11; ++k) {
410 for (int l = 0; l < 7; ++l) {
411 const float result = out(i,j,k,l);
412 const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
413 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
414 VERIFY_IS_APPROX(result, expected);
415 }
416 }
417 }
418 }
419
420 cudaFree(d_input);
421 cudaFree(d_kernel);
422 cudaFree(d_out);
423 }
424
test_cuda_convolution_inner_dim_row_major_1d()425 void test_cuda_convolution_inner_dim_row_major_1d()
426 {
427 Tensor<float, 4, RowMajor> input(7,9,11,74);
428 Tensor<float, 1, RowMajor> kernel(4);
429 Tensor<float, 4, RowMajor> out(7,9,11,71);
430 input = input.constant(10.0f) + input.random();
431 kernel = kernel.constant(7.0f) + kernel.random();
432
433 std::size_t input_bytes = input.size() * sizeof(float);
434 std::size_t kernel_bytes = kernel.size() * sizeof(float);
435 std::size_t out_bytes = out.size() * sizeof(float);
436
437 float* d_input;
438 float* d_kernel;
439 float* d_out;
440 cudaMalloc((void**)(&d_input), input_bytes);
441 cudaMalloc((void**)(&d_kernel), kernel_bytes);
442 cudaMalloc((void**)(&d_out), out_bytes);
443
444 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
445 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
446
447 Eigen::CudaStreamDevice stream;
448 Eigen::GpuDevice gpu_device(&stream);
449
450 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
451 Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
452 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
453
454 Eigen::array<Eigen::DenseIndex, 1> dims(3);
455 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
456
457 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
458 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
459
460 for (int i = 0; i < 7; ++i) {
461 for (int j = 0; j < 9; ++j) {
462 for (int k = 0; k < 11; ++k) {
463 for (int l = 0; l < 71; ++l) {
464 const float result = out(i,j,k,l);
465 const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
466 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
467 VERIFY_IS_APPROX(result, expected);
468 }
469 }
470 }
471 }
472
473 cudaFree(d_input);
474 cudaFree(d_kernel);
475 cudaFree(d_out);
476 }
477
478 template<int DataLayout>
test_cuda_convolution_2d()479 void test_cuda_convolution_2d()
480 {
481 Tensor<float, 4, DataLayout> input(74,37,11,137);
482 Tensor<float, 2, DataLayout> kernel(3,4);
483 Tensor<float, 4, DataLayout> out(74,35,8,137);
484 input = input.constant(10.0f) + input.random();
485 kernel = kernel.constant(7.0f) + kernel.random();
486
487 std::size_t input_bytes = input.size() * sizeof(float);
488 std::size_t kernel_bytes = kernel.size() * sizeof(float);
489 std::size_t out_bytes = out.size() * sizeof(float);
490
491 float* d_input;
492 float* d_kernel;
493 float* d_out;
494 cudaMalloc((void**)(&d_input), input_bytes);
495 cudaMalloc((void**)(&d_kernel), kernel_bytes);
496 cudaMalloc((void**)(&d_out), out_bytes);
497
498 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
499 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
500
501 Eigen::CudaStreamDevice stream;
502 Eigen::GpuDevice gpu_device(&stream);
503
504 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
505 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
506 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
507
508 Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
509 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
510
511 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
512 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
513
514 for (int i = 0; i < 74; ++i) {
515 for (int j = 0; j < 35; ++j) {
516 for (int k = 0; k < 8; ++k) {
517 for (int l = 0; l < 137; ++l) {
518 const float result = out(i,j,k,l);
519 const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
520 input(i,j+1,k+0,l) * kernel(1,0) +
521 input(i,j+2,k+0,l) * kernel(2,0) +
522 input(i,j+0,k+1,l) * kernel(0,1) +
523 input(i,j+1,k+1,l) * kernel(1,1) +
524 input(i,j+2,k+1,l) * kernel(2,1) +
525 input(i,j+0,k+2,l) * kernel(0,2) +
526 input(i,j+1,k+2,l) * kernel(1,2) +
527 input(i,j+2,k+2,l) * kernel(2,2) +
528 input(i,j+0,k+3,l) * kernel(0,3) +
529 input(i,j+1,k+3,l) * kernel(1,3) +
530 input(i,j+2,k+3,l) * kernel(2,3);
531 VERIFY_IS_APPROX(result, expected);
532 }
533 }
534 }
535 }
536
537 cudaFree(d_input);
538 cudaFree(d_kernel);
539 cudaFree(d_out);
540 }
541
542 template<int DataLayout>
test_cuda_convolution_3d()543 void test_cuda_convolution_3d()
544 {
545 Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
546 Tensor<float, 3, DataLayout> kernel(3,4,2);
547 Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
548 input = input.constant(10.0f) + input.random();
549 kernel = kernel.constant(7.0f) + kernel.random();
550
551 std::size_t input_bytes = input.size() * sizeof(float);
552 std::size_t kernel_bytes = kernel.size() * sizeof(float);
553 std::size_t out_bytes = out.size() * sizeof(float);
554
555 float* d_input;
556 float* d_kernel;
557 float* d_out;
558 cudaMalloc((void**)(&d_input), input_bytes);
559 cudaMalloc((void**)(&d_kernel), kernel_bytes);
560 cudaMalloc((void**)(&d_out), out_bytes);
561
562 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
563 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
564
565 Eigen::CudaStreamDevice stream;
566 Eigen::GpuDevice gpu_device(&stream);
567
568 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
569 Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
570 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
571
572 Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
573 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
574
575 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
576 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
577
578 for (int i = 0; i < 74; ++i) {
579 for (int j = 0; j < 35; ++j) {
580 for (int k = 0; k < 8; ++k) {
581 for (int l = 0; l < 136; ++l) {
582 for (int m = 0; m < 17; ++m) {
583 const float result = out(i,j,k,l,m);
584 const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
585 input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
586 input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
587 input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
588 input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
589 input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
590 input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
591 input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
592 input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
593 input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
594 input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
595 input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
596 input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
597 input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
598 input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
599 input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
600 input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
601 input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
602 input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
603 input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
604 input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
605 input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
606 input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
607 input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
608 VERIFY_IS_APPROX(result, expected);
609 }
610 }
611 }
612 }
613 }
614
615 cudaFree(d_input);
616 cudaFree(d_kernel);
617 cudaFree(d_out);
618 }
619
620
621 template <typename Scalar>
test_cuda_lgamma(const Scalar stddev)622 void test_cuda_lgamma(const Scalar stddev)
623 {
624 Tensor<Scalar, 2> in(72,97);
625 in.setRandom();
626 in *= in.constant(stddev);
627 Tensor<Scalar, 2> out(72,97);
628 out.setZero();
629
630 std::size_t bytes = in.size() * sizeof(Scalar);
631
632 Scalar* d_in;
633 Scalar* d_out;
634 cudaMalloc((void**)(&d_in), bytes);
635 cudaMalloc((void**)(&d_out), bytes);
636
637 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
638
639 Eigen::CudaStreamDevice stream;
640 Eigen::GpuDevice gpu_device(&stream);
641
642 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
643 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
644
645 gpu_out.device(gpu_device) = gpu_in.lgamma();
646
647 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
648 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
649
650 for (int i = 0; i < 72; ++i) {
651 for (int j = 0; j < 97; ++j) {
652 VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
653 }
654 }
655
656 cudaFree(d_in);
657 cudaFree(d_out);
658 }
659
660 template <typename Scalar>
test_cuda_digamma()661 void test_cuda_digamma()
662 {
663 Tensor<Scalar, 1> in(7);
664 Tensor<Scalar, 1> out(7);
665 Tensor<Scalar, 1> expected_out(7);
666 out.setZero();
667
668 in(0) = Scalar(1);
669 in(1) = Scalar(1.5);
670 in(2) = Scalar(4);
671 in(3) = Scalar(-10.5);
672 in(4) = Scalar(10000.5);
673 in(5) = Scalar(0);
674 in(6) = Scalar(-1);
675
676 expected_out(0) = Scalar(-0.5772156649015329);
677 expected_out(1) = Scalar(0.03648997397857645);
678 expected_out(2) = Scalar(1.2561176684318);
679 expected_out(3) = Scalar(2.398239129535781);
680 expected_out(4) = Scalar(9.210340372392849);
681 expected_out(5) = std::numeric_limits<Scalar>::infinity();
682 expected_out(6) = std::numeric_limits<Scalar>::infinity();
683
684 std::size_t bytes = in.size() * sizeof(Scalar);
685
686 Scalar* d_in;
687 Scalar* d_out;
688 cudaMalloc((void**)(&d_in), bytes);
689 cudaMalloc((void**)(&d_out), bytes);
690
691 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
692
693 Eigen::CudaStreamDevice stream;
694 Eigen::GpuDevice gpu_device(&stream);
695
696 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
697 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
698
699 gpu_out.device(gpu_device) = gpu_in.digamma();
700
701 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
702 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
703
704 for (int i = 0; i < 5; ++i) {
705 VERIFY_IS_APPROX(out(i), expected_out(i));
706 }
707 for (int i = 5; i < 7; ++i) {
708 VERIFY_IS_EQUAL(out(i), expected_out(i));
709 }
710
711 cudaFree(d_in);
712 cudaFree(d_out);
713 }
714
715 template <typename Scalar>
test_cuda_zeta()716 void test_cuda_zeta()
717 {
718 Tensor<Scalar, 1> in_x(6);
719 Tensor<Scalar, 1> in_q(6);
720 Tensor<Scalar, 1> out(6);
721 Tensor<Scalar, 1> expected_out(6);
722 out.setZero();
723
724 in_x(0) = Scalar(1);
725 in_x(1) = Scalar(1.5);
726 in_x(2) = Scalar(4);
727 in_x(3) = Scalar(-10.5);
728 in_x(4) = Scalar(10000.5);
729 in_x(5) = Scalar(3);
730
731 in_q(0) = Scalar(1.2345);
732 in_q(1) = Scalar(2);
733 in_q(2) = Scalar(1.5);
734 in_q(3) = Scalar(3);
735 in_q(4) = Scalar(1.0001);
736 in_q(5) = Scalar(-2.5);
737
738 expected_out(0) = std::numeric_limits<Scalar>::infinity();
739 expected_out(1) = Scalar(1.61237534869);
740 expected_out(2) = Scalar(0.234848505667);
741 expected_out(3) = Scalar(1.03086757337e-5);
742 expected_out(4) = Scalar(0.367879440865);
743 expected_out(5) = Scalar(0.054102025820864097);
744
745 std::size_t bytes = in_x.size() * sizeof(Scalar);
746
747 Scalar* d_in_x;
748 Scalar* d_in_q;
749 Scalar* d_out;
750 cudaMalloc((void**)(&d_in_x), bytes);
751 cudaMalloc((void**)(&d_in_q), bytes);
752 cudaMalloc((void**)(&d_out), bytes);
753
754 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
755 cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
756
757 Eigen::CudaStreamDevice stream;
758 Eigen::GpuDevice gpu_device(&stream);
759
760 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
761 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
762 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
763
764 gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
765
766 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
767 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
768
769 VERIFY_IS_EQUAL(out(0), expected_out(0));
770 VERIFY((std::isnan)(out(3)));
771
772 for (int i = 1; i < 6; ++i) {
773 if (i != 3) {
774 VERIFY_IS_APPROX(out(i), expected_out(i));
775 }
776 }
777
778 cudaFree(d_in_x);
779 cudaFree(d_in_q);
780 cudaFree(d_out);
781 }
782
783 template <typename Scalar>
test_cuda_polygamma()784 void test_cuda_polygamma()
785 {
786 Tensor<Scalar, 1> in_x(7);
787 Tensor<Scalar, 1> in_n(7);
788 Tensor<Scalar, 1> out(7);
789 Tensor<Scalar, 1> expected_out(7);
790 out.setZero();
791
792 in_n(0) = Scalar(1);
793 in_n(1) = Scalar(1);
794 in_n(2) = Scalar(1);
795 in_n(3) = Scalar(17);
796 in_n(4) = Scalar(31);
797 in_n(5) = Scalar(28);
798 in_n(6) = Scalar(8);
799
800 in_x(0) = Scalar(2);
801 in_x(1) = Scalar(3);
802 in_x(2) = Scalar(25.5);
803 in_x(3) = Scalar(4.7);
804 in_x(4) = Scalar(11.8);
805 in_x(5) = Scalar(17.7);
806 in_x(6) = Scalar(30.2);
807
808 expected_out(0) = Scalar(0.644934066848);
809 expected_out(1) = Scalar(0.394934066848);
810 expected_out(2) = Scalar(0.0399946696496);
811 expected_out(3) = Scalar(293.334565435);
812 expected_out(4) = Scalar(0.445487887616);
813 expected_out(5) = Scalar(-2.47810300902e-07);
814 expected_out(6) = Scalar(-8.29668781082e-09);
815
816 std::size_t bytes = in_x.size() * sizeof(Scalar);
817
818 Scalar* d_in_x;
819 Scalar* d_in_n;
820 Scalar* d_out;
821 cudaMalloc((void**)(&d_in_x), bytes);
822 cudaMalloc((void**)(&d_in_n), bytes);
823 cudaMalloc((void**)(&d_out), bytes);
824
825 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
826 cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
827
828 Eigen::CudaStreamDevice stream;
829 Eigen::GpuDevice gpu_device(&stream);
830
831 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
832 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
833 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
834
835 gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
836
837 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
838 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
839
840 for (int i = 0; i < 7; ++i) {
841 VERIFY_IS_APPROX(out(i), expected_out(i));
842 }
843
844 cudaFree(d_in_x);
845 cudaFree(d_in_n);
846 cudaFree(d_out);
847 }
848
849 template <typename Scalar>
test_cuda_igamma()850 void test_cuda_igamma()
851 {
852 Tensor<Scalar, 2> a(6, 6);
853 Tensor<Scalar, 2> x(6, 6);
854 Tensor<Scalar, 2> out(6, 6);
855 out.setZero();
856
857 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
858 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
859
860 for (int i = 0; i < 6; ++i) {
861 for (int j = 0; j < 6; ++j) {
862 a(i, j) = a_s[i];
863 x(i, j) = x_s[j];
864 }
865 }
866
867 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
868 Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
869 {0.0, 0.6321205588285578, 0.7768698398515702,
870 0.9816843611112658, 9.999500016666262e-05, 1.0},
871 {0.0, 0.4275932955291202, 0.608374823728911,
872 0.9539882943107686, 7.522076445089201e-07, 1.0},
873 {0.0, 0.01898815687615381, 0.06564245437845008,
874 0.5665298796332909, 4.166333347221828e-18, 1.0},
875 {0.0, 0.9999780593618628, 0.9999899967080838,
876 0.9999996219837988, 0.9991370418689945, 1.0},
877 {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
878
879
880
881 std::size_t bytes = a.size() * sizeof(Scalar);
882
883 Scalar* d_a;
884 Scalar* d_x;
885 Scalar* d_out;
886 assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
887 assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
888 assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
889
890 cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
891 cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
892
893 Eigen::CudaStreamDevice stream;
894 Eigen::GpuDevice gpu_device(&stream);
895
896 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
897 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
898 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
899
900 gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
901
902 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
903 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
904
905 for (int i = 0; i < 6; ++i) {
906 for (int j = 0; j < 6; ++j) {
907 if ((std::isnan)(igamma_s[i][j])) {
908 VERIFY((std::isnan)(out(i, j)));
909 } else {
910 VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
911 }
912 }
913 }
914
915 cudaFree(d_a);
916 cudaFree(d_x);
917 cudaFree(d_out);
918 }
919
920 template <typename Scalar>
test_cuda_igammac()921 void test_cuda_igammac()
922 {
923 Tensor<Scalar, 2> a(6, 6);
924 Tensor<Scalar, 2> x(6, 6);
925 Tensor<Scalar, 2> out(6, 6);
926 out.setZero();
927
928 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
929 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
930
931 for (int i = 0; i < 6; ++i) {
932 for (int j = 0; j < 6; ++j) {
933 a(i, j) = a_s[i];
934 x(i, j) = x_s[j];
935 }
936 }
937
938 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
939 Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
940 {1.0, 0.36787944117144233, 0.22313016014842982,
941 0.018315638888734182, 0.9999000049998333, 0.0},
942 {1.0, 0.5724067044708798, 0.3916251762710878,
943 0.04601170568923136, 0.9999992477923555, 0.0},
944 {1.0, 0.9810118431238462, 0.9343575456215499,
945 0.4334701203667089, 1.0, 0.0},
946 {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
947 3.7801620118431334e-07, 0.0008629581310054535,
948 0.0},
949 {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
950
951 std::size_t bytes = a.size() * sizeof(Scalar);
952
953 Scalar* d_a;
954 Scalar* d_x;
955 Scalar* d_out;
956 cudaMalloc((void**)(&d_a), bytes);
957 cudaMalloc((void**)(&d_x), bytes);
958 cudaMalloc((void**)(&d_out), bytes);
959
960 cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
961 cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
962
963 Eigen::CudaStreamDevice stream;
964 Eigen::GpuDevice gpu_device(&stream);
965
966 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
967 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
968 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
969
970 gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
971
972 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
973 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
974
975 for (int i = 0; i < 6; ++i) {
976 for (int j = 0; j < 6; ++j) {
977 if ((std::isnan)(igammac_s[i][j])) {
978 VERIFY((std::isnan)(out(i, j)));
979 } else {
980 VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
981 }
982 }
983 }
984
985 cudaFree(d_a);
986 cudaFree(d_x);
987 cudaFree(d_out);
988 }
989
990 template <typename Scalar>
test_cuda_erf(const Scalar stddev)991 void test_cuda_erf(const Scalar stddev)
992 {
993 Tensor<Scalar, 2> in(72,97);
994 in.setRandom();
995 in *= in.constant(stddev);
996 Tensor<Scalar, 2> out(72,97);
997 out.setZero();
998
999 std::size_t bytes = in.size() * sizeof(Scalar);
1000
1001 Scalar* d_in;
1002 Scalar* d_out;
1003 assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
1004 assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
1005
1006 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1007
1008 Eigen::CudaStreamDevice stream;
1009 Eigen::GpuDevice gpu_device(&stream);
1010
1011 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1012 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1013
1014 gpu_out.device(gpu_device) = gpu_in.erf();
1015
1016 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1017 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1018
1019 for (int i = 0; i < 72; ++i) {
1020 for (int j = 0; j < 97; ++j) {
1021 VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
1022 }
1023 }
1024
1025 cudaFree(d_in);
1026 cudaFree(d_out);
1027 }
1028
1029 template <typename Scalar>
test_cuda_erfc(const Scalar stddev)1030 void test_cuda_erfc(const Scalar stddev)
1031 {
1032 Tensor<Scalar, 2> in(72,97);
1033 in.setRandom();
1034 in *= in.constant(stddev);
1035 Tensor<Scalar, 2> out(72,97);
1036 out.setZero();
1037
1038 std::size_t bytes = in.size() * sizeof(Scalar);
1039
1040 Scalar* d_in;
1041 Scalar* d_out;
1042 cudaMalloc((void**)(&d_in), bytes);
1043 cudaMalloc((void**)(&d_out), bytes);
1044
1045 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1046
1047 Eigen::CudaStreamDevice stream;
1048 Eigen::GpuDevice gpu_device(&stream);
1049
1050 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1051 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1052
1053 gpu_out.device(gpu_device) = gpu_in.erfc();
1054
1055 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1056 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1057
1058 for (int i = 0; i < 72; ++i) {
1059 for (int j = 0; j < 97; ++j) {
1060 VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
1061 }
1062 }
1063
1064 cudaFree(d_in);
1065 cudaFree(d_out);
1066 }
1067
1068 template <typename Scalar>
test_cuda_betainc()1069 void test_cuda_betainc()
1070 {
1071 Tensor<Scalar, 1> in_x(125);
1072 Tensor<Scalar, 1> in_a(125);
1073 Tensor<Scalar, 1> in_b(125);
1074 Tensor<Scalar, 1> out(125);
1075 Tensor<Scalar, 1> expected_out(125);
1076 out.setZero();
1077
1078 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
1079
1080 Array<Scalar, 1, Dynamic> x(125);
1081 Array<Scalar, 1, Dynamic> a(125);
1082 Array<Scalar, 1, Dynamic> b(125);
1083 Array<Scalar, 1, Dynamic> v(125);
1084
1085 a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1086 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1087 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1088 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1089 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1090 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1091 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1092 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1093 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1094 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1095 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1096 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1097 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1098 31.62177660168379, 31.62177660168379, 31.62177660168379,
1099 31.62177660168379, 31.62177660168379, 31.62177660168379,
1100 31.62177660168379, 31.62177660168379, 31.62177660168379,
1101 31.62177660168379, 31.62177660168379, 31.62177660168379,
1102 31.62177660168379, 31.62177660168379, 31.62177660168379,
1103 31.62177660168379, 31.62177660168379, 31.62177660168379,
1104 31.62177660168379, 31.62177660168379, 31.62177660168379,
1105 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1106 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1107 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1108 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
1109
1110 b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1111 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1112 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1113 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1114 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1115 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1116 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1117 31.62177660168379, 31.62177660168379, 31.62177660168379,
1118 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1119 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1120 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1121 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1122 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1123 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1124 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1125 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1126 31.62177660168379, 31.62177660168379, 31.62177660168379,
1127 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1128 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1129 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1130 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1131 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1132 999.999, 999.999, 999.999;
1133
1134 x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1135 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1136 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1137 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
1138 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
1139 -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1140 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1141 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1142 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
1143
1144 v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1145 nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1146 nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
1147 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
1148 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
1149 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
1150 nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
1151 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
1152 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
1153 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
1154 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1155 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
1156 nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
1157 nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
1158 nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
1159 nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
1160 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1161 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
1162 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
1163 0.5000000000004947, 0.9999999999999999, nan;
1164
1165 for (int i = 0; i < 125; ++i) {
1166 in_x(i) = x(i);
1167 in_a(i) = a(i);
1168 in_b(i) = b(i);
1169 expected_out(i) = v(i);
1170 }
1171
1172 std::size_t bytes = in_x.size() * sizeof(Scalar);
1173
1174 Scalar* d_in_x;
1175 Scalar* d_in_a;
1176 Scalar* d_in_b;
1177 Scalar* d_out;
1178 cudaMalloc((void**)(&d_in_x), bytes);
1179 cudaMalloc((void**)(&d_in_a), bytes);
1180 cudaMalloc((void**)(&d_in_b), bytes);
1181 cudaMalloc((void**)(&d_out), bytes);
1182
1183 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
1184 cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
1185 cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
1186
1187 Eigen::CudaStreamDevice stream;
1188 Eigen::GpuDevice gpu_device(&stream);
1189
1190 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
1191 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
1192 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
1193 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
1194
1195 gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
1196
1197 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1198 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1199
1200 for (int i = 1; i < 125; ++i) {
1201 if ((std::isnan)(expected_out(i))) {
1202 VERIFY((std::isnan)(out(i)));
1203 } else {
1204 VERIFY_IS_APPROX(out(i), expected_out(i));
1205 }
1206 }
1207
1208 cudaFree(d_in_x);
1209 cudaFree(d_in_a);
1210 cudaFree(d_in_b);
1211 cudaFree(d_out);
1212 }
1213
1214
test_cxx11_tensor_cuda()1215 void test_cxx11_tensor_cuda()
1216 {
1217 CALL_SUBTEST_1(test_cuda_nullary());
1218 CALL_SUBTEST_1(test_cuda_elementwise_small());
1219 CALL_SUBTEST_1(test_cuda_elementwise());
1220 CALL_SUBTEST_1(test_cuda_props());
1221 CALL_SUBTEST_1(test_cuda_reduction());
1222 CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
1223 CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
1224 CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
1225 CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
1226 CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
1227 CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
1228 CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
1229 CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
1230 CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
1231 CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
1232
1233 #if __cplusplus > 199711L
1234 // std::erf, std::erfc, and so on where only added in c++11. We use them
1235 // as a golden reference to validate the results produced by Eigen. Therefore
1236 // we can only run these tests if we use a c++11 compiler.
1237 CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
1238 CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
1239 CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
1240 CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
1241
1242 CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
1243 CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
1244 CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
1245 CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
1246
1247 CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
1248 CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
1249 CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
1250 CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
1251
1252 CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
1253 // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
1254 CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
1255 CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
1256 CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
1257
1258 CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
1259 CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
1260 CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
1261 CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
1262
1263 CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
1264 // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
1265 CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
1266 CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
1267 CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
1268
1269 CALL_SUBTEST_5(test_cuda_digamma<float>());
1270 CALL_SUBTEST_5(test_cuda_digamma<double>());
1271
1272 CALL_SUBTEST_5(test_cuda_polygamma<float>());
1273 CALL_SUBTEST_5(test_cuda_polygamma<double>());
1274
1275 CALL_SUBTEST_5(test_cuda_zeta<float>());
1276 CALL_SUBTEST_5(test_cuda_zeta<double>());
1277
1278 CALL_SUBTEST_5(test_cuda_igamma<float>());
1279 CALL_SUBTEST_5(test_cuda_igammac<float>());
1280
1281 CALL_SUBTEST_5(test_cuda_igamma<double>());
1282 CALL_SUBTEST_5(test_cuda_igammac<double>());
1283
1284 CALL_SUBTEST_6(test_cuda_betainc<float>());
1285 CALL_SUBTEST_6(test_cuda_betainc<double>());
1286 #endif
1287 }
1288