1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17
18 #define EIGEN_USE_GPU
19
20 #include <complex>
21
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/tensor_types.h"
24 #include "tensorflow/core/kernels/linalg/determinant_op.h"
25 #include "tensorflow/core/util/cuda_solvers.h"
26 #include "tensorflow/core/util/gpu_kernel_helper.h"
27
28 namespace tensorflow {
29 namespace functor {
30
31 typedef Eigen::GpuDevice GPUDevice;
32 namespace {
PermutationOrder(int n,const int * __restrict__ pivots)33 __device__ int PermutationOrder(int n, const int* __restrict__ pivots) {
34 // Compute the order of the permutation from the number of transpositions
35 // encoded in the pivot array, see:
36 // http://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=340
37 int order = 0;
38 for (int i = 0; i < n - 1; ++i) {
39 // Notice: Internally, the cuBlas code uses Fortran convention (1-based)
40 // indexing so we expect pivots[i] == i + 1 for rows that were not moved.
41 order += pivots[i] != (i + 1);
42 }
43 return order;
44 }
45
46 #if defined(__CUDACC__)
47 // Hack around missing support for complex in NVCC.
48 template <typename T>
complex_multiply(const std::complex<T> & a,const std::complex<T> & b)49 __device__ inline std::complex<T> complex_multiply(const std::complex<T>& a,
50 const std::complex<T>& b) {
51 const T a_real = Eigen::numext::real(a);
52 const T a_imag = Eigen::numext::imag(a);
53 const T b_real = Eigen::numext::real(b);
54 const T b_imag = Eigen::numext::imag(b);
55 return std::complex<T>(a_real * b_real - a_imag * b_imag,
56 a_real * b_imag + a_imag * b_real);
57 }
operator *(const complex64 & a,const complex64 & b)58 __device__ inline complex64 operator*(const complex64& a, const complex64& b) {
59 return complex_multiply<float>(a, b);
60 }
operator *(const complex64 & a,const float & b)61 __device__ inline complex64 operator*(const complex64& a, const float& b) {
62 return complex64(Eigen::numext::real(a) * b, Eigen::numext::imag(a) * b);
63 }
operator /(const complex64 & a,const float & b)64 __device__ inline complex64 operator/(const complex64& a, const float& b) {
65 const float inv_b = 1.0f / b;
66 return a * inv_b;
67 }
operator *(const complex128 & a,const complex128 & b)68 __device__ inline complex128 operator*(const complex128& a,
69 const complex128& b) {
70 return complex_multiply<double>(a, b);
71 }
operator *(const complex128 & a,const double & b)72 __device__ inline complex128 operator*(const complex128& a, const double& b) {
73 return complex128(Eigen::numext::real(a) * b, Eigen::numext::imag(a) * b);
74 }
operator /(const complex128 & a,const double & b)75 __device__ inline complex128 operator/(const complex128& a, const double& b) {
76 const double inv_b = 1.0 / b;
77 return a * inv_b;
78 }
79 #endif
80 } // namespace
81
82 // This kernel computes either determinant or log_abs_determinant, depending
83 // on the value of the template parameter. If compute_log_abs_det is false,
84 // the sign argument is ignored.
85 template <typename Scalar, bool compute_log_abs_det = true>
DeterminantFromPivotedLUKernel(int nthreads,int n,const Scalar * __restrict__ lu_factor,const int * __restrict__ all_pivots,Scalar * __restrict__ sign,Scalar * __restrict__ log_abs_det)86 __global__ void DeterminantFromPivotedLUKernel(
87 int nthreads, int n, const Scalar* __restrict__ lu_factor,
88 const int* __restrict__ all_pivots, Scalar* __restrict__ sign,
89 Scalar* __restrict__ log_abs_det) {
90 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
91 const int matrix_size = n * n;
92 const int stride = n + 1;
93 // We only parallelize over batches here. Performance is not critical,
94 // since this cheap O(n) kernel always follows an O(n^3) LU factorization.
95 // The main purpose is to avoid having to copy the LU decomposition to
96 // host memory.
97 GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
98 // Initialize sign to (-1)^order.
99 const int order = PermutationOrder(n, all_pivots + o_idx * n);
100 Scalar prod_sign = order % 2 ? Scalar(-1) : Scalar(1);
101 RealScalar sum_log_abs_det = RealScalar(0);
102 int i_idx = matrix_size * o_idx;
103 for (int i = 0; i < n; ++i, i_idx += stride) {
104 const RealScalar abs_i = Eigen::numext::abs(lu_factor[i_idx]);
105 sum_log_abs_det += Eigen::numext::log(abs_i);
106 prod_sign = prod_sign * (lu_factor[i_idx] / abs_i);
107 }
108 if (!Eigen::numext::isfinite(sum_log_abs_det)) {
109 prod_sign = Scalar(0);
110 sum_log_abs_det = sum_log_abs_det > 0 ? -Eigen::numext::log(RealScalar(0))
111 : Eigen::numext::log(RealScalar(0));
112 }
113 if (compute_log_abs_det) {
114 sign[o_idx] = prod_sign;
115 log_abs_det[o_idx] = Scalar(sum_log_abs_det);
116 } else {
117 log_abs_det[o_idx] = prod_sign * Eigen::numext::exp(sum_log_abs_det);
118 }
119 }
120 }
121
122 template <typename Scalar>
123 struct DeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
operator ()tensorflow::functor::DeterminantFromPivotedLUFunctor124 void operator()(const GPUDevice& device,
125 typename TTypes<Scalar, 3>::ConstTensor lu_factor,
126 const int* pivots, typename TTypes<Scalar, 1>::Tensor output,
127 int* info) {
128 const int64 num_matrices = output.size();
129 const int64 n = lu_factor.dimension(2);
130 GpuLaunchConfig config = GetGpuLaunchConfig(num_matrices, device);
131
132 TF_CHECK_OK(GpuLaunchKernel(
133 DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/false>,
134 config.block_count, config.thread_per_block, 0, device.stream(),
135 config.virtual_thread_count, n, lu_factor.data(), pivots, nullptr,
136 output.data()));
137 }
138 };
139
140 template struct DeterminantFromPivotedLUFunctor<GPUDevice, float>;
141 template struct DeterminantFromPivotedLUFunctor<GPUDevice, double>;
142 template struct DeterminantFromPivotedLUFunctor<GPUDevice, complex64>;
143 template struct DeterminantFromPivotedLUFunctor<GPUDevice, complex128>;
144
145 template <typename Scalar>
146 struct LogDeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
operator ()tensorflow::functor::LogDeterminantFromPivotedLUFunctor147 void operator()(const GPUDevice& device,
148 typename TTypes<Scalar, 3>::ConstTensor lu_factor,
149 const int* pivots, typename TTypes<Scalar, 1>::Tensor sign,
150 typename TTypes<Scalar, 1>::Tensor log_abs_det) {
151 const int64 num_matrices = sign.size();
152 const int64 n = lu_factor.dimension(2);
153 GpuLaunchConfig config = GetGpuLaunchConfig(num_matrices, device);
154 TF_CHECK_OK(GpuLaunchKernel(
155 DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/true>,
156 config.block_count, config.thread_per_block, 0, device.stream(),
157 config.virtual_thread_count, n, lu_factor.data(), pivots, sign.data(),
158 log_abs_det.data()));
159 }
160 };
161
162 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, float>;
163 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, double>;
164 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, complex64>;
165 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, complex128>;
166
167 } // namespace functor
168 } // namespace tensorflow
169
170 #endif // GOOGLE_CUDA
171