1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 
18 #define EIGEN_USE_GPU
19 
20 #include <complex>
21 
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/tensor_types.h"
24 #include "tensorflow/core/kernels/linalg/determinant_op.h"
25 #include "tensorflow/core/util/cuda_solvers.h"
26 #include "tensorflow/core/util/gpu_kernel_helper.h"
27 
28 namespace tensorflow {
29 namespace functor {
30 
31 typedef Eigen::GpuDevice GPUDevice;
32 namespace {
PermutationOrder(int n,const int * __restrict__ pivots)33 __device__ int PermutationOrder(int n, const int* __restrict__ pivots) {
34   // Compute the order of the permutation from the number of transpositions
35   // encoded in the pivot array, see:
36   // http://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=340
37   int order = 0;
38   for (int i = 0; i < n - 1; ++i) {
39     // Notice: Internally, the cuBlas code uses Fortran convention (1-based)
40     // indexing so we expect pivots[i] == i + 1 for rows that were not moved.
41     order += pivots[i] != (i + 1);
42   }
43   return order;
44 }
45 
46 #if defined(__CUDACC__)
47 // Hack around missing support for complex in NVCC.
48 template <typename T>
complex_multiply(const std::complex<T> & a,const std::complex<T> & b)49 __device__ inline std::complex<T> complex_multiply(const std::complex<T>& a,
50                                                    const std::complex<T>& b) {
51   const T a_real = Eigen::numext::real(a);
52   const T a_imag = Eigen::numext::imag(a);
53   const T b_real = Eigen::numext::real(b);
54   const T b_imag = Eigen::numext::imag(b);
55   return std::complex<T>(a_real * b_real - a_imag * b_imag,
56                          a_real * b_imag + a_imag * b_real);
57 }
operator *(const complex64 & a,const complex64 & b)58 __device__ inline complex64 operator*(const complex64& a, const complex64& b) {
59   return complex_multiply<float>(a, b);
60 }
operator *(const complex64 & a,const float & b)61 __device__ inline complex64 operator*(const complex64& a, const float& b) {
62   return complex64(Eigen::numext::real(a) * b, Eigen::numext::imag(a) * b);
63 }
operator /(const complex64 & a,const float & b)64 __device__ inline complex64 operator/(const complex64& a, const float& b) {
65   const float inv_b = 1.0f / b;
66   return a * inv_b;
67 }
operator *(const complex128 & a,const complex128 & b)68 __device__ inline complex128 operator*(const complex128& a,
69                                        const complex128& b) {
70   return complex_multiply<double>(a, b);
71 }
operator *(const complex128 & a,const double & b)72 __device__ inline complex128 operator*(const complex128& a, const double& b) {
73   return complex128(Eigen::numext::real(a) * b, Eigen::numext::imag(a) * b);
74 }
operator /(const complex128 & a,const double & b)75 __device__ inline complex128 operator/(const complex128& a, const double& b) {
76   const double inv_b = 1.0 / b;
77   return a * inv_b;
78 }
79 #endif
80 }  // namespace
81 
82 // This kernel computes either determinant or log_abs_determinant, depending
83 // on the value of the template parameter. If compute_log_abs_det is false,
84 // the sign argument is ignored.
85 template <typename Scalar, bool compute_log_abs_det = true>
DeterminantFromPivotedLUKernel(int nthreads,int n,const Scalar * __restrict__ lu_factor,const int * __restrict__ all_pivots,Scalar * __restrict__ sign,Scalar * __restrict__ log_abs_det)86 __global__ void DeterminantFromPivotedLUKernel(
87     int nthreads, int n, const Scalar* __restrict__ lu_factor,
88     const int* __restrict__ all_pivots, Scalar* __restrict__ sign,
89     Scalar* __restrict__ log_abs_det) {
90   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
91   const int matrix_size = n * n;
92   const int stride = n + 1;
93   // We only parallelize over batches here. Performance is not critical,
94   // since this cheap O(n) kernel always follows an O(n^3) LU factorization.
95   // The main purpose is to avoid having to copy the LU decomposition to
96   // host memory.
97   GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
98     // Initialize sign to (-1)^order.
99     const int order = PermutationOrder(n, all_pivots + o_idx * n);
100     Scalar prod_sign = order % 2 ? Scalar(-1) : Scalar(1);
101     RealScalar sum_log_abs_det = RealScalar(0);
102     int i_idx = matrix_size * o_idx;
103     for (int i = 0; i < n; ++i, i_idx += stride) {
104       const RealScalar abs_i = Eigen::numext::abs(lu_factor[i_idx]);
105       sum_log_abs_det += Eigen::numext::log(abs_i);
106       prod_sign = prod_sign * (lu_factor[i_idx] / abs_i);
107     }
108     if (!Eigen::numext::isfinite(sum_log_abs_det)) {
109       prod_sign = Scalar(0);
110       sum_log_abs_det = sum_log_abs_det > 0 ? -Eigen::numext::log(RealScalar(0))
111                                             : Eigen::numext::log(RealScalar(0));
112     }
113     if (compute_log_abs_det) {
114       sign[o_idx] = prod_sign;
115       log_abs_det[o_idx] = Scalar(sum_log_abs_det);
116     } else {
117       log_abs_det[o_idx] = prod_sign * Eigen::numext::exp(sum_log_abs_det);
118     }
119   }
120 }
121 
122 template <typename Scalar>
123 struct DeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
operator ()tensorflow::functor::DeterminantFromPivotedLUFunctor124   void operator()(const GPUDevice& device,
125                   typename TTypes<Scalar, 3>::ConstTensor lu_factor,
126                   const int* pivots, typename TTypes<Scalar, 1>::Tensor output,
127                   int* info) {
128     const int64 num_matrices = output.size();
129     const int64 n = lu_factor.dimension(2);
130     GpuLaunchConfig config = GetGpuLaunchConfig(num_matrices, device);
131 
132     TF_CHECK_OK(GpuLaunchKernel(
133         DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/false>,
134         config.block_count, config.thread_per_block, 0, device.stream(),
135         config.virtual_thread_count, n, lu_factor.data(), pivots, nullptr,
136         output.data()));
137   }
138 };
139 
140 template struct DeterminantFromPivotedLUFunctor<GPUDevice, float>;
141 template struct DeterminantFromPivotedLUFunctor<GPUDevice, double>;
142 template struct DeterminantFromPivotedLUFunctor<GPUDevice, complex64>;
143 template struct DeterminantFromPivotedLUFunctor<GPUDevice, complex128>;
144 
145 template <typename Scalar>
146 struct LogDeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
operator ()tensorflow::functor::LogDeterminantFromPivotedLUFunctor147   void operator()(const GPUDevice& device,
148                   typename TTypes<Scalar, 3>::ConstTensor lu_factor,
149                   const int* pivots, typename TTypes<Scalar, 1>::Tensor sign,
150                   typename TTypes<Scalar, 1>::Tensor log_abs_det) {
151     const int64 num_matrices = sign.size();
152     const int64 n = lu_factor.dimension(2);
153     GpuLaunchConfig config = GetGpuLaunchConfig(num_matrices, device);
154     TF_CHECK_OK(GpuLaunchKernel(
155         DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/true>,
156         config.block_count, config.thread_per_block, 0, device.stream(),
157         config.virtual_thread_count, n, lu_factor.data(), pivots, sign.data(),
158         log_abs_det.data()));
159   }
160 };
161 
162 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, float>;
163 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, double>;
164 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, complex64>;
165 template struct LogDeterminantFromPivotedLUFunctor<GPUDevice, complex128>;
166 
167 }  // namespace functor
168 }  // namespace tensorflow
169 
170 #endif  // GOOGLE_CUDA
171