1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 
18 #define EIGEN_USE_GPU
19 
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/kernels/ops_util.h"
22 #include "tensorflow/core/kernels/transpose_functor.h"
23 #include "tensorflow/core/util/gpu_kernel_helper.h"
24 
25 // TODO(yangzihao): Remove the dependency of conv_2d.h once we move all
26 // GPU util functions and transpose kernels into separate files.
27 #include "tensorflow/core/kernels/conv_2d.h"
28 
29 typedef Eigen::GpuDevice GPUDevice;
30 
31 namespace tensorflow {
32 namespace internal {
33 
34 template <typename T, bool conjugate>
TransposeKernel(int nthreads,const T * __restrict__ src,const int32 * __restrict__ buf,const int32 ndims,T * __restrict__ dst)35 __global__ void TransposeKernel(int nthreads, const T* __restrict__ src,
36                                 const int32* __restrict__ buf,
37                                 const int32 ndims, T* __restrict__ dst) {
38   const int32* in_strides = buf;
39   const int32* out_strides = buf + ndims;
40   const int32* perm = buf + ndims * 2;
41   GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
42     int32 i_idx = 0;
43     int32 t = o_idx;
44     for (int32 i = 0; i < ndims; ++i) {
45       const int32 ratio = t / out_strides[i];
46       t -= ratio * out_strides[i];
47       i_idx += ratio * in_strides[perm[i]];
48     }
49     if (conjugate) {
50       dst[o_idx] = Eigen::numext::conj(ldg(src + i_idx));
51     } else {
52       dst[o_idx] = ldg(src + i_idx);
53     }
54   }
55 }
56 
57 template <typename T, bool conjugate>
TransposeSimple(const GPUDevice & d,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)58 void TransposeSimple(const GPUDevice& d, const Tensor& in,
59                      const gtl::ArraySlice<int32> perm, Tensor* out) {
60   // Ensures we can use 32-bit index.
61   const int64 nelem = in.NumElements();
62   CHECK_LT(nelem, kint32max) << "Tensor too large to transpose on GPU";
63   // Pack strides and permutation into one buffer.
64   const int32 ndims = in.dims();
65   gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
66   gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
67   gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
68   // Dimension permutation.
69   for (int i = 0; i < ndims; ++i) {
70     host_buf[i] = in_strides[i];
71     host_buf[ndims + i] = out_strides[i];
72     host_buf[ndims * 2 + i] = perm[i];
73   }
74   // Copies the input strides, output strides and permutation to the device.
75   auto num_bytes = sizeof(int32) * host_buf.size();
76   auto dev_buf = d.allocate(num_bytes);
77   // NOTE: host_buf is not allocated by GpuHostAllocator, and
78   // therefore we are doing a sync copy effectively.
79   d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
80   // Launch kernel to q[...] = p[...].
81   const T* p = reinterpret_cast<const T*>(in.tensor_data().data());
82   T* q = reinterpret_cast<T*>(const_cast<char*>((out->tensor_data().data())));
83   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
84   TF_CHECK_OK(GpuLaunchKernel(
85       TransposeKernel<T, conjugate>, cfg.block_count, cfg.thread_per_block, 0,
86       d.stream(), cfg.virtual_thread_count, p,
87       reinterpret_cast<const int32*>(dev_buf), ndims, q));
88   // Safe to deallocate immediately after the kernel launch.
89   d.deallocate(dev_buf);
90 }
91 
92 // TransposeUsingTile tries to reduce the dimension of the input tensor to 3 and
93 // then call special kernels to swap either dimension 1 and dimension 2 or
94 // dimension 0 and dimension 2. It returns true if the operation is success,
95 // false otherwise.
96 template <typename T, bool conjugate = false>
97 struct TransposeUsingTile {
runtensorflow::internal::TransposeUsingTile98   static bool run(const Eigen::GpuDevice& d, const Tensor& in,
99                   const gtl::ArraySlice<int32> perm, Tensor* out) {
100     // First try to reduce the dimensions of the input tensor.
101     TransposePermsVec new_perm;
102     TransposeDimsVec new_dims;
103     ReduceTransposeDimensions(in.shape(), perm, &new_perm, &new_dims);
104 
105     // Only use special GPU kernel when dimension is 2 or 3.
106     int dims = new_dims.size();
107     if (dims < 2 || dims > 3) return false;
108     auto in_data = reinterpret_cast<const T*>(in.tensor_data().data());
109     auto out_data =
110         reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data()));
111     switch (dims) {
112       case 2:
113         if (new_perm[0] == 1 && new_perm[1] == 0) {
114           // Add the first dimension size as 1.
115           new_dims.insert(new_dims.begin(), 1);
116           tensorflow::functor::SwapDimension1And2InTensor3<GPUDevice, T,
117                                                            conjugate>()(
118               d, in_data, new_dims, out_data);
119           return true;
120         }
121         break;
122       case 3:
123         if (new_perm == TransposePermsVec({0, 2, 1})) {
124           tensorflow::functor::SwapDimension1And2InTensor3<GPUDevice, T,
125                                                            conjugate>()(
126               d, in_data, new_dims, out_data);
127           return true;
128         } else if (new_perm == TransposePermsVec({2, 1, 0})) {
129           tensorflow::functor::SwapDimension0And2InTensor3<GPUDevice, T,
130                                                            conjugate>()(
131               d, in_data, new_dims, out_data);
132           return true;
133         } else {
134           // do not handle other 3D permutations
135           return false;
136         }
137         break;
138       default:
139         return false;
140     }
141     return false;
142   }
143 };
144 
145 template <bool conjugate>
146 struct TransposeUsingTile<complex64, conjugate> {
runtensorflow::internal::TransposeUsingTile147   static bool run(const Eigen::GpuDevice& d, const Tensor& in,
148                   const gtl::ArraySlice<int32> perm, Tensor* out) {
149     if (!conjugate) {
150       return TransposeUsingTile<uint64>::run(d, in, perm, out);
151     } else {
152       return TransposeUsingTile<float2, true>::run(d, in, perm, out);
153     }
154   }
155 };
156 
157 template <bool conjugate>
158 struct TransposeUsingTile<complex128, conjugate> {
runtensorflow::internal::TransposeUsingTile159   static bool run(const Eigen::GpuDevice& d, const Tensor& in,
160                   const gtl::ArraySlice<int32> perm, Tensor* out) {
161     if (!conjugate) {
162       return TransposeUsingTile<float4>::run(d, in, perm, out);
163     } else {
164       return TransposeUsingTile<double2, true>::run(d, in, perm, out);
165     }
166   }
167 };
168 
169 }  // namespace internal
170 
171 // Transpose kernel specialized for GPU Device.
172 #define HANDLE_DIM(DIM)                                                      \
173   case DIM:                                                                  \
174     internal::TransposeUsingEigen<GPUDevice, T, DIM>(d, in, perm, conjugate, \
175                                                      out);                   \
176     break
177 
178 template <typename T, bool conjugate>
179 struct Transpose<GPUDevice, T, conjugate> {
runtensorflow::Transpose180   static void run(const GPUDevice& d, const Tensor& in,
181                   const gtl::ArraySlice<int32> perm, Tensor* out) {
182     if (in.dims() < 2) return;
183     if (internal::TransposeUsingTile<T, conjugate>::run(d, in, perm, out)) {
184       return;
185     }
186 
187     switch (in.dims()) {
188       HANDLE_DIM(2);
189       HANDLE_DIM(3);
190       HANDLE_DIM(4);
191       HANDLE_DIM(5);
192       HANDLE_DIM(6);
193       HANDLE_DIM(7);
194       HANDLE_DIM(8);
195       default:
196         internal::TransposeSimple<T, conjugate>(d, in, perm, out);
197         break;
198     }
199   }
200 };
201 
202 #undef HANDLE_DIM
203 
204 template <bool conjugate>
205 struct Transpose<GPUDevice, tstring, conjugate> {
runtensorflow::Transpose206   static void run(const GPUDevice& d, const Tensor& in,
207                   const gtl::ArraySlice<int32> perm, Tensor* out) {
208     LOG(FATAL) << "Transpose of DT_STRING tensor not supported on GPU.";
209   }
210 };
211 
212 // Explicit instantiation.
213 template struct Transpose<GPUDevice, tstring, false>;
214 
215 template <>
DoTranspose(const GPUDevice & device,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)216 Status DoTranspose(const GPUDevice& device, const Tensor& in,
217                    const gtl::ArraySlice<int32> perm, Tensor* out) {
218   return internal::DoTransposeImpl(device, in, perm, /*conjugate=*/false, out);
219 }
220 template <>
DoConjugateTranspose(const GPUDevice & device,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)221 Status DoConjugateTranspose(const GPUDevice& device, const Tensor& in,
222                             const gtl::ArraySlice<int32> perm, Tensor* out) {
223   return internal::DoTransposeImpl(device, in, perm, /*conjugate=*/true, out);
224 }
225 template <>
DoMatrixTranspose(const GPUDevice & device,const Tensor & in,Tensor * out)226 Status DoMatrixTranspose(const GPUDevice& device, const Tensor& in,
227                          Tensor* out) {
228   return internal::DoMatrixTransposeImpl(device, in, /*conjugate=*/false, out);
229 }
230 template <>
DoConjugateMatrixTranspose(const GPUDevice & device,const Tensor & in,Tensor * out)231 Status DoConjugateMatrixTranspose(const GPUDevice& device, const Tensor& in,
232                                   Tensor* out) {
233   return internal::DoMatrixTransposeImpl(device, in, /*conjugate=*/true, out);
234 }
235 
236 }  // namespace tensorflow
237 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
238