1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 
18 #define EIGEN_USE_GPU
19 
20 #include <memory>
21 #include <vector>
22 
23 #include "tensorflow/core/framework/bfloat16.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor_types.h"
26 #include "tensorflow/core/kernels/concat_lib_gpu.h"
27 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
28 #include "tensorflow/core/util/cuda_kernel_helper.h"
29 
30 namespace tensorflow {
31 
32 typedef Eigen::GpuDevice GPUDevice;
33 
34 namespace {
35 
36 template <typename T, typename IntType>
concat_fixed_kernel(GpuDeviceArrayStruct<const T * > input_ptr_data,int split_size,int total_rows,int total_cols,T * output)37 __global__ void concat_fixed_kernel(
38     GpuDeviceArrayStruct<const T*> input_ptr_data, int split_size,
39     int total_rows, int total_cols, T* output) {
40   const T** input_ptrs = GetGpuDeviceArrayOnDevice(&input_ptr_data);
41   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
42 
43   for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
44     IntType gidy = blockIdx.y * blockDim.y + threadIdx.y;
45 
46     IntType split = gidx / split_size;
47     const T* input_ptr = input_ptrs[split];
48     IntType col_offset = gidx % split_size;
49 #pragma unroll
50     for (; gidy < total_rows; gidy += blockDim.y * gridDim.y) {
51       output[gidy * total_cols + gidx] =
52           input_ptr[gidy * split_size + col_offset];
53     }
54   }
55 }
56 
57 }  // end namespace
58 
59 // cannot be in anonymous namespace due to extern shared memory
60 template <typename T, typename IntType, bool useSmem>
concat_variable_kernel(GpuDeviceArrayStruct<const T * > input_ptr_data,GpuDeviceArrayStruct<IntType> output_scan,IntType total_rows,IntType total_cols,T * output)61 __global__ void concat_variable_kernel(
62     GpuDeviceArrayStruct<const T*> input_ptr_data,
63     GpuDeviceArrayStruct<IntType> output_scan, IntType total_rows,
64     IntType total_cols, T* output) {
65   const T** input_ptrs = GetGpuDeviceArrayOnDevice(&input_ptr_data);
66   IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
67 
68   // do upper_bound on col to find which pointer we should be using
69   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
70   IntType num_inputs = input_ptr_data.size;
71 
72   // verbose declaration needed due to template
73   extern __shared__ __align__(sizeof(T)) unsigned char smem[];
74   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
75 
76   if (useSmem) {
77     IntType lidx = threadIdx.y * blockDim.x + threadIdx.x;
78     IntType blockSize = blockDim.x * blockDim.y;
79 
80     for (IntType i = lidx; i < output_scan.size; i += blockSize) {
81       smem_col_scan[i] = col_scan[i];
82     }
83 
84     __syncthreads();
85 
86     col_scan = smem_col_scan;
87   }
88 
89   // do an initial binary search and then scan linearly from there
90   // works well when there are many small segments and when the
91   // segments are much longer
92   IntType segment =
93       cuda_helper::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
94 
95   IntType curr_offset = col_scan[segment];
96   IntType curr_segment = segment;
97   for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
98     IntType curr_col_offset;
99     while ((curr_col_offset = col_scan[curr_segment + 1]) <= gidx) {
100       curr_offset = curr_col_offset;
101       ++curr_segment;
102     }
103 
104     IntType local_col = gidx - curr_offset;
105     IntType segment_width = curr_col_offset - curr_offset;
106     const T* input_ptr = input_ptrs[curr_segment];
107 
108     IntType gidy = blockIdx.y * blockDim.y + threadIdx.y;
109     for (; gidy < total_rows; gidy += blockDim.y * gridDim.y)
110       output[gidy * total_cols + gidx] =
111           input_ptr[gidy * segment_width + local_col];
112   }
113 }
114 
115 template <typename T, typename IntType>
ConcatGPUSlice(const Eigen::GpuDevice & gpu_device,const std::vector<std::unique_ptr<typename TTypes<T,2>::ConstMatrix>> & inputs_flat,typename TTypes<T,2>::Matrix * output)116 void ConcatGPUSlice(
117     const Eigen::GpuDevice& gpu_device,
118     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
119         inputs_flat,
120     typename TTypes<T, 2>::Matrix* output) {
121   Eigen::array<IntType, 2> offset{0, 0};
122   for (int i = 0; i < inputs_flat.size(); ++i) {
123     Eigen::array<IntType, 2> size;
124     size[0] = inputs_flat[i]->dimension(0);
125     size[1] = inputs_flat[i]->dimension(1);
126     if (std::is_same<IntType, int32>::value) {
127       To32Bit(*output).slice(offset, size).device(gpu_device) =
128           To32Bit(*inputs_flat[i]);
129     } else {
130       output->slice(offset, size).device(gpu_device) = *inputs_flat[i];
131     }
132 
133     offset[1] += size[1];
134   }
135 }
136 
137 template <typename T, typename IntType>
ConcatGPUImpl(const Eigen::GpuDevice & gpu_device,const GpuDeviceArrayStruct<const T * > & input_ptrs,const GpuDeviceArrayStruct<IntType> & output_scan,bool fixed_size,int split_size,typename TTypes<T,2>::Matrix * output)138 void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
139                    const GpuDeviceArrayStruct<const T*>& input_ptrs,
140                    const GpuDeviceArrayStruct<IntType>& output_scan,
141                    bool fixed_size, int split_size,
142                    typename TTypes<T, 2>::Matrix* output) {
143   auto config = GetCuda2DLaunchConfig(output->dimension(1),
144                                       output->dimension(0), gpu_device);
145 
146   if (fixed_size) {
147     concat_fixed_kernel<T, IntType>
148         <<<config.block_count, config.thread_per_block, 0,
149            gpu_device.stream()>>>(input_ptrs, split_size, output->dimension(0),
150                                   output->dimension(1), output->data());
151   } else {
152     IntType smem_max = gpu_device.sharedMemPerBlock();
153     IntType smem_usage = output_scan.size * sizeof(IntType);
154     // performance crossover is less than using maximum available shared memory
155     // on most processors
156     // possibly due to decreasing occupancy
157     // 4096 inputs is a lot, most code will take the smem path
158     const int32 kMaxSmemBytesPerformance = 16384;
159     if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
160       concat_variable_kernel<T, IntType, true>
161           <<<config.block_count, config.thread_per_block, smem_usage,
162              gpu_device.stream()>>>(input_ptrs, output_scan,
163                                     output->dimension(0), output->dimension(1),
164                                     output->data());
165     else
166       concat_variable_kernel<T, IntType, false>
167           <<<config.block_count, config.thread_per_block, 0,
168              gpu_device.stream()>>>(input_ptrs, output_scan,
169                                     output->dimension(0), output->dimension(1),
170                                     output->data());
171   }
172 }
173 
174 #define REGISTER_GPUCONCAT32(T)                                               \
175   template void ConcatGPUSlice<T, int32>(                                     \
176       const Eigen::GpuDevice& gpu_device,                                     \
177       const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
178           inputs_flat,                                                        \
179       typename TTypes<T, 2>::Matrix* output);
180 
181 #define REGISTER_GPUCONCAT64(T)                                               \
182   template void ConcatGPUSlice<T, int64>(                                     \
183       const Eigen::GpuDevice& gpu_device,                                     \
184       const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
185           inputs_flat,                                                        \
186       typename TTypes<T, 2>::Matrix* output);
187 
188 #define REGISTER_GPU32(T)                                              \
189   template void ConcatGPUImpl<T, int32>(                               \
190       const Eigen::GpuDevice& d,                                       \
191       const GpuDeviceArrayStruct<const T*>& input_ptrs,                \
192       const GpuDeviceArrayStruct<int32>& ptr_offsets, bool fixed_size, \
193       int split_size, typename TTypes<T, 2>::Matrix* output);
194 
195 #define REGISTER_GPU64(T)                                              \
196   template void ConcatGPUImpl<T, int64>(                               \
197       const Eigen::GpuDevice& d,                                       \
198       const GpuDeviceArrayStruct<const T*>& input_ptrs,                \
199       const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size, \
200       int split_size, typename TTypes<T, 2>::Matrix* output);
201 
202 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
203 TF_CALL_complex64(REGISTER_GPUCONCAT32);
204 TF_CALL_complex128(REGISTER_GPUCONCAT32);
205 TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
206 TF_CALL_int64(REGISTER_GPUCONCAT32);
207 TF_CALL_int16(REGISTER_GPUCONCAT32);
208 TF_CALL_uint8(REGISTER_GPUCONCAT32);
209 REGISTER_GPUCONCAT32(bfloat16);
210 REGISTER_GPUCONCAT32(bool);
211 
212 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
213 TF_CALL_complex64(REGISTER_GPUCONCAT64);
214 TF_CALL_complex128(REGISTER_GPUCONCAT64);
215 TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
216 TF_CALL_int64(REGISTER_GPUCONCAT64);
217 TF_CALL_int16(REGISTER_GPUCONCAT64);
218 TF_CALL_uint8(REGISTER_GPUCONCAT64);
219 REGISTER_GPUCONCAT64(bfloat16);
220 REGISTER_GPUCONCAT64(bool);
221 
222 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
223 TF_CALL_complex64(REGISTER_GPU32);
224 TF_CALL_complex128(REGISTER_GPU32);
225 TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
226 TF_CALL_int64(REGISTER_GPU32);
227 TF_CALL_int16(REGISTER_GPU32);
228 TF_CALL_uint8(REGISTER_GPU32);
229 REGISTER_GPU32(bfloat16);
230 REGISTER_GPU32(bool);
231 
232 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
233 TF_CALL_complex64(REGISTER_GPU64);
234 TF_CALL_complex128(REGISTER_GPU64);
235 TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
236 TF_CALL_int64(REGISTER_GPU64);
237 TF_CALL_int16(REGISTER_GPU64);
238 TF_CALL_uint8(REGISTER_GPU64);
239 REGISTER_GPU64(bfloat16);
240 REGISTER_GPU64(bool);
241 
242 #undef REGISTER_GPUCONCAT32
243 #undef REGISTER_GPUCONCAT64
244 #undef REGISTER_GPU32
245 #undef REGISTER_GPU64
246 
247 }  // end namespace tensorflow
248 
249 #endif  // GOOGLE_CUDA
250