1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
17 #define TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
18 
19 #if GOOGLE_CUDA
20 
21 #include <algorithm>
22 
23 #include "absl/base/casts.h"
24 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/platform/logging.h"
27 #include "tensorflow/core/platform/stream_executor.h"
28 #include "tensorflow/core/platform/types.h"
29 
30 // Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
31 // GetCuda3DLaunchConfig:
32 //
33 // There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
34 // version uses heuristics without any knowledge of the device kernel, the other
35 // version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
36 // launch parameters that maximize occupancy. Currently, only the maximum
37 // occupancy version of GetCuda3DLaunchConfig is available.
38 //
39 // For large number of work elements, the convention is that each kernel would
40 // iterate through its assigned range. The return value of GetCudaLaunchConfig
41 // is struct CudaLaunchConfig, which contains all the information needed for the
42 // kernel launch, including: virtual number of threads, the number of threads
43 // per block and number of threads per block used inside <<< >>> of a kernel
44 // launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
45 // as CudaLaunchConfig. The only difference is the dimension. The macros
46 // CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
47 //
48 /* Sample code:
49 
50 __global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
51   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
52     do_your_job_here;
53   }
54 }
55 
56 __global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
57   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
58     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
59       do_your_job_here;
60     }
61   }
62 }
63 
64 __global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
65   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
66     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
67       CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
68         do_your_job_here;
69       }
70     }
71   }
72 }
73 
74 void MyDriverFunc(const Eigen::GpuDevice &d) {
75   // use heuristics
76   CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
77   MyKernel1D <<<config.block_count,
78                 config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
79   Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
80   MyKernel2D <<<config.block_count,
81                 config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
82   Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
83   MyKernel3D <<<config.block_count,
84                 config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
85 
86   // maximize occupancy
87   CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
88   MyKernel1D <<<config.block_count,
89                 config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
90   Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
91                                                   MyKernel1D, 0, 0);
92   MyKernel2D <<<config.block_count,
93                 config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
94   Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
95                                                   MyKernel1D, 0, 0);
96   MyKernel3D <<<config.block_count,
97                 config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
98 }
99 
100 // See the test for this for more example:
101 //
102 https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
103 
104 */
105 
106 namespace tensorflow {
107 
DivUp(int a,int b)108 inline int DivUp(int a, int b) { return (a + b - 1) / b; }
109 
110 struct CudaLaunchConfig {
111   // Logical number of thread that works on the elements. If each logical
112   // thread works on exactly a single element, this is the same as the working
113   // element count.
114   int virtual_thread_count = -1;
115   // Number of threads per block.
116   int thread_per_block = -1;
117   // Number of blocks for Cuda kernel launch.
118   int block_count = -1;
119 };
120 
121 // Calculate the Cuda launch config we should use for a kernel launch.
122 // This is assuming the kernel is quite simple and will largely be
123 // memory-limited.
124 // REQUIRES: work_element_count > 0.
GetCudaLaunchConfig(int work_element_count,const Eigen::GpuDevice & d)125 inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
126                                             const Eigen::GpuDevice& d) {
127   CHECK_GT(work_element_count, 0);
128   CudaLaunchConfig config;
129   const int virtual_thread_count = work_element_count;
130   const int physical_thread_count = std::min(
131       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
132       virtual_thread_count);
133   const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
134   const int block_count =
135       std::min(DivUp(physical_thread_count, thread_per_block),
136                d.getNumGpuMultiProcessors());
137 
138   config.virtual_thread_count = virtual_thread_count;
139   config.thread_per_block = thread_per_block;
140   config.block_count = block_count;
141   return config;
142 }
143 
144 // Calculate the Cuda launch config we should use for a kernel launch. This
145 // variant takes the resource limits of func into account to maximize occupancy.
146 // REQUIRES: work_element_count > 0.
147 template <typename DeviceFunc>
GetCudaLaunchConfig(int work_element_count,const Eigen::GpuDevice & d,DeviceFunc func,size_t dynamic_shared_memory_size,int block_size_limit)148 inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
149                                             const Eigen::GpuDevice& d,
150                                             DeviceFunc func,
151                                             size_t dynamic_shared_memory_size,
152                                             int block_size_limit) {
153   CHECK_GT(work_element_count, 0);
154   CudaLaunchConfig config;
155   int block_count = 0;
156   int thread_per_block = 0;
157 
158   cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
159       &block_count, &thread_per_block, func, dynamic_shared_memory_size,
160       block_size_limit);
161   CHECK_EQ(err, cudaSuccess);
162 
163   block_count =
164       std::min(block_count, DivUp(work_element_count, thread_per_block));
165 
166   config.virtual_thread_count = work_element_count;
167   config.thread_per_block = thread_per_block;
168   config.block_count = block_count;
169   return config;
170 }
171 
172 // Calculate the Cuda launch config we should use for a kernel launch. This
173 // variant takes the resource limits of func into account to maximize occupancy.
174 // The returned launch config has thread_per_block set to fixed_block_size.
175 // REQUIRES: work_element_count > 0.
176 template <typename DeviceFunc>
GetCudaLaunchConfigFixedBlockSize(int work_element_count,const Eigen::GpuDevice & d,DeviceFunc func,size_t dynamic_shared_memory_size,int fixed_block_size)177 inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
178     int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
179     size_t dynamic_shared_memory_size, int fixed_block_size) {
180   CHECK_GT(work_element_count, 0);
181   CudaLaunchConfig config;
182   int block_count = 0;
183 
184   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
185       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
186   CHECK_EQ(err, cudaSuccess);
187   block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
188                          DivUp(work_element_count, fixed_block_size));
189 
190   config.virtual_thread_count = work_element_count;
191   config.thread_per_block = fixed_block_size;
192   config.block_count = block_count;
193   return config;
194 }
195 
196 struct Cuda2DLaunchConfig {
197   dim3 virtual_thread_count = dim3(0, 0, 0);
198   dim3 thread_per_block = dim3(0, 0, 0);
199   dim3 block_count = dim3(0, 0, 0);
200 };
201 
GetCuda2DLaunchConfig(int xdim,int ydim,const Eigen::GpuDevice & d)202 inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
203                                                 const Eigen::GpuDevice& d) {
204   Cuda2DLaunchConfig config;
205 
206   if (xdim <= 0 || ydim <= 0) {
207     return config;
208   }
209 
210   const int kThreadsPerBlock = 256;
211   int block_cols = std::min(xdim, kThreadsPerBlock);
212   // ok to round down here and just do more loops in the kernel
213   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
214 
215   const int physical_thread_count =
216       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
217 
218   const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
219 
220   config.virtual_thread_count = dim3(xdim, ydim, 1);
221   config.thread_per_block = dim3(block_cols, block_rows, 1);
222 
223   int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
224 
225   config.block_count = dim3(
226       grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
227   return config;
228 }
229 
230 // Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
231 // This variant takes the resource limits of func into account to maximize
232 // occupancy.
233 using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
234 
235 template <typename DeviceFunc>
GetCuda3DLaunchConfig(int xdim,int ydim,int zdim,const Eigen::GpuDevice & d,DeviceFunc func,size_t dynamic_shared_memory_size,int block_size_limit)236 inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
237     int xdim, int ydim, int zdim, const Eigen::GpuDevice& d, DeviceFunc func,
238     size_t dynamic_shared_memory_size, int block_size_limit) {
239   Cuda3DLaunchConfig config;
240 
241   if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
242     return config;
243   }
244 
245   int dev;
246   cudaGetDevice(&dev);
247   cudaDeviceProp deviceProp;
248   cudaGetDeviceProperties(&deviceProp, dev);
249   int xthreadlimit = deviceProp.maxThreadsDim[0];
250   int ythreadlimit = deviceProp.maxThreadsDim[1];
251   int zthreadlimit = deviceProp.maxThreadsDim[2];
252   int xgridlimit = deviceProp.maxGridSize[0];
253   int ygridlimit = deviceProp.maxGridSize[1];
254   int zgridlimit = deviceProp.maxGridSize[2];
255 
256   int block_count = 0;
257   int thread_per_block = 0;
258   cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
259       &block_count, &thread_per_block, func, dynamic_shared_memory_size,
260       block_size_limit);
261   CHECK_EQ(err, cudaSuccess);
262 
263   int threadsx = std::min({xdim, thread_per_block, xthreadlimit});
264   int threadsy =
265       std::min({ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit});
266   int threadsz =
267       std::min({zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
268                 zthreadlimit});
269 
270   int blocksx = std::min({block_count, DivUp(xdim, threadsx), xgridlimit});
271   int blocksy = std::min(
272       {DivUp(block_count, blocksx), DivUp(ydim, threadsy), ygridlimit});
273   int blocksz = std::min({DivUp(block_count, (blocksx * blocksy)),
274                           DivUp(zdim, threadsz), zgridlimit});
275 
276   config.virtual_thread_count = dim3(xdim, ydim, zdim);
277   config.thread_per_block = dim3(threadsx, threadsy, threadsz);
278   config.block_count = dim3(blocksx, blocksy, blocksz);
279   return config;
280 }
281 
282 template <typename DeviceFunc>
GetCuda2DLaunchConfig(int xdim,int ydim,const Eigen::GpuDevice & d,DeviceFunc func,size_t dynamic_shared_memory_size,int block_size_limit)283 inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
284     int xdim, int ydim, const Eigen::GpuDevice& d, DeviceFunc func,
285     size_t dynamic_shared_memory_size, int block_size_limit) {
286   return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
287                                dynamic_shared_memory_size, block_size_limit);
288 }
289 
290 // Returns a raw reference to the current cuda stream.  Required by a
291 // number of kernel calls (for which StreamInterface* does not work), i.e.
292 // CUB and certain cublas primitives.
GetCudaStream(OpKernelContext * context)293 inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
294   const cudaStream_t* ptr = CHECK_NOTNULL(
295       reinterpret_cast<const cudaStream_t*>(context->op_device_context()
296                                                 ->stream()
297                                                 ->implementation()
298                                                 ->GpuStreamMemberHack()));
299   return *ptr;
300 }
301 
302 namespace detail {
303 template <typename... Ts, size_t... Is>
GetArrayOfElementPointersImpl(std::tuple<Ts...> * tuple,absl::index_sequence<Is...>)304 std::array<void*, sizeof...(Ts)> GetArrayOfElementPointersImpl(
305     std::tuple<Ts...>* tuple, absl::index_sequence<Is...>) {
306   return {{&std::get<Is>(*tuple)...}};
307 }
308 // Returns an array of void pointers to the elements of the given tuple.
309 template <typename... Ts>
GetArrayOfElementPointers(std::tuple<Ts...> * tuple)310 std::array<void*, sizeof...(Ts)> GetArrayOfElementPointers(
311     std::tuple<Ts...>* tuple) {
312   return GetArrayOfElementPointersImpl(tuple,
313                                        absl::index_sequence_for<Ts...>{});
314 }
315 
316 template <bool...>
317 struct BoolPack;
318 template <bool... Bs>
319 using NoneTrue = std::is_same<BoolPack<Bs..., false>, BoolPack<false, Bs...>>;
320 // Returns whether none of the types in Ts is a reference.
321 template <typename... Ts>
NoneIsReference()322 constexpr bool NoneIsReference() {
323   return NoneTrue<(std::is_reference<Ts>::value)...>::value;
324 }
325 }  // namespace detail
326 
327 // Launches a CUDA kernel through cudaLaunchKernel with the given arguments.
328 //
329 // The kernel parameters 'Ts' must be constructible from the arguments 'Args'.
330 template <typename... Ts, typename... Args>
CudaLaunchKernel(void (* function)(Ts...),dim3 grid_dim,dim3 block_dim,size_t shared_memory_size_bytes,cudaStream_t stream,Args...arguments)331 Status CudaLaunchKernel(void (*function)(Ts...), dim3 grid_dim, dim3 block_dim,
332                         size_t shared_memory_size_bytes, cudaStream_t stream,
333                         Args... arguments) {
334   static_assert(detail::NoneIsReference<Ts...>(),
335                 "Kernels with reference arguments have undefined behaviour.");
336   // Cast arguments and forward them as an array of pointers.
337   auto args_tuple = std::tuple<Ts...>(arguments...);
338   auto arg_ptrs = detail::GetArrayOfElementPointers(&args_tuple);
339   auto func_ptr = absl::bit_cast<const void*>(function);
340   auto result = cudaLaunchKernel(func_ptr, grid_dim, block_dim, arg_ptrs.data(),
341                                  shared_memory_size_bytes, stream);
342   if (result != cudaSuccess) {
343     return errors::Internal(cudaGetErrorString(result));
344   }
345   return Status::OK();
346 }
347 
348 }  // namespace tensorflow
349 
350 #endif  // GOOGLE_CUDA
351 
352 #endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
353