1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
18 
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20 
21 #include <tuple>
22 #include <unordered_map>
23 
24 #include "absl/strings/str_cat.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/kernels/gpu_utils.h"
27 #include "tensorflow/core/lib/gtl/inlined_vector.h"
28 #include "tensorflow/core/lib/hash/hash.h"
29 #include "tensorflow/core/util/tensor_format.h"
30 
31 namespace tensorflow {
32 
33 // Returns true if the given StreamExecutor is for a Volta or newer nvidia GPU.
IsVoltaOrLater(const se::StreamExecutor & stream_exec)34 inline bool IsVoltaOrLater(const se::StreamExecutor& stream_exec) {
35   int major, minor;
36   CHECK(stream_exec  // Crash OK
37             .GetDeviceDescription()
38             .cuda_compute_capability(&major, &minor));
39   return major >= 7;
40 }
41 
42 // Get the Dnn workspace limit from the environment variable, which is in MB.
43 // Return the workspace memory limit in bytes. If no value is set, return the
44 // default value.
45 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
46                            int64 default_value_in_bytes);
47 
48 // A class to provide scratch-space allocator for Stream-Executor Cudnn
49 // callback. TensorFlow is responsible for releasing the temporary buffers after
50 // the kernel finishes.
51 class DnnScratchAllocator : public se::ScratchAllocator {
52  public:
~DnnScratchAllocator()53   virtual ~DnnScratchAllocator() {}
DnnScratchAllocator(int64 memory_limit,OpKernelContext * context)54   DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
55       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
GetMemoryLimitInBytes()56   int64 GetMemoryLimitInBytes() override { return memory_limit_; }
AllocateBytes(int64 byte_size)57   se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
58       int64 byte_size) override {
59     Tensor temporary_memory;
60     if (byte_size < 0) {
61       return se::port::Status{se::port::error::INVALID_ARGUMENT,
62                               "Requested negative byte size!"};
63     }
64     if (byte_size > memory_limit_) {
65       return se::port::Status{se::port::error::UNAVAILABLE,
66                               absl::StrCat("Requested memory size (", byte_size,
67                                            ") exceeds the max memory limit (",
68                                            memory_limit_, ").")};
69     }
70     AllocationAttributes allocation_attr;
71     allocation_attr.retry_on_failure = false;
72     Status allocation_status(context_->allocate_temp(
73         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
74         AllocatorAttributes(), allocation_attr));
75     if (!allocation_status.ok()) {
76       return se::port::Status{
77           se::port::error::UNAVAILABLE,
78           absl::StrCat("Failed to allocate the requested memory size (",
79                        byte_size, ").")};
80     }
81     // Hold the reference of the allocated tensors until the end of the
82     // allocator.
83     allocated_tensors_.push_back(temporary_memory);
84     total_byte_size_ += byte_size;
85     return se::port::StatusOr<se::DeviceMemory<uint8>>(
86         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
87                        temporary_memory.flat<uint8>().size()));
88   }
TotalByteSize()89   int64 TotalByteSize() { return total_byte_size_; }
90 
91  private:
92   int64 memory_limit_;
93   int64 total_byte_size_;
94   OpKernelContext* context_;
95   std::vector<Tensor> allocated_tensors_;
96 };
97 
98 // Encapsulate all the shape information that is used in both forward and
99 // backward conv operations.
100 class ConvParameters {
101  public:
102   using SpatialArray = gtl::InlinedVector<int64, 3>;
103   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
104                  TensorFormat data_format, int64 out_depths,
105                  const SpatialArray& filter, const SpatialArray& dilation,
106                  const SpatialArray& stride, const SpatialArray& padding,
107                  DataType dtype, int device_id, int group_count = 1)
batch_(batch)108       : batch_(batch),
109         in_depths_(in_depths),
110         out_depths_(out_depths),
111         in_(CheckSpatialArraySize(in)),
112         data_format_(data_format),
113         filter_(CheckSpatialArraySize(filter)),
114         dilation_(CheckSpatialArraySize(dilation)),
115         stride_(CheckSpatialArraySize(stride)),
116         padding_(CheckSpatialArraySize(padding)),
117         dtype_(dtype),
118         device_id_(device_id),
119         group_count_(group_count) {
120     hash_code_ = batch;
121     hash_code_ = Hash64Combine(hash_code_, in_depths);
122     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
123     hash_code_ = Hash64Combine(hash_code_, data_format);
124     hash_code_ = Hash64Combine(hash_code_, out_depths);
125     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
126     for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
127     for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
128     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
129     hash_code_ = Hash64Combine(hash_code_, dtype);
130     hash_code_ = Hash64Combine(hash_code_, device_id);
131     hash_code_ = Hash64Combine(hash_code_, group_count);
132   }
133 
134   bool operator==(const ConvParameters& other) const {
135     return this->get_data_as_tuple() == other.get_data_as_tuple();
136   }
137 
138   bool operator!=(const ConvParameters& other) const {
139     return !(*this == other);
140   }
hash()141   uint64 hash() const { return hash_code_; }
142 
ToString()143   string ToString() const {
144     // clang-format off
145     return strings::StrCat(
146         batch_, ", ", in_depths_, ", ",
147         "(", str_util::Join(in_, ", "), "), ",
148         ::tensorflow::ToString(data_format_), ", ",
149         out_depths_, ", ",
150         "(", str_util::Join(filter_, ", "), "), ",
151         "(", str_util::Join(dilation_, ", "), "), ",
152         "(", str_util::Join(stride_, ", "), "), ",
153         "(", str_util::Join(padding_, ", "), "), ",
154         dtype_, ", ",
155         device_id_,
156         group_count_);
157     // clang-format on
158   }
159 
160   // The purpose of this function is to disable winograd nonfused conv algorithm
161   // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
162   template <typename T>
ShouldIncludeWinogradNonfusedAlgo(se::StreamExecutor * stream_exec)163   bool ShouldIncludeWinogradNonfusedAlgo(
164       se::StreamExecutor* stream_exec) const {
165     auto* dnn_support = stream_exec->AsDnn();
166     if (!dnn_support) {
167       return false;
168     }
169     // Skip this check for cuDNN 7 and newer.
170     auto version = dnn_support->GetVersion();
171     if (version.ok() && version.ValueOrDie().major_version() >= 7) {
172       return true;
173     }
174     return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
175   }
176 
177  protected:
178   using ParameterDataType =
179       std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray,
180                  SpatialArray, SpatialArray, SpatialArray, DataType, int, int>;
181 
get_data_as_tuple()182   ParameterDataType get_data_as_tuple() const {
183     return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_,
184                            filter_, dilation_, stride_, padding_, dtype_,
185                            device_id_, group_count_);
186   }
187 
188   uint64 hash_code_;
189 
190  private:
191   friend struct ConvParametersPeer;  // For testing purposes.
192 
CheckSpatialArraySize(const SpatialArray & array)193   static const SpatialArray& CheckSpatialArraySize(const SpatialArray& array) {
194     CHECK_LE(array.size(), 3);  // Catch corruptions related to b/124313574.
195     return array;
196   }
197 
198   template <typename T>
ShouldIncludeWinogradNonfusedAlgoPreCudnn7()199   bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
200     int64 total_size = 16 * std::ceil(batch_ / 16.0) *
201                        std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
202                        sizeof(T);
203     int64 threshold = 1LL << 31;
204     if (total_size >= threshold) {
205       return false;
206     } else {
207       return true;
208     }
209   }
210 
211   int64 batch_;
212   int64 in_depths_;
213   int64 out_depths_;
214   SpatialArray in_;
215   TensorFormat data_format_;
216   SpatialArray filter_;
217   SpatialArray dilation_;
218   SpatialArray stride_;
219   SpatialArray padding_;
220   DataType dtype_;
221   int device_id_;
222   int group_count_;
223 };
224 
225 typedef Eigen::GpuDevice GPUDevice;
226 
227 }  // namespace tensorflow
228 
229 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
230 
231 #endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
232