1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
18 
19 #if GOOGLE_CUDA
20 
21 #include <tuple>
22 #include <unordered_map>
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/kernels/gpu_utils.h"
25 #include "tensorflow/core/lib/gtl/inlined_vector.h"
26 #include "tensorflow/core/lib/hash/hash.h"
27 
28 namespace tensorflow {
29 
30 // Get the Dnn workspace limit from the environment variable, which is in MB.
31 // Return the workspace memory limit in bytes. If no value is set, return the
32 // default value.
33 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
34                            int64 default_value_in_bytes);
35 
36 // A class to provide scratch-space allocator for Stream-Executor Cudnn
37 // callback. TensorFlow is responsible for releasing the temporary buffers after
38 // the kernel finishes.
39 class DnnScratchAllocator : public se::ScratchAllocator {
40  public:
~DnnScratchAllocator()41   virtual ~DnnScratchAllocator() {}
DnnScratchAllocator(int64 memory_limit,OpKernelContext * context)42   DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
43       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
GetMemoryLimitInBytes(se::Stream * stream)44   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
45     return memory_limit_;
46   }
AllocateBytes(se::Stream * stream,int64 byte_size)47   se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
48       se::Stream* stream, int64 byte_size) override {
49     Tensor temporary_memory;
50     if (byte_size < 0) {
51       return se::port::Status{se::port::error::INVALID_ARGUMENT,
52                               "Requested negative byte size!"};
53     }
54     if (byte_size > memory_limit_) {
55       return se::port::StatusOr<se::DeviceMemory<uint8>>();
56     }
57     AllocationAttributes allocation_attr;
58     allocation_attr.no_retry_on_failure = true;
59     Status allocation_status(context_->allocate_temp(
60         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
61         AllocatorAttributes(), allocation_attr));
62     if (!allocation_status.ok()) {
63       return se::port::StatusOr<se::DeviceMemory<uint8>>();
64     }
65     // Hold the reference of the allocated tensors until the end of the
66     // allocator.
67     allocated_tensors_.push_back(temporary_memory);
68     total_byte_size_ += byte_size;
69     return se::port::StatusOr<se::DeviceMemory<uint8>>(
70         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
71                        temporary_memory.flat<uint8>().size()));
72   }
TotalByteSize()73   int64 TotalByteSize() { return total_byte_size_; }
74 
75  private:
76   int64 memory_limit_;
77   int64 total_byte_size_;
78   OpKernelContext* context_;
79   std::vector<Tensor> allocated_tensors_;
80 };
81 
82 // Encapsulate all the shape information that is used in both forward and
83 // backward conv operations.
84 class ConvParameters {
85  public:
86   using SpatialArray = gtl::InlinedVector<int64, 3>;
ConvParameters(int64 batch,int64 in_depths,const SpatialArray & in,TensorFormat data_format,int64 out_depths,const SpatialArray & filter,const SpatialArray & dilation,const SpatialArray & stride,const SpatialArray & padding,DataType dtype,int device_id)87   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
88                  TensorFormat data_format, int64 out_depths,
89                  const SpatialArray& filter, const SpatialArray& dilation,
90                  const SpatialArray& stride, const SpatialArray& padding,
91                  DataType dtype, int device_id)
92       : batch_(batch),
93         in_depths_(in_depths),
94         out_depths_(out_depths),
95         in_(CheckSpatialArraySize(in)),
96         data_format_(data_format),
97         filter_(CheckSpatialArraySize(filter)),
98         dilation_(CheckSpatialArraySize(dilation)),
99         stride_(CheckSpatialArraySize(stride)),
100         padding_(CheckSpatialArraySize(padding)),
101         dtype_(dtype),
102         device_id_(device_id) {
103     hash_code_ = batch;
104     hash_code_ = Hash64Combine(hash_code_, in_depths);
105     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
106     hash_code_ = Hash64Combine(hash_code_, data_format);
107     hash_code_ = Hash64Combine(hash_code_, out_depths);
108     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
109     for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
110     for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
111     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
112     hash_code_ = Hash64Combine(hash_code_, dtype);
113     hash_code_ = Hash64Combine(hash_code_, device_id);
114   }
115   bool operator==(const ConvParameters& other) const {
116     return this->get_data_as_tuple() == other.get_data_as_tuple();
117   }
118 
119   bool operator!=(const ConvParameters& other) const {
120     return !(*this == other);
121   }
hash()122   uint64 hash() const { return hash_code_; }
123 
ToString()124   string ToString() const {
125     // clang-format off
126     return strings::StrCat(
127         batch_, ", ", in_depths_, ", ",
128         "(", str_util::Join(in_, ", "), "), ",
129         ::tensorflow::ToString(data_format_), ", ",
130         out_depths_, ", ",
131         "(", str_util::Join(filter_, ", "), "), ",
132         "(", str_util::Join(dilation_, ", "), "), ",
133         "(", str_util::Join(stride_, ", "), "), ",
134         "(", str_util::Join(padding_, ", "), "), ",
135         dtype_, ", ",
136         device_id_);
137     // clang-format on
138   }
139 
140   // The purpose of this function is to disable winograd nonfused conv algorithm
141   // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
142   template <typename T>
ShouldIncludeWinogradNonfusedAlgo(se::StreamExecutor * stream_exec)143   bool ShouldIncludeWinogradNonfusedAlgo(
144       se::StreamExecutor* stream_exec) const {
145     auto* dnn_support = stream_exec->AsDnn();
146     if (!dnn_support) {
147       return false;
148     }
149     // Skip this check for cuDNN 7 and newer.
150     auto version = dnn_support->GetVersion();
151     if (version.ok() && version.ValueOrDie().major_version() >= 7) {
152       return true;
153     }
154     return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
155   }
156 
157  protected:
158   using ParameterDataType =
159       std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray,
160                  SpatialArray, SpatialArray, SpatialArray, DataType, int>;
161 
get_data_as_tuple()162   ParameterDataType get_data_as_tuple() const {
163     return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_,
164                            filter_, dilation_, stride_, padding_, dtype_,
165                            device_id_);
166   }
167 
168   uint64 hash_code_;
169 
170  private:
171   friend struct ConvParametersPeer;  // For testing purposes.
172 
CheckSpatialArraySize(const SpatialArray & array)173   static const SpatialArray& CheckSpatialArraySize(const SpatialArray& array) {
174     CHECK_LE(array.size(), 3);  // Catch corruptions related to b/124313574.
175     return array;
176   }
177 
178   template <typename T>
ShouldIncludeWinogradNonfusedAlgoPreCudnn7()179   bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
180     int64 total_size = 16 * std::ceil(batch_ / 16.0) *
181                        std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
182                        sizeof(T);
183     int64 threshold = 1LL << 31;
184     if (total_size >= threshold) {
185       return false;
186     } else {
187       return true;
188     }
189   }
190 
191   int64 batch_;
192   int64 in_depths_;
193   int64 out_depths_;
194   SpatialArray in_;
195   TensorFormat data_format_;
196   SpatialArray filter_;
197   SpatialArray dilation_;
198   SpatialArray stride_;
199   SpatialArray padding_;
200   DataType dtype_;
201   int device_id_;
202 };
203 
204 typedef Eigen::GpuDevice GPUDevice;
205 
206 }  // namespace tensorflow
207 
208 #endif  // GOOGLE_CUDA
209 
210 #endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
211