1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/kernels/gpu_utils.h"
17 
18 #if GOOGLE_CUDA
19 
20 #include "google/protobuf/any.pb.h"
21 #include "tensorflow/core/framework/node_def.pb.h"
22 #include "tensorflow/core/framework/tensor.pb.h"
23 #include "tensorflow/core/platform/logger.h"
24 #include "tensorflow/core/protobuf/autotuning.pb.h"
25 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
26 #include "tensorflow/core/util/proto/proto_utils.h"
27 
28 namespace tensorflow {
29 namespace {
30 
GetCudnnVersion(se::StreamExecutor * stream_executor)31 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
32   tensorflow::CudnnVersion cudnn_version;
33   if (auto* dnn = stream_executor->AsDnn()) {
34     se::port::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
35     if (version_or.ok()) {
36       const auto& version = version_or.ValueOrDie();
37       cudnn_version.set_major(version.major_version());
38       cudnn_version.set_minor(version.minor_version());
39       cudnn_version.set_patch(version.patch());
40     }
41   }
42   return cudnn_version;
43 }
44 
GetComputeCapability(se::StreamExecutor * stream_executor)45 tensorflow::ComputeCapability GetComputeCapability(
46     se::StreamExecutor* stream_executor) {
47   tensorflow::ComputeCapability cc;
48   int cc_major, cc_minor;
49   stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
50                                                                   &cc_minor);
51   cc.set_major(cc_major);
52   cc.set_minor(cc_minor);
53   return cc;
54 }
55 
56 }  // namespace
57 
LogConvAutotuneResults(const NodeDef & node,const Tensor & input,const Tensor & filter,const Tensor & output,se::StreamExecutor * stream_exec,absl::Span<const AutotuneResult> results)58 void LogConvAutotuneResults(const NodeDef& node, const Tensor& input,
59                             const Tensor& filter, const Tensor& output,
60                             se::StreamExecutor* stream_exec,
61                             absl::Span<const AutotuneResult> results) {
62   AutotuningLog log;
63   ConvNodeDef instr;
64   *instr.mutable_conv() = node;
65   input.shape().AsProto(instr.mutable_input()->mutable_tensor_shape());
66   instr.mutable_input()->set_dtype(input.dtype());
67   filter.shape().AsProto(instr.mutable_filter()->mutable_tensor_shape());
68   instr.mutable_filter()->set_dtype(filter.dtype());
69   output.shape().AsProto(instr.mutable_output()->mutable_tensor_shape());
70   instr.mutable_output()->set_dtype(output.dtype());
71   log.mutable_instr()->PackFrom(std::move(instr));
72   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
73   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
74   for (const auto& result : results) {
75     *log.add_results() = result;
76   }
77   Logger::Singleton()->LogProto(log);
78 }
79 
LogFusedConvAutotuneResults(const NodeDef & node,const Tensor & input,const Tensor & filter,const Tensor & output,const Tensor & bias,const Tensor * side_input,se::StreamExecutor * stream_exec,absl::Span<const AutotuneResult> results)80 void LogFusedConvAutotuneResults(const NodeDef& node, const Tensor& input,
81                                  const Tensor& filter, const Tensor& output,
82                                  const Tensor& bias, const Tensor* side_input,
83                                  se::StreamExecutor* stream_exec,
84                                  absl::Span<const AutotuneResult> results) {
85   AutotuningLog log;
86   ConvNodeDef instr;
87   *instr.mutable_conv() = node;
88   input.shape().AsProto(instr.mutable_input()->mutable_tensor_shape());
89   instr.mutable_input()->set_dtype(input.dtype());
90   filter.shape().AsProto(instr.mutable_filter()->mutable_tensor_shape());
91   instr.mutable_filter()->set_dtype(filter.dtype());
92   output.shape().AsProto(instr.mutable_output()->mutable_tensor_shape());
93   instr.mutable_output()->set_dtype(output.dtype());
94   bias.shape().AsProto(instr.mutable_bias()->mutable_tensor_shape());
95   instr.mutable_bias()->set_dtype(bias.dtype());
96   if (side_input) {
97     side_input->shape().AsProto(
98         instr.mutable_side_input()->mutable_tensor_shape());
99     instr.mutable_side_input()->set_dtype(side_input->dtype());
100   }
101   log.mutable_instr()->PackFrom(std::move(instr));
102   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
103   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
104   for (const auto& result : results) {
105     *log.add_results() = result;
106   }
107   Logger::Singleton()->LogProto(log);
108 }
109 
BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,se::dnn::AlgorithmConfig * algo)110 Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
111                               se::dnn::AlgorithmConfig* algo) {
112   // For the "!xhs.has_success()" below, this is because we want successful ones
113   // to order first, therefore they need a smaller key per "min_element".
114   const AutotuneResult* best_result = std::min_element(
115       results.begin(), results.end(),
116       [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
117         return std::make_tuple(
118                    !lhs.has_success(),
119                    proto_utils::FromDurationProto(lhs.success().run_time())) <
120                std::make_tuple(
121                    !rhs.has_success(),
122                    proto_utils::FromDurationProto(rhs.success().run_time()));
123       });
124 
125   const AutotuneResult* best_result_no_scratch = std::min_element(
126       results.begin(), results.end(),
127       [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
128         return std::make_tuple(
129                    !lhs.has_success(), lhs.success().scratch_bytes(),
130                    proto_utils::FromDurationProto(lhs.success().run_time())) <
131                std::make_tuple(
132                    !rhs.has_success(), rhs.success().scratch_bytes(),
133                    proto_utils::FromDurationProto(rhs.success().run_time()));
134       });
135 
136   if (best_result == results.end() || !best_result->has_success()) {
137     return errors::NotFound("No algorithm worked!");
138   }
139   algo->set_algorithm({best_result->conv().algorithm(),
140                        best_result->conv().tensor_ops_enabled()});
141   if (best_result_no_scratch != results.end() &&
142       best_result_no_scratch->has_success() &&
143       best_result_no_scratch->success().scratch_bytes() == 0) {
144     algo->set_algorithm_no_scratch(
145         {best_result_no_scratch->conv().algorithm(),
146          best_result_no_scratch->conv().tensor_ops_enabled()});
147   }
148   return Status::OK();
149 }
150 
151 }  // namespace tensorflow
152 
153 #endif  // GOOGLE_CUDA
154