1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
17 #define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
18
19 #if GOOGLE_CUDA
20
21 #include <unordered_map>
22
23 #include "absl/types/span.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/lib/core/status.h"
26 #include "tensorflow/core/lib/strings/str_util.h"
27 #include "tensorflow/core/lib/strings/strcat.h"
28 #include "tensorflow/core/lib/strings/stringprintf.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/stream_executor.h"
31
32 namespace tensorflow {
33
34 class NodeDef;
35 class AutotuneResult;
36
37 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)38 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
39 se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
40 se::DeviceMemory<T> typed(wrapped);
41 return typed;
42 }
43
44 // A helper class that looks up the best autotuned config from parameters.
45 // Due to the noisy nature of autotune, especially with multiple devices, it
46 // only accepts a config if its margin exceeds a threshold.
47 // For the same shape configs, if a new best config matches the previous best,
48 // they get promoted; otherwise, the winner gets demoted. This process stops
49 // when the winner's score exceeds the threshold.
50 // In a bad case when two configs are very close to each other and flips
51 // back and forth randomly, the expected number of experiments before autotune
52 // settles is O(threshold ^ 2). So we recommend that number of warmup runs
53 // for any benchmarks.
54 template <typename Parameters, typename Config>
55 class AutoTuneMap {
56 public:
Find(const Parameters & params,Config * config)57 bool Find(const Parameters& params, Config* config) const {
58 mutex_lock lock(mu_);
59 auto iter = params_config_map_.find(params);
60 if (iter == params_config_map_.end() ||
61 (iter->second.score < min_score_threshold_ &&
62 iter->second.count <= max_autotune_count_)) {
63 return false;
64 }
65 *config = iter->second.config;
66 return true;
67 }
Insert(const Parameters & params,const Config & config)68 void Insert(const Parameters& params, const Config& config) {
69 mutex_lock lock(mu_);
70 auto iter = params_config_map_.find(params);
71 int new_score = 0;
72 if (iter == params_config_map_.end()) {
73 // Create a new entry if params is new.
74 VLOG(1) << GetActionSummary("creates", params, config);
75 params_config_map_.insert(
76 std::make_pair(params, ValueType{config, 1, 1}));
77 new_score = 1;
78 } else if (iter->second.score < min_score_threshold_ &&
79 iter->second.count <= max_autotune_count_) {
80 DCHECK_GT(iter->second.score, 0);
81 if (iter->second.config != config) {
82 // If it is different from the current winner, demotes the winner.
83 VLOG(1) << GetActionSummary("demotes", params, config);
84 new_score = --iter->second.score;
85 ++iter->second.count;
86 if (new_score <= 0) {
87 VLOG(1) << GetActionSummary("erases", params, config);
88 params_config_map_.erase(iter);
89 }
90 } else {
91 // If it is the same as the current winner, promotes the winner.
92 VLOG(1) << GetActionSummary("promotes", params, config);
93 new_score = ++iter->second.score;
94 ++iter->second.count;
95 }
96 }
97 if (new_score >= min_score_threshold_) {
98 VLOG(1) << GetActionSummary("accepts", params, config);
99 }
100 }
101
102 private:
AutoTuneMap(const string & name)103 AutoTuneMap(const string& name) : name_(name) {
104 min_score_threshold_ = 1;
105 int min_warmup_iterations = 10;
106 const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
107 if (threshold_str != nullptr) {
108 strings::safe_strto32(threshold_str, &min_score_threshold_);
109 }
110 const char* min_warmup_iteration_str =
111 getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
112 if (min_warmup_iteration_str != nullptr) {
113 strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
114 }
115 min_score_threshold_ = std::max(min_score_threshold_, 1);
116 max_autotune_count_ = std::max(
117 5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
118 }
119
120 template <class Group, class Params, class Cfg>
121 friend class AutoTuneSingleton;
122
123 struct Hasher {
operatorHasher124 std::size_t operator()(const Parameters& parameter) const {
125 return parameter.hash();
126 }
127 };
128
GetActionSummary(StringPiece action,const Parameters & params,const Config & config)129 string GetActionSummary(StringPiece action, const Parameters& params,
130 const Config& config) {
131 return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
132 string(action).c_str(), params.ToString().c_str(),
133 config.ToString().c_str());
134 }
135
136 mutable mutex mu_;
137 struct ValueType {
138 Config config;
139 int32 score;
140 int32 count;
141 };
142 std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
143 GUARDED_BY(mu_);
144 string name_;
145 int32 min_score_threshold_;
146 int32 max_autotune_count_;
147
148 TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap);
149 };
150
151 // A Singleton helper that manages the global autotune results by groups.
152 // The caller specified arbitrary Group type that can distinguish between
153 // different autotune results, even if their Parameters and Configs are the
154 // same.
155 template <class Group, typename Parameters, typename Config>
156 class AutoTuneSingleton {
157 public:
158 typedef AutoTuneMap<Parameters, Config> AutoTuneType;
GetInstance()159 static AutoTuneType* GetInstance() {
160 static AutoTuneType* instance = new AutoTuneType(Group::name());
161 return instance;
162 }
163 };
164
165 // Logs convolution results to customized back-storage.
166 void LogConvAutotuneResults(const NodeDef& node, const Tensor& input,
167 const Tensor& filter, const Tensor& output,
168 se::StreamExecutor* stream_exec,
169 absl::Span<const AutotuneResult> results);
170
171 // Logs fused convolution results to customized back-storage.
172 void LogFusedConvAutotuneResults(const NodeDef& node, const Tensor& input,
173 const Tensor& filter, const Tensor& output,
174 const Tensor& bias, const Tensor* side_input,
175 se::StreamExecutor* stream_exec,
176 absl::Span<const AutotuneResult> results);
177
178 // Returns the best algorithms for the config, one is the fastest, the other is
179 // other is fastest with 0 scracth space. Unsuccessful autotuning results are
180 // allowed and ignored.
181 Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
182 se::dnn::AlgorithmConfig* algo);
183
184 } // namespace tensorflow
185
186 #endif // GOOGLE_CUDA
187
188 #endif // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
189