1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
17 #define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
18 
19 #if GOOGLE_CUDA
20 
21 #include <unordered_map>
22 
23 #include "absl/types/span.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/lib/core/status.h"
26 #include "tensorflow/core/lib/strings/str_util.h"
27 #include "tensorflow/core/lib/strings/strcat.h"
28 #include "tensorflow/core/lib/strings/stringprintf.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/stream_executor.h"
31 
32 namespace tensorflow {
33 
34 class NodeDef;
35 class AutotuneResult;
36 
37 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)38 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
39   se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
40   se::DeviceMemory<T> typed(wrapped);
41   return typed;
42 }
43 
44 // A helper class that looks up the best autotuned config from parameters.
45 // Due to the noisy nature of autotune, especially with multiple devices, it
46 // only accepts a config if its margin exceeds a threshold.
47 // For the same shape configs, if a new best config matches the previous best,
48 // they get promoted; otherwise, the winner gets demoted. This process stops
49 // when the winner's score exceeds the threshold.
50 // In a bad case when two configs are very close to each other and flips
51 // back and forth randomly, the expected number of experiments before autotune
52 // settles is O(threshold ^ 2). So we recommend that number of warmup runs
53 // for any benchmarks.
54 template <typename Parameters, typename Config>
55 class AutoTuneMap {
56  public:
Find(const Parameters & params,Config * config)57   bool Find(const Parameters& params, Config* config) const {
58     mutex_lock lock(mu_);
59     auto iter = params_config_map_.find(params);
60     if (iter == params_config_map_.end() ||
61         (iter->second.score < min_score_threshold_ &&
62          iter->second.count <= max_autotune_count_)) {
63       return false;
64     }
65     *config = iter->second.config;
66     return true;
67   }
Insert(const Parameters & params,const Config & config)68   void Insert(const Parameters& params, const Config& config) {
69     mutex_lock lock(mu_);
70     auto iter = params_config_map_.find(params);
71     int new_score = 0;
72     if (iter == params_config_map_.end()) {
73       // Create a new entry if params is new.
74       VLOG(1) << GetActionSummary("creates", params, config);
75       params_config_map_.insert(
76           std::make_pair(params, ValueType{config, 1, 1}));
77       new_score = 1;
78     } else if (iter->second.score < min_score_threshold_ &&
79                iter->second.count <= max_autotune_count_) {
80       DCHECK_GT(iter->second.score, 0);
81       if (iter->second.config != config) {
82         // If it is different from the current winner, demotes the winner.
83         VLOG(1) << GetActionSummary("demotes", params, config);
84         new_score = --iter->second.score;
85         ++iter->second.count;
86         if (new_score <= 0) {
87           VLOG(1) << GetActionSummary("erases", params, config);
88           params_config_map_.erase(iter);
89         }
90       } else {
91         // If it is the same as the current winner, promotes the winner.
92         VLOG(1) << GetActionSummary("promotes", params, config);
93         new_score = ++iter->second.score;
94         ++iter->second.count;
95       }
96     }
97     if (new_score >= min_score_threshold_) {
98       VLOG(1) << GetActionSummary("accepts", params, config);
99     }
100   }
101 
102  private:
AutoTuneMap(const string & name)103   AutoTuneMap(const string& name) : name_(name) {
104     min_score_threshold_ = 1;
105     int min_warmup_iterations = 10;
106     const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
107     if (threshold_str != nullptr) {
108       strings::safe_strto32(threshold_str, &min_score_threshold_);
109     }
110     const char* min_warmup_iteration_str =
111         getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
112     if (min_warmup_iteration_str != nullptr) {
113       strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
114     }
115     min_score_threshold_ = std::max(min_score_threshold_, 1);
116     max_autotune_count_ = std::max(
117         5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
118   }
119 
120   template <class Group, class Params, class Cfg>
121   friend class AutoTuneSingleton;
122 
123   struct Hasher {
operatorHasher124     std::size_t operator()(const Parameters& parameter) const {
125       return parameter.hash();
126     }
127   };
128 
GetActionSummary(StringPiece action,const Parameters & params,const Config & config)129   string GetActionSummary(StringPiece action, const Parameters& params,
130                           const Config& config) {
131     return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
132                            string(action).c_str(), params.ToString().c_str(),
133                            config.ToString().c_str());
134   }
135 
136   mutable mutex mu_;
137   struct ValueType {
138     Config config;
139     int32 score;
140     int32 count;
141   };
142   std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
143       GUARDED_BY(mu_);
144   string name_;
145   int32 min_score_threshold_;
146   int32 max_autotune_count_;
147 
148   TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap);
149 };
150 
151 // A Singleton helper that manages the global autotune results by groups.
152 // The caller specified arbitrary Group type that can distinguish between
153 // different autotune results, even if their Parameters and Configs are the
154 // same.
155 template <class Group, typename Parameters, typename Config>
156 class AutoTuneSingleton {
157  public:
158   typedef AutoTuneMap<Parameters, Config> AutoTuneType;
GetInstance()159   static AutoTuneType* GetInstance() {
160     static AutoTuneType* instance = new AutoTuneType(Group::name());
161     return instance;
162   }
163 };
164 
165 // Logs convolution results to customized back-storage.
166 void LogConvAutotuneResults(const NodeDef& node, const Tensor& input,
167                             const Tensor& filter, const Tensor& output,
168                             se::StreamExecutor* stream_exec,
169                             absl::Span<const AutotuneResult> results);
170 
171 // Logs fused convolution results to customized back-storage.
172 void LogFusedConvAutotuneResults(const NodeDef& node, const Tensor& input,
173                                  const Tensor& filter, const Tensor& output,
174                                  const Tensor& bias, const Tensor* side_input,
175                                  se::StreamExecutor* stream_exec,
176                                  absl::Span<const AutotuneResult> results);
177 
178 // Returns the best algorithms for the config, one is the fastest, the other is
179 // other is fastest with 0 scracth space. Unsuccessful autotuning results are
180 // allowed and ignored.
181 Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
182                               se::dnn::AlgorithmConfig* algo);
183 
184 }  // namespace tensorflow
185 
186 #endif  // GOOGLE_CUDA
187 
188 #endif  // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
189