1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
17 #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
18 
19 #include <list>
20 #include <thread>
21 #include <unordered_map>
22 
23 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
24 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
25 #include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
26 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
27 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
28 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
29 #include "tensorflow/core/framework/resource_mgr.h"
30 #include "tensorflow/core/lib/core/errors.h"
31 
32 #if GOOGLE_CUDA && GOOGLE_TENSORRT
33 #include "third_party/tensorrt/NvInfer.h"
34 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
35 
36 namespace tensorflow {
37 namespace tensorrt {
38 
39 template <class Key, class Value, class HashFunction>
40 class LRUCache {
41  public:
42   typedef Value value_type;
43   typedef Key key_type;
44   typedef HashFunction hasher;
45   typedef typename std::unordered_map<key_type, value_type, hasher> map_type;
46   typedef typename map_type::iterator iterator;
47   typedef typename map_type::const_iterator const_iterator;
48 
LRUCache()49   LRUCache() : capacity_(0) {}
LRUCache(size_t capacity)50   explicit LRUCache(size_t capacity) : capacity_(capacity) {}
51 
capacity()52   size_t capacity() const { return capacity_; }
53 
reserve(size_t capacity)54   void reserve(size_t capacity) {
55     capacity_ = capacity;
56     DiscardOld();
57   }
58 
size()59   size_t size() const { return objects_.size(); }
60 
count(const key_type & key)61   size_t count(const key_type& key) const { return objects_.count(key); }
62 
at(const key_type & key)63   value_type& at(const key_type& key) { return Touch(key); }
64 
begin()65   const_iterator begin() const { return objects_.begin(); }
end()66   const_iterator end() const { return objects_.end(); }
67 
begin()68   iterator begin() { return objects_.begin(); }
end()69   iterator end() { return objects_.end(); }
70 
71   template <typename... Args>
emplace(Args &&...args)72   std::pair<iterator, bool> emplace(Args&&... args) {
73     DiscardOld(1);
74     std::pair<iterator, bool> result =
75         objects_.emplace(std::forward<Args>(args)...);
76     key_type key = result.first->first;
77     if (result.second) {
78       keys_.push_front(key);
79     } else {
80       TouchNoCheck(key);  // The key must exist in this case.
81     }
82     return result;
83   }
84 
85  private:
86   std::unordered_map<key_type, value_type, hasher> objects_;
87   std::list<key_type> keys_;
88   size_t capacity_;
89   value_type not_found_value_;
90 
Touch(const key_type & key)91   value_type& Touch(const key_type& key) {
92     // Check that the key exists, and let it return std::out_of_range error if
93     // not.
94     value_type& value = objects_.at(key);
95     TouchNoCheck(key);
96     return value;
97   }
98 
TouchNoCheck(const key_type & key)99   void TouchNoCheck(const key_type& key) {
100     auto rank = std::find(keys_.begin(), keys_.end(), key);
101     if (rank != keys_.begin()) {
102       keys_.erase(rank);
103       keys_.push_front(key);
104     }
105   }
106 
107   // Creates n free positions in cache
108   void DiscardOld(size_t n = 0) {
109     DCHECK(capacity_ >= n) << "Insufficient capacity in cache (capacity = "
110                            << capacity_ << ", requested " << n << ")";
111     while (objects_.size() > (capacity_ - n)) {
112       key_type discard_key = keys_.back();
113       keys_.pop_back();
114       objects_.erase(discard_key);
115     }
116   }
117 };
118 
119 #if GOOGLE_CUDA && GOOGLE_TENSORRT
120 
121 struct EngineContext {
EngineContextEngineContext122   EngineContext() {}  // Creates an empty context.
EngineContextEngineContext123   EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
124                 ExecutionContext&& input_execution_context)
125       : cuda_engine(std::move(input_cuda_engine)) {
126     execution_context.push_back(std::move(input_execution_context));
127   }
EngineContextEngineContext128   EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
129                 std::vector<ExecutionContext>&& input_execution_context)
130       : cuda_engine(std::move(input_cuda_engine)),
131         execution_context(std::move(input_execution_context)) {}
132 
133   mutex mu;
134   TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
135 
GetExecutionContextEngineContext136   Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx)
137       TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
138     if (idx >= execution_context.size()) {
139       return errors::Internal("Requested engine context with index ", idx,
140                               ", but only ", execution_context.size(),
141                               "contexts are present.");
142     }
143     *exec_ctx = execution_context[idx];
144     return Status::OK();
145   }
146 
147   // In explicit batch mode, we maintain a vector of contexts for each engine,
148   // where each context is created for a specific profile. This is because it is
149   // either not possible or non-trivial to change the profile of a context for
150   // the following reasons:
151   // - In TRT 6 it is not possible to switch a profile after it is set
152   //   https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-601/tensorrt-api/c_api/classnvinfer1_1_1_i_execution_context.html#aba0731b9fbc926c477010df818650b0a
153   // - To switch profiles (from TRT 7), one must first ensure that all inference
154   //   calls in that context are finished. This would require an additional
155   //   synchronization before we call setOptimizationProfile. To avoid this
156   //   extra sync call, we mantain separate execution context for each profile.
157   // IExecutionContext object is not thread safe: only one thread should use it
158   // for inference at a time therefore we need a mutex. More details at
159   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety
160   // Additional discussion about execution context management and thread safety
161   // at https://github.com/tensorflow/tensorflow/issues/36959
162   std::vector<ExecutionContext> execution_context TF_GUARDED_BY(mu);
163 };
164 
165 // Contains the context required to build the calibration data.
166 class CalibrationContext {
167  public:
168   string TerminateCalibration();
169 
170   // Lookup table for temporary staging areas of input tensors for calibration.
171   std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
172 
173   // Temporary staging areas for calibration inputs.
174   std::vector<PersistentTensor> device_tensors_;
175 
176   std::unique_ptr<TRTInt8Calibrator> calibrator_;
177   TrtUniquePtrType<nvinfer1::IBuilder> builder_;
178   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
179   // TODO(sami): Use threadpool threads!
180   std::unique_ptr<std::thread> thr_;
181 
182  private:
183   mutex mu_;
184   bool terminated_ TF_GUARDED_BY(mu_) = false;
185   std::string calibration_table_ TF_GUARDED_BY(mu_);
186 };
187 
188 ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
189 
190 class TRTEngineCacheResource : public ResourceBase {
191  public:
192   // According to the TensorRT API, the logger is considered a singleton by the
193   // TensorRT library, and multiple instances of IRuntime and/or IBuilder must
194   // all use the same logger. So here we make it a singleton.
195   //
196   // TODO(laigd): use this logger in all places where conversion happens.
197   static Logger& GetLogger();
198 
199   TRTEngineCacheResource(OpKernelContext* ctx, size_t capacity);
200 
201   ~TRTEngineCacheResource() override;
202 
203   string DebugString() const override;
204 
205   // Returns the EngineContext that is compatible with input_shapes.
206   // Returns nullptr if no compatible EngineContexts is found in cache.
207   EngineContext* GetEngineContext(const std::vector<TensorShape>& input_shapes);
208 
209   // Returns the EngineContext that is compatible with profile_id.
210   // This function should be only called in explicit batch mode where
211   // cache size is expected to be at most one.
212   // Returns nullptr if no compatible EngineContexts is found in cache.
213   EngineContext* GetEngineContext(const int profile_id);
214 
215   // Keep device allocator for TRT.
216   std::unique_ptr<TRTBaseAllocator> allocator_;
217 
218   // Declare cache after allocator so that it is destroyed before allocator is.
219   LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
220            VectorTensorShapeHasher>
221       cache_;
222 
223   // TODO(hinsu): Use different calibration context for the available shapes and
224   // attach it to each item of the cache.
225   std::unique_ptr<CalibrationContext> calib_ctx_;
226 
227   // This object maintains all the optimization profiles during profile
228   // generation and engine build. During runtime the list of profiles is used to
229   // look up a matching profile for the input data.
230   TrtShapeOptimizationProfile profiles_;
231 };
232 
233 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
234 
235 }  // namespace tensorrt
236 }  // namespace tensorflow
237 
238 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
239