1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
18 
19 #include <cstdint>
20 #include <functional>
21 #include <map>
22 #include <memory>
23 #include <vector>
24 
25 #include "absl/container/flat_hash_map.h"
26 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
27 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
28 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
29 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
30 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
31 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
32 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
33 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/model.h"
35 #include "tensorflow/lite/delegates/gpu/common/model_hints.h"
36 #include "tensorflow/lite/delegates/gpu/common/precision.h"
37 #include "tensorflow/lite/delegates/gpu/common/status.h"
38 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
39 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
40 
41 namespace tflite {
42 namespace gpu {
43 namespace cl {
44 
45 struct CLNode {
46   ClOperation cl_operation;
47   std::vector<ValueId> inputs;
48   std::vector<ValueId> outputs;
49 
50   // Mostly for debug purposes.
51   std::string name;
52 
53   CLNode() = default;
54 
55   CLNode(CLNode&& node) = default;
56   CLNode& operator=(CLNode&& node) = default;
57   CLNode(const CLNode&) = delete;
58   CLNode& operator=(const CLNode&) = delete;
59 };
60 
61 class InferenceContext {
62  public:
63   struct CreateInferenceInfo {
64     CalculationsPrecision precision;
65     TensorStorageType storage_type;
66     ModelHints hints;
67   };
68 
69   absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
70                              const GraphFloat32& graph, Environment* env,
71                              std::vector<uint8_t>* serialized_model = nullptr);
72 
73   // Applies OpenCL-specific transformations to the graph before the
74   // initialization. These transformations are either impossible or useless in
75   // other backends.
76   absl::Status InitFromGraphWithTransforms(
77       const CreateInferenceInfo& create_info, GraphFloat32* graph,
78       Environment* env, std::vector<uint8_t>* serialized_model = nullptr);
79 
80   absl::Status AddToQueue(CLCommandQueue* queue);
81   absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
82   // for profiling and memory statistics
83   uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const;
84 
85   absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
86                               CLCommandQueue* queue);
87 
88   // It will work only with input/output tensor ids. For all other ids we don't
89   // have any guarantees.
90   Tensor* GetTensor(ValueId id);
91 
92   absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
93                                TensorFloat32* result);
94 
GetInputIds()95   const std::vector<ValueId>& GetInputIds() const { return input_ids_; }
GetOutputIds()96   const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
97 
GetInputRefs()98   const std::vector<int64_t>& GetInputRefs() const { return in_refs_; }
GetOutputRefs()99   const std::vector<int64_t>& GetOutputRefs() const { return out_refs_; }
100 
101   absl::Status RestoreDeserialized(
102       const absl::Span<const uint8_t> serialized_model, Environment* env);
103 
104  private:
105   enum class TensorMemoryType { kStrongShape, kBuffer, kVariable, kConst };
106 
107   friend flatbuffers::Offset<data::InferenceContext> Encode(
108       const InferenceContext& inference,
109       flatbuffers::FlatBufferBuilder* builder);
110   friend absl::Status Decode(const data::InferenceContext* fb_inference,
111                              InferenceContext* inference);
112 
113   void CopyInAndOutIds(const GraphFloat32& graph);
114   absl::Status ConvertOperations(const GpuInfo& gpu_info,
115                                  const GraphFloat32& graph, ModelHints hints);
116   void CreateLinks();
117   void ReserveGraphTensors(const CreateInferenceInfo& create_info,
118                            const GpuInfo& gpu_info, const GraphFloat32& graph);
119   absl::Status Merge();
120   absl::Status AllocateMemory(CLContext* context);
121 
122   absl::Status AllocateMemoryForConstTensors(CLContext* context);
123 
124   absl::Status AllocateMemoryForVariableTensors(CLContext* context);
125 
126   absl::Status AllocateMemoryForBuffers(CLContext* context);
127 
128   absl::Status AllocateMemoryForStrongShapes(CLContext* context);
129 
130   // utility function
131   void GetUsages(const std::function<bool(ValueId)>& functor,
132                  std::map<ValueId, int2>* usages);
133 
134   TensorMemoryType GetTensorMemoryType(ValueId id);
135 
136   void BindMemoryToOperations();
137   absl::Status Compile(const CreationContext& creation_context);
138   absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info,
139                     ProfilingCommandQueue* profiling_queue);
140   absl::Status UpdateParams();
141 
142   void ReleaseCPURepresentation();
143 
144   // performance hacks
145   bool need_flush_ = false;
146 
147   bool flush_periodically_ = false;
148   int flush_period_ = 1;
149 
150   // In order to reduce memory leak on Mali a pipeline needs to be synchronized
151   // with CPU to prevent growing internal global OpenCL kernel pool. One trick
152   // is to enqueue an event from a previous run. Most of the time is should
153   // already be executed on GPU and should not stall the pipeline.
154   bool need_manual_release_ = false;
155   CLEvent prev_enqueue_start_point_;
156 
157   CalculationsPrecision precision_;
158   TensorStorageType storage_type_;
159 
160   // Directly mapped nodes from graph, but some of them "inactive" due
161   //  to fusion (inactive = fused).
162   // Memory is allocated only once, in ConvertOperations, and is not modified
163   //  anywhere.
164   std::vector<CLNode> nodes_;
165 
166   struct DummyTensor {
167     BHWC shape;
168     TensorDescriptor descriptor;
169 
170     bool operator==(const DummyTensor& b) const {
171       return shape == b.shape && descriptor == b.descriptor;
172     }
173   };
174 
175   class TensorReserver {
176    public:
TensorReserver()177     TensorReserver() : next_(0) {}
Add(const DummyTensor & dummy)178     ValueId Add(const DummyTensor& dummy) {
179       reservations_[next_] = dummy;
180       return next_++;
181     }
Add(ValueId id,const DummyTensor & dummy)182     void Add(ValueId id, const DummyTensor& dummy) {
183       reservations_[id] = dummy;
184     }
SetNext(ValueId id)185     void SetNext(ValueId id) { next_ = id; }
Get(ValueId id)186     DummyTensor Get(ValueId id) { return reservations_[id]; }
187 
GetTensorDescs()188     std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const {
189       std::vector<std::pair<ValueId, TensorDescriptor>> result;
190       for (auto& v : reservations_) {
191         TensorDescriptor desc = v.second.descriptor;
192         desc.shape.b = v.second.shape.b;
193         desc.shape.h = v.second.shape.h;
194         desc.shape.w = v.second.shape.w;
195         desc.shape.d = 1;
196         desc.shape.c = v.second.shape.c;
197         result.push_back({v.first, desc});
198       }
199       return result;
200     }
201 
Add(const std::vector<std::pair<ValueId,TensorDescriptor>> & tensors)202     void Add(const std::vector<std::pair<ValueId, TensorDescriptor>>& tensors) {
203       for (auto& v : tensors) {
204         DummyTensor dummy;
205         dummy.descriptor = v.second;
206         dummy.shape.b = v.second.shape.b;
207         dummy.shape.h = v.second.shape.h;
208         dummy.shape.w = v.second.shape.w;
209         dummy.shape.c = v.second.shape.c;
210         Add(v.first, dummy);
211       }
212     }
213 
214    private:
215     absl::flat_hash_map<ValueId, DummyTensor> reservations_;
216     ValueId next_;
217   };
218   TensorReserver tensor_reserver_;
219 
220   absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_;
221   std::map<ValueId, Tensor> const_tensors_;
222 
223   std::map<ValueId, Tensor> variable_tensors_;
224   std::vector<Buffer> shared_buffers_;
225   std::vector<Tensor>
226       shared_buffer_tensors_;  // use references to memory from shared_buffers_
227   std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_;
228 
229   std::map<ValueId, Tensor> strong_shape_tensors_;
230   std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
231 
232   std::vector<ValueId> input_ids_;
233   std::map<ValueId, ValueId> variable_ids_and_refs_;
234   std::vector<ValueId> output_ids_;
235 
236   // for serialization
237   std::vector<int64_t> in_refs_;
238   std::vector<int64_t> out_refs_;
239 };
240 
241 // Runs OpenCL specific transforms for the graph.
242 absl::Status RunGraphTransforms(GraphFloat32* graph);
243 
244 }  // namespace cl
245 }  // namespace gpu
246 }  // namespace tflite
247 
248 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
249