1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/delegate.h"
17 
18 #include <cstdint>
19 #include <memory>
20 #include <thread>  // NOLINT(build/c++11)
21 #include <vector>
22 
23 #include "absl/container/flat_hash_map.h"
24 #include "absl/memory/memory.h"
25 #include "absl/types/span.h"
26 #include "tensorflow/lite/builtin_ops.h"
27 #include "tensorflow/lite/c/common.h"
28 #include "tensorflow/lite/delegates/gpu/api.h"
29 #include "tensorflow/lite/delegates/gpu/cl/api.h"
30 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
31 #include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
32 #include "tensorflow/lite/delegates/gpu/common/model.h"
33 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
34 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
35 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
36 #include "tensorflow/lite/delegates/gpu/common/status.h"
37 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
38 #include "tensorflow/lite/minimal_logging.h"
39 
40 #ifndef CL_DELEGATE_NO_GL
41 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
42 #endif
43 
44 namespace tflite {
45 namespace gpu {
46 namespace {
47 
ToPriority(int32_t priority)48 InferencePriority ToPriority(int32_t priority) {
49   switch (priority) {
50     case TFLITE_GPU_INFERENCE_PRIORITY_AUTO:
51       return InferencePriority::AUTO;
52     case TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
53       return InferencePriority::MAX_PRECISION;
54     case TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
55       return InferencePriority::MIN_LATENCY;
56     case TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE:
57       return InferencePriority::MIN_MEMORY_USAGE;
58   }
59   return InferencePriority::UNKNOWN;
60 }
61 
ToUsage(int32_t usage)62 InferenceUsage ToUsage(int32_t usage) {
63   switch (usage) {
64     case TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER:
65       return InferenceUsage::FAST_SINGLE_ANSWER;
66     case TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED:
67       return InferenceUsage::SUSTAINED_SPEED;
68   }
69   return InferenceUsage::UNKNOWN;
70 }
71 
72 // Forward declarations.
73 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
74 
75 class Delegate {
76  public:
Delegate(const TfLiteGpuDelegateOptionsV2 * options)77   explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
78       : num_delegate_kernels_(0) {
79     options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
80     if (options_.max_delegated_partitions <= 0) {
81       options_.max_delegated_partitions = 1;
82     }
83   }
84 
tflite_delegate()85   TfLiteDelegate* tflite_delegate() { return &delegate_; }
options() const86   const TfLiteGpuDelegateOptionsV2& options() const { return options_; }
87 
IsQuantOpsAllowed() const88   bool IsQuantOpsAllowed() const {
89     return options_.experimental_flags &
90            TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
91   }
MaxDelegatedPartitions() const92   int MaxDelegatedPartitions() const {
93     return options_.max_delegated_partitions;
94   }
num_delegate_kernels() const95   int num_delegate_kernels() const { return num_delegate_kernels_; }
96 
97  private:
98   TfLiteDelegate delegate_ = {
99       .data_ = reinterpret_cast<void*>(this),
100       .Prepare = DelegatePrepare,
101       .CopyFromBufferHandle = nullptr,
102       .CopyToBufferHandle = nullptr,
103       .FreeBufferHandle = nullptr,
104       .flags = kTfLiteDelegateFlagsNone,
105   };
106 
107   TfLiteGpuDelegateOptionsV2 options_;
108   int num_delegate_kernels_ = 0;
109 
110   friend class DelegateKernel;
111 };
112 
113 // Represent the execution of a subset of nodes on GPU.
114 class DelegateKernel {
115  public:
DelegateKernel(Delegate * delegate)116   explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
117     ++delegate_->num_delegate_kernels_;
118   }
~DelegateKernel()119   ~DelegateKernel() { --delegate_->num_delegate_kernels_; }
120 
Prepare(TfLiteContext * context,const TfLiteDelegateParams * delegate_params)121   absl::Status Prepare(TfLiteContext* context,
122                        const TfLiteDelegateParams* delegate_params) {
123     thread_id_prepare_ = std::this_thread::get_id();
124 
125     // Extract TFLite delegate execution plan from the context and convert it
126     // into GraphFloat32.
127     GraphFloat32 graph;
128     std::vector<uint32_t> input_refs;
129     std::vector<uint32_t> output_refs;
130     RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph,
131                                     &input_refs, &output_refs));
132 
133     std::unique_ptr<InferenceBuilder> builder;
134     bool graph_is_destroyed;
135     const int experimental_flags = delegate_->options().experimental_flags;
136     if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
137       RETURN_IF_ERROR(
138           InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
139     } else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
140       RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
141     } else {
142       // By default, we try CL first & fall back to GL if that fails.
143       absl::Status status =
144           InitializeOpenClApi(&graph, &builder, &graph_is_destroyed);
145       if (!status.ok()) {
146         TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
147         TF_LITE_KERNEL_LOG(context, "Falling back to OpenGL");
148 
149         // Graph needs to be re-created because it is moved above.
150         GraphFloat32 graph2;
151         if (graph_is_destroyed) {
152           RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph2,
153                                           &input_refs, &output_refs));
154         }
155         RETURN_IF_ERROR(InitializeOpenGlApi(
156             graph_is_destroyed ? &graph2 : &graph, &builder));
157       }
158     }
159 
160     // At this point tflite didn't allocate tensors yet, therefore, collect
161     // indices and set all input and output tensors from tflite later.
162     input_indices_.reserve(input_refs.size());
163     for (uint32_t tensor_index : input_refs) {
164       const int64_t object_index = input_indices_.size();
165       input_indices_.push_back(tensor_index);
166       RETURN_IF_ERROR(
167           builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
168     }
169     output_indices_.reserve(output_refs.size());
170     for (uint32_t tensor_index : output_refs) {
171       const int64_t object_index = output_indices_.size();
172       output_indices_.push_back(tensor_index);
173       RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
174                                                   GetObjectDef(tensor_index)));
175     }
176 
177     return builder->Build(&runner_);
178   }
179 
180   // This directs the runtime to allocate memory for input/output temporary
181   // tensors that require dequantization/quantization.
GetRequiredTemporaries(TfLiteContext * context,TfLiteNode * node,TfLiteIntArray ** temporaries_array_ptr)182   absl::Status GetRequiredTemporaries(TfLiteContext* context, TfLiteNode* node,
183                                       TfLiteIntArray** temporaries_array_ptr) {
184     if (quant_conversion_map_.empty()) return absl::OkStatus();
185 
186     std::vector<int> temporary_tensors;
187     for (auto index : input_indices_) {
188       if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
189         temporary_tensors.push_back(index);
190       }
191     }
192     for (auto index : output_indices_) {
193       if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
194         temporary_tensors.push_back(index);
195       }
196     }
197     *temporaries_array_ptr = TfLiteIntArrayCreate(temporary_tensors.size());
198     for (int i = 0; i < temporary_tensors.size(); ++i) {
199       (*temporaries_array_ptr)->data[i] = temporary_tensors[i];
200     }
201     return absl::OkStatus();
202   }
203 
Invoke(TfLiteContext * context)204   absl::Status Invoke(TfLiteContext* context) {
205     if (thread_id_prepare_ != std::this_thread::get_id()) {
206       TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
207                  "GpuDelegate invoke thread != prepare thread");
208       if (enforce_same_thread_) {
209         return absl::FailedPreconditionError(
210             "GpuDelegate must run on the same thread where it was "
211             "initialized.");
212       }
213     }
214 
215     const bool is_dequant_required = !quant_conversion_map_.empty();
216     if (is_dequant_required) {
217       RETURN_IF_ERROR(
218           DequantizeInputs(context, input_indices_, quant_conversion_map_));
219     }
220     RETURN_IF_ERROR(SetInputsAndOutputs(context));
221     RETURN_IF_ERROR(runner_->Run());
222     if (is_dequant_required) {
223       RETURN_IF_ERROR(
224           QuantizeOutputs(context, output_indices_, quant_conversion_map_));
225     }
226     return absl::OkStatus();
227   }
228 
229  private:
SetInputsAndOutputs(TfLiteContext * context)230   absl::Status SetInputsAndOutputs(TfLiteContext* context) {
231     for (int i = 0; i < input_indices_.size(); ++i) {
232       RETURN_IF_ERROR(runner_->SetInputObject(
233           i, GetTensorObject(input_indices_[i], context)));
234     }
235     for (int i = 0; i < output_indices_.size(); ++i) {
236       RETURN_IF_ERROR(runner_->SetOutputObject(
237           i, GetTensorObject(output_indices_[i], context)));
238     }
239     return absl::OkStatus();
240   }
241 
GetObjectDef(int index) const242   ObjectDef GetObjectDef(int index) const {
243     ObjectDef default_object_def;
244     default_object_def.data_type = DataType::FLOAT32;
245     default_object_def.data_layout = DataLayout::BHWC;
246     default_object_def.object_type = ObjectType::CPU_MEMORY;
247     default_object_def.user_provided = true;
248     return default_object_def;
249   }
250 
GetTensorObject(int index,TfLiteContext * context) const251   TensorObject GetTensorObject(int index, TfLiteContext* context) const {
252     auto& tensor = context->tensors[index];
253     return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
254   }
255 
256  private:
InitializeGraph(TfLiteContext * context,const TfLiteDelegateParams * delegate_params,GraphFloat32 * graph,std::vector<uint32_t> * input_refs,std::vector<uint32_t> * output_refs)257   absl::Status InitializeGraph(TfLiteContext* context,
258                                const TfLiteDelegateParams* delegate_params,
259                                GraphFloat32* graph,
260                                std::vector<uint32_t>* input_refs,
261                                std::vector<uint32_t>* output_refs) {
262     quant_conversion_map_.clear();
263     if (delegate_->IsQuantOpsAllowed()) {
264       RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
265                                       &quant_conversion_map_));
266     } else {
267       RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph));
268     }
269 
270     input_refs->clear();
271     output_refs->clear();
272     const auto inputs = graph->inputs();
273     input_refs->reserve(inputs.size());
274     for (const auto& input : inputs) {
275       input_refs->push_back(input->tensor.ref);
276     }
277     const auto outputs = graph->outputs();
278     output_refs->reserve(outputs.size());
279     for (const auto& output : outputs) {
280       output_refs->push_back(output->tensor.ref);
281     }
282 
283     return absl::OkStatus();
284   }
285 
InitializeOpenClApi(GraphFloat32 * graph,std::unique_ptr<InferenceBuilder> * builder,bool * graph_is_destroyed)286   absl::Status InitializeOpenClApi(GraphFloat32* graph,
287                                    std::unique_ptr<InferenceBuilder>* builder,
288                                    bool* graph_is_destroyed) {
289     *graph_is_destroyed = false;
290     cl::InferenceEnvironmentOptions env_options;
291     cl::InferenceEnvironmentProperties properties;
292     RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
293                                                 &properties));
294     auto delegate_options = delegate_->options();
295     cl::InferenceOptions options;
296     // If is_precision_loss_allowed == -1, then just use priorities instead
297     // of paying attention to is_precision_loss_allowed value.
298     if (delegate_options.is_precision_loss_allowed == -1) {
299       options.priority1 = ToPriority(delegate_options.inference_priority1);
300       options.priority2 = ToPriority(delegate_options.inference_priority2);
301       options.priority3 = ToPriority(delegate_options.inference_priority3);
302     } else {
303       // Users set is_precision_loss_allowed explicitly, thus use it explicitly.
304       if (delegate_options.is_precision_loss_allowed == 0) {
305         options.priority1 = InferencePriority::MAX_PRECISION;
306       } else {
307         options.priority1 = InferencePriority::MIN_LATENCY;
308       }
309     }
310     options.usage = ToUsage(delegate_options.inference_preference);
311     *graph_is_destroyed = true;
312     RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
313         options, std::move(*graph), builder));
314     TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
315                          "Initialized OpenCL-based API.");
316     return absl::OkStatus();
317   }
318 
InitializeOpenGlApi(GraphFloat32 * graph,std::unique_ptr<InferenceBuilder> * builder)319   absl::Status InitializeOpenGlApi(GraphFloat32* graph,
320                                    std::unique_ptr<InferenceBuilder>* builder) {
321 #ifndef CL_DELEGATE_NO_GL
322     gl::InferenceEnvironmentOptions env_options;
323     gl::InferenceEnvironmentProperties properties;
324     RETURN_IF_ERROR(
325         NewInferenceEnvironment(env_options, &gl_environment_, &properties));
326     auto delegate_options = delegate_->options();
327     gl::InferenceOptions options;
328     options.usage = ToUsage(delegate_options.inference_preference);
329     options.priority1 = ToPriority(delegate_options.inference_priority1);
330     options.priority2 = ToPriority(delegate_options.inference_priority2);
331     options.priority3 = ToPriority(delegate_options.inference_priority3);
332     RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
333                                                          options, builder));
334     enforce_same_thread_ = true;
335     TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
336                          "Initialized OpenGL-based API.");
337 #endif
338     return absl::OkStatus();
339   }
340 
341   // The Delegate instance that's shared across all DelegateKernel instances.
342   Delegate* const delegate_;  // doesn't own the memory.
343   std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
344 #ifndef CL_DELEGATE_NO_GL
345   std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
346 #endif
347   std::unique_ptr<InferenceRunner> runner_;
348   std::vector<int64_t> input_indices_;
349   std::vector<int64_t> output_indices_;
350   // Whenever quantized inference is enabled, this maps the tensor index of each
351   // originally quantized (8-bit) tensor to its float version added in
352   // model_builder - and vice versa.
353   absl::flat_hash_map<int, int> quant_conversion_map_;
354   std::thread::id thread_id_prepare_;  // thread id used for Prapare()
355   bool enforce_same_thread_ = false;   // flag to enforce same thread for Invoke
356 };
357 
GetDelegateKernel(TfLiteNode * node)358 inline DelegateKernel* GetDelegateKernel(TfLiteNode* node) {
359   return reinterpret_cast<DelegateKernel*>(node->user_data);
360 }
361 
GetDelegate(TfLiteDelegate * delegate)362 inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
363   return reinterpret_cast<Delegate*>(delegate->data_);
364 }
365 
DelegatePrepare(TfLiteContext * context,TfLiteDelegate * delegate)366 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
367   const TfLiteRegistration kRegistration = {
368       // .init
369       [](TfLiteContext* context, const char* buffer, size_t) -> void* {
370         const auto* params =
371             reinterpret_cast<const TfLiteDelegateParams*>(buffer);
372         auto* gpu_delegate = GetDelegate(params->delegate);
373         // Everything below should happen in prepare function call, but TFLite
374         // for whatever reason forbids that.
375         auto gpu_delegate_kernel =
376             absl::make_unique<DelegateKernel>(gpu_delegate);
377         const auto status = gpu_delegate_kernel->Prepare(context, params);
378         if (!status.ok()) {
379           TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Init: %s",
380                              std::string(status.message()).c_str());
381           return nullptr;
382         }
383         return gpu_delegate_kernel.release();
384       },
385       // .free
386       [](TfLiteContext*, void* buffer) -> void {
387         delete reinterpret_cast<DelegateKernel*>(buffer);
388       },
389       // .prepare
390       [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
391         if (!node->user_data) {
392           TF_LITE_KERNEL_LOG(
393               context,
394               "TfLiteGpuDelegate Prepare: delegate is not initialized");
395           return kTfLiteError;
396         }
397         auto* gpu_delegate_kernel = GetDelegateKernel(node);
398         const auto status = gpu_delegate_kernel->GetRequiredTemporaries(
399             context, node, &node->temporaries);
400         if (!status.ok()) {
401           TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Prepare: %s",
402                              std::string(status.message()).c_str());
403           return kTfLiteError;
404         }
405         // TODO(akulik): tflite tensors are not allocated here either. It would
406         // be good to set inputs and outputs only once here instead of setting
407         // them every time in .invoke.
408         return kTfLiteOk;
409       },
410       // .invoke
411       [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
412         const auto status = GetDelegateKernel(node)->Invoke(context);
413         if (!status.ok()) {
414           TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Invoke: %s",
415                              std::string(status.message()).c_str());
416           return kTfLiteError;
417         }
418         return kTfLiteOk;
419       },
420       nullptr,                // .profiling_string
421       0,                      // .builtin_code
422       "TfLiteGpuDelegateV2",  // .custom_name
423       1,                      // .version
424   };
425 
426   auto* gpu_delegate = GetDelegate(delegate);
427   TfLiteIntArray* ops_to_replace =
428       GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
429                       gpu_delegate->MaxDelegatedPartitions());
430   const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
431       context, kRegistration, ops_to_replace, delegate);
432   TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
433                   gpu_delegate->num_delegate_kernels());
434   TfLiteIntArrayFree(ops_to_replace);
435   return status;
436 }
437 
438 }  // namespace
439 }  // namespace gpu
440 }  // namespace tflite
441 
TfLiteGpuDelegateOptionsV2Default()442 TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
443   TfLiteGpuDelegateOptionsV2 options = {
444       // set it to -1 to detect whether it was later adjusted.
445       .is_precision_loss_allowed = -1,
446       .inference_preference =
447           TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
448       .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
449       .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
450       .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
451       .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
452       .max_delegated_partitions = 1,
453   };
454   return options;
455 }
456 
TfLiteGpuDelegateV2Create(const TfLiteGpuDelegateOptionsV2 * options)457 TfLiteDelegate* TfLiteGpuDelegateV2Create(
458     const TfLiteGpuDelegateOptionsV2* options) {
459   auto* gpu_delegate = new tflite::gpu::Delegate(options);
460   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
461                        "Created TensorFlow Lite delegate for GPU.");
462   return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
463 }
464 
TfLiteGpuDelegateV2Delete(TfLiteDelegate * delegate)465 void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate) {
466   delete tflite::gpu::GetDelegate(delegate);
467 }
468