1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <cstdint>
21 #include <map>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 #include "absl/container/flat_hash_map.h"
27 #include "absl/container/flat_hash_set.h"
28 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
29 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
30 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
31 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
32 #include "tensorflow/lite/delegates/gpu/common/model.h"
33 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
34 #include "tensorflow/lite/delegates/gpu/common/operations.h"
35 #include "tensorflow/lite/delegates/gpu/common/precision.h"
36 #include "tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h"
37 #include "tensorflow/lite/delegates/gpu/common/selectors/special_selector.h"
38 #include "tensorflow/lite/delegates/gpu/common/shape.h"
39 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
40 #include "tensorflow/lite/delegates/gpu/common/task/storage_type_util.h"
41 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
42 #include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
43 #include "tensorflow/lite/delegates/gpu/common/transformations/global_pooling_to_reduce_op.h"
44 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
45 #include "tensorflow/lite/delegates/gpu/common/types.h"
46 #include "tensorflow/lite/delegates/gpu/common/util.h"
47 
48 namespace tflite {
49 namespace gpu {
50 namespace cl {
51 
52 namespace {
IsReady(const absl::flat_hash_set<ValueId> & ready_tensors,const CLNode & node)53 bool IsReady(const absl::flat_hash_set<ValueId>& ready_tensors,
54              const CLNode& node) {
55   for (const ValueId in_id : node.inputs) {
56     if (ready_tensors.find(in_id) == ready_tensors.end()) {
57       return false;
58     }
59   }
60   return true;
61 }
62 
GetCLNodeTensors(const CLNode & node)63 std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
64     const CLNode& node) {
65   std::vector<std::pair<ValueId, TensorDescriptor>> result;
66   result.reserve(node.inputs.size() + node.outputs.size());
67   const OperationDef op_def = node.cl_operation.GetDefinition();
68   for (int j = 0; j < node.inputs.size(); ++j) {
69     result.push_back({node.inputs[j], op_def.src_tensors[j]});
70   }
71   for (int j = 0; j < node.outputs.size(); ++j) {
72     result.push_back({node.outputs[j], op_def.dst_tensors[j]});
73   }
74 
75   return result;
76 }
77 
MergeCLNodes(CLNode * src,CLNode * dst)78 absl::Status MergeCLNodes(CLNode* src, CLNode* dst) {
79   for (int j = 1; j < src->inputs.size(); ++j) {
80     dst->inputs.push_back(src->inputs[j]);
81   }
82   dst->outputs[0] = src->outputs[0];
83   dst->name += " linked : " + src->name;
84   return dst->cl_operation.AddOperation(&src->cl_operation);
85 }
86 
AddUsage(ValueId id,int task_index,std::map<ValueId,int2> * usage_records)87 void AddUsage(ValueId id, int task_index,
88               std::map<ValueId, int2>* usage_records) {
89   auto it = usage_records->find(id);
90   if (it == usage_records->end()) {
91     (*usage_records)[id].x = task_index;
92     (*usage_records)[id].y = task_index;
93   } else {
94     (*usage_records)[id].y = task_index;
95   }
96 }
97 
98 // returns true if actual memory for this storage type will be allocated with
99 // clCreateBuffer.
IsBufferBased(const TensorStorageType & type)100 bool IsBufferBased(const TensorStorageType& type) {
101   return type == TensorStorageType::BUFFER ||
102          type == TensorStorageType::IMAGE_BUFFER;
103 }
104 
105 // Generic add is add that have several runtime inputs and they are not
106 // broadcasted, i.e. pointwise add for N tensors where N > 1.
IsGenericAdd(const Node & node,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs)107 bool IsGenericAdd(const Node& node, const std::vector<Value*>& inputs,
108                   const std::vector<Value*>& outputs) {
109   if (inputs.size() == 1) {
110     return false;
111   }
112   const OperationType op_type = OperationTypeFromString(node.operation.type);
113   if (op_type != OperationType::ADD) {
114     return false;
115   }
116 
117   const auto dst_shape = outputs[0]->tensor.shape;
118   for (int i = 0; i < inputs.size(); ++i) {
119     const auto src_shape = inputs[i]->tensor.shape;
120     if (dst_shape.b != src_shape.b && src_shape.b == 1) {
121       return false;
122     }
123     if (dst_shape.h != src_shape.h && src_shape.h == 1) {
124       return false;
125     }
126     if (dst_shape.w != src_shape.w && src_shape.w == 1) {
127       return false;
128     }
129     if (dst_shape.c != src_shape.c && src_shape.c == 1) {
130       return false;
131     }
132   }
133   return true;
134 }
135 
136 }  // namespace
137 
InitFromGraph(const CreateInferenceInfo & create_info,const GraphFloat32 & graph,Environment * env,std::vector<uint8_t> * serialized_model)138 absl::Status InferenceContext::InitFromGraph(
139     const CreateInferenceInfo& create_info, const GraphFloat32& graph,
140     Environment* env, std::vector<uint8_t>* serialized_model) {
141   CreationContext creation_context;
142   creation_context.device = env->GetDevicePtr();
143   creation_context.context = &env->context();
144   creation_context.queue = env->queue();
145   creation_context.cache = env->program_cache();
146 
147   ReserveGraphTensors(create_info, creation_context.GetGpuInfo(), graph);
148   precision_ = create_info.precision;
149   storage_type_ = create_info.storage_type;
150   if (env->device().GetInfo().IsMali()) {
151     need_flush_ = true;
152     need_manual_release_ = true;
153 
154     flush_periodically_ = true;
155     flush_period_ = 24;
156   }
157   if (env->device().GetInfo().IsPowerVR()) {
158     need_flush_ = true;
159   }
160   CopyInAndOutIds(graph);
161   RETURN_IF_ERROR(ConvertOperations(creation_context.GetGpuInfo(), graph,
162                                     create_info.hints));
163   RETURN_IF_ERROR(Merge());
164   RETURN_IF_ERROR(AllocateMemory(creation_context.context));
165   BindMemoryToOperations();
166   RETURN_IF_ERROR(Compile(creation_context));
167   RETURN_IF_ERROR(UpdateParams());
168 
169   TuningType tuning_type = TuningType::kExhaustive;
170   if (create_info.hints.Check(ModelHints::kFastTuning)) {
171     tuning_type = TuningType::kFast;
172   }
173   if (env->device().GetInfo().IsMali()) {
174     const MaliInfo& info = env->device().GetInfo().mali_info;
175     if (info.IsMaliT6xx()) {
176       // Mali T628 hangs forever in clFinish when used profiling queue
177       // TuningType::FAST does not use profiling queue.
178       tuning_type = TuningType::kFast;
179     }
180   }
181   RETURN_IF_ERROR(
182       Tune(tuning_type, env->device().GetInfo(), env->profiling_queue()));
183 
184   if (serialized_model) {
185     for (auto& node : nodes_) {
186       node.cl_operation.MoveObjectRefsFromCLToGeneric();
187       node.cl_operation.SyncScalarValues();
188     }
189     flatbuffers::FlatBufferBuilder builder;
190     auto encoded_fb = Encode(*this, &builder);
191     data::FinishInferenceContextBuffer(builder, encoded_fb);
192     serialized_model->resize(builder.GetSize());
193     std::memcpy(serialized_model->data(), builder.GetBufferPointer(),
194                 builder.GetSize());
195     for (auto& node : nodes_) {
196       node.cl_operation.MoveObjectRefsFromGenericToCL();
197     }
198   }
199   ReleaseCPURepresentation();
200   return absl::OkStatus();
201 }
202 
RestoreDeserialized(const absl::Span<const uint8_t> serialized_model,Environment * env)203 absl::Status InferenceContext::RestoreDeserialized(
204     const absl::Span<const uint8_t> serialized_model, Environment* env) {
205   flatbuffers::Verifier verifier(serialized_model.data(),
206                                  serialized_model.size());
207   if (!data::VerifyInferenceContextBuffer(verifier)) {
208     return absl::DataLossError("Deserialization failed.");
209   }
210   auto decoded_fb = data::GetInferenceContext(serialized_model.data());
211   RETURN_IF_ERROR(Decode(decoded_fb, this));
212 
213   CreationContext creation_context;
214   creation_context.device = env->GetDevicePtr();
215   creation_context.context = &env->context();
216   creation_context.queue = env->queue();
217   creation_context.cache = env->program_cache();
218 
219   RETURN_IF_ERROR(AllocateMemory(creation_context.context));
220   BindMemoryToOperations();
221   for (auto& node : nodes_) {
222     RETURN_IF_ERROR(node.cl_operation.CompileDeserialized(creation_context));
223   }
224   RETURN_IF_ERROR(UpdateParams());
225   ReleaseCPURepresentation();
226   return absl::OkStatus();
227 }
228 
InitFromGraphWithTransforms(const CreateInferenceInfo & create_info,GraphFloat32 * graph,Environment * env,std::vector<uint8_t> * serialized_model)229 absl::Status InferenceContext::InitFromGraphWithTransforms(
230     const CreateInferenceInfo& create_info, GraphFloat32* graph,
231     Environment* env, std::vector<uint8_t>* serialized_model) {
232   RETURN_IF_ERROR(RunGraphTransforms(graph));
233   RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env, serialized_model));
234   return absl::OkStatus();
235 }
236 
CopyInAndOutIds(const GraphFloat32 & graph)237 void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
238   const auto inputs = graph.inputs();
239   for (const auto& input : inputs) {
240     input_ids_.push_back(input->id);
241   }
242 
243   const auto variable_inputs = graph.variable_inputs();
244   for (const auto& variable_input : variable_inputs) {
245     variable_ids_and_refs_[variable_input->id] = variable_input->tensor.ref;
246   }
247 
248   const auto outputs = graph.outputs();
249   for (const auto& output : outputs) {
250     output_ids_.push_back(output->id);
251   }
252 
253   in_refs_.resize(inputs.size());
254   out_refs_.resize(outputs.size());
255   for (int i = 0; i < inputs.size(); ++i) {
256     in_refs_[i] = inputs[i]->tensor.ref;
257   }
258   for (int i = 0; i < outputs.size(); ++i) {
259     out_refs_[i] = outputs[i]->tensor.ref;
260   }
261 }
262 
ReserveGraphTensors(const CreateInferenceInfo & create_info,const GpuInfo & gpu_info,const GraphFloat32 & graph)263 void InferenceContext::ReserveGraphTensors(
264     const CreateInferenceInfo& create_info, const GpuInfo& gpu_info,
265     const GraphFloat32& graph) {
266   ValueId max_id = 0;
267   auto tensors = graph.values();
268   auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
269   for (auto& t : tensors) {
270     TensorStorageType storage_type = create_info.storage_type;
271     const auto shape = graph.GetValue(t->id)->tensor.shape;
272     Layout layout = shape.b == 1 ? Layout::HWC : Layout::BHWC;
273     if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
274       if (shape.c < 4 &&
275           CanCreateTensorWithShape(
276               gpu_info, shape,
277               TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
278                                layout})) {
279         storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
280       }
281     }
282     storage_type =
283         SelectBestStorageType(gpu_info, shape, storage_type, data_type, layout);
284     tensor_reserver_.Add(
285         t->id, {shape, TensorDescriptor{data_type, storage_type, layout}});
286     max_id = std::max(max_id, t->id);
287   }
288   tensor_reserver_.SetNext(max_id + 1);
289 }
290 
ConvertOperations(const GpuInfo & gpu_info,const GraphFloat32 & graph,ModelHints hints)291 absl::Status InferenceContext::ConvertOperations(const GpuInfo& gpu_info,
292                                                  const GraphFloat32& graph,
293                                                  ModelHints hints) {
294   std::map<ValueId, TensorDescriptor> tensor_descriptors;
295   const auto values = graph.values();
296   for (auto value : values) {
297     tensor_descriptors[value->id] = tensor_reserver_.Get(value->id).descriptor;
298   }
299   std::set<NodeId> consumed_nodes;
300   std::vector<Node*> graph_nodes = graph.nodes();
301   std::map<ValueId, int>
302       tensor_usages;  // keeps latest index of operation that updated tensor
303   for (const auto& input_id : input_ids_) {
304     tensor_usages[input_id] = -1;  // so as inputs "updated" before operation 0,
305                                    // we will mark them with -1
306   }
307   for (int i = 0; i < graph_nodes.size(); ++i) {
308     const Node& node = *graph_nodes[i];
309     if (consumed_nodes.find(node.id) != consumed_nodes.end()) {
310       continue;
311     }
312     auto op_type = OperationTypeFromString(node.operation.type);
313     if (op_type == OperationType::CONSTANT) {
314       auto attr =
315           absl::any_cast<ConstTensorAttributes>(node.operation.attributes);
316       auto outputs = graph.FindOutputs(node.id);
317       const_tensors_descs_[outputs[0]->id] =
318           tensor_reserver_.Get(outputs[0]->id).descriptor;
319       const_tensors_descs_[outputs[0]->id].UploadData(attr.tensor);
320       continue;
321     }
322     std::string op_name = node.operation.type + " " + std::to_string(node.id);
323     GPUOperationsSubgraph gpu_subgraph;
324     if (hints.Check(ModelHints::kAllowSpecialKernels) &&
325         GPUSubgraphFromGraph(gpu_info, precision_, graph, node.id,
326                              tensor_descriptors, &consumed_nodes, &gpu_subgraph,
327                              &op_name)
328             .ok()) {
329       // Mapping of subgraph (set of nodes) to GPU operations. Should happen
330       // before straigtforward mapping.
331     } else {
332       // Straigtforward mapping of one graph node to GPU operations.
333       auto inputs = graph.FindInputs(node.id);
334       auto outputs = graph.FindOutputs(node.id);
335       // Reordering of input ids and updating of temporary tensors_usage struct.
336       // This stage is necessary because we are building OperationDef that rely
337       // on order of input ids. But we also should have input id on first
338       // position that potentially can be "linking" tensor and as result
339       // eliminated(unused) We apply it only for ADD operation, because of ADD
340       // associativity and ADD can be linked. In current approach "linking"
341       // tensor can be only latest written tensor(during linear order of
342       // execution) among input tensors.
343       if (IsGenericAdd(node, inputs, outputs)) {
344         int latest_written_tensor_index = 0;
345         int last_usage = tensor_usages[inputs[0]->id];
346         for (int j = 1; j < inputs.size(); ++j) {
347           if (tensor_usages[inputs[j]->id] > last_usage) {
348             last_usage = tensor_usages[inputs[j]->id];
349             latest_written_tensor_index = j;
350           }
351         }
352         std::swap(inputs[0], inputs[latest_written_tensor_index]);
353       }
354       consumed_nodes.insert(node.id);
355       OperationDef op_def;
356       op_def.precision = precision_;
357       for (int j = 0; j < inputs.size(); ++j) {
358         op_def.src_tensors.push_back(
359             tensor_reserver_.Get(inputs[j]->id).descriptor);
360       }
361       for (int j = 0; j < outputs.size(); ++j) {
362         op_def.dst_tensors.push_back(
363             tensor_reserver_.Get(outputs[j]->id).descriptor);
364       }
365       RETURN_IF_ERROR(GPUOperationFromNode(gpu_info, op_def, hints, inputs,
366                                            outputs, node, &gpu_subgraph));
367     }
368     absl::flat_hash_map<int, ValueId> mapping_to_global_ids;
369     for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
370       const auto& t = gpu_subgraph.new_tensors[j];
371       auto global_id = tensor_reserver_.Add({t.first, t.second});
372       mapping_to_global_ids[j] = global_id;
373     }
374     for (auto& gpu_op : gpu_subgraph.operations) {
375       CLNode cl_node;
376       cl_node.cl_operation.Init(std::move(gpu_op.operation));
377       cl_node.inputs.resize(gpu_op.input_ids.size());
378       for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
379         int id = gpu_op.input_ids[j];
380         if (id >= 0) {
381           cl_node.inputs[j] = id;
382         } else {
383           cl_node.inputs[j] = mapping_to_global_ids[-(id + 1)];
384         }
385       }
386       cl_node.outputs.resize(gpu_op.output_ids.size());
387       for (int j = 0; j < gpu_op.output_ids.size(); ++j) {
388         int id = gpu_op.output_ids[j];
389         if (id >= 0) {
390           cl_node.outputs[j] = id;
391           tensor_usages[id] = i;
392         } else {
393           cl_node.outputs[j] = mapping_to_global_ids[-(id + 1)];
394         }
395       }
396       cl_node.name = op_name;
397       nodes_.push_back(std::move(cl_node));
398     }
399   }
400 
401   return absl::OkStatus();
402 }
403 
Merge()404 absl::Status InferenceContext::Merge() {
405   absl::flat_hash_set<ValueId> ready_tensors;
406   for (const auto& input_id : input_ids_) {
407     ready_tensors.insert(input_id);
408   }
409   for (int i = 0; i < nodes_.size(); ++i) {
410     auto& node = nodes_[i];
411     for (const auto& out_id : node.outputs) {
412       ready_tensors.insert(out_id);
413     }
414     if (node.outputs.size() != 1) {
415       continue;
416     }
417     std::vector<int> next_nodes;
418     int link_index = 0;
419     for (int j = i + 1; j < nodes_.size(); ++j) {
420       for (int k = 0; k < nodes_[j].inputs.size(); ++k) {
421         if (nodes_[j].inputs[k] == node.outputs[0]) {
422           next_nodes.push_back(j);
423           link_index = k;
424         }
425       }
426     }
427     if (next_nodes.size() != 1 || link_index != 0) {
428       continue;
429     }
430     auto& linkable_node = nodes_[next_nodes[0]];
431     if (!linkable_node.cl_operation.GetGpuOperation().IsLinkable() ||
432         linkable_node.outputs.size() != 1 ||
433         !IsReady(ready_tensors, linkable_node)) {
434       continue;
435     }
436     const auto& original_dst_def =
437         node.cl_operation.GetDefinition().dst_tensors[0];
438     const auto& link_dst_def =
439         linkable_node.cl_operation.GetDefinition().dst_tensors[0];
440     if (original_dst_def != link_dst_def) {
441       continue;
442     }
443     RETURN_IF_ERROR(MergeCLNodes(&linkable_node, &node));
444     nodes_.erase(nodes_.begin() + next_nodes[0]);
445     i -= 1;
446   }
447   return absl::OkStatus();
448 }
449 
GetUsages(const std::function<bool (ValueId)> & functor,std::map<ValueId,int2> * usages)450 void InferenceContext::GetUsages(const std::function<bool(ValueId)>& functor,
451                                  std::map<ValueId, int2>* usages) {
452   for (ValueId in_id : input_ids_) {
453     if (functor(in_id)) {
454       AddUsage(in_id, 0, usages);
455     }
456   }
457   for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
458     auto tensors = GetCLNodeTensors(nodes_[op_index]);
459     for (auto& tensor : tensors) {
460       if (functor(tensor.first)) {
461         AddUsage(tensor.first, op_index, usages);
462       }
463     }
464   }
465   for (ValueId out_id : output_ids_) {
466     if (functor(out_id)) {
467       AddUsage(out_id, nodes_.size(), usages);
468     }
469   }
470 }
471 
GetTensorMemoryType(ValueId id)472 InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType(
473     ValueId id) {
474   if (const_tensors_.find(id) != const_tensors_.end()) {
475     return TensorMemoryType::kConst;
476   } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
477     return TensorMemoryType::kVariable;
478   } else if (IsBufferBased(tensor_reserver_.Get(id).descriptor.storage_type)) {
479     return TensorMemoryType::kBuffer;
480   } else {
481     return TensorMemoryType::kStrongShape;
482   }
483 }
484 
AllocateMemory(CLContext * context)485 absl::Status InferenceContext::AllocateMemory(CLContext* context) {
486   RETURN_IF_ERROR(AllocateMemoryForConstTensors(context));
487   RETURN_IF_ERROR(AllocateMemoryForVariableTensors(context));
488   RETURN_IF_ERROR(AllocateMemoryForBuffers(context));
489   RETURN_IF_ERROR(AllocateMemoryForStrongShapes(context));
490   return absl::OkStatus();
491 }
492 
AllocateMemoryForConstTensors(CLContext * context)493 absl::Status InferenceContext::AllocateMemoryForConstTensors(
494     CLContext* context) {
495   for (auto& description : const_tensors_descs_) {
496     RETURN_IF_ERROR(const_tensors_[description.first].CreateFromDescriptor(
497         description.second, context));
498   }
499   return absl::OkStatus();
500 }
501 
AllocateMemoryForVariableTensors(CLContext * context)502 absl::Status InferenceContext::AllocateMemoryForVariableTensors(
503     CLContext* context) {
504   std::map<ValueId, int> ref_value_to_tensor_index;
505 
506   for (auto value_and_ref_value : variable_ids_and_refs_) {
507     if (ref_value_to_tensor_index.find(value_and_ref_value.second) ==
508         ref_value_to_tensor_index.end()) {
509       const auto& t = tensor_reserver_.Get(value_and_ref_value.first);
510       const auto& shape = t.shape;
511       const auto& descriptor = t.descriptor;
512 
513       RETURN_IF_ERROR(
514           CreateTensor(*context, shape, descriptor,
515                        &variable_tensors_[value_and_ref_value.second]));
516     }
517   }
518   return absl::OkStatus();
519 }
520 
AllocateMemoryForBuffers(CLContext * context)521 absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) {
522   std::map<ValueId, int2> buffer_usages;
523   GetUsages(
524       [this](ValueId id) {
525         return GetTensorMemoryType(id) == TensorMemoryType::kBuffer;
526       },
527       &buffer_usages);
528 
529   std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
530   for (auto& usage : buffer_usages) {
531     const auto& t = tensor_reserver_.Get(usage.first);
532     const auto& shape = t.shape;
533     const auto& descriptor = t.descriptor;
534     const size_t element_size =
535         descriptor.data_type == DataType::FLOAT32 ? 4 : 2;
536     const size_t buffer_size =
537         shape.b * shape.w * shape.h * AlignByN(shape.c, 4) * element_size;
538     graph_ids_to_shared_buffer_tensors_[usage.first] =
539         buffer_usage_records.size();
540     buffer_usage_records.push_back({buffer_size,
541                                     static_cast<TaskId>(usage.second.x),
542                                     static_cast<TaskId>(usage.second.y)});
543   }
544 
545   ObjectsAssignment<size_t> buffer_assignment;
546   RETURN_IF_ERROR(AssignObjectsToTensors(
547       buffer_usage_records, MemoryStrategy::GREEDY_BEST, &buffer_assignment));
548 
549   shared_buffers_.resize(buffer_assignment.object_sizes.size());
550   for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) {
551     RETURN_IF_ERROR(CreateReadWriteBuffer(buffer_assignment.object_sizes[i],
552                                           context, &shared_buffers_[i]));
553   }
554 
555   std::vector<bool> created_tensors(buffer_usage_records.size(), false);
556   shared_buffer_tensors_.resize(buffer_usage_records.size());
557   for (auto& node : nodes_) {
558     auto tensors = GetCLNodeTensors(node);
559     for (auto& t : tensors) {
560       if (GetTensorMemoryType(t.first) != TensorMemoryType::kBuffer) continue;
561       const int tensor_index = graph_ids_to_shared_buffer_tensors_[t.first];
562       if (created_tensors[tensor_index]) continue;
563       const auto& shape = tensor_reserver_.Get(t.first).shape;
564       const int buffer_index = buffer_assignment.object_ids[tensor_index];
565       RETURN_IF_ERROR(CreateSharedTensor(
566           *context, shared_buffers_[buffer_index].GetMemoryPtr(), shape,
567           t.second, &shared_buffer_tensors_[tensor_index]));
568       created_tensors[tensor_index] = true;
569     }
570   }
571   return absl::OkStatus();
572 }
573 
AllocateMemoryForStrongShapes(CLContext * context)574 absl::Status InferenceContext::AllocateMemoryForStrongShapes(
575     CLContext* context) {
576   std::map<ValueId, int2> usages;
577   GetUsages(
578       [this](ValueId id) {
579         return GetTensorMemoryType(id) == TensorMemoryType::kStrongShape;
580       },
581       &usages);
582 
583   std::vector<TensorUsageRecord<DummyTensor>> usage_records;
584   std::map<ValueId, ValueId> remap_from_graph_ids;
585   for (auto& usage : usages) {
586     remap_from_graph_ids[usage.first] = usage_records.size();
587     usage_records.push_back({tensor_reserver_.Get(usage.first),
588                              static_cast<TaskId>(usage.second.x),
589                              static_cast<TaskId>(usage.second.y)});
590   }
591 
592   ObjectsAssignment<DummyTensor> assignment;
593   RETURN_IF_ERROR(AssignObjectsToTensors(
594       usage_records, MemoryStrategy::EQUALITY, &assignment));
595 
596   for (auto& node : nodes_) {
597     auto tensors = GetCLNodeTensors(node);
598     for (auto& t : tensors) {
599       if (GetTensorMemoryType(t.first) != TensorMemoryType::kStrongShape) {
600         continue;
601       }
602       const auto& shape = tensor_reserver_.Get(t.first).shape;
603       const auto id = assignment.object_ids[remap_from_graph_ids[t.first]];
604       graph_ids_to_strong_shape_tensors_[t.first] = id;
605       const auto& it = strong_shape_tensors_.find(id);
606       if (it == strong_shape_tensors_.end()) {
607         RETURN_IF_ERROR(CreateTensor(*context, shape, t.second,
608                                      &strong_shape_tensors_[id]));
609       }
610     }
611   }
612   return absl::OkStatus();
613 }
614 
BindMemoryToOperations()615 void InferenceContext::BindMemoryToOperations() {
616   for (auto& node : nodes_) {
617     for (int i = 0; i < node.inputs.size(); ++i) {
618       node.cl_operation.GetGpuOperation().SetSrc(GetTensor(node.inputs[i]), i);
619     }
620     for (int i = 0; i < node.outputs.size(); ++i) {
621       node.cl_operation.GetGpuOperation().SetDst(GetTensor(node.outputs[i]), i);
622     }
623   }
624 }
625 
Compile(const CreationContext & creation_context)626 absl::Status InferenceContext::Compile(
627     const CreationContext& creation_context) {
628   for (auto& node : nodes_) {
629     RETURN_IF_ERROR(node.cl_operation.Compile(creation_context));
630   }
631   return absl::OkStatus();
632 }
633 
Tune(TuningType tuning_type,const GpuInfo & gpu_info,ProfilingCommandQueue * profiling_queue)634 absl::Status InferenceContext::Tune(TuningType tuning_type,
635                                     const GpuInfo& gpu_info,
636                                     ProfilingCommandQueue* profiling_queue) {
637   for (auto& node : nodes_) {
638     RETURN_IF_ERROR(
639         node.cl_operation.Tune(tuning_type, gpu_info, profiling_queue));
640   }
641   return absl::OkStatus();
642 }
643 
UpdateParams()644 absl::Status InferenceContext::UpdateParams() {
645   for (auto& node : nodes_) {
646     RETURN_IF_ERROR(node.cl_operation.UpdateParams());
647   }
648   return absl::OkStatus();
649 }
650 
AddToQueue(CLCommandQueue * queue)651 absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
652   if (need_manual_release_) {
653     if (prev_enqueue_start_point_.is_valid()) {
654       prev_enqueue_start_point_.Wait();
655     }
656     RETURN_IF_ERROR(queue->EnqueueEvent(&prev_enqueue_start_point_));
657   }
658   int counter = 0;
659   for (auto& node : nodes_) {
660     RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue));
661     counter++;
662     if (flush_periodically_ && counter % flush_period_ == 0) {
663       clFlush(queue->queue());
664     }
665   }
666   if (need_flush_) {
667     clFlush(queue->queue());
668   }
669   return absl::OkStatus();
670 }
671 
Profile(ProfilingCommandQueue * queue,ProfilingInfo * result)672 absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
673                                        ProfilingInfo* result) {
674   queue->ResetMeasurements();
675   for (auto& node : nodes_) {
676     queue->SetEventsLabel(node.name);
677     RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue));
678   }
679   RETURN_IF_ERROR(queue->WaitForCompletion());
680   *result = queue->GetProfilingInfo();
681   return absl::OkStatus();
682 }
683 
GetSizeOfMemoryAllocatedForIntermediateTensors() const684 uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
685     const {
686   uint64_t total_memory = 0;
687   for (const auto& t : strong_shape_tensors_) {
688     total_memory += t.second.GetMemorySizeInBytes();
689   }
690   for (const auto& b : shared_buffers_) {
691     total_memory += b.GetMemorySizeInBytes();
692   }
693   for (const auto& t : variable_tensors_) {
694     total_memory += t.second.GetMemorySizeInBytes();
695   }
696 
697   return total_memory;
698 }
699 
GetTensor(ValueId id)700 Tensor* InferenceContext::GetTensor(ValueId id) {
701   if (const_tensors_.find(id) != const_tensors_.end()) {
702     return &const_tensors_[id];
703   } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
704     return &variable_tensors_[variable_ids_and_refs_[id]];
705   } else if (graph_ids_to_shared_buffer_tensors_.find(id) !=
706              graph_ids_to_shared_buffer_tensors_.end()) {
707     return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]];
708   } else {
709     return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]];
710   }
711 }
712 
SetInputTensor(ValueId id,const TensorFloat32 & tensor,CLCommandQueue * queue)713 absl::Status InferenceContext::SetInputTensor(ValueId id,
714                                               const TensorFloat32& tensor,
715                                               CLCommandQueue* queue) {
716   return GetTensor(id)->WriteData(queue, tensor);
717 }
718 
GetOutputTensor(ValueId id,CLCommandQueue * queue,TensorFloat32 * result)719 absl::Status InferenceContext::GetOutputTensor(ValueId id,
720                                                CLCommandQueue* queue,
721                                                TensorFloat32* result) {
722   const auto& gpu_tensor = *GetTensor(id);
723   const auto dst_shape = BHWC(gpu_tensor.Batch(), gpu_tensor.Height(),
724                               gpu_tensor.Width(), gpu_tensor.Channels());
725   result->id = id;
726   result->shape = dst_shape;
727   result->data.resize(dst_shape.DimensionsProduct());
728   return gpu_tensor.ReadData(queue, result);
729 }
730 
ReleaseCPURepresentation()731 void InferenceContext::ReleaseCPURepresentation() {
732   for (auto& node : nodes_) {
733     node.cl_operation.GetGpuOperation().args_.ReleaseCPURepresentation();
734   }
735   const_tensors_descs_.clear();
736 }
737 
RunGraphTransforms(GraphFloat32 * graph)738 absl::Status RunGraphTransforms(GraphFloat32* graph) {
739   auto merge_padding_transform = NewMergePaddingWithAdd();
740   auto add_bias_transform = NewAddBias();
741   auto pooling_to_reduce_op = NewGlobalPoolingToReduceOp();
742   ModelTransformer transformer(graph, /*reporter=*/nullptr);
743   if (!transformer.Apply("add_bias", add_bias_transform.get())) {
744     return absl::InternalError("Invalid add_bias transform");
745   }
746   if (!transformer.Apply("merge_padding", merge_padding_transform.get())) {
747     return absl::InternalError("Invalid merge_padding transform");
748   }
749   if (!transformer.Apply("global pooling to mean",
750                          pooling_to_reduce_op.get())) {
751     return absl::InternalError("Invalid global pooling to mean transform");
752   }
753   return absl::OkStatus();
754 }
755 
756 }  // namespace cl
757 }  // namespace gpu
758 }  // namespace tflite
759