1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
16 #define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
17 
18 #include <memory>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "tensorflow/lite/c/common.h"
24 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
25 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
26 
27 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
28 
29 namespace tflite {
30 
31 namespace delegate {
32 namespace nnapi {
33 class NNAPIDelegateKernel;
34 }  // namespace nnapi
35 }  // namespace delegate
36 
37 using tflite::delegate::nnapi::NNAPIDelegateKernel;
38 
39 // TFliteDelegate to interface with NNAPI.
40 class StatefulNnApiDelegate : public TfLiteDelegate {
41  public:
42   // Encapsulates all options that are specific to NNAPI delegate.
43   struct Options {
44     // Preferred Power/perf trade-off. For more details please see
45     // ANeuralNetworksCompilation_setPreference documentation in :
46     // https://developer.android.com/ndk/reference/group/neural-networks.html
47     enum ExecutionPreference {
48       kUndefined = -1,
49       kLowPower = 0,
50       kFastSingleAnswer = 1,
51       kSustainedSpeed = 2,
52     };
53 
54     // Preferred Power/perf trade-off.
55     ExecutionPreference execution_preference = kUndefined;
56 
57     // Selected NNAPI accelerator with nul-terminated name.
58     // Default to nullptr, which implies the NNAPI default behavior: NNAPI
59     // runtime is allowed to use all available accelerators. If the selected
60     // accelerator cannot be found, NNAPI will not be used.
61     // It is the caller's responsibility to ensure the string is valid for the
62     // duration of the Options object lifetime.
63     const char* accelerator_name = nullptr;
64 
65     // The nul-terminated cache dir for NNAPI model.
66     // Default to nullptr, which implies the NNAPI will not try caching the
67     // compilation.
68     const char* cache_dir = nullptr;
69 
70     // The unique nul-terminated token string for NNAPI model.
71     // Default to nullptr, which implies the NNAPI will not try caching the
72     // compilation. It is the caller's responsibility to ensure there is no
73     // clash of the tokens.
74     // NOTE: when using compilation caching, it is not recommended to use the
75     // same delegate instance for multiple models.
76     const char* model_token = nullptr;
77 
78     // Whether to disallow NNAPI CPU usage. Only effective on Android 10 and
79     // above. The NNAPI CPU typically performs less well than built-in TfLite
80     // kernels, but allowing CPU allows partial acceleration of models. If this
81     // is set to true, NNAPI is only used if the whole model is accelerated.
82     bool disallow_nnapi_cpu = true;
83 
84     // Specifies the max number of partitions to delegate. A value <= 0 means
85     // no limit.
86     // If the delegation of the full set of supported nodes would generate a
87     // number of partition greater than this parameter, only
88     // <max_number_delegated_partitions> of them will be actually accelerated.
89     // The selection is currently done sorting partitions in decreasing order
90     // of number of nodes and selecting them until the limit is reached.
91     int max_number_delegated_partitions = 3;
92 
93     // allow fp32 compuation to be run in fp16.
94     bool allow_fp16 = false;
95 
96     // Specifies the relative priority for executions of the model.
97     // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
98     // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
99     // ANEURALNETWORKS_PRIORITY_DEFAULT}.
100     int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
101 
102     // Specifies the maximum expected duration in nanosecond for compiling the
103     // model. If the device is not able to complete the compilation within the
104     // specified duration, the compilation may be aborted. If set to 0, the
105     // timeout duration is considered infinite.
106     uint64_t max_compilation_timeout_duration_ns = 0;
107 
108     // Specifies the maximum expected duration in nanosecond for executing the
109     // model. If the device is not able to complete the execution within the
110     // specified duration, the execution may be aborted. If set to 0, the
111     // timeout duration is considered infinite.
112     uint64_t max_execution_timeout_duration_ns = 0;
113 
114     // Specifies the maximum expected duration in nanosecond for WHILE loops in
115     // the execution. If a WHILE loop condition model does not output false
116     // within the specified duration, the execution will be aborted. If set to
117     // 0, the default timeout for loops will be used.
118     uint64_t max_execution_loop_timeout_duration_ns = 0;
119 
120     // Whether to allow dynamic dimension sizes without re-compilation.
121     // A tensor of with dynamic dimension must have a valid dim_signature
122     // defined.
123     // Only supported in NNAPI 1.1 and newer versions.
124     // WARNING: Setting this flag to true may result in model being rejected by
125     // accelerator. This should only be enabled if the target device supports
126     // dynamic dimensions of the model.
127     bool allow_dynamic_dimensions = false;
128   };
129 
130   // Uses default options.
131   StatefulNnApiDelegate();
132 
133   // The ownership of the NnApi instance is left to the caller of the
134   // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime
135   // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate.
136   explicit StatefulNnApiDelegate(const NnApi* nnapi);
137 
138   // The constructor that accepts options from user.
139   // This makes a copy of any data that it needs from Options, so
140   // the caller can safely deallocate any storage pointed to by
141   // the 'const char *' members of Options immediately after calling this.
142   explicit StatefulNnApiDelegate(Options options);
143 
144   // Constructor that accepts both an NnApi instance and options.
145   // The ownership of the NnApi instance is left to the caller of the
146   // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime
147   // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate.
148   // This constructor makes a copy of any data that it needs from Options, so
149   // the caller can safely deallocate any storage pointed to by
150   // the 'const char *' members of Options immediately after calling this.
151   StatefulNnApiDelegate(const NnApi* nnapi, Options options);
152 
153   ~StatefulNnApiDelegate() = default;
154 
155   // Returns the delegate options.
156   // The lifetime of the storage pointed to by the 'const char *' members of the
157   // returned Options object is the same as the lifetime of the supplied
158   // TfLiteDelegate instance.
159   static const Options GetOptions(TfLiteDelegate* delegate);
160 
161   // Callback function which copies data from ANeuralNetworksMemory to host
162   // tensor CPU buffer. It is the users responsibility to implement these
163   // callbacks for the specific types of shared memory they intend to use.
164   // WARNING: This is an experimental interface that is subject to change.
165   typedef TfLiteStatus (*CopyToHostTensorFnPtr)(TfLiteTensor* tensor,
166                                                 ANeuralNetworksMemory* memory,
167                                                 size_t memory_offset,
168                                                 size_t byte_size,
169                                                 void* callback_context);
170 
171   // Encapsulates all fields related to memory registration for internal
172   // bookkeeping only.
173   struct MemoryRegistration {
174     ANeuralNetworksMemory* memory;
175     CopyToHostTensorFnPtr callback;
176     void* callback_context;
177   };
178 
179   // Register the ANeuralNetworksMemory handle with the delegate. A
180   // TfLiteBufferHandle will be returned to be used with
181   // Interpreter::SetBufferHandle. The callback_context will be passed to the
182   // callback function when invoked.
183   // Note: the returned TfLiteBufferHandle can only be used with a single
184   // Interpreter instance. However, the caller can register the same memory
185   // multiple times to get different handles to use with difference Interpreter
186   // instances
187   // WARNING: This is an experimental interface that is subject to change.
188   TfLiteBufferHandle RegisterNnapiMemory(ANeuralNetworksMemory* memory,
189                                          CopyToHostTensorFnPtr callback,
190                                          void* callback_context);
191 
192   // Returns the vector of known ANeuralNetworksMemory handles.
193   // Note: this function is not intended to be called by developers.
194   // WARNING: This is an experimental interface that is subject to change.
195   static const std::vector<MemoryRegistration>& GetTensorMemoryMap(
196       TfLiteDelegate* delegate);
197 
198   // Returns the int value of the ResultCode returned by the latest
199   // failed call to NNAPI, if any. Zero only in case of NO failed calls since
200   // the construction of this instance of StatefulNnApiDelegate.
201   // The error code is reset when the delegate is re-initialized
202   // (i.e. when calling interpreter.ModifyGraphWithDelegate(delegate)).
203   int GetNnApiErrno() const;
204 
205  private:
206   // Encapsulates all delegate data.
207   struct Data {
208     // Pointer to NNAPI implementation to be used by this delegate as
209     // set when building the StatefulNnApiDelegate instance.
210     // Will generally be the NnApiInstance() singleton but can be overridden
211     // for testing or for users needing to wrap or stub parts of NNAPI.
212     // The ownership of the nnapi instance is left to the caller of
213     // the StatefulNnApiDelegate constructor.
214     const NnApi* nnapi;
215     // Preferred Power/perf trade-off.
216     Options::ExecutionPreference execution_preference;
217     // Selected NNAPI accelerator name.
218     std::string accelerator_name;
219     // The cache dir for NNAPI model.
220     std::string cache_dir;
221     // The unique token string for NNAPI model.
222     std::string model_token;
223     // Whether to disallow NNAPI CPU.
224     bool disallow_nnapi_cpu;
225     // Tensor to ANeuralNetworksMemory mapping.
226     std::vector<MemoryRegistration> tensor_memory_map;
227     // Contains a non zero value if any NNAPI method call
228     // operation returned a non zero result code.
229     int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
230     // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
231     // when trying to understand if all nodes are supported by the target
232     // accelerators.
233     // The key is the index of the first node in the partition.
234     // Couldn't use unique_ptr because of problems building on gcc
235     std::unordered_map<int, NNAPIDelegateKernel*> delegate_state_cache;
236     // Maximum number of NNAPI partition to delegate. Zero or negative means
237     // no limit. Copied from StatefulNnApiDelegate::Options
238     int max_number_delegated_partitions;
239     // allow fp32 computation to be run in fp16.
240     bool allow_fp16;
241     // Specifies the relative priority for executions of the model.
242     int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
243     // Specifies the maximum expected duration in nanosecond for compiling the
244     // model.
245     uint64_t max_compilation_timeout_duration_ns = 0;
246     // Specifies the maximum expected duration in nanosecond for executing the
247     // model.
248     uint64_t max_execution_timeout_duration_ns = 0;
249     // Specifies the maximum expected duration in nanosecond for WHILE loops in
250     // the execution
251     uint64_t max_execution_loop_timeout_duration_ns = 0;
252     // Whether to allow dynamic dimension sizes without re-compilation.
253     bool allow_dynamic_dimensions = false;
254 
255     explicit Data(const NnApi* nnapi);
256     ~Data();
257 
258     // Caches an initialised NNAPIDelegateKernel.
259     void CacheDelegateKernel(const TfLiteDelegateParams* delegate_params,
260                              NNAPIDelegateKernel* delegate_state);
261     // Returns a cached NNAPIDelegateKernel if available and removes it
262     // from the cache transferring the ownership to the caller.
263     NNAPIDelegateKernel* MaybeGetCachedDelegateKernel(
264         const TfLiteDelegateParams* delegate_params);
265   };
266 
267   // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate
268   // documentation for more info.
269   static TfLiteStatus DoPrepare(TfLiteContext* context,
270                                 TfLiteDelegate* delegate);
271 
272   // Copy the data from delegate buffer handle into raw memory of the given
273   // 'tensor'. The delegate is allowed to allocate the raw
274   // bytes as long as it follows the rules for kTfLiteDynamic tensors.
275   static TfLiteStatus DoCopyFromBufferHandle(TfLiteContext* context,
276                                              TfLiteDelegate* delegate,
277                                              TfLiteBufferHandle buffer_handle,
278                                              TfLiteTensor* tensor);
279 
280   // Copy the data from raw memory of the given 'tensor' to delegate buffer
281   // handle. Currently this function is not supported, and calling the function
282   // will result in an error.
283   static TfLiteStatus DoCopyToBufferHandle(TfLiteContext* context,
284                                            TfLiteDelegate* delegate,
285                                            TfLiteBufferHandle buffer_handle,
286                                            TfLiteTensor* tensor);
287 
288   // Free the Delegate Buffer Handle. Note: This only frees the handle, but
289   // this doesn't release the underlying resource (e.g. textures). The
290   // resources are either owned by application layer or the delegate.
291   static void DoFreeBufferHandle(TfLiteContext* context,
292                                  TfLiteDelegate* delegate,
293                                  TfLiteBufferHandle* handle);
294 
295   // Returns the nodes that can be delegated via NNAPI to the accelerator
296   // specified in the delegate options and information about the way the
297   // graph will be partitioned if the supported nodes will be delegated.
298   // Partition information is composed by the number of partitions and
299   // the delegate parameters associated to each partition.
300   // The method also caches in delegate->data the NNApiDelegateKernel instances
301   // that have been created during the device evaluation.
302   // All arguments are expected to be non-null.
303   static TfLiteStatus GetNodesSupportedByAccelerator(
304       TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
305       const std::vector<int>& supported_nodes,
306       std::vector<int>* device_supported_nodes, int* num_partitions,
307       TfLiteDelegateParams** params_array, int* nnapi_errno);
308 
309   // Alters the given array of nodes_to_delegate to limit the number of NNAPI
310   // owned partition to be less or equal than num_partitions. If num_partitions
311   // is less or equal to zero the input is left unaltered.
312   // The nodes_to_delegate array is expected to contain at element 0 the number
313   // of nodes to delegate and in remaining elements the set of nodes
314   // that would be delegated to NNAPI if this function wouldn't be
315   // called. It will be altered storing in the first element the count of
316   // nodes to actually delegate and in the remainder of the array the indexes.
317   // The params_array params might be altered during the functions execution.
318   static TfLiteStatus LimitDelegatedPartitions(
319       int max_partitions,
320       std::vector<TfLiteDelegateParams> partition_params_array,
321       std::vector<int>* nodes_to_delegate);
322 
323   // Delegate data presented through TfLiteDelegate::data_.
324   Data delegate_data_;
325 };
326 
327 // DEPRECATED: Please use StatefulNnApiDelegate class instead.
328 //
329 // Returns a singleton delegate that can be used to use the NN API.
330 // e.g.
331 //   NnApiDelegate* delegate = NnApiDelegate();
332 //   interpreter->ModifyGraphWithDelegate(&delegate);
333 // NnApiDelegate() returns a singleton, so you should not free this
334 // pointer or worry about its lifetime.
335 TfLiteDelegate* NnApiDelegate();
336 
337 }  // namespace tflite
338 
339 #endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
340