1 /**
2 * Copyright 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "run_tflite.h"
18
19 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
20 #include "tensorflow/lite/kernels/register.h"
21
22 #include <android/log.h>
23 #include <dlfcn.h>
24 #include <sys/time.h>
25 #include <cstdio>
26
27 #define LOG_TAG "NN_BENCHMARK"
28
29 #define FATAL(fmt, ...) \
30 do { \
31 __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
32 assert(false); \
33 } while (0)
34
35 namespace {
36
currentTimeInUsec()37 long long currentTimeInUsec() {
38 timeval tv;
39 gettimeofday(&tv, NULL);
40 return ((tv.tv_sec * 1000000L) + tv.tv_usec);
41 }
42
43 // Workaround for build systems that make difficult to pick the correct NDK API
44 // level. NDK tracing methods are dynamically loaded from libandroid.so.
45 typedef void* (*fp_ATrace_beginSection)(const char* sectionName);
46 typedef void* (*fp_ATrace_endSection)();
47 struct TraceFunc {
48 fp_ATrace_beginSection ATrace_beginSection;
49 fp_ATrace_endSection ATrace_endSection;
50 };
setupTraceFunc()51 TraceFunc setupTraceFunc() {
52 void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
53 if (lib == nullptr) {
54 FATAL("unable to open libandroid.so");
55 }
56 return {
57 reinterpret_cast<fp_ATrace_beginSection>(
58 dlsym(lib, "ATrace_beginSection")),
59 reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
60 }
61 static TraceFunc kTraceFunc{setupTraceFunc()};
62
63 } // namespace
64
create(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)65 BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
66 bool enable_intermediate_tensors_dump,
67 const char* nnapi_device_name) {
68 BenchmarkModel* model = new BenchmarkModel();
69 if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump,
70 nnapi_device_name)) {
71 delete model;
72 return nullptr;
73 }
74 return model;
75 }
76
init(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)77 bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
78 bool enable_intermediate_tensors_dump,
79 const char* nnapi_device_name) {
80 __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "BenchmarkModel %s",
81 modelfile);
82
83 // Memory map the model. NOTE this needs lifetime greater than or equal
84 // to interpreter context.
85 mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
86 if (!mTfliteModel) {
87 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
88 modelfile);
89 return false;
90 }
91
92 tflite::ops::builtin::BuiltinOpResolver resolver;
93 tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
94 if (!mTfliteInterpreter) {
95 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
96 "Failed to create TFlite interpreter");
97 return false;
98 }
99
100 if (enable_intermediate_tensors_dump) {
101 // Make output of every op a model output. This way we will be able to
102 // fetch each intermediate tensor when running with delegates.
103 outputs.clear();
104 for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
105 auto node_outputs =
106 mTfliteInterpreter->node_and_registration(node)->first.outputs;
107 outputs.insert(outputs.end(), node_outputs->data,
108 node_outputs->data + node_outputs->size);
109 }
110 mTfliteInterpreter->SetOutputs(outputs);
111 }
112
113 // Allow Fp16 precision for all models
114 mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);
115
116 if (use_nnapi) {
117 if (nnapi_device_name != nullptr) {
118 __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "Running NNAPI on device %s",
119 nnapi_device_name);
120 }
121 tflite::StatefulNnApiDelegate::Options nnapi_options;
122 nnapi_options.accelerator_name = nnapi_device_name;
123 mTfliteNnapiDelegate = std::make_unique<tflite::StatefulNnApiDelegate>(nnapi_options);
124 if (mTfliteInterpreter->ModifyGraphWithDelegate(mTfliteNnapiDelegate.get()) != kTfLiteOk) {
125 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
126 "Failed to initialize NNAPI Delegate");
127 return false;
128 }
129 }
130 return true;
131 }
132
BenchmarkModel()133 BenchmarkModel::BenchmarkModel() {}
~BenchmarkModel()134 BenchmarkModel::~BenchmarkModel() {}
135
setInput(const uint8_t * dataPtr,size_t length)136 bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
137 int input = mTfliteInterpreter->inputs()[0];
138 auto* input_tensor = mTfliteInterpreter->tensor(input);
139
140 switch (input_tensor->type) {
141 case kTfLiteFloat32:
142 case kTfLiteUInt8: {
143 void* raw = input_tensor->data.raw;
144 memcpy(raw, dataPtr, length);
145 break;
146 }
147 default:
148 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
149 "Input tensor type not supported");
150 return false;
151 }
152 return true;
153 }
saveInferenceOutput(InferenceResult * result,int output_index)154 void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
155 int output_index) {
156 int output = mTfliteInterpreter->outputs()[output_index];
157 auto* output_tensor = mTfliteInterpreter->tensor(output);
158 auto& sink = result->inferenceOutputs[output_index];
159 sink.insert(sink.end(), output_tensor->data.uint8,
160 output_tensor->data.uint8 + output_tensor->bytes);
161 }
162
getOutputError(const uint8_t * expected_data,size_t length,InferenceResult * result,int output_index)163 void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
164 InferenceResult* result, int output_index) {
165 int output = mTfliteInterpreter->outputs()[output_index];
166 auto* output_tensor = mTfliteInterpreter->tensor(output);
167 if (output_tensor->bytes != length) {
168 FATAL("Wrong size of output tensor, expected %zu, is %zu",
169 output_tensor->bytes, length);
170 }
171
172 size_t elements_count = 0;
173 float err_sum = 0.0;
174 float max_error = 0.0;
175 switch (output_tensor->type) {
176 case kTfLiteUInt8: {
177 uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
178 elements_count = output_tensor->bytes;
179 for (size_t i = 0; i < output_tensor->bytes; ++i) {
180 float err = ((float)output_raw[i]) - ((float)expected_data[i]);
181 if (err > max_error) max_error = err;
182 err_sum += err * err;
183 }
184 break;
185 }
186 case kTfLiteFloat32: {
187 const float* expected = reinterpret_cast<const float*>(expected_data);
188 float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
189 elements_count = output_tensor->bytes / sizeof(float);
190 for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
191 float err = output_raw[i] - expected[i];
192 if (err > max_error) max_error = err;
193 err_sum += err * err;
194 }
195 break;
196 }
197 default:
198 FATAL("Output sensor type %d not supported", output_tensor->type);
199 }
200 result->meanSquareErrors[output_index] = err_sum / elements_count;
201 result->maxSingleErrors[output_index] = max_error;
202 }
203
resizeInputTensors(std::vector<int> shape)204 bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
205 // The benchmark only expects single input tensor, hardcoded as 0.
206 int input = mTfliteInterpreter->inputs()[0];
207 mTfliteInterpreter->ResizeInputTensor(input, shape);
208 if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
209 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
210 "Failed to allocate tensors!");
211 return false;
212 }
213 return true;
214 }
215
runInference()216 bool BenchmarkModel::runInference() {
217 auto status = mTfliteInterpreter->Invoke();
218 if (status != kTfLiteOk) {
219 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to invoke: %d!",
220 (int)status);
221 return false;
222 }
223 return true;
224 }
225
resetStates()226 bool BenchmarkModel::resetStates() {
227 auto status = mTfliteInterpreter->ResetVariableTensors();
228 if (status != kTfLiteOk) {
229 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
230 "Failed to reset variable tensors: %d!", (int)status);
231 return false;
232 }
233 return true;
234 }
235
benchmark(const std::vector<InferenceInOutSequence> & inOutData,int seqInferencesMaxCount,float timeout,int flags,std::vector<InferenceResult> * results)236 bool BenchmarkModel::benchmark(
237 const std::vector<InferenceInOutSequence>& inOutData,
238 int seqInferencesMaxCount, float timeout, int flags,
239 std::vector<InferenceResult>* results) {
240 if (inOutData.empty()) {
241 __android_log_print(ANDROID_LOG_WARN, LOG_TAG,
242 "Input/output vector is empty");
243 return true;
244 }
245
246 float inferenceTotal = 0.0;
247 for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
248 ++seqInferenceIndex) {
249 resetStates();
250
251 const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
252 const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
253 for (int i = 0; i < seq.size(); ++i) {
254 const InferenceInOut& data = seq[i];
255
256 // For NNAPI systrace usage documentation, see
257 // frameworks/ml/nn/common/include/Tracing.h.
258 kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
259 kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
260 if (data.input) {
261 setInput(data.input, data.input_size);
262 } else {
263 int input = mTfliteInterpreter->inputs()[0];
264 auto* input_tensor = mTfliteInterpreter->tensor(input);
265 if (!data.createInput((uint8_t*)input_tensor->data.raw,
266 input_tensor->bytes)) {
267 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
268 "Input creation %d failed", i);
269 return false;
270 }
271 }
272 kTraceFunc.ATrace_endSection();
273 long long startTime = currentTimeInUsec();
274 const bool success = runInference();
275 kTraceFunc.ATrace_endSection();
276 long long endTime = currentTimeInUsec();
277 if (!success) {
278 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
279 i);
280 return false;
281 }
282
283 float inferenceTime =
284 static_cast<float>(endTime - startTime) / 1000000.0f;
285 size_t outputsCount = mTfliteInterpreter->outputs().size();
286 InferenceResult result{
287 inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
288 result.meanSquareErrors.resize(outputsCount);
289 result.maxSingleErrors.resize(outputsCount);
290 result.inferenceOutputs.resize(outputsCount);
291
292 if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
293 if (outputsCount != data.outputs.size()) {
294 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
295 "Golden/actual outputs (%zu/%zu) count mismatch",
296 data.outputs.size(), outputsCount);
297 return false;
298 }
299 for (int j = 0; j < outputsCount; ++j) {
300 getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
301 }
302 }
303
304 if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
305 for (int j = 0; j < outputsCount; ++j) {
306 saveInferenceOutput(&result, j);
307 }
308 }
309 results->push_back(result);
310 inferenceTotal += inferenceTime;
311 }
312
313 // Timeout?
314 if (timeout > 0.001 && inferenceTotal > timeout) {
315 return true;
316 }
317 }
318 return true;
319 }
320
dumpAllLayers(const char * path,const std::vector<InferenceInOutSequence> & inOutData)321 bool BenchmarkModel::dumpAllLayers(
322 const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
323 if (inOutData.empty()) {
324 FATAL("Input/output vector is empty");
325 }
326
327 for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
328 ++seqInferenceIndex) {
329 resetStates();
330
331 const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
332 for (int i = 0; i < seq.size(); ++i) {
333 const InferenceInOut& data = seq[i];
334 setInput(data.input, data.input_size);
335 const bool success = runInference();
336 if (!success) {
337 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
338 i);
339 return false;
340 }
341
342 // The order of the tensor is not sorted by the tensor index
343 for (int tensor_order = 0; tensor_order < outputs.size(); ++tensor_order) {
344 int tensor_index = outputs[tensor_order];
345 auto* output_tensor = mTfliteInterpreter->tensor(tensor_index);
346 if (output_tensor->data.raw == nullptr) {
347 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
348 "output_tensor->data.raw == nullptr at index %d ", tensor_index);
349 continue;
350 }
351 char fullpath[1024];
352 snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_order_%.3d_tensor_%.3d", path,
353 seqInferenceIndex, i, tensor_order, tensor_index);
354 FILE* f = fopen(fullpath, "wb");
355 fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
356 fclose(f);
357 }
358 }
359 }
360 return true;
361 }
362