1 /**
2  * Copyright 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "run_tflite.h"
18 
19 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
20 #include "tensorflow/lite/kernels/register.h"
21 
22 #include <android/log.h>
23 #include <dlfcn.h>
24 #include <sys/time.h>
25 #include <cstdio>
26 
27 #define LOG_TAG "NN_BENCHMARK"
28 
29 #define FATAL(fmt, ...)                                                  \
30   do {                                                                   \
31     __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
32     assert(false);                                                       \
33   } while (0)
34 
35 namespace {
36 
currentTimeInUsec()37 long long currentTimeInUsec() {
38   timeval tv;
39   gettimeofday(&tv, NULL);
40   return ((tv.tv_sec * 1000000L) + tv.tv_usec);
41 }
42 
43 // Workaround for build systems that make difficult to pick the correct NDK API
44 // level. NDK tracing methods are dynamically loaded from libandroid.so.
45 typedef void* (*fp_ATrace_beginSection)(const char* sectionName);
46 typedef void* (*fp_ATrace_endSection)();
47 struct TraceFunc {
48   fp_ATrace_beginSection ATrace_beginSection;
49   fp_ATrace_endSection ATrace_endSection;
50 };
setupTraceFunc()51 TraceFunc setupTraceFunc() {
52   void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
53   if (lib == nullptr) {
54     FATAL("unable to open libandroid.so");
55   }
56   return {
57       reinterpret_cast<fp_ATrace_beginSection>(
58           dlsym(lib, "ATrace_beginSection")),
59       reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
60 }
61 static TraceFunc kTraceFunc{setupTraceFunc()};
62 
63 }  // namespace
64 
create(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)65 BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
66                                        bool enable_intermediate_tensors_dump,
67                                        const char* nnapi_device_name) {
68     BenchmarkModel* model = new BenchmarkModel();
69     if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump,
70                      nnapi_device_name)) {
71       delete model;
72       return nullptr;
73     }
74     return model;
75 }
76 
init(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)77 bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
78                           bool enable_intermediate_tensors_dump,
79                           const char* nnapi_device_name) {
80   __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "BenchmarkModel %s",
81                       modelfile);
82 
83   // Memory map the model. NOTE this needs lifetime greater than or equal
84   // to interpreter context.
85   mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
86   if (!mTfliteModel) {
87     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
88                         modelfile);
89     return false;
90   }
91 
92   tflite::ops::builtin::BuiltinOpResolver resolver;
93   tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
94   if (!mTfliteInterpreter) {
95     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
96                         "Failed to create TFlite interpreter");
97     return false;
98   }
99 
100   if (enable_intermediate_tensors_dump) {
101     // Make output of every op a model output. This way we will be able to
102     // fetch each intermediate tensor when running with delegates.
103     std::vector<int> outputs;
104     for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
105       auto node_outputs =
106           mTfliteInterpreter->node_and_registration(node)->first.outputs;
107       outputs.insert(outputs.end(), node_outputs->data,
108                      node_outputs->data + node_outputs->size);
109     }
110     mTfliteInterpreter->SetOutputs(outputs);
111   }
112 
113   // Allow Fp16 precision for all models
114   mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);
115 
116   if (use_nnapi) {
117     if (nnapi_device_name != nullptr) {
118       __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "Running NNAPI on device %s",
119                           nnapi_device_name);
120     }
121     if (mTfliteInterpreter->ModifyGraphWithDelegate(
122             tflite::NnApiDelegate(nnapi_device_name)) != kTfLiteOk) {
123       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
124                           "Failed to initialize NNAPI Delegate");
125       return false;
126     }
127   }
128   return true;
129 }
130 
BenchmarkModel()131 BenchmarkModel::BenchmarkModel() {}
~BenchmarkModel()132 BenchmarkModel::~BenchmarkModel() {}
133 
setInput(const uint8_t * dataPtr,size_t length)134 bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
135   int input = mTfliteInterpreter->inputs()[0];
136   auto* input_tensor = mTfliteInterpreter->tensor(input);
137 
138   switch (input_tensor->type) {
139     case kTfLiteFloat32:
140     case kTfLiteUInt8: {
141       void* raw = input_tensor->data.raw;
142       memcpy(raw, dataPtr, length);
143       break;
144     }
145     default:
146       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
147                           "Input tensor type not supported");
148       return false;
149   }
150   return true;
151 }
saveInferenceOutput(InferenceResult * result,int output_index)152 void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
153                                          int output_index) {
154   int output = mTfliteInterpreter->outputs()[output_index];
155   auto* output_tensor = mTfliteInterpreter->tensor(output);
156   auto& sink = result->inferenceOutputs[output_index];
157   sink.insert(sink.end(), output_tensor->data.uint8,
158               output_tensor->data.uint8 + output_tensor->bytes);
159 }
160 
getOutputError(const uint8_t * expected_data,size_t length,InferenceResult * result,int output_index)161 void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
162                                     InferenceResult* result, int output_index) {
163   int output = mTfliteInterpreter->outputs()[output_index];
164   auto* output_tensor = mTfliteInterpreter->tensor(output);
165   if (output_tensor->bytes != length) {
166     FATAL("Wrong size of output tensor, expected %zu, is %zu",
167           output_tensor->bytes, length);
168   }
169 
170   size_t elements_count = 0;
171   float err_sum = 0.0;
172   float max_error = 0.0;
173   switch (output_tensor->type) {
174     case kTfLiteUInt8: {
175       uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
176       elements_count = output_tensor->bytes;
177       for (size_t i = 0; i < output_tensor->bytes; ++i) {
178         float err = ((float)output_raw[i]) - ((float)expected_data[i]);
179         if (err > max_error) max_error = err;
180         err_sum += err * err;
181       }
182       break;
183     }
184     case kTfLiteFloat32: {
185       const float* expected = reinterpret_cast<const float*>(expected_data);
186       float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
187       elements_count = output_tensor->bytes / sizeof(float);
188       for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
189         float err = output_raw[i] - expected[i];
190         if (err > max_error) max_error = err;
191         err_sum += err * err;
192       }
193       break;
194     }
195     default:
196       FATAL("Output sensor type %d not supported", output_tensor->type);
197   }
198   result->meanSquareErrors[output_index] = err_sum / elements_count;
199   result->maxSingleErrors[output_index] = max_error;
200 }
201 
resizeInputTensors(std::vector<int> shape)202 bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
203   // The benchmark only expects single input tensor, hardcoded as 0.
204   int input = mTfliteInterpreter->inputs()[0];
205   mTfliteInterpreter->ResizeInputTensor(input, shape);
206   if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
207     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
208                         "Failed to allocate tensors!");
209     return false;
210   }
211   return true;
212 }
213 
runInference()214 bool BenchmarkModel::runInference() {
215   auto status = mTfliteInterpreter->Invoke();
216   if (status != kTfLiteOk) {
217     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to invoke: %d!",
218                         (int)status);
219     return false;
220   }
221   return true;
222 }
223 
resetStates()224 bool BenchmarkModel::resetStates() {
225   auto status = mTfliteInterpreter->ResetVariableTensors();
226   if (status != kTfLiteOk) {
227     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
228                         "Failed to reset variable tensors: %d!", (int)status);
229     return false;
230   }
231   return true;
232 }
233 
benchmark(const std::vector<InferenceInOutSequence> & inOutData,int seqInferencesMaxCount,float timeout,int flags,std::vector<InferenceResult> * results)234 bool BenchmarkModel::benchmark(
235     const std::vector<InferenceInOutSequence>& inOutData,
236     int seqInferencesMaxCount, float timeout, int flags,
237     std::vector<InferenceResult>* results) {
238   if (inOutData.empty()) {
239     FATAL("Input/output vector is empty");
240   }
241 
242   float inferenceTotal = 0.0;
243   for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
244        ++seqInferenceIndex) {
245     resetStates();
246 
247     const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
248     const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
249     for (int i = 0; i < seq.size(); ++i) {
250       const InferenceInOut& data = seq[i];
251 
252       // For NNAPI systrace usage documentation, see
253       // frameworks/ml/nn/common/include/Tracing.h.
254       kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
255       kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
256       if (data.input) {
257         setInput(data.input, data.input_size);
258       } else {
259         int input = mTfliteInterpreter->inputs()[0];
260         auto* input_tensor = mTfliteInterpreter->tensor(input);
261         if (!data.createInput((uint8_t*)input_tensor->data.raw,
262                               input_tensor->bytes)) {
263           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
264                               "Input creation %d failed", i);
265           return false;
266         }
267       }
268       kTraceFunc.ATrace_endSection();
269       long long startTime = currentTimeInUsec();
270       const bool success = runInference();
271       kTraceFunc.ATrace_endSection();
272       long long endTime = currentTimeInUsec();
273       if (!success) {
274         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
275                             i);
276         return false;
277       }
278 
279       float inferenceTime =
280           static_cast<float>(endTime - startTime) / 1000000.0f;
281       size_t outputsCount = mTfliteInterpreter->outputs().size();
282       InferenceResult result{
283           inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
284       result.meanSquareErrors.resize(outputsCount);
285       result.maxSingleErrors.resize(outputsCount);
286       result.inferenceOutputs.resize(outputsCount);
287 
288       if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
289         if (outputsCount != data.outputs.size()) {
290           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
291                               "Golden/actual outputs (%zu/%zu) count mismatch",
292                               data.outputs.size(), outputsCount);
293           return false;
294         }
295         for (int j = 0; j < outputsCount; ++j) {
296           getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
297         }
298       }
299 
300       if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
301         for (int j = 0; j < outputsCount; ++j) {
302           saveInferenceOutput(&result, j);
303         }
304       }
305       results->push_back(result);
306       inferenceTotal += inferenceTime;
307     }
308 
309     // Timeout?
310     if (timeout > 0.001 && inferenceTotal > timeout) {
311       return true;
312     }
313   }
314   return true;
315 }
316 
dumpAllLayers(const char * path,const std::vector<InferenceInOutSequence> & inOutData)317 bool BenchmarkModel::dumpAllLayers(
318     const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
319   if (inOutData.empty()) {
320     FATAL("Input/output vector is empty");
321   }
322 
323   for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
324        ++seqInferenceIndex) {
325     resetStates();
326 
327     const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
328     for (int i = 0; i < seq.size(); ++i) {
329       const InferenceInOut& data = seq[i];
330       setInput(data.input, data.input_size);
331       const bool success = runInference();
332       if (!success) {
333         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
334                             i);
335         return false;
336       }
337 
338       for (int tensor = 0; tensor < mTfliteInterpreter->tensors_size();
339            ++tensor) {
340         auto* output_tensor = mTfliteInterpreter->tensor(tensor);
341         if (output_tensor->data.raw == nullptr) {
342           continue;
343         }
344         char fullpath[1024];
345         snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_tensor_%.3d", path,
346                  seqInferenceIndex, i, tensor);
347         FILE* f = fopen(fullpath, "wb");
348         fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
349         fclose(f);
350       }
351     }
352   }
353   return true;
354 }
355