1 /**
2  * Copyright 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "run_tflite.h"
18 
19 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
20 #include "tensorflow/lite/kernels/register.h"
21 
22 #include <android/log.h>
23 #include <dlfcn.h>
24 #include <sys/time.h>
25 #include <cstdio>
26 
27 #define LOG_TAG "NN_BENCHMARK"
28 
29 #define FATAL(fmt, ...)                                                  \
30   do {                                                                   \
31     __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
32     assert(false);                                                       \
33   } while (0)
34 
35 namespace {
36 
currentTimeInUsec()37 long long currentTimeInUsec() {
38   timeval tv;
39   gettimeofday(&tv, NULL);
40   return ((tv.tv_sec * 1000000L) + tv.tv_usec);
41 }
42 
43 // Workaround for build systems that make difficult to pick the correct NDK API
44 // level. NDK tracing methods are dynamically loaded from libandroid.so.
45 typedef void* (*fp_ATrace_beginSection)(const char* sectionName);
46 typedef void* (*fp_ATrace_endSection)();
47 struct TraceFunc {
48   fp_ATrace_beginSection ATrace_beginSection;
49   fp_ATrace_endSection ATrace_endSection;
50 };
setupTraceFunc()51 TraceFunc setupTraceFunc() {
52   void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
53   if (lib == nullptr) {
54     FATAL("unable to open libandroid.so");
55   }
56   return {
57       reinterpret_cast<fp_ATrace_beginSection>(
58           dlsym(lib, "ATrace_beginSection")),
59       reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
60 }
61 static TraceFunc kTraceFunc{setupTraceFunc()};
62 
63 }  // namespace
64 
create(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)65 BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
66                                        bool enable_intermediate_tensors_dump,
67                                        const char* nnapi_device_name) {
68     BenchmarkModel* model = new BenchmarkModel();
69     if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump,
70                      nnapi_device_name)) {
71       delete model;
72       return nullptr;
73     }
74     return model;
75 }
76 
init(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)77 bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
78                           bool enable_intermediate_tensors_dump,
79                           const char* nnapi_device_name) {
80   __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "BenchmarkModel %s",
81                       modelfile);
82 
83   // Memory map the model. NOTE this needs lifetime greater than or equal
84   // to interpreter context.
85   mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
86   if (!mTfliteModel) {
87     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
88                         modelfile);
89     return false;
90   }
91 
92   tflite::ops::builtin::BuiltinOpResolver resolver;
93   tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
94   if (!mTfliteInterpreter) {
95     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
96                         "Failed to create TFlite interpreter");
97     return false;
98   }
99 
100   if (enable_intermediate_tensors_dump) {
101     // Make output of every op a model output. This way we will be able to
102     // fetch each intermediate tensor when running with delegates.
103     outputs.clear();
104     for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
105       auto node_outputs =
106           mTfliteInterpreter->node_and_registration(node)->first.outputs;
107       outputs.insert(outputs.end(), node_outputs->data,
108                      node_outputs->data + node_outputs->size);
109     }
110     mTfliteInterpreter->SetOutputs(outputs);
111   }
112 
113   // Allow Fp16 precision for all models
114   mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);
115 
116   if (use_nnapi) {
117     if (nnapi_device_name != nullptr) {
118       __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "Running NNAPI on device %s",
119                           nnapi_device_name);
120     }
121     tflite::StatefulNnApiDelegate::Options nnapi_options;
122     nnapi_options.accelerator_name = nnapi_device_name;
123     mTfliteNnapiDelegate = std::make_unique<tflite::StatefulNnApiDelegate>(nnapi_options);
124     if (mTfliteInterpreter->ModifyGraphWithDelegate(mTfliteNnapiDelegate.get()) != kTfLiteOk) {
125       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
126                           "Failed to initialize NNAPI Delegate");
127       return false;
128     }
129   }
130   return true;
131 }
132 
BenchmarkModel()133 BenchmarkModel::BenchmarkModel() {}
~BenchmarkModel()134 BenchmarkModel::~BenchmarkModel() {}
135 
setInput(const uint8_t * dataPtr,size_t length)136 bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
137   int input = mTfliteInterpreter->inputs()[0];
138   auto* input_tensor = mTfliteInterpreter->tensor(input);
139 
140   switch (input_tensor->type) {
141     case kTfLiteFloat32:
142     case kTfLiteUInt8: {
143       void* raw = input_tensor->data.raw;
144       memcpy(raw, dataPtr, length);
145       break;
146     }
147     default:
148       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
149                           "Input tensor type not supported");
150       return false;
151   }
152   return true;
153 }
saveInferenceOutput(InferenceResult * result,int output_index)154 void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
155                                          int output_index) {
156   int output = mTfliteInterpreter->outputs()[output_index];
157   auto* output_tensor = mTfliteInterpreter->tensor(output);
158   auto& sink = result->inferenceOutputs[output_index];
159   sink.insert(sink.end(), output_tensor->data.uint8,
160               output_tensor->data.uint8 + output_tensor->bytes);
161 }
162 
getOutputError(const uint8_t * expected_data,size_t length,InferenceResult * result,int output_index)163 void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
164                                     InferenceResult* result, int output_index) {
165   int output = mTfliteInterpreter->outputs()[output_index];
166   auto* output_tensor = mTfliteInterpreter->tensor(output);
167   if (output_tensor->bytes != length) {
168     FATAL("Wrong size of output tensor, expected %zu, is %zu",
169           output_tensor->bytes, length);
170   }
171 
172   size_t elements_count = 0;
173   float err_sum = 0.0;
174   float max_error = 0.0;
175   switch (output_tensor->type) {
176     case kTfLiteUInt8: {
177       uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
178       elements_count = output_tensor->bytes;
179       for (size_t i = 0; i < output_tensor->bytes; ++i) {
180         float err = ((float)output_raw[i]) - ((float)expected_data[i]);
181         if (err > max_error) max_error = err;
182         err_sum += err * err;
183       }
184       break;
185     }
186     case kTfLiteFloat32: {
187       const float* expected = reinterpret_cast<const float*>(expected_data);
188       float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
189       elements_count = output_tensor->bytes / sizeof(float);
190       for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
191         float err = output_raw[i] - expected[i];
192         if (err > max_error) max_error = err;
193         err_sum += err * err;
194       }
195       break;
196     }
197     default:
198       FATAL("Output sensor type %d not supported", output_tensor->type);
199   }
200   result->meanSquareErrors[output_index] = err_sum / elements_count;
201   result->maxSingleErrors[output_index] = max_error;
202 }
203 
resizeInputTensors(std::vector<int> shape)204 bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
205   // The benchmark only expects single input tensor, hardcoded as 0.
206   int input = mTfliteInterpreter->inputs()[0];
207   mTfliteInterpreter->ResizeInputTensor(input, shape);
208   if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
209     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
210                         "Failed to allocate tensors!");
211     return false;
212   }
213   return true;
214 }
215 
runInference()216 bool BenchmarkModel::runInference() {
217   auto status = mTfliteInterpreter->Invoke();
218   if (status != kTfLiteOk) {
219     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to invoke: %d!",
220                         (int)status);
221     return false;
222   }
223   return true;
224 }
225 
resetStates()226 bool BenchmarkModel::resetStates() {
227   auto status = mTfliteInterpreter->ResetVariableTensors();
228   if (status != kTfLiteOk) {
229     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
230                         "Failed to reset variable tensors: %d!", (int)status);
231     return false;
232   }
233   return true;
234 }
235 
benchmark(const std::vector<InferenceInOutSequence> & inOutData,int seqInferencesMaxCount,float timeout,int flags,std::vector<InferenceResult> * results)236 bool BenchmarkModel::benchmark(
237     const std::vector<InferenceInOutSequence>& inOutData,
238     int seqInferencesMaxCount, float timeout, int flags,
239     std::vector<InferenceResult>* results) {
240   if (inOutData.empty()) {
241     __android_log_print(ANDROID_LOG_WARN, LOG_TAG,
242                         "Input/output vector is empty");
243     return true;
244   }
245 
246   float inferenceTotal = 0.0;
247   for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
248        ++seqInferenceIndex) {
249     resetStates();
250 
251     const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
252     const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
253     for (int i = 0; i < seq.size(); ++i) {
254       const InferenceInOut& data = seq[i];
255 
256       // For NNAPI systrace usage documentation, see
257       // frameworks/ml/nn/common/include/Tracing.h.
258       kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
259       kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
260       if (data.input) {
261         setInput(data.input, data.input_size);
262       } else {
263         int input = mTfliteInterpreter->inputs()[0];
264         auto* input_tensor = mTfliteInterpreter->tensor(input);
265         if (!data.createInput((uint8_t*)input_tensor->data.raw,
266                               input_tensor->bytes)) {
267           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
268                               "Input creation %d failed", i);
269           return false;
270         }
271       }
272       kTraceFunc.ATrace_endSection();
273       long long startTime = currentTimeInUsec();
274       const bool success = runInference();
275       kTraceFunc.ATrace_endSection();
276       long long endTime = currentTimeInUsec();
277       if (!success) {
278         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
279                             i);
280         return false;
281       }
282 
283       float inferenceTime =
284           static_cast<float>(endTime - startTime) / 1000000.0f;
285       size_t outputsCount = mTfliteInterpreter->outputs().size();
286       InferenceResult result{
287           inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
288       result.meanSquareErrors.resize(outputsCount);
289       result.maxSingleErrors.resize(outputsCount);
290       result.inferenceOutputs.resize(outputsCount);
291 
292       if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
293         if (outputsCount != data.outputs.size()) {
294           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
295                               "Golden/actual outputs (%zu/%zu) count mismatch",
296                               data.outputs.size(), outputsCount);
297           return false;
298         }
299         for (int j = 0; j < outputsCount; ++j) {
300           getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
301         }
302       }
303 
304       if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
305         for (int j = 0; j < outputsCount; ++j) {
306           saveInferenceOutput(&result, j);
307         }
308       }
309       results->push_back(result);
310       inferenceTotal += inferenceTime;
311     }
312 
313     // Timeout?
314     if (timeout > 0.001 && inferenceTotal > timeout) {
315       return true;
316     }
317   }
318   return true;
319 }
320 
dumpAllLayers(const char * path,const std::vector<InferenceInOutSequence> & inOutData)321 bool BenchmarkModel::dumpAllLayers(
322     const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
323   if (inOutData.empty()) {
324     FATAL("Input/output vector is empty");
325   }
326 
327   for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
328        ++seqInferenceIndex) {
329     resetStates();
330 
331     const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
332     for (int i = 0; i < seq.size(); ++i) {
333       const InferenceInOut& data = seq[i];
334       setInput(data.input, data.input_size);
335       const bool success = runInference();
336       if (!success) {
337         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
338                             i);
339         return false;
340       }
341 
342       // The order of the tensor is not sorted by the tensor index
343       for (int tensor_order = 0; tensor_order < outputs.size(); ++tensor_order) {
344         int tensor_index = outputs[tensor_order];
345         auto* output_tensor = mTfliteInterpreter->tensor(tensor_index);
346         if (output_tensor->data.raw == nullptr) {
347           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
348                       "output_tensor->data.raw == nullptr at index %d ", tensor_index);
349           continue;
350         }
351         char fullpath[1024];
352         snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_order_%.3d_tensor_%.3d", path,
353                  seqInferenceIndex, i, tensor_order, tensor_index);
354         FILE* f = fopen(fullpath, "wb");
355         fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
356         fclose(f);
357       }
358     }
359   }
360   return true;
361 }
362