1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionPlan"
18 
19 #include "ExecutionPlan.h"
20 
21 #include "Callbacks.h"
22 #include "CompilationBuilder.h"
23 #include "ExecutionBuilder.h"
24 #include "Manager.h"
25 #include "ModelBuilder.h"
26 #include "Utils.h"
27 
28 #include <functional>
29 #include <map>
30 #include <queue>
31 #include <unordered_set>
32 #include <utility>
33 #include <vector>
34 
35 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback;
36 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback;
37 
38 namespace android {
39 namespace nn {
40 
compile(std::shared_ptr<Device> device,const ModelBuilder * model,int32_t executionPreference,sp<IPreparedModel> * preparedModel)41 static int compile(std::shared_ptr<Device> device, const ModelBuilder* model,
42                    int32_t executionPreference, sp<IPreparedModel>* preparedModel) {
43     nnAssert(device != nullptr);  // nullptr indicates CPU
44     // Compilation logic copied from ExecutionBuilder::startComputeOnDevice().
45     Model hidlModel;
46     model->setHidlModel(&hidlModel);
47 
48     sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
49     Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
50         hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
51     if (!prepareLaunchStatus.isOk()) {
52         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: "
53                    << prepareLaunchStatus.description();
54         return ANEURALNETWORKS_OP_FAILED;
55     }
56     if (prepareLaunchStatus != ErrorStatus::NONE) {
57         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: "
58                    << toString(static_cast<ErrorStatus>(prepareLaunchStatus));
59         return ANEURALNETWORKS_OP_FAILED;
60     }
61 
62     preparedModelCallback->wait();
63     ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus();
64     *preparedModel = preparedModelCallback->getPreparedModel();
65     if (prepareReturnStatus != ErrorStatus::NONE || *preparedModel == nullptr) {
66         LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:"
67                    << " prepareReturnStatus=" << toString(prepareReturnStatus)
68                    << ", preparedModel=" << preparedModel->get();
69         return ANEURALNETWORKS_OP_FAILED;
70     }
71     return ANEURALNETWORKS_NO_ERROR;
72 }
73 
74 typedef std::function<void(uint32_t)> OperationReadyCallback;
75 
76 // This class tracks whether we know the value of an operand as operations
77 // are processed.
78 class OperandTracker {
79 public:
80     // Creates the tracker for this model. Figure out which operations can be
81     // executed right away and cb for each one of them.
82     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
83     // Mark the specified operation as having been processed. The output
84     // of the operation now being known, this may make new operations to be
85     // able to run.  Call cb for each one of them.
86     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
87 
88 private:
89     const ModelBuilder* mModel;
90     std::multimap<uint32_t, uint32_t> mOperandToOperations;
91     std::vector<uint32_t> mUnknownInputCount;  // For each operation
92 };
93 
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)94 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
95         mModel(model) {
96     const auto& operations = mModel->getOperations();
97     mUnknownInputCount.resize(operations.size());
98     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
99         const Operation& operation = operations[operationIndex];
100         uint32_t count = 0;
101         for (uint32_t operandIndex : operation.inputs) {
102             auto lifetime = mModel->getOperand(operandIndex).lifetime;
103             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
104                 lifetime == OperandLifeTime::MODEL_OUTPUT) {
105                 count++;
106                 mOperandToOperations.insert(
107                         std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
108             }
109         }
110         if (count == 0) {
111             cb(operationIndex);
112         }
113         mUnknownInputCount[operationIndex] = count;
114     }
115 }
116 
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)117 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
118     // Mark all its outputs as known.
119     const Operation& operation = mModel->getOperations()[operationIndex];
120     for (uint32_t operandIndex : operation.outputs) {
121         auto range = mOperandToOperations.equal_range(operandIndex);
122         for (auto i = range.first; i != range.second; i++) {
123             uint32_t& count = mUnknownInputCount[i->second];
124             if (--count == 0) {
125                 cb(i->second);
126             }
127         }
128     }
129 }
130 
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,std::shared_ptr<Device> device)131 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
132                              std::shared_ptr<Device> device)
133         : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device) {}
134 
135 // Adds an operand if it has not been added already.
136 // Sets the index in the submodel for the corresponding operand.
addOperand(uint32_t fromOperandIndex,uint32_t * toOperandIndex,const ModelBuilder & fromModel,OperandKind kind)137 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
138                               const ModelBuilder& fromModel, OperandKind kind) {
139     // Have we added this operand already?
140     auto i = mOperandMap.find(fromOperandIndex);
141     if (i != mOperandMap.end()) {
142         nnAssert(kind == INPUT);
143         *toOperandIndex = i->second;
144         return ANEURALNETWORKS_NO_ERROR;
145     }
146 
147     // First time we add this operand.
148     *toOperandIndex = mSubModel.operandCount();
149     mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
150 
151     // Add the operand to the submodel.
152     const Operand& operand = fromModel.getOperand(fromOperandIndex);
153     ANeuralNetworksOperandType type = {
154         .type = static_cast<int32_t>(operand.type),
155         .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
156         .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
157         .scale = operand.scale,
158         .zeroPoint = operand.zeroPoint
159     };
160     int n = mSubModel.addOperand(type);
161     if (n != ANEURALNETWORKS_NO_ERROR) {
162         LOG(ERROR) << "Previous error occurred when partitioning the graph";
163         return n;
164     }
165 
166     // Sets its value.
167     switch (operand.lifetime) {
168         case OperandLifeTime::CONSTANT_COPY: {
169             const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
170             n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
171             if (n != ANEURALNETWORKS_NO_ERROR) {
172                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
173                 return n;
174             }
175         } break;
176         case OperandLifeTime::CONSTANT_REFERENCE: {
177             const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
178             n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory,
179                                                      operand.location.offset,
180                                                      operand.location.length);
181             if (n != ANEURALNETWORKS_NO_ERROR) {
182                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
183                 return n;
184             }
185         } break;
186         case OperandLifeTime::NO_VALUE: {
187             n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
188             if (n != ANEURALNETWORKS_NO_ERROR) {
189                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
190                 return n;
191             }
192         } break;
193         case OperandLifeTime::TEMPORARY_VARIABLE:  // handled similarly to MODEL_OUTPUT
194             if (kind == INPUT) {
195                 // The first time we've seen this operand is as an
196                 // input.  That means it must be defined by a
197                 // different partition, and is an input to this one.
198                 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
199             } else {
200                 // The first time we've seen this operand is as an
201                 // output.  It may be an input to a different
202                 // partition, so keep track of it.
203                 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
204             }
205             break;
206         case OperandLifeTime::MODEL_INPUT:
207             mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
208             break;
209         case OperandLifeTime::MODEL_OUTPUT:  // handled similarly to TEMPORARY_VARIABLE
210             if (kind == INPUT) {
211                 // The first time we've seen this operand is as an
212                 // input.  That means it must be defined by a
213                 // different partition, and is an input to this one.
214                 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
215             } else {
216                 // The first time we've seen this operand is as an
217                 // output.
218                 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
219             }
220             break;
221         default:
222             nnAssert(false);
223             break;
224     }
225 
226     return ANEURALNETWORKS_NO_ERROR;
227 }
228 
addOperation(int operationIndex,const ModelBuilder & fromModel)229 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
230     const Operation& operation = fromModel.getOperation(operationIndex);
231 
232     // Convert the input and output operand indexes.
233     //
234     // We expect operations to be added in topological order.  Therefore:
235     //
236     // - We may not have seen an input if it is a model input, a
237     //   constant, or an operand written by a different partition.
238     //
239     // - We should not have seen any outputs.
240     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
241     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
242     std::vector<uint32_t> inputs(inputCount);
243     std::vector<uint32_t> outputs(outputCount);
244 
245     auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
246                                           std::vector<uint32_t>& localOperands,
247                                           OperandKind kind) -> int {
248         const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
249         for (uint32_t i = 0; i < operandCount; i++) {
250             uint32_t localOperand = ~0U;
251             int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
252             if (n != ANEURALNETWORKS_NO_ERROR)
253                 return n;
254             localOperands[i] = localOperand;
255         }
256         return ANEURALNETWORKS_NO_ERROR;
257     };
258 
259     int n;
260     if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
261         (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
262         return n;
263     }
264 
265     return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
266                                    outputCount, outputs.data());
267 }
268 
mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const269 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
270     for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
271         stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
272     }
273     for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
274         stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
275     }
276 }
277 
findTempsAsSubModelOutputs()278 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
279     for (const auto& step : mSteps) {
280         for (const auto& input : step->getTempsAsSubModelInputs()) {
281             const uint32_t fromModelIndex = input.first;
282             const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
283             nnAssert(it != mTemporaryToDefiningStep.end());
284             const uint32_t stepIndex = it->second;
285             nnAssert(stepIndex < mSteps.size());
286             mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
287         }
288     }
289 }
290 
logSubModel() const291 void ExecutionStep::logSubModel() const {
292     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
293 
294     auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) {
295         if (!toLog.empty()) {
296             toLog += ", ";
297         }
298         toLog += "(";
299         toLog += std::to_string(e.first);
300         toLog += "->";
301         toLog += std::to_string(e.second);
302         toLog += ")";
303     };
304 
305     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
306         std::string toLog;
307         for (const auto& e : map) {
308             logRemapEntry(toLog, e);
309         }
310         VLOG(COMPILATION) << name << ": " << toLog;
311     };
312     auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
313         std::string toLog;
314         for (const auto& e : set) {
315             logRemapEntry(toLog, e);
316         }
317         VLOG(COMPILATION) << name << ": " << toLog;
318     };
319 
320     logRemapVector("model inputs", mModelInputs);
321     logRemapVector("model outputs", mModelOutputs);
322     logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
323     logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
324     logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
325 }
326 
convertModelInputsOrOutputs(const ExecutionStep::RemapVectorType & myModelInputsOrOutputs,uint32_t fromModelInputOrOutputCount,std::function<uint32_t (uint32_t)> fromModelGetInputOrOutputOperandIndex,std::vector<uint32_t> * inputsOrOutputs,std::vector<uint32_t> * inputOrOutputIndexSubModelToFromModel)327 static void convertModelInputsOrOutputs(
328         // IN: mModel{Inputs|Outputs}
329         const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
330         // IN: fromModel->{input|output}Count()
331         uint32_t                              fromModelInputOrOutputCount,
332         // IN: fromModel->get{Input|Output}OperandIndex
333         std::function<uint32_t(uint32_t)>     fromModelGetInputOrOutputOperandIndex,
334         // OUT: for v : mModel{Inputs|Outputs} : v.second
335         std::vector<uint32_t>*                inputsOrOutputs,
336         // OUT: submodel input-or-output index to original model input-or-output index
337         std::vector<uint32_t>*                inputOrOutputIndexSubModelToFromModel) {
338     std::map<uint32_t, uint32_t> fromModelIndexMap;  // operand index to input-or-output index
339     for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
340         fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
341     }
342     for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
343         inputsOrOutputs->push_back(myInputOrOutput.second);
344         const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
345         inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
346     }
347 }
348 
finishSubModel(const ModelBuilder * fromModel,bool * hasOutputOfUnknownSize,int32_t executionPreference)349 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
350                                   int32_t executionPreference) {
351     if (VLOG_IS_ON(COMPILATION)) {
352         logSubModel();
353     }
354 
355     mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());
356 
357     // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
358     // Output order: mModelOutputs, mTempsAsSubModelOutputs
359     //
360     // ExecutionPlan::next() depends on these orderings.
361 
362     std::vector<uint32_t> inputs;
363     convertModelInputsOrOutputs(mModelInputs,
364                                 fromModel->inputCount(),
365                                 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
366                                 &inputs,
367                                 &mInputIndexSubModelToFromModel);
368     for (const auto& subModelInput : mTempsAsSubModelInputs) {
369         inputs.push_back(subModelInput.second);
370     }
371     for (const auto& subModelInput : mOutputsAsSubModelInputs) {
372         inputs.push_back(subModelInput.second);
373     }
374 
375     std::vector<uint32_t> outputs;
376     convertModelInputsOrOutputs(mModelOutputs,
377                                 fromModel->outputCount(),
378                                 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
379                                 &outputs,
380                                 &mOutputIndexSubModelToFromModel);
381     for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
382         outputs.push_back(subModelOutput.second);
383         const Operand& operand = mSubModel.getOperand(subModelOutput.second);
384         for (uint32_t dimension : operand.dimensions) {
385             if (dimension == 0) {
386                 *hasOutputOfUnknownSize = true;
387                 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
388                                 << " of original graph) has unknown size: "
389                                 << toString(operand);
390                 break;
391             }
392         }
393     }
394 
395     {
396         int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
397         if (n != ANEURALNETWORKS_NO_ERROR) {
398             return n;
399         }
400         n = mSubModel.finish();
401         if (n != ANEURALNETWORKS_NO_ERROR) {
402             return n;
403         }
404     }
405 
406     {
407         // Compute mOutputsAsSubModelInputsIndexToFromModel.
408 
409         std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
410         for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
411             fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
412         }
413 
414         for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
415             const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
416             const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
417             if (it == fromModelOperandIndexToOutputIndex.end()) {
418                 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
419                            << " in main model output operand list";
420                 return ANEURALNETWORKS_BAD_STATE;
421             }
422             mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
423         }
424     }
425 
426     // TODO: Move compilation elsewhere?
427 
428     if (mDevice == nullptr) {
429         return ANEURALNETWORKS_NO_ERROR;
430     }
431 
432     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation";
433     return compile(mDevice, &mSubModel, executionPreference, &mPreparedSubModel);
434 }
435 
dump() const436 void ExecutionStep::dump() const {
437     Model model;
438     mSubModel.setHidlModel(&model);
439     if (VLOG_IS_ON(COMPILATION)) {
440         VLOG(COMPILATION) << "ExecutionStep#" << mIndex
441                           << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
442         logModelToInfo(model);
443     }
444 }
445 
finish(const ModelBuilder * fromModel,int32_t executionPreference)446 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
447                                         int32_t executionPreference) {
448     findTempsAsSubModelOutputs();
449     for (const auto& step : mSteps) {
450         int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
451                                      executionPreference);
452         if (n != ANEURALNETWORKS_NO_ERROR) {
453             VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
454             return n;
455         }
456     }
457     if (mHasSubModelOutputOfUnknownSize) {
458         VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
459         return ANEURALNETWORKS_OP_FAILED;
460     }
461 
462     mSuccessfulFinish = true;
463     return ANEURALNETWORKS_NO_ERROR;
464 }
465 
finish(const ModelBuilder * fromModel,int32_t executionPreference)466 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
467                                       int32_t executionPreference) {
468     if (mDevice == nullptr) {
469         mSuccessfulFinish = true;
470         return ANEURALNETWORKS_NO_ERROR;
471     }
472 
473     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
474     const int n = compile(mDevice, mModel, executionPreference, &mPreparedModel);
475     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
476     return n;
477 }
478 
finish(const ModelBuilder * fromModel,int32_t executionPreference)479 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
480     nnAssert(mBody != nullptr);
481     return mBody->finish(fromModel, executionPreference);
482 }
483 
Controller(const ExecutionPlan * plan,const ExecutionBuilder * executionBuilder,std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,uint32_t totalSizeOfTemporaries)484 ExecutionPlan::Controller::Controller(
485     const ExecutionPlan* plan,
486     const ExecutionBuilder* executionBuilder,
487     std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
488     uint32_t totalSizeOfTemporaries) :
489         mPlan(plan), mExecutionBuilder(executionBuilder),
490         mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) {
491     if (totalSizeOfTemporaries) {
492         if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
493             LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
494             mNextStepIndex = kBadStepIndex;
495         }
496     }
497 }
498 
makeController(const ExecutionBuilder * executionBuilder) const499 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
500     const ExecutionBuilder* executionBuilder) const {
501     nnAssert((mState == EMPTY) == (mBody == nullptr));
502     if (mBody && !mBody->mSuccessfulFinish) {
503         VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish";
504         return std::shared_ptr<Controller>(nullptr);
505     }
506 
507     // Create the layout for a Memory object big enough for to hold
508     // every TEMPORARY in the original model that is live across
509     // partition boundaries.
510     //
511     // TODO: Rethink this approach for managing temporaries.  Some
512     // alternatives:
513     //
514     // 1) Adopt a memory layout scheme analogous to stack allocation,
515     // where objects of non-overlapping lifetime can occupy the same
516     // storage.  We would still have a single Memory object in this
517     // case.
518     //
519     // 2) Do something like what CpuExecutor does, and do allocations
520     // and deallocations on the fly (during execution) before first
521     // reference and after last reference, respectively.  This would
522     // mean having one Memory object per TEMPORARY; or, in a more
523     // complicated implementation, one Memory object per set of
524     // temporaries that have the same lifetime.  Note that the Android
525     // system limits the number of shared memory objects, which are
526     // what our Memory objects represent.
527     //
528     uint32_t totalSizeOfTemporaries = 0;
529     std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
530     if (mState == COMPOUND) {
531         const ModelBuilder* fromModel = executionBuilder->getModel();
532         for (const auto& step : compound()->mSteps) {
533             for (const auto& output: step->getTempsAsSubModelOutputs()) {
534                 const uint32_t fromModelOperandIndex = output.first;
535                 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
536                 if (subModelInputsAndOutputs == nullptr) {
537                     subModelInputsAndOutputs =
538                             std::make_shared<Controller::SubModelInputsAndOutputsType>();
539                 }
540                 const uint32_t size = sizeOfData(fromModelOperand);
541                 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
542                 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
543                 totalSizeOfTemporaries += size;
544             }
545         }
546         if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
547             for (const auto& io : *subModelInputsAndOutputs) {
548                 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first
549                                 << ", offset = " << io.second;
550             }
551         }
552     }
553 
554     return std::shared_ptr<Controller>(new Controller(this, executionBuilder,
555                                                       subModelInputsAndOutputs,
556                                                       totalSizeOfTemporaries));
557 }
558 
559 
560 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const561 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
562                             std::shared_ptr<StepExecutor>* executor) const {
563     *executor = nullptr;
564 
565     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
566                     << "): mNextStepIndex = " << controller->mNextStepIndex;
567 
568     if (controller->mNextStepIndex == 0) {
569         // We haven't called next().
570         return ANEURALNETWORKS_OP_FAILED;
571     }
572 
573     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
574         // The last call to next() did not produce an executor.
575         return ANEURALNETWORKS_OP_FAILED;
576     }
577 
578     --controller->mNextStepIndex;
579     return next(controller, executor);
580 }
581 
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const582 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
583                         std::shared_ptr<StepExecutor>* executor) const {
584     *executor = nullptr;
585 
586     VLOG(EXECUTION) << "ExecutionPlan::next("
587                     << SHOW_IF_DEBUG(controller << ", " << executor)
588                     << "): mNextStepIndex = " << controller->mNextStepIndex;
589 
590     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
591         return ANEURALNETWORKS_OP_FAILED;
592     }
593 
594     if (mState == EMPTY) {
595         nnAssert(controller->mNextStepIndex == 0);  // end
596         controller->mNextStepIndex = Controller::kBadStepIndex;
597         return ANEURALNETWORKS_NO_ERROR;
598     }
599 
600     if (mState == SIMPLE) {
601         if (controller->mNextStepIndex == 0) {
602             // First (and only) step.
603             auto simpleBody = static_cast<const SimpleBody*>(mBody);
604             *executor = std::make_shared<StepExecutor>(
605                 controller->mExecutionBuilder,
606                 simpleBody->mModel,
607                 (simpleBody->mDevice == nullptr ? nullptr : simpleBody->mDevice->getInterface()),
608                 simpleBody->mPreparedModel);
609             (*executor)->mapInputsAndOutputsTrivially();
610             controller->mNextStepIndex = 1;
611             return ANEURALNETWORKS_NO_ERROR;
612         }
613 
614         nnAssert(controller->mNextStepIndex == 1);  // end
615         controller->mNextStepIndex = Controller::kBadStepIndex;
616         return ANEURALNETWORKS_NO_ERROR;
617     }
618 
619     auto compoundBody = compound();
620 
621     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
622         // end
623         controller->mNextStepIndex = Controller::kBadStepIndex;
624         return ANEURALNETWORKS_NO_ERROR;
625     }
626 
627     // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
628     // Output order: model outputs, temps as submodel outputs
629     //
630     // ExecutionStep::finishSubModel() establishes these orderings.
631 
632     const auto step = compoundBody->mSteps[controller->mNextStepIndex];
633     *executor = std::make_shared<StepExecutor>(
634         controller->mExecutionBuilder,
635         step->getSubModel(),
636         (step->getDevice() == nullptr ? nullptr : step->getDevice()->getInterface()),
637         step->getPreparedSubModel());
638     step->mapInputsAndOutputs(*executor);
639     if (controller->mSubModelInputsAndOutputs != nullptr) {
640         {
641             // Tell executor about temps as submodel outputs.
642 
643             const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
644             const auto& subModelOutputs = step->getTempsAsSubModelOutputs();
645 
646             uint32_t idx = 0;
647             for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
648                 const uint32_t fromModelOperandIndex = I->first;
649                 const uint32_t offsetOfTemporary =
650                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
651                 int n = (*executor)->setOutputFromTemporaryMemory(
652                     firstSubModelOutputIndex + idx,
653                     &controller->mTemporaries,
654                     offsetOfTemporary);
655                 if (n != ANEURALNETWORKS_NO_ERROR) {
656                     controller->mNextStepIndex = Controller::kBadStepIndex;
657                     return n;
658                 }
659             }
660         }
661         {
662             // Tell executor about temps as submodel inputs.
663 
664             const size_t firstSubModelInputIndex = step->getModelInputs().size();
665             const auto& subModelInputs = step->getTempsAsSubModelInputs();
666 
667             uint32_t idx = 0;
668             for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
669                 const uint32_t fromModelOperandIndex = I->first;
670                 const uint32_t offsetOfTemporary =
671                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
672                 int n = (*executor)->setInputFromTemporaryMemory(
673                     firstSubModelInputIndex + idx,
674                     &controller->mTemporaries,
675                     offsetOfTemporary);
676                 if (n != ANEURALNETWORKS_NO_ERROR) {
677                     controller->mNextStepIndex = Controller::kBadStepIndex;
678                     return n;
679                 }
680             }
681         }
682     }
683     {
684         // Tell executor about outputs as submodel inputs.
685 
686         const size_t firstOutputsAsSubModelInputIndex =
687                 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
688         const auto& outputsAsSubModelInputsIndexToFromModel =
689                 step->getOutputsAsSubModelInputsIndexToFromModel();
690         for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
691             uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
692             (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
693         }
694     }
695 
696     controller->mNextStepIndex++;
697     return ANEURALNETWORKS_NO_ERROR;
698 }
699 
createNewStep(const std::shared_ptr<Device> device)700 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
701     nnAssert(mState != SIMPLE);
702     if (mState == EMPTY) {
703         mBody = new CompoundBody();
704         mState = COMPOUND;
705     }
706     auto& steps = compound()->mSteps;
707     auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
708     steps.push_back(step);
709     return step;
710 }
711 
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)712 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
713                                      const ModelBuilder* model) {
714     nnAssert(mState == EMPTY);
715     mBody = new SimpleBody(device, model);
716     mState = SIMPLE;
717 }
718 
dump() const719 void ExecutionPlan::dump() const {
720     if (mBody) {
721         mBody->dump();
722     } else {
723         VLOG(COMPILATION) << "EMPTY";
724     }
725 }
726 
forTest_getKind() const727 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
728     switch (mState) {
729         case EMPTY:
730             return Kind::EMPTY;
731         case SIMPLE:
732             nnAssert(mBody);
733             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
734         case COMPOUND:
735             nnAssert(mBody);
736             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
737         default:
738             nnAssert(!"unexpected state");
739             return Kind::ERROR;
740     }
741 }
742 
forTest_simpleGetDevice() const743 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
744     nnAssert(mState == SIMPLE);
745     return static_cast<const SimpleBody*>(mBody)->mDevice;
746 }
747 
forTest_compoundGetSteps() const748 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
749     return compound()->mSteps;
750 }
751 
forTest_hasSubModelOutputsOfUnknownSize() const752 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
753     return mBody->hasSubModelOutputsOfUnknownSize();
754 }
755 
dump() const756 void ExecutionPlan::SimpleBody::dump() const {
757     VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
758 }
759 
dump() const760 void ExecutionPlan::CompoundBody::dump() const {
761     for (const auto& step : mSteps) {
762         step->dump();
763     }
764 }
765 
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,ExecutionPlan * plan) const766 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
767                                    uint32_t preference, ExecutionPlan* plan) const {
768     // This function uses a heuristic approach to partitioning the graph.
769     // It should be good enough for the first release.
770 
771     const size_t nonCpuDeviceCount = devices.size();
772     // The device count is the number of HAL devices + 1. The +1 is for the CPU.
773     // Note that deviceCount includes CPU, which has no entry in devices[].
774     const size_t deviceCount = nonCpuDeviceCount + 1;
775     const size_t operationCount = mOperations.size();
776 
777     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
778                       << ", operationCount = " << operationCount;
779 
780     // If we only have the CPU, or if the graph has no operations, no need to try to partition.
781     if (nonCpuDeviceCount == 0 || operationCount == 0) {
782         // Make sure no op is an OEM operation.
783         for (auto& op: mOperations) {
784             if (op.type == OperationType::OEM_OPERATION) {
785                 LOG(ERROR) << "No driver can do the OEM op";
786                 return ANEURALNETWORKS_BAD_DATA;
787             }
788         }
789         plan->becomeSingleStep(nullptr /* CPU */, this);
790         return plan->finish(this, preference);
791     }
792 
793     // Figure out where each operation will best execute.
794     // The value of the vector is the index in the devices vector, with devices.size()
795     // representing the CPU.
796     std::vector<int> bestDeviceForOperation(operationCount);
797     int status = findBestDeviceForEachOperation(preference, devices, deviceCount,
798                                                 &bestDeviceForOperation);
799     if (status != ANEURALNETWORKS_NO_ERROR) {
800         return status;
801     }
802 
803     // If one device will run all the operations, we don't need to split the work.
804     if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
805                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
806         const int bestDeviceIndex = bestDeviceForOperation[0];
807         const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1);
808         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
809                           << bestDeviceIndex << " = "
810                           << (cpu ? "CPU" : devices[bestDeviceIndex]->getName());
811         plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this);
812         return plan->finish(this, preference);
813     }
814 
815     // No easy solution, we need to split the work.
816 
817     // We keep track of the operations that are ready to run for each device.
818     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
819 
820     // This helper function enqueues the operation on the appropriate queue.
821     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
822         int deviceIndex = bestDeviceForOperation[operationIndex];
823         perDeviceQueue[deviceIndex].push(operationIndex);
824         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
825                           << deviceIndex;
826     };
827 
828     // This helper function finds a device that has operations ready to process.
829     // We start by looking at the CPU. We do this to try to maximize the
830     // size of the graph we'll send to non-CPU devices. If the CPU runs first,
831     // it will have the chance to prepare more of the inputs required by the
832     // other devices. This function returns -1 if all queues are empty.
833     auto findNextDeviceToProcess = [&]() -> int {
834         for (int i = deviceCount - 1; i >= 0; i--) {
835             if (!perDeviceQueue[i].empty()) {
836                 return i;
837             }
838         }
839         return -1;
840     };
841 
842     OperandTracker tracker(this, enqueueOnAppropriateDevice);
843     // For each iteration of this loop, we'll create an execution step.
844     while (true) {
845         // Find the device we'll do this step for.
846         int deviceIndex = findNextDeviceToProcess();
847         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
848         if (deviceIndex < 0) {
849             break;
850         }
851         // nullptr represents the CPU.
852         std::shared_ptr<Device> device =
853                 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount
854                         ? devices[deviceIndex] : nullptr;
855 
856         // Assign as much as possible to this device.
857         std::shared_ptr<ExecutionStep> step = plan->createNewStep(device);
858         auto& queue = perDeviceQueue[deviceIndex];
859         while (!queue.empty()) {
860             uint32_t operationIndex = queue.front();
861             queue.pop();
862             int n = step->addOperation(operationIndex, *this);
863             if (n != ANEURALNETWORKS_NO_ERROR) {
864                 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
865                 return n;
866             }
867             tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
868         }
869     }
870 
871     int n = plan->finish(this, preference);
872     if (VLOG_IS_ON(COMPILATION)) {
873         Model model;
874         setHidlModel(&model);
875         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
876         logModelToInfo(model);
877         plan->dump();
878     }
879     return n;
880 }
881 
getPerformanceInfo(const std::shared_ptr<Device> device,uint32_t operationIndex) const882 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
883                                                  uint32_t operationIndex) const {
884     const Operation& operation = getOperation(operationIndex);
885     // TODO This assumes that the type is dictated by the first operand. This is
886     // currently the case but is not a safe assumption to make in the long term.
887     const uint32_t operandIndex = operation.inputs[0];
888     const OperandType operandType = mOperands[operandIndex].type;
889     switch(operandType) {
890         case OperandType::FLOAT32:
891         case OperandType::TENSOR_FLOAT32:
892             if (mRelaxComputationFloat32toFloat16) {
893                 return device->getRelaxedFloat32toFloat16Performance();
894             } else {
895                 return device->getFloat32Performance();
896             }
897         case OperandType::INT32:
898         case OperandType::UINT32:
899         case OperandType::TENSOR_INT32:
900         case OperandType::TENSOR_QUANT8_ASYMM:
901             // For OEM, the real selection will be made from who can run the operand.
902         case OperandType::OEM:
903         case OperandType::TENSOR_OEM_BYTE:
904             return device->getQuantized8Performance();
905         default:
906             nnAssert(false);
907             return device->getQuantized8Performance();
908     }
909 }
910 
911 namespace {
912 // This class determines whether a given device can execute a given operation
913 class CanDo {
914 public:
CanDo()915     CanDo() {}
916 
initialize(const ModelBuilder * model,std::shared_ptr<Device> device)917     void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) {
918         Model hidlModel;
919         model->setHidlModel(&hidlModel);
920         device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex);
921     }
922 
check(size_t operationIndex) const923     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
924 
925 private:
926     hidl_vec<bool> mSupportsOperationByIndex;
927 };
928 };  // anonymous namespace
929 
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,const size_t deviceCount,std::vector<int> * bestDeviceForOperation) const930 int ModelBuilder::findBestDeviceForEachOperation(
931         uint32_t preference,
932         const std::vector<std::shared_ptr<Device>>& devices,
933         const size_t deviceCount,
934         std::vector<int>* bestDeviceForOperation) const {
935 
936     // Note that deviceCount includes CPU, which has no entry in devices[]
937     const size_t nonCpuDeviceCount = deviceCount - 1;
938 
939     std::vector<CanDo> canDo(nonCpuDeviceCount);
940     for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
941         canDo[deviceIndex].initialize(this, devices[deviceIndex]);
942     }
943 
944     // Figure out the best driver for each operation.
945     const size_t operationCount = mOperations.size();
946     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
947         // Find which non-CPU device gives the best performance for this operation.
948         int bestChoice = -1;
949         float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
950         for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
951             const auto& device = devices[deviceIndex];
952             if (canDo[deviceIndex].check(operationIndex)) {
953                 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
954                 const float perfVal =
955                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
956                                                                             : perf.execTime);
957                 if (bestChoice < 0 || perfVal < bestPerfVal) {
958                     bestChoice = deviceIndex;
959                     bestPerfVal = perfVal;
960                 }
961             } else {
962                 // Somewhat noisy logging, but only place where the user of
963                 // NNAPI can get feedback on why an operation was not run on a
964                 // specific device.
965                 // Logs O(operationCount * nonCpuDeviceCount) times, but
966                 // typically nonCpuDeviceCount is very small.
967                 VLOG(COMPILATION) << "Device " << device->getName()
968                                   << " can't do operation "
969                                   << toString(getOperation(operationIndex).type);
970             }
971         }
972         // If it's the OEM op, we'd better have a device able to do it.
973         if (mOperations[operationIndex].type == OperationType::OEM_OPERATION) {
974             if (bestChoice < 0) {
975                 LOG(ERROR) << "No driver can do the OEM op";
976                 return ANEURALNETWORKS_BAD_DATA;
977             }
978         } else {
979             // If no driver has been found, or if the best driver is not better than the CPU,
980             // prefer the CPU. Since the performance is a ratio compared to the CPU performance,
981             // by definition the performance of the CPU is 1.0.
982             if (bestChoice < 0 || bestPerfVal >= 1.0) {
983                 bestChoice = nonCpuDeviceCount;  // The ID of the CPU.
984             }
985         }
986 
987         (*bestDeviceForOperation)[operationIndex] = bestChoice;
988         VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
989                           << toString(getOperation(operationIndex).type)
990                           << ") = "
991                           << (*bestDeviceForOperation)[operationIndex];
992     }
993     return ANEURALNETWORKS_NO_ERROR;
994 }
995 
996 } // namespace nn
997 } // namespace android
998