1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionPlan"
18
19 #include "ExecutionPlan.h"
20
21 #include "Callbacks.h"
22 #include "CompilationBuilder.h"
23 #include "ExecutionBuilder.h"
24 #include "Manager.h"
25 #include "ModelBuilder.h"
26 #include "Utils.h"
27
28 #include <functional>
29 #include <map>
30 #include <queue>
31 #include <unordered_set>
32 #include <utility>
33 #include <vector>
34
35 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback;
36 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback;
37
38 namespace android {
39 namespace nn {
40
compile(std::shared_ptr<Device> device,const ModelBuilder * model,int32_t executionPreference,sp<IPreparedModel> * preparedModel)41 static int compile(std::shared_ptr<Device> device, const ModelBuilder* model,
42 int32_t executionPreference, sp<IPreparedModel>* preparedModel) {
43 nnAssert(device != nullptr); // nullptr indicates CPU
44 // Compilation logic copied from ExecutionBuilder::startComputeOnDevice().
45 Model hidlModel;
46 model->setHidlModel(&hidlModel);
47
48 sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
49 Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
50 hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
51 if (!prepareLaunchStatus.isOk()) {
52 LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: "
53 << prepareLaunchStatus.description();
54 return ANEURALNETWORKS_OP_FAILED;
55 }
56 if (prepareLaunchStatus != ErrorStatus::NONE) {
57 LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: "
58 << toString(static_cast<ErrorStatus>(prepareLaunchStatus));
59 return ANEURALNETWORKS_OP_FAILED;
60 }
61
62 preparedModelCallback->wait();
63 ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus();
64 *preparedModel = preparedModelCallback->getPreparedModel();
65 if (prepareReturnStatus != ErrorStatus::NONE || *preparedModel == nullptr) {
66 LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:"
67 << " prepareReturnStatus=" << toString(prepareReturnStatus)
68 << ", preparedModel=" << preparedModel->get();
69 return ANEURALNETWORKS_OP_FAILED;
70 }
71 return ANEURALNETWORKS_NO_ERROR;
72 }
73
74 typedef std::function<void(uint32_t)> OperationReadyCallback;
75
76 // This class tracks whether we know the value of an operand as operations
77 // are processed.
78 class OperandTracker {
79 public:
80 // Creates the tracker for this model. Figure out which operations can be
81 // executed right away and cb for each one of them.
82 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
83 // Mark the specified operation as having been processed. The output
84 // of the operation now being known, this may make new operations to be
85 // able to run. Call cb for each one of them.
86 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
87
88 private:
89 const ModelBuilder* mModel;
90 std::multimap<uint32_t, uint32_t> mOperandToOperations;
91 std::vector<uint32_t> mUnknownInputCount; // For each operation
92 };
93
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)94 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
95 mModel(model) {
96 const auto& operations = mModel->getOperations();
97 mUnknownInputCount.resize(operations.size());
98 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
99 const Operation& operation = operations[operationIndex];
100 uint32_t count = 0;
101 for (uint32_t operandIndex : operation.inputs) {
102 auto lifetime = mModel->getOperand(operandIndex).lifetime;
103 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
104 lifetime == OperandLifeTime::MODEL_OUTPUT) {
105 count++;
106 mOperandToOperations.insert(
107 std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
108 }
109 }
110 if (count == 0) {
111 cb(operationIndex);
112 }
113 mUnknownInputCount[operationIndex] = count;
114 }
115 }
116
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)117 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
118 // Mark all its outputs as known.
119 const Operation& operation = mModel->getOperations()[operationIndex];
120 for (uint32_t operandIndex : operation.outputs) {
121 auto range = mOperandToOperations.equal_range(operandIndex);
122 for (auto i = range.first; i != range.second; i++) {
123 uint32_t& count = mUnknownInputCount[i->second];
124 if (--count == 0) {
125 cb(i->second);
126 }
127 }
128 }
129 }
130
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,std::shared_ptr<Device> device)131 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
132 std::shared_ptr<Device> device)
133 : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device) {}
134
135 // Adds an operand if it has not been added already.
136 // Sets the index in the submodel for the corresponding operand.
addOperand(uint32_t fromOperandIndex,uint32_t * toOperandIndex,const ModelBuilder & fromModel,OperandKind kind)137 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
138 const ModelBuilder& fromModel, OperandKind kind) {
139 // Have we added this operand already?
140 auto i = mOperandMap.find(fromOperandIndex);
141 if (i != mOperandMap.end()) {
142 nnAssert(kind == INPUT);
143 *toOperandIndex = i->second;
144 return ANEURALNETWORKS_NO_ERROR;
145 }
146
147 // First time we add this operand.
148 *toOperandIndex = mSubModel.operandCount();
149 mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
150
151 // Add the operand to the submodel.
152 const Operand& operand = fromModel.getOperand(fromOperandIndex);
153 ANeuralNetworksOperandType type = {
154 .type = static_cast<int32_t>(operand.type),
155 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
156 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
157 .scale = operand.scale,
158 .zeroPoint = operand.zeroPoint
159 };
160 int n = mSubModel.addOperand(type);
161 if (n != ANEURALNETWORKS_NO_ERROR) {
162 LOG(ERROR) << "Previous error occurred when partitioning the graph";
163 return n;
164 }
165
166 // Sets its value.
167 switch (operand.lifetime) {
168 case OperandLifeTime::CONSTANT_COPY: {
169 const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
170 n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
171 if (n != ANEURALNETWORKS_NO_ERROR) {
172 LOG(ERROR) << "Previous error occurred when partitioning the graph";
173 return n;
174 }
175 } break;
176 case OperandLifeTime::CONSTANT_REFERENCE: {
177 const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
178 n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory,
179 operand.location.offset,
180 operand.location.length);
181 if (n != ANEURALNETWORKS_NO_ERROR) {
182 LOG(ERROR) << "Previous error occurred when partitioning the graph";
183 return n;
184 }
185 } break;
186 case OperandLifeTime::NO_VALUE: {
187 n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
188 if (n != ANEURALNETWORKS_NO_ERROR) {
189 LOG(ERROR) << "Previous error occurred when partitioning the graph";
190 return n;
191 }
192 } break;
193 case OperandLifeTime::TEMPORARY_VARIABLE: // handled similarly to MODEL_OUTPUT
194 if (kind == INPUT) {
195 // The first time we've seen this operand is as an
196 // input. That means it must be defined by a
197 // different partition, and is an input to this one.
198 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
199 } else {
200 // The first time we've seen this operand is as an
201 // output. It may be an input to a different
202 // partition, so keep track of it.
203 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
204 }
205 break;
206 case OperandLifeTime::MODEL_INPUT:
207 mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
208 break;
209 case OperandLifeTime::MODEL_OUTPUT: // handled similarly to TEMPORARY_VARIABLE
210 if (kind == INPUT) {
211 // The first time we've seen this operand is as an
212 // input. That means it must be defined by a
213 // different partition, and is an input to this one.
214 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
215 } else {
216 // The first time we've seen this operand is as an
217 // output.
218 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
219 }
220 break;
221 default:
222 nnAssert(false);
223 break;
224 }
225
226 return ANEURALNETWORKS_NO_ERROR;
227 }
228
addOperation(int operationIndex,const ModelBuilder & fromModel)229 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
230 const Operation& operation = fromModel.getOperation(operationIndex);
231
232 // Convert the input and output operand indexes.
233 //
234 // We expect operations to be added in topological order. Therefore:
235 //
236 // - We may not have seen an input if it is a model input, a
237 // constant, or an operand written by a different partition.
238 //
239 // - We should not have seen any outputs.
240 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
241 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
242 std::vector<uint32_t> inputs(inputCount);
243 std::vector<uint32_t> outputs(outputCount);
244
245 auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
246 std::vector<uint32_t>& localOperands,
247 OperandKind kind) -> int {
248 const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
249 for (uint32_t i = 0; i < operandCount; i++) {
250 uint32_t localOperand = ~0U;
251 int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
252 if (n != ANEURALNETWORKS_NO_ERROR)
253 return n;
254 localOperands[i] = localOperand;
255 }
256 return ANEURALNETWORKS_NO_ERROR;
257 };
258
259 int n;
260 if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
261 (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
262 return n;
263 }
264
265 return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
266 outputCount, outputs.data());
267 }
268
mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const269 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
270 for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
271 stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
272 }
273 for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
274 stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
275 }
276 }
277
findTempsAsSubModelOutputs()278 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
279 for (const auto& step : mSteps) {
280 for (const auto& input : step->getTempsAsSubModelInputs()) {
281 const uint32_t fromModelIndex = input.first;
282 const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
283 nnAssert(it != mTemporaryToDefiningStep.end());
284 const uint32_t stepIndex = it->second;
285 nnAssert(stepIndex < mSteps.size());
286 mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
287 }
288 }
289 }
290
logSubModel() const291 void ExecutionStep::logSubModel() const {
292 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
293
294 auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) {
295 if (!toLog.empty()) {
296 toLog += ", ";
297 }
298 toLog += "(";
299 toLog += std::to_string(e.first);
300 toLog += "->";
301 toLog += std::to_string(e.second);
302 toLog += ")";
303 };
304
305 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
306 std::string toLog;
307 for (const auto& e : map) {
308 logRemapEntry(toLog, e);
309 }
310 VLOG(COMPILATION) << name << ": " << toLog;
311 };
312 auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
313 std::string toLog;
314 for (const auto& e : set) {
315 logRemapEntry(toLog, e);
316 }
317 VLOG(COMPILATION) << name << ": " << toLog;
318 };
319
320 logRemapVector("model inputs", mModelInputs);
321 logRemapVector("model outputs", mModelOutputs);
322 logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
323 logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
324 logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
325 }
326
convertModelInputsOrOutputs(const ExecutionStep::RemapVectorType & myModelInputsOrOutputs,uint32_t fromModelInputOrOutputCount,std::function<uint32_t (uint32_t)> fromModelGetInputOrOutputOperandIndex,std::vector<uint32_t> * inputsOrOutputs,std::vector<uint32_t> * inputOrOutputIndexSubModelToFromModel)327 static void convertModelInputsOrOutputs(
328 // IN: mModel{Inputs|Outputs}
329 const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
330 // IN: fromModel->{input|output}Count()
331 uint32_t fromModelInputOrOutputCount,
332 // IN: fromModel->get{Input|Output}OperandIndex
333 std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex,
334 // OUT: for v : mModel{Inputs|Outputs} : v.second
335 std::vector<uint32_t>* inputsOrOutputs,
336 // OUT: submodel input-or-output index to original model input-or-output index
337 std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) {
338 std::map<uint32_t, uint32_t> fromModelIndexMap; // operand index to input-or-output index
339 for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
340 fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
341 }
342 for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
343 inputsOrOutputs->push_back(myInputOrOutput.second);
344 const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
345 inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
346 }
347 }
348
finishSubModel(const ModelBuilder * fromModel,bool * hasOutputOfUnknownSize,int32_t executionPreference)349 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
350 int32_t executionPreference) {
351 if (VLOG_IS_ON(COMPILATION)) {
352 logSubModel();
353 }
354
355 mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());
356
357 // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
358 // Output order: mModelOutputs, mTempsAsSubModelOutputs
359 //
360 // ExecutionPlan::next() depends on these orderings.
361
362 std::vector<uint32_t> inputs;
363 convertModelInputsOrOutputs(mModelInputs,
364 fromModel->inputCount(),
365 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
366 &inputs,
367 &mInputIndexSubModelToFromModel);
368 for (const auto& subModelInput : mTempsAsSubModelInputs) {
369 inputs.push_back(subModelInput.second);
370 }
371 for (const auto& subModelInput : mOutputsAsSubModelInputs) {
372 inputs.push_back(subModelInput.second);
373 }
374
375 std::vector<uint32_t> outputs;
376 convertModelInputsOrOutputs(mModelOutputs,
377 fromModel->outputCount(),
378 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
379 &outputs,
380 &mOutputIndexSubModelToFromModel);
381 for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
382 outputs.push_back(subModelOutput.second);
383 const Operand& operand = mSubModel.getOperand(subModelOutput.second);
384 for (uint32_t dimension : operand.dimensions) {
385 if (dimension == 0) {
386 *hasOutputOfUnknownSize = true;
387 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
388 << " of original graph) has unknown size: "
389 << toString(operand);
390 break;
391 }
392 }
393 }
394
395 {
396 int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
397 if (n != ANEURALNETWORKS_NO_ERROR) {
398 return n;
399 }
400 n = mSubModel.finish();
401 if (n != ANEURALNETWORKS_NO_ERROR) {
402 return n;
403 }
404 }
405
406 {
407 // Compute mOutputsAsSubModelInputsIndexToFromModel.
408
409 std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
410 for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
411 fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
412 }
413
414 for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
415 const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
416 const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
417 if (it == fromModelOperandIndexToOutputIndex.end()) {
418 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
419 << " in main model output operand list";
420 return ANEURALNETWORKS_BAD_STATE;
421 }
422 mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
423 }
424 }
425
426 // TODO: Move compilation elsewhere?
427
428 if (mDevice == nullptr) {
429 return ANEURALNETWORKS_NO_ERROR;
430 }
431
432 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation";
433 return compile(mDevice, &mSubModel, executionPreference, &mPreparedSubModel);
434 }
435
dump() const436 void ExecutionStep::dump() const {
437 Model model;
438 mSubModel.setHidlModel(&model);
439 if (VLOG_IS_ON(COMPILATION)) {
440 VLOG(COMPILATION) << "ExecutionStep#" << mIndex
441 << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
442 logModelToInfo(model);
443 }
444 }
445
finish(const ModelBuilder * fromModel,int32_t executionPreference)446 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
447 int32_t executionPreference) {
448 findTempsAsSubModelOutputs();
449 for (const auto& step : mSteps) {
450 int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
451 executionPreference);
452 if (n != ANEURALNETWORKS_NO_ERROR) {
453 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
454 return n;
455 }
456 }
457 if (mHasSubModelOutputOfUnknownSize) {
458 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
459 return ANEURALNETWORKS_OP_FAILED;
460 }
461
462 mSuccessfulFinish = true;
463 return ANEURALNETWORKS_NO_ERROR;
464 }
465
finish(const ModelBuilder * fromModel,int32_t executionPreference)466 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
467 int32_t executionPreference) {
468 if (mDevice == nullptr) {
469 mSuccessfulFinish = true;
470 return ANEURALNETWORKS_NO_ERROR;
471 }
472
473 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
474 const int n = compile(mDevice, mModel, executionPreference, &mPreparedModel);
475 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
476 return n;
477 }
478
finish(const ModelBuilder * fromModel,int32_t executionPreference)479 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
480 nnAssert(mBody != nullptr);
481 return mBody->finish(fromModel, executionPreference);
482 }
483
Controller(const ExecutionPlan * plan,const ExecutionBuilder * executionBuilder,std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,uint32_t totalSizeOfTemporaries)484 ExecutionPlan::Controller::Controller(
485 const ExecutionPlan* plan,
486 const ExecutionBuilder* executionBuilder,
487 std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
488 uint32_t totalSizeOfTemporaries) :
489 mPlan(plan), mExecutionBuilder(executionBuilder),
490 mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) {
491 if (totalSizeOfTemporaries) {
492 if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
493 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
494 mNextStepIndex = kBadStepIndex;
495 }
496 }
497 }
498
makeController(const ExecutionBuilder * executionBuilder) const499 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
500 const ExecutionBuilder* executionBuilder) const {
501 nnAssert((mState == EMPTY) == (mBody == nullptr));
502 if (mBody && !mBody->mSuccessfulFinish) {
503 VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish";
504 return std::shared_ptr<Controller>(nullptr);
505 }
506
507 // Create the layout for a Memory object big enough for to hold
508 // every TEMPORARY in the original model that is live across
509 // partition boundaries.
510 //
511 // TODO: Rethink this approach for managing temporaries. Some
512 // alternatives:
513 //
514 // 1) Adopt a memory layout scheme analogous to stack allocation,
515 // where objects of non-overlapping lifetime can occupy the same
516 // storage. We would still have a single Memory object in this
517 // case.
518 //
519 // 2) Do something like what CpuExecutor does, and do allocations
520 // and deallocations on the fly (during execution) before first
521 // reference and after last reference, respectively. This would
522 // mean having one Memory object per TEMPORARY; or, in a more
523 // complicated implementation, one Memory object per set of
524 // temporaries that have the same lifetime. Note that the Android
525 // system limits the number of shared memory objects, which are
526 // what our Memory objects represent.
527 //
528 uint32_t totalSizeOfTemporaries = 0;
529 std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
530 if (mState == COMPOUND) {
531 const ModelBuilder* fromModel = executionBuilder->getModel();
532 for (const auto& step : compound()->mSteps) {
533 for (const auto& output: step->getTempsAsSubModelOutputs()) {
534 const uint32_t fromModelOperandIndex = output.first;
535 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
536 if (subModelInputsAndOutputs == nullptr) {
537 subModelInputsAndOutputs =
538 std::make_shared<Controller::SubModelInputsAndOutputsType>();
539 }
540 const uint32_t size = sizeOfData(fromModelOperand);
541 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
542 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
543 totalSizeOfTemporaries += size;
544 }
545 }
546 if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
547 for (const auto& io : *subModelInputsAndOutputs) {
548 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first
549 << ", offset = " << io.second;
550 }
551 }
552 }
553
554 return std::shared_ptr<Controller>(new Controller(this, executionBuilder,
555 subModelInputsAndOutputs,
556 totalSizeOfTemporaries));
557 }
558
559
560 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const561 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
562 std::shared_ptr<StepExecutor>* executor) const {
563 *executor = nullptr;
564
565 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
566 << "): mNextStepIndex = " << controller->mNextStepIndex;
567
568 if (controller->mNextStepIndex == 0) {
569 // We haven't called next().
570 return ANEURALNETWORKS_OP_FAILED;
571 }
572
573 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
574 // The last call to next() did not produce an executor.
575 return ANEURALNETWORKS_OP_FAILED;
576 }
577
578 --controller->mNextStepIndex;
579 return next(controller, executor);
580 }
581
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const582 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
583 std::shared_ptr<StepExecutor>* executor) const {
584 *executor = nullptr;
585
586 VLOG(EXECUTION) << "ExecutionPlan::next("
587 << SHOW_IF_DEBUG(controller << ", " << executor)
588 << "): mNextStepIndex = " << controller->mNextStepIndex;
589
590 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
591 return ANEURALNETWORKS_OP_FAILED;
592 }
593
594 if (mState == EMPTY) {
595 nnAssert(controller->mNextStepIndex == 0); // end
596 controller->mNextStepIndex = Controller::kBadStepIndex;
597 return ANEURALNETWORKS_NO_ERROR;
598 }
599
600 if (mState == SIMPLE) {
601 if (controller->mNextStepIndex == 0) {
602 // First (and only) step.
603 auto simpleBody = static_cast<const SimpleBody*>(mBody);
604 *executor = std::make_shared<StepExecutor>(
605 controller->mExecutionBuilder,
606 simpleBody->mModel,
607 (simpleBody->mDevice == nullptr ? nullptr : simpleBody->mDevice->getInterface()),
608 simpleBody->mPreparedModel);
609 (*executor)->mapInputsAndOutputsTrivially();
610 controller->mNextStepIndex = 1;
611 return ANEURALNETWORKS_NO_ERROR;
612 }
613
614 nnAssert(controller->mNextStepIndex == 1); // end
615 controller->mNextStepIndex = Controller::kBadStepIndex;
616 return ANEURALNETWORKS_NO_ERROR;
617 }
618
619 auto compoundBody = compound();
620
621 if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
622 // end
623 controller->mNextStepIndex = Controller::kBadStepIndex;
624 return ANEURALNETWORKS_NO_ERROR;
625 }
626
627 // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
628 // Output order: model outputs, temps as submodel outputs
629 //
630 // ExecutionStep::finishSubModel() establishes these orderings.
631
632 const auto step = compoundBody->mSteps[controller->mNextStepIndex];
633 *executor = std::make_shared<StepExecutor>(
634 controller->mExecutionBuilder,
635 step->getSubModel(),
636 (step->getDevice() == nullptr ? nullptr : step->getDevice()->getInterface()),
637 step->getPreparedSubModel());
638 step->mapInputsAndOutputs(*executor);
639 if (controller->mSubModelInputsAndOutputs != nullptr) {
640 {
641 // Tell executor about temps as submodel outputs.
642
643 const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
644 const auto& subModelOutputs = step->getTempsAsSubModelOutputs();
645
646 uint32_t idx = 0;
647 for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
648 const uint32_t fromModelOperandIndex = I->first;
649 const uint32_t offsetOfTemporary =
650 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
651 int n = (*executor)->setOutputFromTemporaryMemory(
652 firstSubModelOutputIndex + idx,
653 &controller->mTemporaries,
654 offsetOfTemporary);
655 if (n != ANEURALNETWORKS_NO_ERROR) {
656 controller->mNextStepIndex = Controller::kBadStepIndex;
657 return n;
658 }
659 }
660 }
661 {
662 // Tell executor about temps as submodel inputs.
663
664 const size_t firstSubModelInputIndex = step->getModelInputs().size();
665 const auto& subModelInputs = step->getTempsAsSubModelInputs();
666
667 uint32_t idx = 0;
668 for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
669 const uint32_t fromModelOperandIndex = I->first;
670 const uint32_t offsetOfTemporary =
671 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
672 int n = (*executor)->setInputFromTemporaryMemory(
673 firstSubModelInputIndex + idx,
674 &controller->mTemporaries,
675 offsetOfTemporary);
676 if (n != ANEURALNETWORKS_NO_ERROR) {
677 controller->mNextStepIndex = Controller::kBadStepIndex;
678 return n;
679 }
680 }
681 }
682 }
683 {
684 // Tell executor about outputs as submodel inputs.
685
686 const size_t firstOutputsAsSubModelInputIndex =
687 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
688 const auto& outputsAsSubModelInputsIndexToFromModel =
689 step->getOutputsAsSubModelInputsIndexToFromModel();
690 for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
691 uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
692 (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
693 }
694 }
695
696 controller->mNextStepIndex++;
697 return ANEURALNETWORKS_NO_ERROR;
698 }
699
createNewStep(const std::shared_ptr<Device> device)700 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
701 nnAssert(mState != SIMPLE);
702 if (mState == EMPTY) {
703 mBody = new CompoundBody();
704 mState = COMPOUND;
705 }
706 auto& steps = compound()->mSteps;
707 auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
708 steps.push_back(step);
709 return step;
710 }
711
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)712 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
713 const ModelBuilder* model) {
714 nnAssert(mState == EMPTY);
715 mBody = new SimpleBody(device, model);
716 mState = SIMPLE;
717 }
718
dump() const719 void ExecutionPlan::dump() const {
720 if (mBody) {
721 mBody->dump();
722 } else {
723 VLOG(COMPILATION) << "EMPTY";
724 }
725 }
726
forTest_getKind() const727 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
728 switch (mState) {
729 case EMPTY:
730 return Kind::EMPTY;
731 case SIMPLE:
732 nnAssert(mBody);
733 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
734 case COMPOUND:
735 nnAssert(mBody);
736 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
737 default:
738 nnAssert(!"unexpected state");
739 return Kind::ERROR;
740 }
741 }
742
forTest_simpleGetDevice() const743 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
744 nnAssert(mState == SIMPLE);
745 return static_cast<const SimpleBody*>(mBody)->mDevice;
746 }
747
forTest_compoundGetSteps() const748 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
749 return compound()->mSteps;
750 }
751
forTest_hasSubModelOutputsOfUnknownSize() const752 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
753 return mBody->hasSubModelOutputsOfUnknownSize();
754 }
755
dump() const756 void ExecutionPlan::SimpleBody::dump() const {
757 VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
758 }
759
dump() const760 void ExecutionPlan::CompoundBody::dump() const {
761 for (const auto& step : mSteps) {
762 step->dump();
763 }
764 }
765
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,ExecutionPlan * plan) const766 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
767 uint32_t preference, ExecutionPlan* plan) const {
768 // This function uses a heuristic approach to partitioning the graph.
769 // It should be good enough for the first release.
770
771 const size_t nonCpuDeviceCount = devices.size();
772 // The device count is the number of HAL devices + 1. The +1 is for the CPU.
773 // Note that deviceCount includes CPU, which has no entry in devices[].
774 const size_t deviceCount = nonCpuDeviceCount + 1;
775 const size_t operationCount = mOperations.size();
776
777 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
778 << ", operationCount = " << operationCount;
779
780 // If we only have the CPU, or if the graph has no operations, no need to try to partition.
781 if (nonCpuDeviceCount == 0 || operationCount == 0) {
782 // Make sure no op is an OEM operation.
783 for (auto& op: mOperations) {
784 if (op.type == OperationType::OEM_OPERATION) {
785 LOG(ERROR) << "No driver can do the OEM op";
786 return ANEURALNETWORKS_BAD_DATA;
787 }
788 }
789 plan->becomeSingleStep(nullptr /* CPU */, this);
790 return plan->finish(this, preference);
791 }
792
793 // Figure out where each operation will best execute.
794 // The value of the vector is the index in the devices vector, with devices.size()
795 // representing the CPU.
796 std::vector<int> bestDeviceForOperation(operationCount);
797 int status = findBestDeviceForEachOperation(preference, devices, deviceCount,
798 &bestDeviceForOperation);
799 if (status != ANEURALNETWORKS_NO_ERROR) {
800 return status;
801 }
802
803 // If one device will run all the operations, we don't need to split the work.
804 if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
805 std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
806 const int bestDeviceIndex = bestDeviceForOperation[0];
807 const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1);
808 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
809 << bestDeviceIndex << " = "
810 << (cpu ? "CPU" : devices[bestDeviceIndex]->getName());
811 plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this);
812 return plan->finish(this, preference);
813 }
814
815 // No easy solution, we need to split the work.
816
817 // We keep track of the operations that are ready to run for each device.
818 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
819
820 // This helper function enqueues the operation on the appropriate queue.
821 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
822 int deviceIndex = bestDeviceForOperation[operationIndex];
823 perDeviceQueue[deviceIndex].push(operationIndex);
824 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
825 << deviceIndex;
826 };
827
828 // This helper function finds a device that has operations ready to process.
829 // We start by looking at the CPU. We do this to try to maximize the
830 // size of the graph we'll send to non-CPU devices. If the CPU runs first,
831 // it will have the chance to prepare more of the inputs required by the
832 // other devices. This function returns -1 if all queues are empty.
833 auto findNextDeviceToProcess = [&]() -> int {
834 for (int i = deviceCount - 1; i >= 0; i--) {
835 if (!perDeviceQueue[i].empty()) {
836 return i;
837 }
838 }
839 return -1;
840 };
841
842 OperandTracker tracker(this, enqueueOnAppropriateDevice);
843 // For each iteration of this loop, we'll create an execution step.
844 while (true) {
845 // Find the device we'll do this step for.
846 int deviceIndex = findNextDeviceToProcess();
847 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
848 if (deviceIndex < 0) {
849 break;
850 }
851 // nullptr represents the CPU.
852 std::shared_ptr<Device> device =
853 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount
854 ? devices[deviceIndex] : nullptr;
855
856 // Assign as much as possible to this device.
857 std::shared_ptr<ExecutionStep> step = plan->createNewStep(device);
858 auto& queue = perDeviceQueue[deviceIndex];
859 while (!queue.empty()) {
860 uint32_t operationIndex = queue.front();
861 queue.pop();
862 int n = step->addOperation(operationIndex, *this);
863 if (n != ANEURALNETWORKS_NO_ERROR) {
864 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
865 return n;
866 }
867 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
868 }
869 }
870
871 int n = plan->finish(this, preference);
872 if (VLOG_IS_ON(COMPILATION)) {
873 Model model;
874 setHidlModel(&model);
875 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
876 logModelToInfo(model);
877 plan->dump();
878 }
879 return n;
880 }
881
getPerformanceInfo(const std::shared_ptr<Device> device,uint32_t operationIndex) const882 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
883 uint32_t operationIndex) const {
884 const Operation& operation = getOperation(operationIndex);
885 // TODO This assumes that the type is dictated by the first operand. This is
886 // currently the case but is not a safe assumption to make in the long term.
887 const uint32_t operandIndex = operation.inputs[0];
888 const OperandType operandType = mOperands[operandIndex].type;
889 switch(operandType) {
890 case OperandType::FLOAT32:
891 case OperandType::TENSOR_FLOAT32:
892 if (mRelaxComputationFloat32toFloat16) {
893 return device->getRelaxedFloat32toFloat16Performance();
894 } else {
895 return device->getFloat32Performance();
896 }
897 case OperandType::INT32:
898 case OperandType::UINT32:
899 case OperandType::TENSOR_INT32:
900 case OperandType::TENSOR_QUANT8_ASYMM:
901 // For OEM, the real selection will be made from who can run the operand.
902 case OperandType::OEM:
903 case OperandType::TENSOR_OEM_BYTE:
904 return device->getQuantized8Performance();
905 default:
906 nnAssert(false);
907 return device->getQuantized8Performance();
908 }
909 }
910
911 namespace {
912 // This class determines whether a given device can execute a given operation
913 class CanDo {
914 public:
CanDo()915 CanDo() {}
916
initialize(const ModelBuilder * model,std::shared_ptr<Device> device)917 void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) {
918 Model hidlModel;
919 model->setHidlModel(&hidlModel);
920 device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex);
921 }
922
check(size_t operationIndex) const923 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
924
925 private:
926 hidl_vec<bool> mSupportsOperationByIndex;
927 };
928 }; // anonymous namespace
929
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,const size_t deviceCount,std::vector<int> * bestDeviceForOperation) const930 int ModelBuilder::findBestDeviceForEachOperation(
931 uint32_t preference,
932 const std::vector<std::shared_ptr<Device>>& devices,
933 const size_t deviceCount,
934 std::vector<int>* bestDeviceForOperation) const {
935
936 // Note that deviceCount includes CPU, which has no entry in devices[]
937 const size_t nonCpuDeviceCount = deviceCount - 1;
938
939 std::vector<CanDo> canDo(nonCpuDeviceCount);
940 for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
941 canDo[deviceIndex].initialize(this, devices[deviceIndex]);
942 }
943
944 // Figure out the best driver for each operation.
945 const size_t operationCount = mOperations.size();
946 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
947 // Find which non-CPU device gives the best performance for this operation.
948 int bestChoice = -1;
949 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
950 for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
951 const auto& device = devices[deviceIndex];
952 if (canDo[deviceIndex].check(operationIndex)) {
953 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
954 const float perfVal =
955 (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
956 : perf.execTime);
957 if (bestChoice < 0 || perfVal < bestPerfVal) {
958 bestChoice = deviceIndex;
959 bestPerfVal = perfVal;
960 }
961 } else {
962 // Somewhat noisy logging, but only place where the user of
963 // NNAPI can get feedback on why an operation was not run on a
964 // specific device.
965 // Logs O(operationCount * nonCpuDeviceCount) times, but
966 // typically nonCpuDeviceCount is very small.
967 VLOG(COMPILATION) << "Device " << device->getName()
968 << " can't do operation "
969 << toString(getOperation(operationIndex).type);
970 }
971 }
972 // If it's the OEM op, we'd better have a device able to do it.
973 if (mOperations[operationIndex].type == OperationType::OEM_OPERATION) {
974 if (bestChoice < 0) {
975 LOG(ERROR) << "No driver can do the OEM op";
976 return ANEURALNETWORKS_BAD_DATA;
977 }
978 } else {
979 // If no driver has been found, or if the best driver is not better than the CPU,
980 // prefer the CPU. Since the performance is a ratio compared to the CPU performance,
981 // by definition the performance of the CPU is 1.0.
982 if (bestChoice < 0 || bestPerfVal >= 1.0) {
983 bestChoice = nonCpuDeviceCount; // The ID of the CPU.
984 }
985 }
986
987 (*bestDeviceForOperation)[operationIndex] = bestChoice;
988 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
989 << toString(getOperation(operationIndex).type)
990 << ") = "
991 << (*bestDeviceForOperation)[operationIndex];
992 }
993 return ANEURALNETWORKS_NO_ERROR;
994 }
995
996 } // namespace nn
997 } // namespace android
998