1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionBuilder"
18
19 #include "ExecutionBuilder.h"
20
21 #include <android/sync.h>
22
23 #include <algorithm>
24 #include <limits>
25 #include <memory>
26 #include <mutex>
27 #include <optional>
28 #include <string>
29 #include <thread>
30 #include <tuple>
31 #include <utility>
32 #include <vector>
33
34 #include "CompilationBuilder.h"
35 #include "ControlFlow.h"
36 #include "CpuExecutor.h"
37 #include "ExecutionBurstController.h"
38 #include "HalInterfaces.h"
39 #include "Manager.h"
40 #include "ModelArgumentInfo.h"
41 #include "ModelBuilder.h"
42 #include "Tracing.h"
43 #include "TypeManager.h"
44 #include "Utils.h"
45
46 namespace android {
47 namespace nn {
48
49 using namespace hal;
50
51 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
52
measureTiming(const ExecutionBuilder * execution)53 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
54 return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
55 }
56
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)57 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
58 const char* tag, bool allowUnspecified) {
59 if (newType != nullptr) {
60 const Extension::OperandTypeInformation* info = nullptr;
61 if (isExtensionOperandType(operand.type)) {
62 NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
63 }
64 if (validateOperandType(*newType, info, tag, allowUnspecified) !=
65 ANEURALNETWORKS_NO_ERROR) {
66 LOG(ERROR) << tag << ": Invalid newType";
67 return false;
68 }
69 if (operand.dimensions.size() == 0) {
70 return true;
71 }
72 if (operand.dimensions.size() != newType->dimensionCount) {
73 LOG(ERROR) << tag << ": Setting with incompatible dimension count";
74 return false;
75 }
76 for (uint32_t i = 0; i < newType->dimensionCount; i++) {
77 if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
78 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
79 return false;
80 }
81 }
82 } else {
83 if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
84 tensorHasUnspecifiedDimensions(operand)) {
85 LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
86 return false;
87 }
88 }
89 return true;
90 }
91
ExecutionBuilder(const CompilationBuilder * compilation)92 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
93 : mCompilation(compilation),
94 mModel(compilation->mModel),
95 mPlan(&compilation->mPlan),
96 mPartitioning(compilation->mPartitioning),
97 mInputs(mModel->inputCount()),
98 mOutputs(mModel->outputCount()) {
99 VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
100 << " inputs and " << mOutputs.size() << " outputs";
101 }
102
getSourceModel(uint32_t index) const103 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
104 return mPlan->getSourceModels().getModel(index);
105 }
106
isFinished() const107 bool ExecutionBuilder::isFinished() const {
108 CHECK(!(mFinishedWithoutSyncFence && hasSyncFence()));
109 if (mFinishedWithoutSyncFence) {
110 return true;
111 }
112 if (hasSyncFence()) {
113 auto r = syncWait(mSyncFenceFd, 0);
114 CHECK(r != FenceState::UNKNOWN);
115 return r != FenceState::ACTIVE;
116 }
117 return false;
118 }
119
completedWith() const120 ExecutionBuilder::Completion ExecutionBuilder::completedWith() const {
121 CHECK(isFinished());
122 if (hasSyncFence()) {
123 auto r = syncWait(mSyncFenceFd, 0);
124 CHECK(r == FenceState::SIGNALED || r == FenceState::ERROR);
125 return (r == FenceState::SIGNALED) ? Completion::NO_ERROR : Completion::OTHER_ERROR;
126 } else {
127 return mCompletionWithoutSyncFence;
128 }
129 }
130
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)131 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
132 const void* buffer, size_t length) {
133 if (mStarted) {
134 LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
135 "execution has started.";
136 return ANEURALNETWORKS_BAD_STATE;
137 }
138 uint32_t count = static_cast<uint32_t>(mInputs.size());
139 if (index >= count) {
140 LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
141 return ANEURALNETWORKS_BAD_DATA;
142 }
143 if (!checkDimensionInfo(mModel->getInputOperand(index), type,
144 "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
145 return ANEURALNETWORKS_BAD_DATA;
146 }
147 if (length > 0xFFFFFFFF) {
148 LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
149 return ANEURALNETWORKS_BAD_DATA;
150 }
151 uint32_t l = static_cast<uint32_t>(length);
152 if (!mInputs[index].unspecified()) {
153 LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
154 "provided";
155 return ANEURALNETWORKS_BAD_STATE;
156 }
157 int n;
158 std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
159 mModel->getInputOperand(index), type, const_cast<void*>(buffer), l);
160 return n;
161 }
162
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)163 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
164 const Memory* memory, size_t offset, size_t length) {
165 // Should be similar to StepExecutor::setInputOrOutputFromMemory()
166
167 if (mStarted) {
168 LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
169 "execution has started.";
170 return ANEURALNETWORKS_BAD_STATE;
171 }
172 uint32_t count = static_cast<uint32_t>(mInputs.size());
173 if (index >= count) {
174 LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
175 << count;
176 return ANEURALNETWORKS_BAD_DATA;
177 }
178 if (!checkDimensionInfo(mModel->getInputOperand(index), type,
179 "ANeuralNetworksExecution_setInputFromMemory", false)) {
180 return ANEURALNETWORKS_BAD_DATA;
181 }
182 if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
183 length)) {
184 return ANEURALNETWORKS_BAD_DATA;
185 }
186 // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
187 // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
188 // region is used. We update the length here because the drivers are still expecting a real
189 // length. For other memories that do not allow this semantic, it is checked in
190 // MemoryValidatorBase::validate before reaching here.
191 if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
192 length = memory->getHidlMemory().size();
193 }
194 // TODO validate the rest
195 uint32_t poolIndex = mMemories.add(memory);
196 if (!mInputs[index].unspecified()) {
197 LOG(ERROR)
198 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
199 "been provided";
200 return ANEURALNETWORKS_BAD_STATE;
201 }
202 int n;
203 std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromMemory(
204 mModel->getInputOperand(index), type, poolIndex, offset, length);
205 return n;
206 }
207
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)208 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
209 void* buffer, size_t length) {
210 if (mStarted) {
211 LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
212 "execution has started.";
213 return ANEURALNETWORKS_BAD_STATE;
214 }
215 uint32_t count = static_cast<uint32_t>(mOutputs.size());
216 if (index >= count) {
217 LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
218 return ANEURALNETWORKS_BAD_DATA;
219 }
220 if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
221 "ANeuralNetworksExecution_setOutput", true)) {
222 return ANEURALNETWORKS_BAD_DATA;
223 }
224 if (length > 0xFFFFFFFF) {
225 LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
226 return ANEURALNETWORKS_BAD_DATA;
227 }
228 uint32_t l = static_cast<uint32_t>(length);
229 if (!mOutputs[index].unspecified()) {
230 LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
231 "provided";
232 return ANEURALNETWORKS_BAD_STATE;
233 }
234 int n;
235 std::tie(n, mOutputs[index]) =
236 ModelArgumentInfo::createFromPointer(mModel->getOutputOperand(index), type, buffer, l);
237 return n;
238 }
239
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)240 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
241 const Memory* memory, size_t offset, size_t length) {
242 // Should be similar to StepExecutor::setInputOrOutputFromMemory()
243
244 if (mStarted) {
245 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
246 "execution has started.";
247 return ANEURALNETWORKS_BAD_STATE;
248 }
249 uint32_t count = static_cast<uint32_t>(mOutputs.size());
250 if (index >= count) {
251 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
252 << count;
253 return ANEURALNETWORKS_BAD_DATA;
254 }
255 if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
256 "ANeuralNetworksExecution_setOutputFromMemory", true)) {
257 return ANEURALNETWORKS_BAD_DATA;
258 }
259 if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
260 length)) {
261 return ANEURALNETWORKS_BAD_DATA;
262 }
263 // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
264 // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
265 // region is used. We update the length here because the drivers are still expecting a real
266 // length. For other memories that do not allow this semantic, it is checked in
267 // MemoryValidatorBase::validate before reaching here.
268 if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
269 length = memory->getHidlMemory().size();
270 }
271 // TODO validate the rest
272 uint32_t poolIndex = mMemories.add(memory);
273 if (!mOutputs[index].unspecified()) {
274 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
275 "already been provided";
276 return ANEURALNETWORKS_BAD_STATE;
277 }
278 int n;
279 std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromMemory(
280 mModel->getOutputOperand(index), type, poolIndex, offset, length);
281 return n;
282 }
283
setMeasureTiming(bool measure)284 int ExecutionBuilder::setMeasureTiming(bool measure) {
285 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
286 LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
287 << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
288 << "that was not created by ANeuralNetworksCompilation_createForDevices "
289 << "with numDevices = 1";
290 return ANEURALNETWORKS_BAD_DATA;
291 }
292 if (mStarted) {
293 LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
294 "execution has started.";
295 return ANEURALNETWORKS_BAD_STATE;
296 }
297 mMeasureTiming = measure;
298 return ANEURALNETWORKS_NO_ERROR;
299 }
300
getDuration(int32_t durationCode,uint64_t * duration) const301 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
302 if (!isFinished()) {
303 LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
304 "execution has finished.";
305 *duration = UINT64_MAX;
306 return ANEURALNETWORKS_BAD_STATE;
307 }
308 if (completedWith() != Completion::NO_ERROR) {
309 LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
310 "that has encountered an error.";
311 *duration = UINT64_MAX;
312 return ANEURALNETWORKS_BAD_STATE;
313 }
314
315 // NOTE: At the HAL level, timing is in microseconds. At the NDK level, nanoseconds.
316 const uint64_t kNanoPerMicro = 1000;
317
318 if (!mMeasureTiming) {
319 *duration = UINT64_MAX;
320 return ANEURALNETWORKS_BAD_STATE;
321 }
322
323 Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
324 Timing timingFenced = timingLaunched;
325 if (mFencedExecutionCallback != nullptr) {
326 ErrorStatus status;
327 const Return<void> ret = mFencedExecutionCallback->getExecutionInfo(
328 [&status, &timingLaunched, &timingFenced](ErrorStatus error, Timing tLaunched,
329 Timing tFenced) {
330 status = error;
331 timingLaunched = tLaunched;
332 timingFenced = tFenced;
333 });
334 if (!ret.isOk()) {
335 *duration = UINT64_MAX;
336 return ANEURALNETWORKS_OP_FAILED;
337 }
338 if (status != ErrorStatus::NONE) {
339 *duration = UINT64_MAX;
340 return ANEURALNETWORKS_BAD_STATE;
341 }
342 }
343 uint64_t microDuration = UINT64_MAX;
344 switch (durationCode) {
345 case ANEURALNETWORKS_DURATION_ON_HARDWARE:
346 microDuration = timingLaunched.timeOnDevice;
347 break;
348 case ANEURALNETWORKS_DURATION_IN_DRIVER:
349 microDuration = timingLaunched.timeInDriver;
350 break;
351 case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
352 microDuration = timingFenced.timeOnDevice;
353 break;
354 case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
355 microDuration = timingFenced.timeInDriver;
356 break;
357 default:
358 CHECK(!"unexpected");
359 }
360 *duration = (microDuration == UINT64_MAX) ? UINT64_MAX : kNanoPerMicro * microDuration;
361
362 VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
363 return ANEURALNETWORKS_NO_ERROR;
364 }
365
setTimeoutDuration(uint64_t duration)366 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
367 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
368 LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
369 "created from an ANeuralNetworksCompilation that was not created by "
370 "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
371 return ANEURALNETWORKS_BAD_DATA;
372 }
373 if (mStarted) {
374 LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
375 return ANEURALNETWORKS_BAD_STATE;
376 }
377 if (duration > 0) {
378 mTimeoutDuration = duration;
379 } else {
380 mTimeoutDuration.reset();
381 }
382 return ANEURALNETWORKS_NO_ERROR;
383 }
384
getTimeoutDuration() const385 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
386 return mTimeoutDuration;
387 }
388
setLoopTimeout(uint64_t duration)389 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
390 if (mStarted) {
391 LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
392 "execution has started.";
393 return ANEURALNETWORKS_BAD_STATE;
394 }
395 if (duration > operation_while::kTimeoutNsMaximum) {
396 LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
397 << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
398 duration = operation_while::kTimeoutNsMaximum;
399 }
400 mLoopTimeoutDuration = duration;
401 return ANEURALNETWORKS_NO_ERROR;
402 }
403
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)404 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
405 if (!isFinished()) {
406 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
407 "execution has finished.";
408 return ANEURALNETWORKS_BAD_STATE;
409 }
410 if (completedWith() == Completion::OTHER_ERROR) {
411 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
412 "that has encountered an error.";
413 return ANEURALNETWORKS_BAD_STATE;
414 }
415
416 uint32_t count = static_cast<uint32_t>(mOutputs.size());
417 if (index >= count) {
418 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
419 << " " << count;
420 return ANEURALNETWORKS_BAD_DATA;
421 }
422 const auto& dims = mOutputs[index].dimensions();
423 if (dims.empty()) {
424 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
425 "dimensions of a scalar";
426 return ANEURALNETWORKS_BAD_DATA;
427 }
428 std::copy(dims.begin(), dims.end(), dimensions);
429 return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
430 : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
431 }
432
getOutputOperandRank(uint32_t index,uint32_t * rank)433 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
434 if (!isFinished()) {
435 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
436 "execution has finished.";
437 return ANEURALNETWORKS_BAD_STATE;
438 }
439 if (completedWith() == Completion::OTHER_ERROR) {
440 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
441 "that has encountered an error.";
442 return ANEURALNETWORKS_BAD_STATE;
443 }
444 uint32_t count = static_cast<uint32_t>(mOutputs.size());
445 if (index >= count) {
446 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
447 << count;
448 return ANEURALNETWORKS_BAD_DATA;
449 }
450 *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
451 return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
452 : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
453 }
454
455 // Attempt synchronous execution of full model on CPU.
456 // TODO: How should we handle timing in this case?
457 // For Q this is irrelevant: We only support timing in conjunction
458 // with an explicit device list; and we do not support CPU fallback
459 // with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)460 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
461 ExecutionBuilder* executionBuilder) {
462 CHECK(executionBuilder != nullptr);
463 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
464 VLOG(EXECUTION) << "cpuFallbackFull";
465
466 // Get fallback executor.
467 StepExecutor executor(executionBuilder, executionBuilder->getModel(),
468 DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr);
469 executor.mapInputsAndOutputsTrivially();
470
471 // Attempt fallback execution.
472 return executor.computeOnCpuFallback();
473 }
474
475 // Attempt synchronous execution on CPU.
476 // TODO: How should we handle timing in this case?
477 // For Q this is irrelevant: We only support timing in conjunction
478 // with an explicit device list; and we do not support CPU fallback
479 // with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
480 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)481 cpuFallbackPartial(const ExecutionPlan& plan,
482 std::shared_ptr<ExecutionPlan::Controller> controller) {
483 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
484 VLOG(EXECUTION) << "cpuFallbackPartial";
485
486 // Get fallback executor.
487 std::shared_ptr<StepExecutor> executor;
488 int n1 = plan.fallback(controller, &executor);
489 if (n1 != ANEURALNETWORKS_NO_ERROR) {
490 return {n1, {}, kNoTiming, nullptr};
491 }
492 CHECK(executor != nullptr);
493
494 // Attempt fallback execution.
495 auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
496 return {n2, std::move(outputShapes), timing, executor};
497 }
498
asyncStartComputePartitioned(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,bool allowFallback,const std::optional<Deadline> & deadline,const sp<ExecutionCallback> & executionCallback)499 static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
500 const ExecutionPlan& plan,
501 std::shared_ptr<ExecutionPlan::Controller> controller,
502 bool allowFallback,
503 const std::optional<Deadline>& deadline,
504 const sp<ExecutionCallback>& executionCallback) {
505 CHECK(executionBuilder != nullptr);
506 VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)";
507
508 std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
509 Timing timing = kNoTiming;
510 // Disallow fallback when the ExecutionPlan is simple on CPU.
511 allowFallback &= !plan.isSimpleCpu();
512
513 while (true) {
514 VLOG(EXECUTION) << "looking for next StepExecutor";
515
516 // Get the current step of the execution.
517 std::shared_ptr<StepExecutor> executor;
518 std::shared_ptr<ExecutionBurstController> burstController;
519 int n = plan.next(controller, &executor, &burstController);
520 if (n != ANEURALNETWORKS_NO_ERROR) {
521 // During the interpreted execution of control flow, a loop timeout
522 // might occur in ExecutionPlan::next().
523 bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
524 n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
525 if (allowFallback && !missedDeadline) break;
526 executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
527 return;
528 }
529
530 // If the code reached the end of the plan without error, then return
531 // with no error.
532 if (executor == nullptr) {
533 executionCallback->notify(ErrorStatus::NONE, outputShapes, timing);
534 return;
535 }
536 const bool executorIsCpu = executor->isCpu();
537
538 // Attempt to execute a single step of the execution.
539 auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
540
541 // Update global outputs.
542 if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
543 stepN = ANEURALNETWORKS_OP_FAILED;
544 }
545
546 // If execution was successful, continue to next step.
547 if (stepN == ANEURALNETWORKS_NO_ERROR) {
548 // We only support collection of timing information in the case of a
549 // single step, so it's safe to just keep track of the last step's
550 // timing information.
551 timing = stepTiming;
552 continue;
553 }
554
555 // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
556 if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
557 const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
558 executionCallback->notify(stepStatus, outputShapes, kNoTiming);
559 return;
560 }
561
562 // If fallback is not allowed and there was an error, end execution.
563 if (!allowFallback) {
564 const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
565 executionCallback->notify(stepStatus, {}, kNoTiming);
566 return;
567 }
568
569 // If CPU execution was already attempted, either:
570 // (1) perform a full fallback if the plan is not simple, or
571 // (2) return from the function with an error
572 if (executorIsCpu) {
573 if (!plan.isSimple()) break;
574 executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, kNoTiming);
575 return;
576 }
577
578 // If the code reaches this point, attempt a partial fallback to CPU.
579 CHECK(allowFallback);
580 auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
581 cpuFallbackPartial(plan, controller);
582
583 // Update global outputs.
584 if (fallbackExecutor != nullptr &&
585 !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
586 fallbackN = ANEURALNETWORKS_OP_FAILED;
587 }
588
589 // If execution was successful, continue to next step.
590 if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
591 // We only support collection of timing information in the case of a
592 // single step, so it's safe to just keep track of the last step's
593 // timing information.
594 timing = fallbackTiming;
595 continue;
596 }
597
598 // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
599 if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
600 const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
601 executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
602 return;
603 }
604
605 // Do not fallback twice if the ExecutionPlan is simple.
606 if (plan.isSimple()) {
607 const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
608 executionCallback->notify(fallbackStatus, {}, kNoTiming);
609 return;
610 }
611
612 // If the code reaches this point, then there was an error with the
613 // fallback. In this case, attempt full fallback.
614 break;
615 }
616
617 // If the code has reached this point, a potentially recoverable error
618 // occurred during the step executions. Instead, do a full execution
619 // fallback on the CPU.
620 auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
621 const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
622 executionCallback->notify(fullStatus, fullOutputShapes, fullTiming);
623 }
624
625 // In case of partitioned execution, startComputeFenced call will return the sync
626 // fence and the fenced compute callback returned from the last partition.
627 // Any failed partition will result in the whole execution fallback to CPU if
628 // allowFallback is set to true.
startComputeFenced(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline,bool allowFallback)629 static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
630 ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
631 std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
632 uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
633 bool allowFallback) {
634 CHECK(executionBuilder != nullptr);
635 VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
636 // Disallow fallback when the ExecutionPlan is simple on CPU.
637 allowFallback &= !plan.isSimpleCpu();
638
639 // Initiate waitForFds, syncFence for the first step.
640 std::vector<int> waitForFds = waitFor;
641 int syncFence = -1;
642 sp<hal::IFencedExecutionCallback> computeFencedCallback;
643
644 while (true) {
645 VLOG(EXECUTION) << "looking for next StepExecutor";
646
647 // Get the current step of the execution.
648 std::shared_ptr<StepExecutor> executor;
649 int n = plan.next(controller, &executor, nullptr, syncFence);
650 if (n != ANEURALNETWORKS_NO_ERROR) {
651 // During the interpreted execution of control flow, a loop timeout
652 // might occur in ExecutionPlan::next().
653 bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
654 n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
655 if (allowFallback && !missedDeadline) break;
656 // Return -1 for the sync fence fd, and nullptr for the callback.
657 return std::make_tuple(n, -1, nullptr);
658 }
659
660 // If the code reached the end of the plan without error, then return
661 // with no error.
662 if (executor == nullptr) {
663 // If the final step returns a -1 for sync fence, the execution is finished.
664 // Update the output shapes.
665 if (syncFence == -1) {
666 // TODO(miaowang): support dynamic output shape only with memory domain.
667 // For now just return the initial output shapes.
668 executionBuilder->finishWithoutSyncFence(
669 ErrorStatus::NONE, executionBuilder->getInitialOutputShapes());
670 }
671 return std::make_tuple(ANEURALNETWORKS_NO_ERROR, syncFence, computeFencedCallback);
672 }
673 const bool executorIsCpu = executor->isCpu();
674
675 // Attempt to execute a single step of the execution.
676 auto [stepN, syncFd, callback] =
677 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
678
679 // Update waitForFds, syncFence for the next step.
680 syncFence = syncFd;
681 computeFencedCallback = callback;
682 waitForFds.clear();
683 if (syncFd > 0) {
684 waitForFds = {syncFd};
685 }
686
687 // If execution was successful, continue to next step.
688 if (stepN == ANEURALNETWORKS_NO_ERROR) {
689 continue;
690 }
691 // If fallback is not allowed and there was an error, end execution.
692 if (!allowFallback) {
693 return std::make_tuple(stepN, -1, nullptr);
694 }
695
696 // If CPU execution was already attempted, either:
697 // (1) perform a full fallback if the plan is not simple, or
698 // (2) return from the function with an error
699 if (executorIsCpu) {
700 if (!plan.isSimple()) break;
701 return std::make_tuple(stepN, -1, nullptr);
702 }
703 // If the code reaches this point, then there was an error with the
704 // fallback. In this case, attempt full fallback.
705 break;
706 }
707
708 // If the code has reached this point, a potentially recoverable error
709 // occurred during the step executions. Instead, do a full execution
710 // fallback on the CPU.
711 VLOG(EXECUTION) << "Performing full fallback on the CPU.";
712 for (int syncFd : waitFor) {
713 if (syncFd > 0) {
714 auto r = syncWait(syncFd, -1);
715 if (r != FenceState::SIGNALED) {
716 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
717 return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr);
718 }
719 }
720 }
721 auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
722 const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
723 syncFence = -1;
724 executionBuilder->finishWithoutSyncFence(fullStatus, fullOutputShapes);
725 executionBuilder->reportTimingWithoutFencedExecutionCallback(fullTiming);
726 return std::make_tuple(fullN, syncFence, nullptr);
727 }
728
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)729 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
730 uint64_t timeoutDurationAfterFence, int* syncFence) {
731 CHECK(syncFence != nullptr);
732 if (mStarted) {
733 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
734 " called on an execution that has already started";
735 return ANEURALNETWORKS_BAD_STATE;
736 }
737 if (timeoutDurationAfterFence > 0) {
738 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
739 LOG(ERROR)
740 << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
741 "duration on an ANeuralNetworksExecution "
742 "created from an ANeuralNetworksCompilation that was not created by "
743 "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
744 return ANEURALNETWORKS_BAD_DATA;
745 }
746 }
747 const auto deadline = makeDeadline(mTimeoutDuration);
748 for (auto& p : mInputs) {
749 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
750 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
751 " not all inputs specified";
752 return ANEURALNETWORKS_BAD_DATA;
753 }
754 }
755 for (auto& p : mOutputs) {
756 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
757 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
758 " not all outputs specified";
759 return ANEURALNETWORKS_BAD_DATA;
760 }
761 }
762 for (uint32_t i = 0; i < mOutputs.size(); i++) {
763 if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
764 !checkDimensionInfo(mModel->getOutputOperand(i), nullptr,
765 "ANeuralNetworksExecution_startComputeWithDependencies", false)) {
766 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
767 " not all outputs have fully specified dimensions";
768 return ANEURALNETWORKS_BAD_DATA;
769 }
770 }
771 mStarted = true;
772 const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
773 std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
774 VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
775 int result;
776 std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
777 this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
778 *syncFence = mSyncFenceFd;
779 return result;
780 }
781
compute(sp<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)782 int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
783 BurstBuilder* burstBuilder) {
784 CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
785 << "synchronizationCallback and burstBuilder cannot simultaneously be used";
786
787 const bool synchronous = (synchronizationCallback == nullptr);
788 if (!synchronous) {
789 *synchronizationCallback = nullptr;
790 }
791
792 const auto deadline = makeDeadline(mTimeoutDuration);
793
794 // TODO validate that we have full types for all inputs and outputs,
795 // that the graph is not cyclic,
796
797 auto name = [synchronous, burstBuilder] {
798 return burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
799 };
800 if (mStarted) {
801 LOG(ERROR) << "ANeuralNetworksExecution_" << name()
802 << " called on an execution that has already started";
803 return ANEURALNETWORKS_BAD_STATE;
804 }
805 for (auto& p : mInputs) {
806 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
807 LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all inputs specified";
808 return ANEURALNETWORKS_BAD_DATA;
809 } else if (p.state() == ModelArgumentInfo::MEMORY) {
810 const Memory* memory = mMemories[p.locationAndLength().poolIndex];
811 if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
812 return ANEURALNETWORKS_OP_FAILED;
813 }
814 }
815 }
816 for (auto& p : mOutputs) {
817 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
818 LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all outputs specified";
819 return ANEURALNETWORKS_BAD_DATA;
820 }
821 }
822
823 auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
824 return finishWithoutSyncFence(error, outputShapes);
825 };
826
827 // TODO: For asynchronous execution, entire plan-based-path should run in an
828 // asynchronous thread -- take the asynchronous thread logic out of
829 // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
830 mStarted = true;
831 const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
832 std::shared_ptr<ExecutionPlan::Controller> controller =
833 mPlan->makeController(this, burstBuilder);
834 if (synchronous) {
835 VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
836 sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
837 localSynchronizationCallback->setOnFinish(wrappedFinish);
838 asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
839 localSynchronizationCallback);
840 localSynchronizationCallback->wait();
841 if (mMeasureTiming) {
842 mTimingWithoutFencedExecutionCallback = localSynchronizationCallback->getTiming();
843 }
844 return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus());
845 } else /* asynchronous */ {
846 // TODO: use a thread pool
847 // TODO(mikie): this could have NNTRACE so we could measure the overhead
848 // of spinning up a new thread.
849
850 // Prepare the callback for asynchronous execution.
851 // sp<ExecutionCallback> object is returned when the
852 // execution has been successfully launched, otherwise a
853 // nullptr is returned. The executionCallback is
854 // abstracted in the NN API as an "event".
855 sp<ExecutionCallback> executionCallback = new ExecutionCallback();
856 executionCallback->setOnFinish(wrappedFinish);
857 if (DeviceManager::get()->syncExecRuntime()) {
858 VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
859 asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
860 executionCallback);
861 } else {
862 VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
863 std::thread asyncExecution(
864 [this, controller, allowFallback, deadline, executionCallback] {
865 asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
866 deadline, executionCallback);
867 });
868 executionCallback->bindThread(std::move(asyncExecution));
869 }
870 *synchronizationCallback = executionCallback;
871 return ANEURALNETWORKS_NO_ERROR;
872 }
873 }
874
getInitialOutputShapes() const875 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
876 std::vector<OutputShape> outputShapes(mOutputs.size());
877 std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
878 [](const auto& x) -> OutputShape {
879 hidl_vec<uint32_t> dimensions;
880 if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
881 dimensions = x.dimensions();
882 }
883 return {.dimensions = std::move(dimensions), .isSufficient = true};
884 });
885 return outputShapes;
886 }
887
888 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
889 // have a higher specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)890 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
891 if (to.size() == 0) return true;
892 NN_RET_CHECK_EQ(to.size(), from.size());
893 for (uint32_t i = 0; i < to.size(); i++) {
894 NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
895 }
896 return true;
897 }
898
updateOutputShapes(const std::vector<OutputShape> & outputShapes)899 bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
900 if (outputShapes.size() == 0) {
901 return true;
902 }
903 NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
904 for (uint32_t i = 0; i < outputShapes.size(); i++) {
905 // Check if only unspecified dimensions or rank are overwritten.
906 NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
907 const OperandType operandType = mModel->getOutputOperand(i).type;
908 NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
909 outputShapes[i].dimensions));
910 }
911 for (uint32_t i = 0; i < outputShapes.size(); i++) {
912 mOutputs[i].dimensions() = outputShapes[i].dimensions;
913 mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
914 }
915 return true;
916 }
917
updateMemories()918 bool ExecutionBuilder::updateMemories() {
919 for (const auto& output : mOutputs) {
920 if (output.state() != ModelArgumentInfo::MEMORY) continue;
921 const Memory* memory = mMemories[output.locationAndLength().poolIndex];
922 NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
923 }
924 return true;
925 }
926
finishWithoutSyncFence(ErrorStatus status,const std::vector<OutputShape> & outputShapes)927 ErrorStatus ExecutionBuilder::finishWithoutSyncFence(ErrorStatus status,
928 const std::vector<OutputShape>& outputShapes) {
929 CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
930 CHECK(!hasSyncFence())
931 << "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
932 if (!updateOutputShapes(outputShapes) || !updateMemories()) {
933 status = ErrorStatus::GENERAL_FAILURE;
934 }
935 bool success = status == ErrorStatus::NONE;
936 for (const auto& output : mOutputs) {
937 if (output.state() != ModelArgumentInfo::MEMORY) continue;
938 const Memory* memory = mMemories[output.locationAndLength().poolIndex];
939 memory->getValidator().setInitialized(success);
940 }
941 switch (convertErrorStatusToResultCode(status)) {
942 case ANEURALNETWORKS_NO_ERROR:
943 mCompletionWithoutSyncFence = Completion::NO_ERROR;
944 break;
945 case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
946 mCompletionWithoutSyncFence = Completion::OUTPUT_INSUFFICIENT_SIZE;
947 break;
948 default:
949 mCompletionWithoutSyncFence = Completion::OTHER_ERROR;
950 break;
951 }
952 mFinishedWithoutSyncFence = true;
953 return status;
954 }
955
updateOutputShapes(const std::vector<OutputShape> & from,std::vector<OutputShape> * to)956 bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
957 std::vector<OutputShape>* to) {
958 if (from.size() == 0) {
959 return true;
960 }
961 if (mExecutionStep != nullptr) {
962 const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
963 NN_RET_CHECK_LE(indexMapping.size(), from.size());
964 for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
965 uint32_t toIndex = indexMapping[i];
966 NN_RET_CHECK_GT(to->size(), toIndex);
967 NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
968 (*to)[toIndex] = from[i];
969 }
970 } else {
971 NN_RET_CHECK_EQ(from.size(), to->size());
972 for (uint32_t i = 0, e = from.size(); i < e; i++) {
973 NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
974 (*to)[i] = from[i];
975 }
976 }
977 return true;
978 }
979
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<PreparedModel> preparedModel,const ExecutionStep * step)980 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
981 std::shared_ptr<Device> device,
982 std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
983 : mExecutionBuilder(executionBuilder),
984 mExecutionStep(step),
985 mModel(model),
986 mDevice(device),
987 mPreparedModel(preparedModel),
988 mInputs(model->inputCount()),
989 mOutputs(model->outputCount()) {
990 CHECK(mDevice != nullptr);
991 VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
992 << mOutputs.size() << " outputs";
993 }
994
mapInputsAndOutputsTrivially()995 void StepExecutor::mapInputsAndOutputsTrivially() {
996 mInputs = mExecutionBuilder->mInputs;
997 mOutputs = mExecutionBuilder->mOutputs;
998 mMemories = mExecutionBuilder->mMemories;
999 }
1000
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput)1001 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1002 ModelArgumentInfo* executorInputOrOutput) {
1003 *executorInputOrOutput = builderInputOrOutput;
1004 switch (executorInputOrOutput->state()) {
1005 default:
1006 CHECK(false) << "unexpected ModelArgumentInfo::state";
1007 break;
1008 case ModelArgumentInfo::HAS_NO_VALUE:
1009 case ModelArgumentInfo::POINTER:
1010 case ModelArgumentInfo::UNSPECIFIED:
1011 break;
1012 case ModelArgumentInfo::MEMORY: {
1013 const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1014 const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1015 const uint32_t executorPoolIndex = mMemories.add(memory);
1016 executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1017 break;
1018 }
1019 }
1020 }
1021
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const Memory * memory,uint32_t offset,ModelArgumentInfo * inputOrOutputInfo)1022 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1023 const Memory* memory, uint32_t offset,
1024 ModelArgumentInfo* inputOrOutputInfo) {
1025 // Should be similar to
1026 // ExecutionBuilder::setInputFromMemory()
1027 // ExecutionBuilder::setOutputFromMemory()
1028
1029 uint32_t poolIndex = mMemories.add(memory);
1030 uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
1031 CHECK(inputOrOutputInfo->unspecified());
1032 int n;
1033 std::tie(n, *inputOrOutputInfo) =
1034 ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1035 /*type=*/nullptr, poolIndex, offset, length);
1036 return n;
1037 }
1038
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1039 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1040 for (unsigned i = 0; i < args.size(); i++) {
1041 const auto& arg = args[i];
1042 std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1043 switch (arg.state()) {
1044 case ModelArgumentInfo::POINTER:
1045 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
1046 break;
1047 case ModelArgumentInfo::MEMORY:
1048 VLOG(EXECUTION) << prefix << "MEMORY("
1049 << "pool=" << arg.locationAndLength().poolIndex << ", "
1050 << "off=" << arg.locationAndLength().offset << ")";
1051 break;
1052 case ModelArgumentInfo::HAS_NO_VALUE:
1053 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1054 break;
1055 case ModelArgumentInfo::UNSPECIFIED:
1056 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1057 break;
1058 default:
1059 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1060 break;
1061 }
1062 }
1063 }
1064
isCpu() const1065 bool StepExecutor::isCpu() const {
1066 return mDevice == DeviceManager::getCpuDevice();
1067 }
1068
makeTimeoutDuration(uint64_t nanoseconds)1069 static OptionalTimeoutDuration makeTimeoutDuration(uint64_t nanoseconds) {
1070 OptionalTimeoutDuration otd;
1071 otd.nanoseconds(nanoseconds);
1072 return otd;
1073 }
1074
compute(const std::optional<Deadline> & deadline,const std::shared_ptr<ExecutionBurstController> & burstController)1075 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1076 const std::optional<Deadline>& deadline,
1077 const std::shared_ptr<ExecutionBurstController>& burstController) {
1078 return computeWithMemories(deadline, mMemories.getObjects(), burstController);
1079 }
1080
computeWithMemories(const std::optional<Deadline> & deadline,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController)1081 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeWithMemories(
1082 const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
1083 const std::shared_ptr<ExecutionBurstController>& burstController) {
1084 CHECK(mPreparedModel != nullptr);
1085
1086 if (VLOG_IS_ON(EXECUTION)) {
1087 logArguments("input", mInputs);
1088 logArguments("output", mOutputs);
1089 }
1090
1091 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1092 const OptionalTimeoutDuration loopTimeoutDuration =
1093 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1094 const auto [n, outputShapes, timing] = mPreparedModel->execute(
1095 mInputs, mOutputs, memories, burstController, measure, deadline, loopTimeoutDuration);
1096 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1097
1098 return {n, std::move(outputShapes), timing};
1099 }
1100
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline)1101 std::tuple<int, int, sp<hal::IFencedExecutionCallback>> StepExecutor::computeFenced(
1102 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1103 const std::optional<Deadline>& deadline) {
1104 CHECK(mPreparedModel != nullptr);
1105
1106 if (VLOG_IS_ON(EXECUTION)) {
1107 logArguments("input", mInputs);
1108 logArguments("output", mOutputs);
1109 }
1110
1111 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1112 const OptionalTimeoutDuration loopTimeoutDuration =
1113 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1114 OptionalTimeoutDuration optionalTimeoutDurationAfterFence;
1115 if (timeoutDurationAfterFence > 0) {
1116 optionalTimeoutDurationAfterFence.nanoseconds(timeoutDurationAfterFence);
1117 }
1118 const auto [n, syncFence, computeFencedCallback, timing] = mPreparedModel->executeFenced(
1119 mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1120 loopTimeoutDuration, optionalTimeoutDurationAfterFence);
1121 if (syncFence < 0 && computeFencedCallback == nullptr) {
1122 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1123 }
1124 return {n, syncFence, computeFencedCallback};
1125 }
1126
1127 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1128 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1129 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1130 VLOG(EXECUTION) << "Re-compile the model on CPU";
1131 mDevice = DeviceManager::getCpuDevice();
1132 mPreparedModel = nullptr;
1133 const ModelFactory makeModel = [this] { return mModel->makeHidlModel(); };
1134 // TODO: Propagate user preference and compilation priority to this point instead of using
1135 // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1136 // ANEURALNETWORKS_PRIORITY_MEDIUM
1137 const ExecutionPreference preference =
1138 static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1139 const Priority priority = convertToHalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1140 auto [n, preparedModel] = mDevice->prepareModel(makeModel, preference, priority, {}, {}, {});
1141 mPreparedModel = std::move(preparedModel);
1142 if (n != ANEURALNETWORKS_NO_ERROR) {
1143 return {n, {}, kNoTiming};
1144 }
1145
1146 // Prepare device memories for CPU fallback.
1147 std::vector<const Memory*> memories = mMemories.getObjects();
1148 std::vector<bool> isUsedAsInput(memories.size(), false);
1149 std::vector<bool> isUsedAsOutput(memories.size(), false);
1150 std::vector<std::unique_ptr<Memory>> blobAhwbs;
1151
1152 // Mark the input and output usages.
1153 for (auto& input : mInputs) {
1154 if (input.state() == ModelArgumentInfo::MEMORY) {
1155 const uint32_t poolIndex = input.locationAndLength().poolIndex;
1156 isUsedAsInput[poolIndex] = true;
1157 }
1158 }
1159 for (auto& output : mOutputs) {
1160 if (output.state() == ModelArgumentInfo::MEMORY) {
1161 const uint32_t poolIndex = output.locationAndLength().poolIndex;
1162 // Cannot allocate output buffers with unknown shapes.
1163 if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1164 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1165 "has unknown shape.";
1166 return {ANEURALNETWORKS_OP_FAILED, {}, kNoTiming};
1167 }
1168 isUsedAsOutput[poolIndex] = true;
1169 }
1170 }
1171
1172 // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1173 for (uint32_t i = 0; i < memories.size(); i++) {
1174 const Memory* memory = mMemories[i];
1175 if (memory->getIBuffer() != nullptr) {
1176 const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1177 auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1178 if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1179 return {nAhwb, {}, kNoTiming};
1180 }
1181 if (isUsedAsInput[i]) {
1182 n = copyIBufferToHidlMemory(memory->getIBuffer(), blobAhwb->getHidlMemory());
1183 if (n != ANEURALNETWORKS_NO_ERROR) {
1184 return {n, {}, kNoTiming};
1185 }
1186 }
1187 memories[i] = blobAhwb.get();
1188 blobAhwbs.push_back(std::move(blobAhwb));
1189 }
1190 }
1191
1192 auto [nCompute, outputShapes, timing] = computeWithMemories({}, memories);
1193 if (nCompute != ANEURALNETWORKS_NO_ERROR) {
1194 return {nCompute, std::move(outputShapes), timing};
1195 }
1196
1197 // Write back to output device memories.
1198 for (uint32_t i = 0; i < memories.size(); i++) {
1199 const Memory* memory = mMemories[i];
1200 if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1201 n = copyHidlMemoryToIBuffer(memories[i]->getHidlMemory(), memory->getIBuffer(), {});
1202 if (n != ANEURALNETWORKS_NO_ERROR) {
1203 return {n, {}, kNoTiming};
1204 }
1205 }
1206 }
1207 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1208 }
1209
1210 } // namespace nn
1211 } // namespace android
1212