1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionBuilder"
18 
19 #include "ExecutionBuilder.h"
20 
21 #include <android/sync.h>
22 
23 #include <algorithm>
24 #include <limits>
25 #include <memory>
26 #include <mutex>
27 #include <optional>
28 #include <string>
29 #include <thread>
30 #include <tuple>
31 #include <utility>
32 #include <vector>
33 
34 #include "CompilationBuilder.h"
35 #include "ControlFlow.h"
36 #include "CpuExecutor.h"
37 #include "ExecutionBurstController.h"
38 #include "HalInterfaces.h"
39 #include "Manager.h"
40 #include "ModelArgumentInfo.h"
41 #include "ModelBuilder.h"
42 #include "Tracing.h"
43 #include "TypeManager.h"
44 #include "Utils.h"
45 
46 namespace android {
47 namespace nn {
48 
49 using namespace hal;
50 
51 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
52 
measureTiming(const ExecutionBuilder * execution)53 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
54     return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
55 }
56 
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)57 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
58                                const char* tag, bool allowUnspecified) {
59     if (newType != nullptr) {
60         const Extension::OperandTypeInformation* info = nullptr;
61         if (isExtensionOperandType(operand.type)) {
62             NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
63         }
64         if (validateOperandType(*newType, info, tag, allowUnspecified) !=
65             ANEURALNETWORKS_NO_ERROR) {
66             LOG(ERROR) << tag << ": Invalid newType";
67             return false;
68         }
69         if (operand.dimensions.size() == 0) {
70             return true;
71         }
72         if (operand.dimensions.size() != newType->dimensionCount) {
73             LOG(ERROR) << tag << ": Setting with incompatible dimension count";
74             return false;
75         }
76         for (uint32_t i = 0; i < newType->dimensionCount; i++) {
77             if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
78                 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
79                 return false;
80             }
81         }
82     } else {
83         if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
84             tensorHasUnspecifiedDimensions(operand)) {
85             LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
86             return false;
87         }
88     }
89     return true;
90 }
91 
ExecutionBuilder(const CompilationBuilder * compilation)92 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
93     : mCompilation(compilation),
94       mModel(compilation->mModel),
95       mPlan(&compilation->mPlan),
96       mPartitioning(compilation->mPartitioning),
97       mInputs(mModel->inputCount()),
98       mOutputs(mModel->outputCount()) {
99     VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
100                     << " inputs and " << mOutputs.size() << " outputs";
101 }
102 
getSourceModel(uint32_t index) const103 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
104     return mPlan->getSourceModels().getModel(index);
105 }
106 
isFinished() const107 bool ExecutionBuilder::isFinished() const {
108     CHECK(!(mFinishedWithoutSyncFence && hasSyncFence()));
109     if (mFinishedWithoutSyncFence) {
110         return true;
111     }
112     if (hasSyncFence()) {
113         auto r = syncWait(mSyncFenceFd, 0);
114         CHECK(r != FenceState::UNKNOWN);
115         return r != FenceState::ACTIVE;
116     }
117     return false;
118 }
119 
completedWith() const120 ExecutionBuilder::Completion ExecutionBuilder::completedWith() const {
121     CHECK(isFinished());
122     if (hasSyncFence()) {
123         auto r = syncWait(mSyncFenceFd, 0);
124         CHECK(r == FenceState::SIGNALED || r == FenceState::ERROR);
125         return (r == FenceState::SIGNALED) ? Completion::NO_ERROR : Completion::OTHER_ERROR;
126     } else {
127         return mCompletionWithoutSyncFence;
128     }
129 }
130 
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)131 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
132                                const void* buffer, size_t length) {
133     if (mStarted) {
134         LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
135                       "execution has started.";
136         return ANEURALNETWORKS_BAD_STATE;
137     }
138     uint32_t count = static_cast<uint32_t>(mInputs.size());
139     if (index >= count) {
140         LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
141         return ANEURALNETWORKS_BAD_DATA;
142     }
143     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
144                             "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
145         return ANEURALNETWORKS_BAD_DATA;
146     }
147     if (length > 0xFFFFFFFF) {
148         LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
149         return ANEURALNETWORKS_BAD_DATA;
150     }
151     uint32_t l = static_cast<uint32_t>(length);
152     if (!mInputs[index].unspecified()) {
153         LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
154                       "provided";
155         return ANEURALNETWORKS_BAD_STATE;
156     }
157     int n;
158     std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
159             mModel->getInputOperand(index), type, const_cast<void*>(buffer), l);
160     return n;
161 }
162 
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)163 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
164                                          const Memory* memory, size_t offset, size_t length) {
165     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
166 
167     if (mStarted) {
168         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
169                       "execution has started.";
170         return ANEURALNETWORKS_BAD_STATE;
171     }
172     uint32_t count = static_cast<uint32_t>(mInputs.size());
173     if (index >= count) {
174         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
175                    << count;
176         return ANEURALNETWORKS_BAD_DATA;
177     }
178     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
179                             "ANeuralNetworksExecution_setInputFromMemory", false)) {
180         return ANEURALNETWORKS_BAD_DATA;
181     }
182     if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
183                                          length)) {
184         return ANEURALNETWORKS_BAD_DATA;
185     }
186     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
187     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
188     // region is used. We update the length here because the drivers are still expecting a real
189     // length. For other memories that do not allow this semantic, it is checked in
190     // MemoryValidatorBase::validate before reaching here.
191     if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
192         length = memory->getHidlMemory().size();
193     }
194     // TODO validate the rest
195     uint32_t poolIndex = mMemories.add(memory);
196     if (!mInputs[index].unspecified()) {
197         LOG(ERROR)
198                 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
199                    "been provided";
200         return ANEURALNETWORKS_BAD_STATE;
201     }
202     int n;
203     std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromMemory(
204             mModel->getInputOperand(index), type, poolIndex, offset, length);
205     return n;
206 }
207 
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)208 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
209                                 void* buffer, size_t length) {
210     if (mStarted) {
211         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
212                       "execution has started.";
213         return ANEURALNETWORKS_BAD_STATE;
214     }
215     uint32_t count = static_cast<uint32_t>(mOutputs.size());
216     if (index >= count) {
217         LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
218         return ANEURALNETWORKS_BAD_DATA;
219     }
220     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
221                             "ANeuralNetworksExecution_setOutput", true)) {
222         return ANEURALNETWORKS_BAD_DATA;
223     }
224     if (length > 0xFFFFFFFF) {
225         LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
226         return ANEURALNETWORKS_BAD_DATA;
227     }
228     uint32_t l = static_cast<uint32_t>(length);
229     if (!mOutputs[index].unspecified()) {
230         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
231                       "provided";
232         return ANEURALNETWORKS_BAD_STATE;
233     }
234     int n;
235     std::tie(n, mOutputs[index]) =
236             ModelArgumentInfo::createFromPointer(mModel->getOutputOperand(index), type, buffer, l);
237     return n;
238 }
239 
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)240 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
241                                           const Memory* memory, size_t offset, size_t length) {
242     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
243 
244     if (mStarted) {
245         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
246                       "execution has started.";
247         return ANEURALNETWORKS_BAD_STATE;
248     }
249     uint32_t count = static_cast<uint32_t>(mOutputs.size());
250     if (index >= count) {
251         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
252                    << count;
253         return ANEURALNETWORKS_BAD_DATA;
254     }
255     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
256                             "ANeuralNetworksExecution_setOutputFromMemory", true)) {
257         return ANEURALNETWORKS_BAD_DATA;
258     }
259     if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
260                                          length)) {
261         return ANEURALNETWORKS_BAD_DATA;
262     }
263     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
264     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
265     // region is used. We update the length here because the drivers are still expecting a real
266     // length. For other memories that do not allow this semantic, it is checked in
267     // MemoryValidatorBase::validate before reaching here.
268     if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
269         length = memory->getHidlMemory().size();
270     }
271     // TODO validate the rest
272     uint32_t poolIndex = mMemories.add(memory);
273     if (!mOutputs[index].unspecified()) {
274         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
275                       "already been provided";
276         return ANEURALNETWORKS_BAD_STATE;
277     }
278     int n;
279     std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromMemory(
280             mModel->getOutputOperand(index), type, poolIndex, offset, length);
281     return n;
282 }
283 
setMeasureTiming(bool measure)284 int ExecutionBuilder::setMeasureTiming(bool measure) {
285     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
286         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
287                    << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
288                    << "that was not created by ANeuralNetworksCompilation_createForDevices "
289                    << "with numDevices = 1";
290         return ANEURALNETWORKS_BAD_DATA;
291     }
292     if (mStarted) {
293         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
294                       "execution has started.";
295         return ANEURALNETWORKS_BAD_STATE;
296     }
297     mMeasureTiming = measure;
298     return ANEURALNETWORKS_NO_ERROR;
299 }
300 
getDuration(int32_t durationCode,uint64_t * duration) const301 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
302     if (!isFinished()) {
303         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
304                       "execution has finished.";
305         *duration = UINT64_MAX;
306         return ANEURALNETWORKS_BAD_STATE;
307     }
308     if (completedWith() != Completion::NO_ERROR) {
309         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
310                       "that has encountered an error.";
311         *duration = UINT64_MAX;
312         return ANEURALNETWORKS_BAD_STATE;
313     }
314 
315     // NOTE: At the HAL level, timing is in microseconds. At the NDK level, nanoseconds.
316     const uint64_t kNanoPerMicro = 1000;
317 
318     if (!mMeasureTiming) {
319         *duration = UINT64_MAX;
320         return ANEURALNETWORKS_BAD_STATE;
321     }
322 
323     Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
324     Timing timingFenced = timingLaunched;
325     if (mFencedExecutionCallback != nullptr) {
326         ErrorStatus status;
327         const Return<void> ret = mFencedExecutionCallback->getExecutionInfo(
328                 [&status, &timingLaunched, &timingFenced](ErrorStatus error, Timing tLaunched,
329                                                           Timing tFenced) {
330                     status = error;
331                     timingLaunched = tLaunched;
332                     timingFenced = tFenced;
333                 });
334         if (!ret.isOk()) {
335             *duration = UINT64_MAX;
336             return ANEURALNETWORKS_OP_FAILED;
337         }
338         if (status != ErrorStatus::NONE) {
339             *duration = UINT64_MAX;
340             return ANEURALNETWORKS_BAD_STATE;
341         }
342     }
343     uint64_t microDuration = UINT64_MAX;
344     switch (durationCode) {
345         case ANEURALNETWORKS_DURATION_ON_HARDWARE:
346             microDuration = timingLaunched.timeOnDevice;
347             break;
348         case ANEURALNETWORKS_DURATION_IN_DRIVER:
349             microDuration = timingLaunched.timeInDriver;
350             break;
351         case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
352             microDuration = timingFenced.timeOnDevice;
353             break;
354         case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
355             microDuration = timingFenced.timeInDriver;
356             break;
357         default:
358             CHECK(!"unexpected");
359     }
360     *duration = (microDuration == UINT64_MAX) ? UINT64_MAX : kNanoPerMicro * microDuration;
361 
362     VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
363     return ANEURALNETWORKS_NO_ERROR;
364 }
365 
setTimeoutDuration(uint64_t duration)366 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
367     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
368         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
369                       "created from an ANeuralNetworksCompilation that was not created by "
370                       "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
371         return ANEURALNETWORKS_BAD_DATA;
372     }
373     if (mStarted) {
374         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
375         return ANEURALNETWORKS_BAD_STATE;
376     }
377     if (duration > 0) {
378         mTimeoutDuration = duration;
379     } else {
380         mTimeoutDuration.reset();
381     }
382     return ANEURALNETWORKS_NO_ERROR;
383 }
384 
getTimeoutDuration() const385 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
386     return mTimeoutDuration;
387 }
388 
setLoopTimeout(uint64_t duration)389 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
390     if (mStarted) {
391         LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
392                       "execution has started.";
393         return ANEURALNETWORKS_BAD_STATE;
394     }
395     if (duration > operation_while::kTimeoutNsMaximum) {
396         LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
397                      << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
398         duration = operation_while::kTimeoutNsMaximum;
399     }
400     mLoopTimeoutDuration = duration;
401     return ANEURALNETWORKS_NO_ERROR;
402 }
403 
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)404 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
405     if (!isFinished()) {
406         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
407                       "execution has finished.";
408         return ANEURALNETWORKS_BAD_STATE;
409     }
410     if (completedWith() == Completion::OTHER_ERROR) {
411         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
412                       "that has encountered an error.";
413         return ANEURALNETWORKS_BAD_STATE;
414     }
415 
416     uint32_t count = static_cast<uint32_t>(mOutputs.size());
417     if (index >= count) {
418         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
419                    << " " << count;
420         return ANEURALNETWORKS_BAD_DATA;
421     }
422     const auto& dims = mOutputs[index].dimensions();
423     if (dims.empty()) {
424         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
425                       "dimensions of a scalar";
426         return ANEURALNETWORKS_BAD_DATA;
427     }
428     std::copy(dims.begin(), dims.end(), dimensions);
429     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
430                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
431 }
432 
getOutputOperandRank(uint32_t index,uint32_t * rank)433 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
434     if (!isFinished()) {
435         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
436                       "execution has finished.";
437         return ANEURALNETWORKS_BAD_STATE;
438     }
439     if (completedWith() == Completion::OTHER_ERROR) {
440         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
441                       "that has encountered an error.";
442         return ANEURALNETWORKS_BAD_STATE;
443     }
444     uint32_t count = static_cast<uint32_t>(mOutputs.size());
445     if (index >= count) {
446         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
447                    << count;
448         return ANEURALNETWORKS_BAD_DATA;
449     }
450     *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
451     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
452                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
453 }
454 
455 // Attempt synchronous execution of full model on CPU.
456 // TODO: How should we handle timing in this case?
457 //       For Q this is irrelevant: We only support timing in conjunction
458 //         with an explicit device list; and we do not support CPU fallback
459 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)460 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
461         ExecutionBuilder* executionBuilder) {
462     CHECK(executionBuilder != nullptr);
463     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
464     VLOG(EXECUTION) << "cpuFallbackFull";
465 
466     // Get fallback executor.
467     StepExecutor executor(executionBuilder, executionBuilder->getModel(),
468                           DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr);
469     executor.mapInputsAndOutputsTrivially();
470 
471     // Attempt fallback execution.
472     return executor.computeOnCpuFallback();
473 }
474 
475 // Attempt synchronous execution on CPU.
476 // TODO: How should we handle timing in this case?
477 //       For Q this is irrelevant: We only support timing in conjunction
478 //         with an explicit device list; and we do not support CPU fallback
479 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
480 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)481 cpuFallbackPartial(const ExecutionPlan& plan,
482                    std::shared_ptr<ExecutionPlan::Controller> controller) {
483     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
484     VLOG(EXECUTION) << "cpuFallbackPartial";
485 
486     // Get fallback executor.
487     std::shared_ptr<StepExecutor> executor;
488     int n1 = plan.fallback(controller, &executor);
489     if (n1 != ANEURALNETWORKS_NO_ERROR) {
490         return {n1, {}, kNoTiming, nullptr};
491     }
492     CHECK(executor != nullptr);
493 
494     // Attempt fallback execution.
495     auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
496     return {n2, std::move(outputShapes), timing, executor};
497 }
498 
asyncStartComputePartitioned(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,bool allowFallback,const std::optional<Deadline> & deadline,const sp<ExecutionCallback> & executionCallback)499 static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
500                                          const ExecutionPlan& plan,
501                                          std::shared_ptr<ExecutionPlan::Controller> controller,
502                                          bool allowFallback,
503                                          const std::optional<Deadline>& deadline,
504                                          const sp<ExecutionCallback>& executionCallback) {
505     CHECK(executionBuilder != nullptr);
506     VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)";
507 
508     std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
509     Timing timing = kNoTiming;
510     // Disallow fallback when the ExecutionPlan is simple on CPU.
511     allowFallback &= !plan.isSimpleCpu();
512 
513     while (true) {
514         VLOG(EXECUTION) << "looking for next StepExecutor";
515 
516         // Get the current step of the execution.
517         std::shared_ptr<StepExecutor> executor;
518         std::shared_ptr<ExecutionBurstController> burstController;
519         int n = plan.next(controller, &executor, &burstController);
520         if (n != ANEURALNETWORKS_NO_ERROR) {
521             // During the interpreted execution of control flow, a loop timeout
522             // might occur in ExecutionPlan::next().
523             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
524                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
525             if (allowFallback && !missedDeadline) break;
526             executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
527             return;
528         }
529 
530         // If the code reached the end of the plan without error, then return
531         // with no error.
532         if (executor == nullptr) {
533             executionCallback->notify(ErrorStatus::NONE, outputShapes, timing);
534             return;
535         }
536         const bool executorIsCpu = executor->isCpu();
537 
538         // Attempt to execute a single step of the execution.
539         auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
540 
541         // Update global outputs.
542         if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
543             stepN = ANEURALNETWORKS_OP_FAILED;
544         }
545 
546         // If execution was successful, continue to next step.
547         if (stepN == ANEURALNETWORKS_NO_ERROR) {
548             // We only support collection of timing information in the case of a
549             // single step, so it's safe to just keep track of the last step's
550             // timing information.
551             timing = stepTiming;
552             continue;
553         }
554 
555         // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
556         if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
557             const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
558             executionCallback->notify(stepStatus, outputShapes, kNoTiming);
559             return;
560         }
561 
562         // If fallback is not allowed and there was an error, end execution.
563         if (!allowFallback) {
564             const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
565             executionCallback->notify(stepStatus, {}, kNoTiming);
566             return;
567         }
568 
569         // If CPU execution was already attempted, either:
570         // (1) perform a full fallback if the plan is not simple, or
571         // (2) return from the function with an error
572         if (executorIsCpu) {
573             if (!plan.isSimple()) break;
574             executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, kNoTiming);
575             return;
576         }
577 
578         // If the code reaches this point, attempt a partial fallback to CPU.
579         CHECK(allowFallback);
580         auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
581                 cpuFallbackPartial(plan, controller);
582 
583         // Update global outputs.
584         if (fallbackExecutor != nullptr &&
585             !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
586             fallbackN = ANEURALNETWORKS_OP_FAILED;
587         }
588 
589         // If execution was successful, continue to next step.
590         if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
591             // We only support collection of timing information in the case of a
592             // single step, so it's safe to just keep track of the last step's
593             // timing information.
594             timing = fallbackTiming;
595             continue;
596         }
597 
598         // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
599         if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
600             const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
601             executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
602             return;
603         }
604 
605         // Do not fallback twice if the ExecutionPlan is simple.
606         if (plan.isSimple()) {
607             const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
608             executionCallback->notify(fallbackStatus, {}, kNoTiming);
609             return;
610         }
611 
612         // If the code reaches this point, then there was an error with the
613         // fallback. In this case, attempt full fallback.
614         break;
615     }
616 
617     // If the code has reached this point, a potentially recoverable error
618     // occurred during the step executions. Instead, do a full execution
619     // fallback on the CPU.
620     auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
621     const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
622     executionCallback->notify(fullStatus, fullOutputShapes, fullTiming);
623 }
624 
625 // In case of partitioned execution, startComputeFenced call will return the sync
626 // fence and the fenced compute callback returned from the last partition.
627 // Any failed partition will result in the whole execution fallback to CPU if
628 // allowFallback is set to true.
startComputeFenced(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline,bool allowFallback)629 static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
630         ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
631         std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
632         uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
633         bool allowFallback) {
634     CHECK(executionBuilder != nullptr);
635     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
636     // Disallow fallback when the ExecutionPlan is simple on CPU.
637     allowFallback &= !plan.isSimpleCpu();
638 
639     // Initiate waitForFds, syncFence for the first step.
640     std::vector<int> waitForFds = waitFor;
641     int syncFence = -1;
642     sp<hal::IFencedExecutionCallback> computeFencedCallback;
643 
644     while (true) {
645         VLOG(EXECUTION) << "looking for next StepExecutor";
646 
647         // Get the current step of the execution.
648         std::shared_ptr<StepExecutor> executor;
649         int n = plan.next(controller, &executor, nullptr, syncFence);
650         if (n != ANEURALNETWORKS_NO_ERROR) {
651             // During the interpreted execution of control flow, a loop timeout
652             // might occur in ExecutionPlan::next().
653             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
654                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
655             if (allowFallback && !missedDeadline) break;
656             // Return -1 for the sync fence fd, and nullptr for the callback.
657             return std::make_tuple(n, -1, nullptr);
658         }
659 
660         // If the code reached the end of the plan without error, then return
661         // with no error.
662         if (executor == nullptr) {
663             // If the final step returns a -1 for sync fence, the execution is finished.
664             // Update the output shapes.
665             if (syncFence == -1) {
666                 // TODO(miaowang): support dynamic output shape only with memory domain.
667                 // For now just return the initial output shapes.
668                 executionBuilder->finishWithoutSyncFence(
669                         ErrorStatus::NONE, executionBuilder->getInitialOutputShapes());
670             }
671             return std::make_tuple(ANEURALNETWORKS_NO_ERROR, syncFence, computeFencedCallback);
672         }
673         const bool executorIsCpu = executor->isCpu();
674 
675         // Attempt to execute a single step of the execution.
676         auto [stepN, syncFd, callback] =
677                 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
678 
679         // Update waitForFds, syncFence for the next step.
680         syncFence = syncFd;
681         computeFencedCallback = callback;
682         waitForFds.clear();
683         if (syncFd > 0) {
684             waitForFds = {syncFd};
685         }
686 
687         // If execution was successful, continue to next step.
688         if (stepN == ANEURALNETWORKS_NO_ERROR) {
689             continue;
690         }
691         // If fallback is not allowed and there was an error, end execution.
692         if (!allowFallback) {
693             return std::make_tuple(stepN, -1, nullptr);
694         }
695 
696         // If CPU execution was already attempted, either:
697         // (1) perform a full fallback if the plan is not simple, or
698         // (2) return from the function with an error
699         if (executorIsCpu) {
700             if (!plan.isSimple()) break;
701             return std::make_tuple(stepN, -1, nullptr);
702         }
703         // If the code reaches this point, then there was an error with the
704         // fallback. In this case, attempt full fallback.
705         break;
706     }
707 
708     // If the code has reached this point, a potentially recoverable error
709     // occurred during the step executions. Instead, do a full execution
710     // fallback on the CPU.
711     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
712     for (int syncFd : waitFor) {
713         if (syncFd > 0) {
714             auto r = syncWait(syncFd, -1);
715             if (r != FenceState::SIGNALED) {
716                 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
717                 return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr);
718             }
719         }
720     }
721     auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
722     const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
723     syncFence = -1;
724     executionBuilder->finishWithoutSyncFence(fullStatus, fullOutputShapes);
725     executionBuilder->reportTimingWithoutFencedExecutionCallback(fullTiming);
726     return std::make_tuple(fullN, syncFence, nullptr);
727 }
728 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)729 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
730                                     uint64_t timeoutDurationAfterFence, int* syncFence) {
731     CHECK(syncFence != nullptr);
732     if (mStarted) {
733         LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
734                       " called on an execution that has already started";
735         return ANEURALNETWORKS_BAD_STATE;
736     }
737     if (timeoutDurationAfterFence > 0) {
738         if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
739             LOG(ERROR)
740                     << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
741                        "duration on an ANeuralNetworksExecution "
742                        "created from an ANeuralNetworksCompilation that was not created by "
743                        "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
744             return ANEURALNETWORKS_BAD_DATA;
745         }
746     }
747     const auto deadline = makeDeadline(mTimeoutDuration);
748     for (auto& p : mInputs) {
749         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
750             LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
751                           " not all inputs specified";
752             return ANEURALNETWORKS_BAD_DATA;
753         }
754     }
755     for (auto& p : mOutputs) {
756         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
757             LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
758                           " not all outputs specified";
759             return ANEURALNETWORKS_BAD_DATA;
760         }
761     }
762     for (uint32_t i = 0; i < mOutputs.size(); i++) {
763         if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
764             !checkDimensionInfo(mModel->getOutputOperand(i), nullptr,
765                                 "ANeuralNetworksExecution_startComputeWithDependencies", false)) {
766             LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
767                           " not all outputs have fully specified dimensions";
768             return ANEURALNETWORKS_BAD_DATA;
769         }
770     }
771     mStarted = true;
772     const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
773     std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
774     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
775     int result;
776     std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
777             this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
778     *syncFence = mSyncFenceFd;
779     return result;
780 }
781 
compute(sp<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)782 int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
783                               BurstBuilder* burstBuilder) {
784     CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
785             << "synchronizationCallback and burstBuilder cannot simultaneously be used";
786 
787     const bool synchronous = (synchronizationCallback == nullptr);
788     if (!synchronous) {
789         *synchronizationCallback = nullptr;
790     }
791 
792     const auto deadline = makeDeadline(mTimeoutDuration);
793 
794     // TODO validate that we have full types for all inputs and outputs,
795     // that the graph is not cyclic,
796 
797     auto name = [synchronous, burstBuilder] {
798         return burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
799     };
800     if (mStarted) {
801         LOG(ERROR) << "ANeuralNetworksExecution_" << name()
802                    << " called on an execution that has already started";
803         return ANEURALNETWORKS_BAD_STATE;
804     }
805     for (auto& p : mInputs) {
806         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
807             LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all inputs specified";
808             return ANEURALNETWORKS_BAD_DATA;
809         } else if (p.state() == ModelArgumentInfo::MEMORY) {
810             const Memory* memory = mMemories[p.locationAndLength().poolIndex];
811             if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
812                 return ANEURALNETWORKS_OP_FAILED;
813             }
814         }
815     }
816     for (auto& p : mOutputs) {
817         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
818             LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all outputs specified";
819             return ANEURALNETWORKS_BAD_DATA;
820         }
821     }
822 
823     auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
824         return finishWithoutSyncFence(error, outputShapes);
825     };
826 
827     // TODO: For asynchronous execution, entire plan-based-path should run in an
828     // asynchronous thread -- take the asynchronous thread logic out of
829     // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
830     mStarted = true;
831     const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
832     std::shared_ptr<ExecutionPlan::Controller> controller =
833             mPlan->makeController(this, burstBuilder);
834     if (synchronous) {
835         VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
836         sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
837         localSynchronizationCallback->setOnFinish(wrappedFinish);
838         asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
839                                      localSynchronizationCallback);
840         localSynchronizationCallback->wait();
841         if (mMeasureTiming) {
842             mTimingWithoutFencedExecutionCallback = localSynchronizationCallback->getTiming();
843         }
844         return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus());
845     } else /* asynchronous */ {
846         // TODO: use a thread pool
847         // TODO(mikie): this could have NNTRACE so we could measure the overhead
848         //              of spinning up a new thread.
849 
850         // Prepare the callback for asynchronous execution.
851         // sp<ExecutionCallback> object is returned when the
852         // execution has been successfully launched, otherwise a
853         // nullptr is returned.  The executionCallback is
854         // abstracted in the NN API as an "event".
855         sp<ExecutionCallback> executionCallback = new ExecutionCallback();
856         executionCallback->setOnFinish(wrappedFinish);
857         if (DeviceManager::get()->syncExecRuntime()) {
858             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
859             asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
860                                          executionCallback);
861         } else {
862             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
863             std::thread asyncExecution(
864                     [this, controller, allowFallback, deadline, executionCallback] {
865                         asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
866                                                      deadline, executionCallback);
867                     });
868             executionCallback->bindThread(std::move(asyncExecution));
869         }
870         *synchronizationCallback = executionCallback;
871         return ANEURALNETWORKS_NO_ERROR;
872     }
873 }
874 
getInitialOutputShapes() const875 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
876     std::vector<OutputShape> outputShapes(mOutputs.size());
877     std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
878                    [](const auto& x) -> OutputShape {
879                        hidl_vec<uint32_t> dimensions;
880                        if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
881                            dimensions = x.dimensions();
882                        }
883                        return {.dimensions = std::move(dimensions), .isSufficient = true};
884                    });
885     return outputShapes;
886 }
887 
888 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
889 // have a higher specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)890 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
891     if (to.size() == 0) return true;
892     NN_RET_CHECK_EQ(to.size(), from.size());
893     for (uint32_t i = 0; i < to.size(); i++) {
894         NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
895     }
896     return true;
897 }
898 
updateOutputShapes(const std::vector<OutputShape> & outputShapes)899 bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
900     if (outputShapes.size() == 0) {
901         return true;
902     }
903     NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
904     for (uint32_t i = 0; i < outputShapes.size(); i++) {
905         // Check if only unspecified dimensions or rank are overwritten.
906         NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
907         const OperandType operandType = mModel->getOutputOperand(i).type;
908         NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
909                                                                     outputShapes[i].dimensions));
910     }
911     for (uint32_t i = 0; i < outputShapes.size(); i++) {
912         mOutputs[i].dimensions() = outputShapes[i].dimensions;
913         mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
914     }
915     return true;
916 }
917 
updateMemories()918 bool ExecutionBuilder::updateMemories() {
919     for (const auto& output : mOutputs) {
920         if (output.state() != ModelArgumentInfo::MEMORY) continue;
921         const Memory* memory = mMemories[output.locationAndLength().poolIndex];
922         NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
923     }
924     return true;
925 }
926 
finishWithoutSyncFence(ErrorStatus status,const std::vector<OutputShape> & outputShapes)927 ErrorStatus ExecutionBuilder::finishWithoutSyncFence(ErrorStatus status,
928                                                      const std::vector<OutputShape>& outputShapes) {
929     CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
930     CHECK(!hasSyncFence())
931             << "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
932     if (!updateOutputShapes(outputShapes) || !updateMemories()) {
933         status = ErrorStatus::GENERAL_FAILURE;
934     }
935     bool success = status == ErrorStatus::NONE;
936     for (const auto& output : mOutputs) {
937         if (output.state() != ModelArgumentInfo::MEMORY) continue;
938         const Memory* memory = mMemories[output.locationAndLength().poolIndex];
939         memory->getValidator().setInitialized(success);
940     }
941     switch (convertErrorStatusToResultCode(status)) {
942         case ANEURALNETWORKS_NO_ERROR:
943             mCompletionWithoutSyncFence = Completion::NO_ERROR;
944             break;
945         case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
946             mCompletionWithoutSyncFence = Completion::OUTPUT_INSUFFICIENT_SIZE;
947             break;
948         default:
949             mCompletionWithoutSyncFence = Completion::OTHER_ERROR;
950             break;
951     }
952     mFinishedWithoutSyncFence = true;
953     return status;
954 }
955 
updateOutputShapes(const std::vector<OutputShape> & from,std::vector<OutputShape> * to)956 bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
957                                       std::vector<OutputShape>* to) {
958     if (from.size() == 0) {
959         return true;
960     }
961     if (mExecutionStep != nullptr) {
962         const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
963         NN_RET_CHECK_LE(indexMapping.size(), from.size());
964         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
965             uint32_t toIndex = indexMapping[i];
966             NN_RET_CHECK_GT(to->size(), toIndex);
967             NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
968             (*to)[toIndex] = from[i];
969         }
970     } else {
971         NN_RET_CHECK_EQ(from.size(), to->size());
972         for (uint32_t i = 0, e = from.size(); i < e; i++) {
973             NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
974             (*to)[i] = from[i];
975         }
976     }
977     return true;
978 }
979 
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<PreparedModel> preparedModel,const ExecutionStep * step)980 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
981                            std::shared_ptr<Device> device,
982                            std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
983     : mExecutionBuilder(executionBuilder),
984       mExecutionStep(step),
985       mModel(model),
986       mDevice(device),
987       mPreparedModel(preparedModel),
988       mInputs(model->inputCount()),
989       mOutputs(model->outputCount()) {
990     CHECK(mDevice != nullptr);
991     VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
992                     << mOutputs.size() << " outputs";
993 }
994 
mapInputsAndOutputsTrivially()995 void StepExecutor::mapInputsAndOutputsTrivially() {
996     mInputs = mExecutionBuilder->mInputs;
997     mOutputs = mExecutionBuilder->mOutputs;
998     mMemories = mExecutionBuilder->mMemories;
999 }
1000 
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput)1001 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1002                                     ModelArgumentInfo* executorInputOrOutput) {
1003     *executorInputOrOutput = builderInputOrOutput;
1004     switch (executorInputOrOutput->state()) {
1005         default:
1006             CHECK(false) << "unexpected ModelArgumentInfo::state";
1007             break;
1008         case ModelArgumentInfo::HAS_NO_VALUE:
1009         case ModelArgumentInfo::POINTER:
1010         case ModelArgumentInfo::UNSPECIFIED:
1011             break;
1012         case ModelArgumentInfo::MEMORY: {
1013             const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1014             const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1015             const uint32_t executorPoolIndex = mMemories.add(memory);
1016             executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1017             break;
1018         }
1019     }
1020 }
1021 
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const Memory * memory,uint32_t offset,ModelArgumentInfo * inputOrOutputInfo)1022 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1023                                              const Memory* memory, uint32_t offset,
1024                                              ModelArgumentInfo* inputOrOutputInfo) {
1025     // Should be similar to
1026     //     ExecutionBuilder::setInputFromMemory()
1027     //     ExecutionBuilder::setOutputFromMemory()
1028 
1029     uint32_t poolIndex = mMemories.add(memory);
1030     uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
1031     CHECK(inputOrOutputInfo->unspecified());
1032     int n;
1033     std::tie(n, *inputOrOutputInfo) =
1034             ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1035                                                 /*type=*/nullptr, poolIndex, offset, length);
1036     return n;
1037 }
1038 
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1039 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1040     for (unsigned i = 0; i < args.size(); i++) {
1041         const auto& arg = args[i];
1042         std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1043         switch (arg.state()) {
1044             case ModelArgumentInfo::POINTER:
1045                 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
1046                 break;
1047             case ModelArgumentInfo::MEMORY:
1048                 VLOG(EXECUTION) << prefix << "MEMORY("
1049                                 << "pool=" << arg.locationAndLength().poolIndex << ", "
1050                                 << "off=" << arg.locationAndLength().offset << ")";
1051                 break;
1052             case ModelArgumentInfo::HAS_NO_VALUE:
1053                 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1054                 break;
1055             case ModelArgumentInfo::UNSPECIFIED:
1056                 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1057                 break;
1058             default:
1059                 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1060                 break;
1061         }
1062     }
1063 }
1064 
isCpu() const1065 bool StepExecutor::isCpu() const {
1066     return mDevice == DeviceManager::getCpuDevice();
1067 }
1068 
makeTimeoutDuration(uint64_t nanoseconds)1069 static OptionalTimeoutDuration makeTimeoutDuration(uint64_t nanoseconds) {
1070     OptionalTimeoutDuration otd;
1071     otd.nanoseconds(nanoseconds);
1072     return otd;
1073 }
1074 
compute(const std::optional<Deadline> & deadline,const std::shared_ptr<ExecutionBurstController> & burstController)1075 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1076         const std::optional<Deadline>& deadline,
1077         const std::shared_ptr<ExecutionBurstController>& burstController) {
1078     return computeWithMemories(deadline, mMemories.getObjects(), burstController);
1079 }
1080 
computeWithMemories(const std::optional<Deadline> & deadline,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController)1081 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeWithMemories(
1082         const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
1083         const std::shared_ptr<ExecutionBurstController>& burstController) {
1084     CHECK(mPreparedModel != nullptr);
1085 
1086     if (VLOG_IS_ON(EXECUTION)) {
1087         logArguments("input", mInputs);
1088         logArguments("output", mOutputs);
1089     }
1090 
1091     const MeasureTiming measure = measureTiming(mExecutionBuilder);
1092     const OptionalTimeoutDuration loopTimeoutDuration =
1093             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1094     const auto [n, outputShapes, timing] = mPreparedModel->execute(
1095             mInputs, mOutputs, memories, burstController, measure, deadline, loopTimeoutDuration);
1096     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1097 
1098     return {n, std::move(outputShapes), timing};
1099 }
1100 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline)1101 std::tuple<int, int, sp<hal::IFencedExecutionCallback>> StepExecutor::computeFenced(
1102         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1103         const std::optional<Deadline>& deadline) {
1104     CHECK(mPreparedModel != nullptr);
1105 
1106     if (VLOG_IS_ON(EXECUTION)) {
1107         logArguments("input", mInputs);
1108         logArguments("output", mOutputs);
1109     }
1110 
1111     const MeasureTiming measure = measureTiming(mExecutionBuilder);
1112     const OptionalTimeoutDuration loopTimeoutDuration =
1113             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1114     OptionalTimeoutDuration optionalTimeoutDurationAfterFence;
1115     if (timeoutDurationAfterFence > 0) {
1116         optionalTimeoutDurationAfterFence.nanoseconds(timeoutDurationAfterFence);
1117     }
1118     const auto [n, syncFence, computeFencedCallback, timing] = mPreparedModel->executeFenced(
1119             mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1120             loopTimeoutDuration, optionalTimeoutDurationAfterFence);
1121     if (syncFence < 0 && computeFencedCallback == nullptr) {
1122         mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1123     }
1124     return {n, syncFence, computeFencedCallback};
1125 }
1126 
1127 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1128 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1129     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1130     VLOG(EXECUTION) << "Re-compile the model on CPU";
1131     mDevice = DeviceManager::getCpuDevice();
1132     mPreparedModel = nullptr;
1133     const ModelFactory makeModel = [this] { return mModel->makeHidlModel(); };
1134     // TODO: Propagate user preference and compilation priority to this point instead of using
1135     // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1136     // ANEURALNETWORKS_PRIORITY_MEDIUM
1137     const ExecutionPreference preference =
1138             static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1139     const Priority priority = convertToHalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1140     auto [n, preparedModel] = mDevice->prepareModel(makeModel, preference, priority, {}, {}, {});
1141     mPreparedModel = std::move(preparedModel);
1142     if (n != ANEURALNETWORKS_NO_ERROR) {
1143         return {n, {}, kNoTiming};
1144     }
1145 
1146     // Prepare device memories for CPU fallback.
1147     std::vector<const Memory*> memories = mMemories.getObjects();
1148     std::vector<bool> isUsedAsInput(memories.size(), false);
1149     std::vector<bool> isUsedAsOutput(memories.size(), false);
1150     std::vector<std::unique_ptr<Memory>> blobAhwbs;
1151 
1152     // Mark the input and output usages.
1153     for (auto& input : mInputs) {
1154         if (input.state() == ModelArgumentInfo::MEMORY) {
1155             const uint32_t poolIndex = input.locationAndLength().poolIndex;
1156             isUsedAsInput[poolIndex] = true;
1157         }
1158     }
1159     for (auto& output : mOutputs) {
1160         if (output.state() == ModelArgumentInfo::MEMORY) {
1161             const uint32_t poolIndex = output.locationAndLength().poolIndex;
1162             // Cannot allocate output buffers with unknown shapes.
1163             if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1164                 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1165                               "has unknown shape.";
1166                 return {ANEURALNETWORKS_OP_FAILED, {}, kNoTiming};
1167             }
1168             isUsedAsOutput[poolIndex] = true;
1169         }
1170     }
1171 
1172     // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1173     for (uint32_t i = 0; i < memories.size(); i++) {
1174         const Memory* memory = mMemories[i];
1175         if (memory->getIBuffer() != nullptr) {
1176             const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1177             auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1178             if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1179                 return {nAhwb, {}, kNoTiming};
1180             }
1181             if (isUsedAsInput[i]) {
1182                 n = copyIBufferToHidlMemory(memory->getIBuffer(), blobAhwb->getHidlMemory());
1183                 if (n != ANEURALNETWORKS_NO_ERROR) {
1184                     return {n, {}, kNoTiming};
1185                 }
1186             }
1187             memories[i] = blobAhwb.get();
1188             blobAhwbs.push_back(std::move(blobAhwb));
1189         }
1190     }
1191 
1192     auto [nCompute, outputShapes, timing] = computeWithMemories({}, memories);
1193     if (nCompute != ANEURALNETWORKS_NO_ERROR) {
1194         return {nCompute, std::move(outputShapes), timing};
1195     }
1196 
1197     // Write back to output device memories.
1198     for (uint32_t i = 0; i < memories.size(); i++) {
1199         const Memory* memory = mMemories[i];
1200         if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1201             n = copyHidlMemoryToIBuffer(memories[i]->getHidlMemory(), memory->getIBuffer(), {});
1202             if (n != ANEURALNETWORKS_NO_ERROR) {
1203                 return {n, {}, kNoTiming};
1204             }
1205         }
1206     }
1207     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1208 }
1209 
1210 }  // namespace nn
1211 }  // namespace android
1212