1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionBuilder"
18 
19 #include "ExecutionBuilder.h"
20 
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <LegacyUtils.h>
24 #include <Tracing.h>
25 #include <android-base/logging.h>
26 #include <nnapi/IBurst.h>
27 #include <nnapi/IPreparedModel.h>
28 #include <nnapi/Types.h>
29 
30 #include <algorithm>
31 #include <limits>
32 #include <map>
33 #include <memory>
34 #include <mutex>
35 #include <optional>
36 #include <string>
37 #include <thread>
38 #include <tuple>
39 #include <utility>
40 #include <vector>
41 
42 #include "BurstBuilder.h"
43 #include "CompilationBuilder.h"
44 #include "Manager.h"
45 #include "ModelArgumentInfo.h"
46 #include "ModelBuilder.h"
47 #include "Telemetry.h"
48 #include "TypeManager.h"
49 
50 namespace android {
51 namespace nn {
52 
53 // Partial validation of output shapes returned from driver, to ensure they
54 // conform to a very specific set of rules.
validateOutputShapesFromDriver(ErrorStatus executionStatus,const ModelBuilder * model,const std::vector<OutputShape> & shapes)55 static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
56                                            const std::vector<OutputShape>& shapes) {
57     // Enforces the following rules (some of which are from b/154054474):
58     // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
59     //   If the vector is not empty, it must have as many entries as the step model has outputs.
60     // - If NONE, then either shapes vector is empty, or every shape is
61     //   marked isSufficient and, if a tensor, has known rank.
62     // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty.  At least one entry
63     //   is marked !isSufficient.
64     switch (executionStatus) {
65         case ErrorStatus::NONE: {
66             NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
67                     << "With execution ErrorStatus " << executionStatus
68                     << " output shapes vector must be empty or of length " << model->outputCount()
69                     << " but has length " << shapes.size();
70             NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
71                                      [](const OutputShape& shape) { return shape.isSufficient; }))
72                     << "With execution ErrorStatus " << executionStatus
73                     << " at least one output shape is unexpectedly marked !isSufficient";
74 
75             const TypeManager* tm = TypeManager::get();
76             for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
77                  ++outputIndex) {
78                 const Operand& outputOperand = model->getOutputOperand(outputIndex);
79                 NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
80                              (shapes[outputIndex].dimensions.size() != 0))
81                         << "With execution ErrorStatus " << executionStatus << " output#"
82                         << outputIndex << " shape unexpectedly has zero rank";
83             }
84 
85             break;
86         }
87         case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
88             NN_RET_CHECK(shapes.size() == model->outputCount())
89                     << "With execution ErrorStatus " << executionStatus
90                     << " output shapes vector must be of length " << model->outputCount()
91                     << " but has length " << shapes.size();
92             NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
93                                      [](const OutputShape& shape) { return !shape.isSufficient; }))
94                     << "With execution ErrorStatus " << executionStatus
95                     << " at least one output shape must have been marked !isSufficient";
96             break;
97         }
98         default: {
99             NN_RET_CHECK(shapes.size() == 0)
100                     << "With execution ErrorStatus " << executionStatus
101                     << " output shapes vector must be empty but has length " << shapes.size();
102             break;
103         }
104     }
105     return true;
106 }
validateOutputShapesFromDriver(int executionResultCode,const ModelBuilder * model,const std::vector<OutputShape> & shapes)107 static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
108                                            const std::vector<OutputShape>& shapes) {
109     return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
110                                           model, shapes);
111 }
112 
measureTiming(const ExecutionBuilder * execution)113 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
114     return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
115 }
116 
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)117 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
118                                const char* tag, bool allowUnspecified) {
119     if (newType != nullptr) {
120         const Extension::OperandTypeInformation* info = nullptr;
121         if (isExtension(operand.type)) {
122             NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
123         }
124         if (validateOperandType(*newType, info, tag, allowUnspecified) !=
125             ANEURALNETWORKS_NO_ERROR) {
126             LOG(ERROR) << tag << ": Invalid newType";
127             return false;
128         }
129         if (operand.dimensions.size() == 0) {
130             return true;
131         }
132         if (operand.dimensions.size() != newType->dimensionCount) {
133             LOG(ERROR) << tag << ": Setting with incompatible dimension count (existing = "
134                        << operand.dimensions.size() << ", new = " << newType->dimensionCount << ")";
135             return false;
136         }
137         for (uint32_t i = 0; i < newType->dimensionCount; i++) {
138             if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
139                 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
140                 return false;
141             }
142         }
143     } else {
144         if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
145             tensorHasUnspecifiedDimensions(operand)) {
146             LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
147             return false;
148         }
149     }
150     return true;
151 }
152 
ExecutionBuilder(const CompilationBuilder * compilation)153 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
154     : mCompilation(compilation),
155       mModel(compilation->mModel),
156       mPlan(&compilation->mPlan),
157       mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)),
158       mInputs(mModel->inputCount()),
159       mOutputs(mModel->outputCount()) {
160     VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
161                     << " inputs and " << mOutputs.size() << " outputs";
162 }
163 
SimpleExecutionBuilder(const CompilationBuilder * compilation)164 SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation)
165     : ExecutionBuilder(compilation) {
166     CHECK(mPlan->isSimple());
167 }
168 
CompoundExecutionBuilder(const CompilationBuilder * compilation)169 CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation)
170     : ExecutionBuilder(compilation) {
171     CHECK(mPlan->isCompound());
172 }
173 
getSourceModel(uint32_t index) const174 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
175     return mPlan->getSourceModels().getModel(index);
176 }
177 
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)178 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
179                                const void* buffer, size_t length) {
180     if (computationStarted()) {
181         LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
182                       "execution has started.";
183         return ANEURALNETWORKS_BAD_STATE;
184     }
185     uint32_t count = static_cast<uint32_t>(mInputs.size());
186     if (index >= count) {
187         LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
188         return ANEURALNETWORKS_BAD_DATA;
189     }
190     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
191                             "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
192         return ANEURALNETWORKS_BAD_DATA;
193     }
194     if (length > 0xFFFFFFFF) {
195         LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
196         return ANEURALNETWORKS_BAD_DATA;
197     }
198     uint32_t l = static_cast<uint32_t>(length);
199     if (!mInputs[index].unspecified()) {
200         LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
201                       "provided";
202         return ANEURALNETWORKS_BAD_STATE;
203     }
204     int n;
205     std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
206             mModel->getInputOperand(index), type, const_cast<void*>(buffer), l,
207             mInputAndOutputPaddingEnabled);
208     mHasCalledSetInputOutput = true;
209     return n;
210 }
211 
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const RuntimeMemory * memory,size_t offset,size_t length)212 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
213                                          const RuntimeMemory* memory, size_t offset,
214                                          size_t length) {
215     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
216 
217     if (computationStarted()) {
218         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
219                       "execution has started.";
220         return ANEURALNETWORKS_BAD_STATE;
221     }
222     uint32_t count = static_cast<uint32_t>(mInputs.size());
223     if (index >= count) {
224         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
225                    << count;
226         return ANEURALNETWORKS_BAD_DATA;
227     }
228     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
229                             "ANeuralNetworksExecution_setInputFromMemory", false)) {
230         return ANEURALNETWORKS_BAD_DATA;
231     }
232     if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
233                                          length)) {
234         return ANEURALNETWORKS_BAD_DATA;
235     }
236     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
237     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
238     // region is used. We update the length here because the drivers are still expecting a real
239     // length. For other memories that do not allow this semantic, it is checked in
240     // MemoryValidatorBase::validate before reaching here.
241     if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
242         length = memory->getSize();
243     }
244     // TODO validate the rest
245     uint32_t poolIndex = mMemories.add(memory);
246     if (!mInputs[index].unspecified()) {
247         LOG(ERROR)
248                 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
249                    "been provided";
250         return ANEURALNETWORKS_BAD_STATE;
251     }
252     int n;
253     std::tie(n, mInputs[index]) =
254             ModelArgumentInfo::createFromMemory(mModel->getInputOperand(index), type, poolIndex,
255                                                 offset, length, mInputAndOutputPaddingEnabled);
256     mHasCalledSetInputOutput = true;
257     return n;
258 }
259 
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)260 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
261                                 void* buffer, size_t length) {
262     if (computationStarted()) {
263         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
264                       "execution has started.";
265         return ANEURALNETWORKS_BAD_STATE;
266     }
267     uint32_t count = static_cast<uint32_t>(mOutputs.size());
268     if (index >= count) {
269         LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
270         return ANEURALNETWORKS_BAD_DATA;
271     }
272     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
273                             "ANeuralNetworksExecution_setOutput", true)) {
274         return ANEURALNETWORKS_BAD_DATA;
275     }
276     if (length > 0xFFFFFFFF) {
277         LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
278         return ANEURALNETWORKS_BAD_DATA;
279     }
280     uint32_t l = static_cast<uint32_t>(length);
281     if (!mOutputs[index].unspecified()) {
282         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
283                       "provided";
284         return ANEURALNETWORKS_BAD_STATE;
285     }
286     int n;
287     std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromPointer(
288             mModel->getOutputOperand(index), type, buffer, l, mInputAndOutputPaddingEnabled);
289     mHasCalledSetInputOutput = true;
290     return n;
291 }
292 
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const RuntimeMemory * memory,size_t offset,size_t length)293 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
294                                           const RuntimeMemory* memory, size_t offset,
295                                           size_t length) {
296     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
297 
298     if (computationStarted()) {
299         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
300                       "execution has started.";
301         return ANEURALNETWORKS_BAD_STATE;
302     }
303     uint32_t count = static_cast<uint32_t>(mOutputs.size());
304     if (index >= count) {
305         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
306                    << count;
307         return ANEURALNETWORKS_BAD_DATA;
308     }
309     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
310                             "ANeuralNetworksExecution_setOutputFromMemory", true)) {
311         return ANEURALNETWORKS_BAD_DATA;
312     }
313     if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
314                                          length)) {
315         return ANEURALNETWORKS_BAD_DATA;
316     }
317     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
318     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
319     // region is used. We update the length here because the drivers are still expecting a real
320     // length. For other memories that do not allow this semantic, it is checked in
321     // MemoryValidatorBase::validate before reaching here.
322     if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) {
323         length = memory->getSize();
324     }
325     // TODO validate the rest
326     uint32_t poolIndex = mMemories.add(memory);
327     if (!mOutputs[index].unspecified()) {
328         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
329                       "already been provided";
330         return ANEURALNETWORKS_BAD_STATE;
331     }
332     int n;
333     std::tie(n, mOutputs[index]) =
334             ModelArgumentInfo::createFromMemory(mModel->getOutputOperand(index), type, poolIndex,
335                                                 offset, length, mInputAndOutputPaddingEnabled);
336     mHasCalledSetInputOutput = true;
337     return n;
338 }
339 
setMeasureTiming(bool measure)340 int ExecutionBuilder::setMeasureTiming(bool measure) {
341     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
342         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
343                    << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
344                    << "that was not created by ANeuralNetworksCompilation_createForDevices "
345                    << "with numDevices = 1";
346         return ANEURALNETWORKS_BAD_DATA;
347     }
348     if (computationStarted()) {
349         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
350                       "execution has started.";
351         return ANEURALNETWORKS_BAD_STATE;
352     }
353     mMeasureTiming = measure;
354     return ANEURALNETWORKS_NO_ERROR;
355 }
356 
getDuration(int32_t durationCode,uint64_t * duration) const357 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
358     if (!completed()) {
359         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
360                       "execution has finished.";
361         *duration = UINT64_MAX;
362         return ANEURALNETWORKS_BAD_STATE;
363     }
364     if (completedWith() != Completion::NO_ERROR) {
365         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
366                       "that has encountered an error.";
367         *duration = UINT64_MAX;
368         return ANEURALNETWORKS_BAD_STATE;
369     }
370 
371     if (!mMeasureTiming) {
372         *duration = UINT64_MAX;
373         return ANEURALNETWORKS_BAD_STATE;
374     }
375 
376     Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
377     Timing timingFenced = timingLaunched;
378     if (mFencedExecutionCallback != nullptr) {
379         auto result = mFencedExecutionCallback();
380         if (!result.has_value()) {
381             LOG(ERROR) << "Fenced execution callback failed: " << result.error().message;
382             *duration = UINT64_MAX;
383             return ANEURALNETWORKS_BAD_STATE;
384         }
385         std::tie(timingLaunched, timingFenced) = std::move(result).value();
386     }
387     const OptionalDuration selectedDuration = [durationCode, &timingLaunched,
388                                                &timingFenced]() -> OptionalDuration {
389         switch (durationCode) {
390             case ANEURALNETWORKS_DURATION_ON_HARDWARE:
391                 return timingLaunched.timeOnDevice;
392             case ANEURALNETWORKS_DURATION_IN_DRIVER:
393                 return timingLaunched.timeInDriver;
394             case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
395                 return timingFenced.timeOnDevice;
396             case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
397                 return timingFenced.timeInDriver;
398             default:
399                 LOG(FATAL) << "unexpected";
400                 return std::nullopt;
401         }
402     }();
403     if (selectedDuration.has_value()) {
404         constexpr uint64_t kMaxTiming = std::numeric_limits<uint64_t>::max() - 1;
405         using CommonType = std::common_type_t<Duration::rep, uint64_t>;
406         const auto count = std::min<CommonType>(selectedDuration.value().count(), kMaxTiming);
407         *duration = static_cast<uint64_t>(count);
408     } else {
409         constexpr uint64_t kNoTiming = std::numeric_limits<uint64_t>::max();
410         *duration = kNoTiming;
411     }
412 
413     VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
414     return ANEURALNETWORKS_NO_ERROR;
415 }
416 
setTimeoutDuration(uint64_t duration)417 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
418     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
419         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
420                       "created from an ANeuralNetworksCompilation that was not created by "
421                       "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
422         return ANEURALNETWORKS_BAD_DATA;
423     }
424     if (computationStarted()) {
425         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
426         return ANEURALNETWORKS_BAD_STATE;
427     }
428     if (duration > 0) {
429         mTimeoutDuration = duration;
430     } else {
431         mTimeoutDuration.reset();
432     }
433     return ANEURALNETWORKS_NO_ERROR;
434 }
435 
getTimeoutDuration() const436 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
437     return mTimeoutDuration;
438 }
439 
getComputeStartTimePoint() const440 TimePoint ExecutionBuilder::getComputeStartTimePoint() const {
441     CHECK(computationStarted()) << "getComputeStartTimePoint called before "
442                                 << "execution has started.";
443     return mComputeStartTimePoint;
444 }
445 
setLoopTimeout(uint64_t duration)446 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
447     if (computationStarted()) {
448         LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
449                       "execution has started.";
450         return ANEURALNETWORKS_BAD_STATE;
451     }
452     if (duration > operation_while::kTimeoutNsMaximum) {
453         LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
454                      << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
455         duration = operation_while::kTimeoutNsMaximum;
456     }
457     mLoopTimeoutDuration = duration;
458     return ANEURALNETWORKS_NO_ERROR;
459 }
460 
enableInputAndOutputPadding(bool enable)461 int ExecutionBuilder::enableInputAndOutputPadding(bool enable) {
462     if (computationStarted()) {
463         LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after the "
464                       "execution has started.";
465         return ANEURALNETWORKS_BAD_STATE;
466     }
467     if (mHasCalledSetInputOutput) {
468         LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after an input "
469                       "or output is set.";
470         return ANEURALNETWORKS_BAD_STATE;
471     }
472     mInputAndOutputPaddingEnabled = enable;
473     return ANEURALNETWORKS_NO_ERROR;
474 }
475 
setReusable(bool reusable)476 int ExecutionBuilder::setReusable(bool reusable) {
477     if (computationStarted()) {
478         LOG(ERROR) << "ANeuralNetworksExecution_setReusable called after the "
479                       "execution has started.";
480         return ANEURALNETWORKS_BAD_STATE;
481     }
482     mReusable = reusable;
483     return ANEURALNETWORKS_NO_ERROR;
484 }
485 
addExtensionAttribute(const char * extensionName,uint16_t attributeCodeWithinExtension,const void * data,size_t length)486 int ExecutionBuilder::addExtensionAttribute(const char* extensionName,
487                                             uint16_t attributeCodeWithinExtension, const void* data,
488                                             size_t length) {
489     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
490         LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called on an "
491                       "ANeuralNetworksExecution created from an ANeuralNetworksCompilation that "
492                       "was not created by ANeuralNetworksCompilation_createForDevices with "
493                       "numDevices = 1";
494         return ANEURALNETWORKS_BAD_DATA;
495     }
496     if (computationStarted()) {
497         LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called after the execution "
498                       "has started.";
499         return ANEURALNETWORKS_BAD_STATE;
500     }
501     int32_t attributeToken = 0;
502     if (!TypeManager::get()->getExtensionType(extensionName, attributeCodeWithinExtension,
503                                               &attributeToken)) {
504         return ANEURALNETWORKS_BAD_DATA;
505     }
506     if (std::find_if(mMetadata.begin(), mMetadata.end(), [attributeToken](const auto& entry) {
507             return attributeToken == entry.token;
508         }) != mMetadata.end()) {
509         LOG(ERROR) << "ANeuralNetworksCompilation_addExtensionAttribute called more than once for "
510                       "the same attribute";
511         return ANEURALNETWORKS_BAD_DATA;
512     }
513     const uint8_t* dataPtr = reinterpret_cast<const uint8_t*>(data);
514     mMetadata.push_back({attributeToken, std::vector<uint8_t>(dataPtr, dataPtr + length)});
515     return ANEURALNETWORKS_NO_ERROR;
516 }
517 
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)518 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
519     if (!completed()) {
520         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
521                       "execution has finished.";
522         return ANEURALNETWORKS_BAD_STATE;
523     }
524     if (completedWith() == Completion::OTHER_ERROR) {
525         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
526                       "that has encountered an error.";
527         return ANEURALNETWORKS_BAD_STATE;
528     }
529 
530     uint32_t count = static_cast<uint32_t>(mOutputs.size());
531     if (index >= count) {
532         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
533                    << " " << count;
534         return ANEURALNETWORKS_BAD_DATA;
535     }
536     const auto& dims = mOutputs[index].dimensions();
537     if (dims.empty()) {
538         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
539                       "dimensions of a scalar";
540         return ANEURALNETWORKS_BAD_DATA;
541     }
542     std::copy(dims.begin(), dims.end(), dimensions);
543     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
544                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
545 }
546 
getOutputOperandRank(uint32_t index,uint32_t * rank)547 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
548     if (!completed()) {
549         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
550                       "execution has finished.";
551         return ANEURALNETWORKS_BAD_STATE;
552     }
553     if (completedWith() == Completion::OTHER_ERROR) {
554         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
555                       "that has encountered an error.";
556         return ANEURALNETWORKS_BAD_STATE;
557     }
558     uint32_t count = static_cast<uint32_t>(mOutputs.size());
559     if (index >= count) {
560         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
561                    << count;
562         return ANEURALNETWORKS_BAD_DATA;
563     }
564     *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
565     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
566                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
567 }
568 
checkAndSetComputationState(const char * name)569 bool ExecutionBuilder::checkAndSetComputationState(const char* name) {
570     std::lock_guard<std::mutex> lock(mStateMutex);
571     if (!mReusable && mState == State::COMPLETED) {
572         LOG(ERROR) << "ANeuralNetworksExecution_" << name
573                    << " called on a non-reusable execution that has already completed";
574         return false;
575     }
576     if (mState == State::COMPUTATION) {
577         LOG(ERROR) << "ANeuralNetworksExecution_" << name
578                    << " called on an execution that has already started";
579         return false;
580     }
581     mState = State::COMPUTATION;
582     return true;
583 }
584 
585 // TODO(b/132321855): validate that we have full types for all inputs and outputs,
586 // that the graph is not cyclic,
validateRequest(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs)587 static int validateRequest(const std::vector<ModelArgumentInfo>& inputs,
588                            const std::vector<ModelArgumentInfo>& outputs) {
589     for (auto& p : inputs) {
590         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
591             LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all inputs specified";
592             return ANEURALNETWORKS_BAD_DATA;
593         }
594     }
595     for (auto& p : outputs) {
596         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
597             LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all outputs specified";
598             return ANEURALNETWORKS_BAD_DATA;
599         }
600     }
601     return ANEURALNETWORKS_NO_ERROR;
602 }
603 
getValidationResultCode()604 int ExecutionBuilder::getValidationResultCode() {
605     if (!mValidationResultCode.has_value()) {
606         mValidationResultCode = validateRequest(mInputs, mOutputs);
607     }
608     return mValidationResultCode.value();
609 }
610 
areOutputsFullySpecified()611 bool ExecutionBuilder::areOutputsFullySpecified() {
612     if (!mOutputsFullySpecified.has_value()) {
613         mOutputsFullySpecified = true;
614         for (uint32_t i = 0; i < mOutputs.size(); i++) {
615             if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
616                 TypeManager::get()->isTensorType(mModel->getOutputOperand(i).type) &&
617                 tensorHasUnspecifiedDimensions(mModel->getOutputOperand(i).type,
618                                                mOutputs[i].initialDimensions())) {
619                 mOutputsFullySpecified = false;
620                 break;
621             }
622         }
623     }
624     return mOutputsFullySpecified.value();
625 }
626 
prepareForCompute(const char * name,ExecutionMode mode)627 int ExecutionBuilder::prepareForCompute(const char* name, ExecutionMode mode) {
628     if (!checkAndSetComputationState(name)) {
629         return ANEURALNETWORKS_BAD_STATE;
630     }
631     if (int n = getValidationResultCode(); n != ANEURALNETWORKS_NO_ERROR) {
632         return finishComputation(n, {}, mode);
633     }
634     return ANEURALNETWORKS_NO_ERROR;
635 }
636 
637 // Attempt synchronous execution of full model on CPU.
638 // TODO: How should we handle timing in this case?
639 //       For Q this is irrelevant: We only support timing in conjunction
640 //         with an explicit device list; and we do not support CPU fallback
641 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)642 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
643         ExecutionBuilder* executionBuilder) {
644     CHECK(executionBuilder != nullptr);
645     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
646     VLOG(EXECUTION) << "cpuFallbackFull";
647 
648     // Get fallback executor.
649     StepExecutor executor(executionBuilder, executionBuilder->getModel(),
650                           DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr,
651                           /*reusable=*/false);
652     executor.mapInputsAndOutputsTrivially();
653 
654     // Attempt fallback execution.
655     return executor.computeOnCpuFallback();
656 }
657 
658 // Attempt synchronous execution on CPU.
659 // TODO: How should we handle timing in this case?
660 //       For Q this is irrelevant: We only support timing in conjunction
661 //         with an explicit device list; and we do not support CPU fallback
662 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
663 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)664 cpuFallbackPartial(const ExecutionPlan& plan,
665                    std::shared_ptr<ExecutionPlan::Controller> controller) {
666     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
667     VLOG(EXECUTION) << "cpuFallbackPartial";
668 
669     // Get fallback executor.
670     std::shared_ptr<StepExecutor> executor;
671     int n1 = plan.fallback(controller, &executor, nullptr, nullptr);
672     if (n1 != ANEURALNETWORKS_NO_ERROR) {
673         return {n1, {}, {}, nullptr};
674     }
675     CHECK(executor != nullptr);
676 
677     // Attempt fallback execution.
678     auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
679     return {n2, std::move(outputShapes), timing, executor};
680 }
681 
computeInternal(const OptionalTimePoint & deadline,BurstBuilder * burstBuilder)682 std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal(
683         const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
684     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal");
685     VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal";
686 
687     if (mExecutor == nullptr) {
688         mExecutor = mPlan->makeStepExecutor(mReusable, this);
689     }
690 
691     auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr;
692     auto [n, outputShapes, timing] = mExecutor->compute(deadline, burstController);
693 
694     if (n == ANEURALNETWORKS_NO_ERROR) {
695         return {n, std::move(outputShapes), timing};
696     }
697 
698     // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable.
699     if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
700         return {n, std::move(outputShapes), {}};
701     }
702 
703     // If CPU fallback is not allowed and there was an error, end execution.
704     if (!mAllowCpuFallback) {
705         return {n, {}, {}};
706     }
707 
708     // If CPU execution was already attempted, do not perform CPU fallback.
709     if (mExecutor->isCpu()) {
710         return {n, {}, {}};
711     }
712 
713     // If the code has reached this point, a potentially recoverable error
714     // occurred during the execution. Do an execution fallback on the CPU.
715     return cpuFallbackFull(this);
716 }
717 
computeInternal(const OptionalTimePoint & deadline,BurstBuilder * burstBuilder)718 std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal(
719         const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
720     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal");
721     VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)";
722 
723     auto controller = mPlan->makeController(this, burstBuilder);
724     std::vector<OutputShape> outputShapes = getInitialOutputShapes();
725 
726     // On this iteration, do I need to repeat the previous step because it
727     // reported insufficient size?
728     bool doInsufficientSizeFallback = false;
729 
730     while (true) {
731         VLOG(EXECUTION) << "looking for next StepExecutor";
732 
733         // Get the current step of the execution.
734         std::shared_ptr<StepExecutor> executor;
735         SharedBurst burstController;
736         int n = doInsufficientSizeFallback
737                         ? mPlan->fallback(controller, &executor, &burstController, &outputShapes)
738                         : mPlan->next(controller, &executor, &burstController, &outputShapes);
739         doInsufficientSizeFallback = false;
740         if (n != ANEURALNETWORKS_NO_ERROR) {
741             // During the interpreted execution of control flow, a loop timeout
742             // might occur in ExecutionPlan::next().
743             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
744                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
745             if (mAllowCpuFallback && !missedDeadline) break;
746             return {n, {}, {}};
747         }
748 
749         // If the code reached the end of the plan without error, then return
750         // with no error.
751         if (executor == nullptr) {
752             return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}};
753         }
754         const bool executorIsCpu = executor->isCpu();
755 
756         // Attempt to execute a single step of the execution.
757         auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController);
758 
759         // Update global outputs and dynamic temporaries.
760         StepExecutor::UpdateOutputShapes updateOutputShapes = {};
761         if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
762                                           &updateOutputShapes)) {
763             stepN = ANEURALNETWORKS_OP_FAILED;
764         }
765 
766         // If execution was successful, continue to next step.
767         if (stepN == ANEURALNETWORKS_NO_ERROR) {
768             if (updateOutputShapes.zeroSizedInput) {
769                 // We'll need to do full model CPU fallback
770                 VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
771                 stepN = ANEURALNETWORKS_OP_FAILED;
772             } else {
773                 CHECK(executor->areDynamicTemporariesAllocated());
774                 continue;
775             }
776         }
777 
778         if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
779             VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
780             if (updateOutputShapes.mainOutputInsufficient ||
781                 !updateOutputShapes.updatedDynamicTemporary) {
782                 // Either:
783                 // - At least one main model output is not of sufficient size; or
784                 // - we didn't learn anything new about dynamic temporaries.
785                 // Neither of these is recoverable, so end execution.
786                 return {stepN, outputShapes, {}};
787             }
788             // Every main model output is of sufficient size.  This implies that
789             // at least one dynamic temporary is not of sufficient size.  This
790             // is recoverable.
791             doInsufficientSizeFallback = true;
792             continue;
793         }
794 
795         // If CPU fallback is not allowed and there was an error, end execution.
796         if (!mAllowCpuFallback) {
797             return {stepN, {}, {}};
798         }
799 
800         // If CPU execution was already attempted, perform a full CPU fallback.
801         if (executorIsCpu) {
802             break;
803         }
804 
805         // If the code reaches this point, attempt a partial fallback to CPU.
806         CHECK(mAllowCpuFallback);
807         if (updateOutputShapes.zeroSizedInput) {
808             // Do not attempt a partial fallback.
809             break;
810         }
811         while (true) {
812             auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] =
813                     cpuFallbackPartial(*mPlan, controller);
814 
815             // Update global outputs and dynamic temporaries.
816             StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
817             if (fallbackExecutor != nullptr &&
818                 !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
819                                                       &outputShapes, &fallbackUpdateOutputShapes)) {
820                 fallbackN = ANEURALNETWORKS_OP_FAILED;
821             }
822 
823             // If execution was successful, continue to next step.
824             if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
825                 if (fallbackUpdateOutputShapes.zeroSizedInput) {
826                     // We'll need to do full model CPU fallback
827                     VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
828                     fallbackN = ANEURALNETWORKS_OP_FAILED;
829                     break;
830                 }
831                 CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
832                 goto nextStep;
833             }
834 
835             if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
836                 VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
837                                 << toString(fallbackUpdateOutputShapes);
838                 if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
839                     !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
840                     // Either:
841                     // - At least one main model output is not of sufficient size; or
842                     // - we didn't learn anything new about dynamic temporaries.
843                     // Neither of these is recoverable, so end execution.
844                     return {fallbackN, outputShapes, {}};
845                 }
846                 // Every main model output is of sufficient size.  This implies
847                 // that at least one dynamic temporary is not of sufficient
848                 // size.  This is recoverable.
849                 continue;
850             }
851 
852             // If the code reaches this point, then there was an error with the
853             // fallback. In this case, attempt full fallback.
854             break;
855         }
856 
857         // If the code reaches this point, then there was an error with the
858         // fallback. In this case, attempt full fallback.
859         break;
860 
861     nextStep:
862         // Bottom of the outer loop
863         continue;
864     }
865 
866     // If the code has reached this point, a potentially recoverable error
867     // occurred during the step executions. Instead, do a full execution
868     // fallback on the CPU.
869     return cpuFallbackFull(this);
870 }
871 
waitForSyncFences(const std::vector<int> & waitFor)872 static bool waitForSyncFences(const std::vector<int>& waitFor) {
873     for (int syncFd : waitFor) {
874         if (syncFd > 0) {
875             auto r = syncWait(syncFd, -1);
876             if (r != FenceState::SIGNALED) {
877                 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
878                 return false;
879             }
880         }
881     }
882     return true;
883 }
884 
computeFencedInternal(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)885 std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal(
886         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
887         const OptionalTimePoint& deadline) {
888     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal");
889     VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal";
890 
891     if (mExecutor == nullptr) {
892         mExecutor = mPlan->makeStepExecutor(mReusable, this);
893     }
894 
895     auto [n, syncFd, callback] =
896             mExecutor->computeFenced(waitFor, timeoutDurationAfterFence, deadline);
897 
898     if (n == ANEURALNETWORKS_NO_ERROR) {
899         return {ANEURALNETWORKS_NO_ERROR, syncFd, callback};
900     }
901 
902     // If CPU fallback is not allowed and there was an error, end execution.
903     if (!mAllowCpuFallback) {
904         return {n, -1, nullptr};
905     }
906 
907     // If CPU execution was already attempted, return from the function with an error.
908     if (mExecutor->isCpu()) {
909         return {n, -1, nullptr};
910     }
911 
912     // If the code has reached this point, a potentially recoverable error
913     // occurred during the step executions. Instead, do a full execution
914     // fallback on the CPU.
915     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
916     if (!waitForSyncFences(waitFor)) {
917         return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
918     }
919     auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this);
920     reportTimingWithoutFencedExecutionCallback(fallbackTiming);
921     return {fallbackN, -1, nullptr};
922 }
923 
924 // In case of partitioned execution, computeFencedInternal call will return the sync
925 // fence and the fenced compute callback returned from the last partition.
926 // Any failed partition will result in whole execution fallback to CPU if
927 // mAllowCpuFallback is set to true.
computeFencedInternal(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)928 std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal(
929         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
930         const OptionalTimePoint& deadline) {
931     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal");
932     VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)";
933 
934     // We should have detected this earlier in the call chain and fallen back to
935     // non-fenced execution.  This is an implementation limitation: In order to
936     // support dynamic temporarires in this code, we'd need to implement
937     // something like the following:
938     // - If a partition has outputs of unknown size, compute that partition in a
939     //   non fenced fashion, just as if it were scheduled on a driver that does
940     //   not support fenced execution.
941     // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal()
942     //   that handles a step execution that fails with
943     //   ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
944     CHECK(!mCompilation->hasDynamicTemporaries());
945 
946     // Initiate waitForFds, syncFence for the first step.
947     std::vector<int> waitForFds = waitFor;
948     base::unique_fd syncFence;
949     ExecuteFencedInfoCallback executeFencedInfoCallback;
950 
951     std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
952     while (true) {
953         VLOG(EXECUTION) << "looking for next StepExecutor";
954 
955         // Get the current step of the execution.
956         std::shared_ptr<StepExecutor> executor;
957         int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence.get());
958         if (n != ANEURALNETWORKS_NO_ERROR) {
959             // During the interpreted execution of control flow, a loop timeout
960             // might occur in ExecutionPlan::next().
961             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
962                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
963             if (mAllowCpuFallback && !missedDeadline) break;
964             // Return -1 for the sync fence fd, and nullptr for the callback.
965             return {n, -1, nullptr};
966         }
967 
968         // If the code reached the end of the plan without error, then return
969         // with no error.
970         if (executor == nullptr) {
971             return {ANEURALNETWORKS_NO_ERROR, syncFence.release(), executeFencedInfoCallback};
972         }
973 
974         // Attempt to compute a single step of the execution.
975         auto [stepN, syncFd, callback] =
976                 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
977 
978         // Update waitForFds, syncFence for the next step.
979         syncFence.reset(syncFd);
980         executeFencedInfoCallback = callback;
981         waitForFds.clear();
982         if (syncFd >= 0) {
983             waitForFds = {syncFd};
984         }
985 
986         // If execution was successful, continue to next step.
987         if (stepN == ANEURALNETWORKS_NO_ERROR) {
988             continue;
989         }
990         // If CPU fallback is not allowed and there was an error, end execution.
991         if (!mAllowCpuFallback) {
992             return {stepN, -1, nullptr};
993         }
994 
995         // If the code reaches this point, then there was an error with the
996         // fallback. In this case, attempt full fallback.
997         break;
998     }
999 
1000     // If the code has reached this point, a potentially recoverable error
1001     // occurred during the step executions. Instead, do a full execution
1002     // fallback on the CPU.
1003     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
1004     if (!waitForSyncFences(waitFor)) {
1005         return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
1006     }
1007     auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this);
1008     return {fullN, -1, nullptr};
1009 }
1010 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)1011 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
1012                                     uint64_t timeoutDurationAfterFence, int* syncFence) {
1013     CHECK(syncFence != nullptr);
1014     NN_RETURN_IF_ERROR(
1015             prepareForCompute("startComputeWithDependencies", ExecutionMode::ASYNC_WITH_DEPS));
1016     if (timeoutDurationAfterFence > 0) {
1017         if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
1018             LOG(ERROR)
1019                     << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
1020                        "duration on an ANeuralNetworksExecution "
1021                        "created from an ANeuralNetworksCompilation that was not created by "
1022                        "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
1023             return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS);
1024         }
1025     }
1026     if (!areOutputsFullySpecified()) {
1027         LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
1028                       " not all outputs have fully specified dimensions";
1029         return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS);
1030     }
1031 
1032     // Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because
1033     // fenced executions do not support dynamic output shape.
1034 
1035     mComputeStartTimePoint = Clock::now();
1036     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
1037     int result;
1038     const auto deadline = makeDeadline(mTimeoutDuration);
1039     std::tie(result, *syncFence, mFencedExecutionCallback) =
1040             computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline);
1041     // If there is an error, call finishComputation to mark the computation as completed.
1042     // Otherwise, we will call finishComputation in SyncFenceEvent::wait().
1043     if (result != ANEURALNETWORKS_NO_ERROR) {
1044         // TODO(miaowang): support dynamic output shape only with memory domain.
1045         // For now just return empty output shapes.
1046         result = finishComputation(result, {}, ExecutionMode::ASYNC_WITH_DEPS);
1047     }
1048     return result;
1049 }
1050 
compute(std::shared_ptr<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)1051 int ExecutionBuilder::compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
1052                               BurstBuilder* burstBuilder) {
1053     CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
1054             << "synchronizationCallback and burstBuilder cannot simultaneously be used";
1055 
1056     const bool synchronous = (synchronizationCallback == nullptr);
1057     if (!synchronous) {
1058         *synchronizationCallback = nullptr;
1059     }
1060 
1061     const char* name = burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
1062     const ExecutionMode mode = burstBuilder
1063                                        ? ExecutionMode::BURST
1064                                        : synchronous ? ExecutionMode::SYNC : ExecutionMode::ASYNC;
1065     NN_RETURN_IF_ERROR(prepareForCompute(name, mode));
1066 
1067     // Validate input memory dimensions. We need to do the validation in every computation because
1068     // the memory dimensions may change between computations.
1069     for (auto& p : mInputs) {
1070         if (p.state() == ModelArgumentInfo::MEMORY) {
1071             const RuntimeMemory* memory = mMemories[p.locationAndLength().poolIndex];
1072             if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
1073                 return finishComputation(ANEURALNETWORKS_OP_FAILED, {}, mode);
1074             }
1075         }
1076     }
1077 
1078     // Reset output dimensions.
1079     if (!areOutputsFullySpecified()) {
1080         for (auto& output : mOutputs) {
1081             output.reset();
1082         }
1083     }
1084 
1085     const auto deadline = makeDeadline(mTimeoutDuration);
1086     mComputeStartTimePoint = Clock::now();
1087     if (synchronous) {
1088         if (burstBuilder) {
1089             VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
1090         } else {
1091             VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
1092         }
1093         const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder);
1094         if (mMeasureTiming) {
1095             mTimingWithoutFencedExecutionCallback = timing;
1096         }
1097         return finishComputation(n, outputShapes, mode);
1098     } else /* asynchronous */ {
1099         // TODO: For asynchronous execution, entire plan-based-path should run in an
1100         // asynchronous thread -- take the asynchronous thread logic out of
1101         // CpuExecution::compute() and use it to wrap the plan-based-path.
1102 
1103         // TODO: use a thread pool
1104         // TODO(mikie): this could have NNTRACE so we could measure the overhead
1105         //              of spinning up a new thread.
1106 
1107         // Prepare the callback for asynchronous execution.
1108         // std::shared_ptr<ExecutionCallback> object is returned when the
1109         // execution has been successfully launched, otherwise a
1110         // nullptr is returned.  The executionCallback is
1111         // abstracted in the NN API as an "event".
1112         auto executionCallback = std::make_shared<ExecutionCallback>();
1113         executionCallback->setOnFinish(
1114                 [this, mode](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
1115                     return finishComputation(error, outputShapes, mode);
1116                 });
1117         const auto asyncStartCompute = [this, deadline, executionCallback] {
1118             const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr);
1119             const auto status = convertResultCodeToErrorStatus(n);
1120             executionCallback->notify(status, outputShapes, timing);
1121         };
1122         if (DeviceManager::get()->syncExecRuntime()) {
1123             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
1124             asyncStartCompute();
1125         } else {
1126             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
1127             std::thread asyncExecution(asyncStartCompute);
1128             executionCallback->bindThread(std::move(asyncExecution));
1129         }
1130         *synchronizationCallback = executionCallback;
1131         return ANEURALNETWORKS_NO_ERROR;
1132     }
1133 }
1134 
getInitialOutputShapes() const1135 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
1136     std::vector<OutputShape> outputShapes(mOutputs.size());
1137     std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
1138                    [](const auto& x) -> OutputShape {
1139                        std::vector<uint32_t> dimensions;
1140                        if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
1141                            dimensions = x.dimensions();
1142                        }
1143                        return {.dimensions = std::move(dimensions), .isSufficient = true};
1144                    });
1145     return outputShapes;
1146 }
1147 
1148 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
1149 // have no lower a specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)1150 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
1151     if (to.size() == 0) return true;
1152     NN_RET_CHECK_EQ(to.size(), from.size());
1153     for (uint32_t i = 0; i < to.size(); i++) {
1154         NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
1155     }
1156     return true;
1157 }
1158 
isZeroSizedTensor(int executionResultCode,const OutputShape & outputShape)1159 static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
1160     return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
1161            outputShape.dimensions.size() &&
1162            (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
1163             outputShape.dimensions.end());
1164 }
1165 
updateOutputShapes(ErrorStatus status,const std::vector<OutputShape> & outputShapes)1166 bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
1167                                           const std::vector<OutputShape>& outputShapes) {
1168     NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
1169 
1170     if (outputShapes.size() == 0) {
1171         return true;
1172     }
1173     NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
1174     for (uint32_t i = 0; i < outputShapes.size(); i++) {
1175         // Check if only unspecified dimensions or rank are overwritten.
1176         NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
1177         const OperandType operandType = mModel->getOutputOperand(i).type;
1178         NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
1179                                                                     outputShapes[i].dimensions));
1180     }
1181     for (uint32_t i = 0; i < outputShapes.size(); i++) {
1182         mOutputs[i].dimensions() = outputShapes[i].dimensions;
1183         mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
1184     }
1185     return true;
1186 }
1187 
updateMemories()1188 bool ExecutionBuilder::updateMemories() {
1189     for (const auto& output : mOutputs) {
1190         if (output.state() != ModelArgumentInfo::MEMORY) continue;
1191         const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
1192         NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
1193     }
1194     return true;
1195 }
1196 
finishComputation(int result,const std::vector<OutputShape> & outputShapes,ExecutionMode mode)1197 int ExecutionBuilder::finishComputation(int result, const std::vector<OutputShape>& outputShapes,
1198                                         ExecutionMode mode) {
1199     const auto status = convertResultCodeToErrorStatus(result);
1200     if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
1201         result = ANEURALNETWORKS_OP_FAILED;
1202     }
1203     bool success = result == ANEURALNETWORKS_NO_ERROR;
1204     for (const auto& output : mOutputs) {
1205         if (output.state() != ModelArgumentInfo::MEMORY) continue;
1206         const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex];
1207         memory->getValidator().setInitialized(success);
1208     }
1209     switch (result) {
1210         case ANEURALNETWORKS_NO_ERROR:
1211             mCompletion = Completion::NO_ERROR;
1212             break;
1213         case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
1214             mCompletion = Completion::OUTPUT_INSUFFICIENT_SIZE;
1215             break;
1216         default:
1217             mCompletion = Completion::OTHER_ERROR;
1218             break;
1219     }
1220     {
1221         std::lock_guard<std::mutex> lock(mStateMutex);
1222         CHECK(mState != State::PREPARATION)
1223                 << "ExecutionBuilder::finishComputation is called in the preparation state";
1224         CHECK(mState != State::COMPLETED) << "ExecutionBuilder::finishComputation is called twice";
1225         mState = State::COMPLETED;
1226     }
1227     telemetry::onExecutionFinish(this, mode, result);
1228     return result;
1229 }
1230 
toString(StepExecutor::UpdateOutputShapes updateOutputShapes)1231 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
1232     return "{ .updatedDynamicTemporary = " +
1233            std::to_string(updateOutputShapes.updatedDynamicTemporary) +
1234            ", .mainOutputInsufficient = " +
1235            std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
1236 }
1237 
updateOutputShapes(int executionResultCode,const std::vector<OutputShape> & from,std::vector<OutputShape> * to,UpdateOutputShapes * update)1238 bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
1239                                       std::vector<OutputShape>* to, UpdateOutputShapes* update) {
1240     CHECK(update != nullptr);
1241     *update = {.updatedDynamicTemporary = false,
1242                .mainOutputInsufficient = false,
1243                .zeroSizedInput = false};
1244 
1245     NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
1246 
1247     if (from.size() == 0) {
1248         return true;
1249     }
1250 
1251     if (VLOG_IS_ON(EXECUTION)) {
1252         for (const auto& shape : from) {
1253             VLOG(EXECUTION) << "updateOutputShapes: " << shape;
1254         }
1255     }
1256 
1257     if (mExecutionStep != nullptr) {
1258         const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
1259         NN_RET_CHECK_LE(indexMapping.size(), from.size());
1260         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
1261             const uint32_t toIndex = indexMapping[i];
1262             NN_RET_CHECK_GT(to->size(), toIndex);
1263             NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
1264             (*to)[toIndex] = from[i];
1265             update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
1266             if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
1267                 isZeroSizedTensor(executionResultCode, from[i])) {
1268                 update->zeroSizedInput = true;
1269             }
1270         }
1271 
1272         if (!mDynamicTemporaries->empty()) {
1273             // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
1274             std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
1275             for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
1276                 operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
1277             }
1278 
1279             const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
1280             for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
1281                 const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
1282                 const auto it =
1283                         operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
1284                 if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
1285                     continue;
1286                 }
1287                 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
1288                 VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
1289                                 << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
1290                                 << sourceOperandIndex.second << ") is a dynamic temporary";
1291                 // This is a temporary, but it might not be a dynamic temporary.
1292                 const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
1293                 if (loc == std::nullopt) {
1294                     continue;
1295                 }
1296                 NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
1297                 bool changedShape = false;
1298                 const uint32_t actualSize = TypeManager::get()->getSizeOfData(
1299                         mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
1300                 if (actualSize > 0) {
1301                     changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
1302                                                                   from[i].dimensions, actualSize);
1303                 } else if (!from[i].isSufficient) {
1304                     NN_RET_CHECK(loc->paddedLength < UINT32_MAX / 2)
1305                             << "output#" << i << " paddedLength overflow";
1306                     changedShape = mDynamicTemporaries->redeclare(
1307                             sourceOperandIndex, from[i].dimensions, 2 * loc->paddedLength);
1308                 } else {
1309                     // The combination of not-fully-specified dimensions
1310                     // and isSufficient means that we have no
1311                     // information about whether the size of the dynamic
1312                     // temporary is adequate.
1313                     VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
1314                     if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
1315                         NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
1316                         // This is a zero-sized tensor, and by
1317                         // definition, any dynamic temporary is an input
1318                         // to an execution step.
1319                         update->zeroSizedInput = true;
1320                     }
1321                 }
1322                 if (changedShape) {
1323                     // TODO: find a better place for this comment.
1324                     //
1325                     // isUpdatable(a, b) imposes a partial ordering a <=
1326                     // b.  Every fully specified dimensions vector is an
1327                     // upper bound of that ordering.  Therefore, any
1328                     // change in dimensions moves towards an upper
1329                     // bound, and hence there are a finite number of
1330                     // such changes possible.
1331                     //
1332                     // actualSize can only be computed from dimensions
1333                     // that are an upper bound.  Therefore, once
1334                     // actualSize is computed, it will not change.
1335                     //
1336                     // If dimensions are not fully specified, and
1337                     // estimated size changes, it increases.  There is
1338                     // an upper bound on estimated size to avoid
1339                     // overflow.
1340                     //
1341                     // Therefore, if we retry only when dimensions or
1342                     // size chage, and we stop retrying if we would
1343                     // otherwise overflow, we should only retry a finite
1344                     // number of times.
1345                     update->updatedDynamicTemporary = true;
1346                 }
1347             }
1348             mDynamicTemporaries->vlogDump("finished updateOutputShapes");
1349         }
1350     } else {
1351         NN_RET_CHECK_EQ(from.size(), to->size());
1352         for (uint32_t i = 0, e = from.size(); i < e; i++) {
1353             NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
1354             (*to)[i] = from[i];
1355         }
1356     }
1357     return true;
1358 }
1359 
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<RuntimePreparedModel> preparedModel,bool reusable,const ExecutionStep * step,DynamicTemporaries * dynamicTemporaries)1360 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
1361                            std::shared_ptr<Device> device,
1362                            std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
1363                            const ExecutionStep* step, DynamicTemporaries* dynamicTemporaries)
1364     : mExecutionBuilder(executionBuilder),
1365       mExecutionStep(step),
1366       mDynamicTemporaries(dynamicTemporaries),
1367       mModel(model),
1368       mDevice(device),
1369       mPreparedModel(preparedModel),
1370       mInputs(model->inputCount()),
1371       mOutputs(model->outputCount()),
1372       mReusable(reusable) {
1373     CHECK(mDevice != nullptr);
1374     CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
1375     CHECK(!(reusable && dynamicTemporaries != nullptr));
1376     VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
1377                     << mOutputs.size() << " outputs";
1378 }
1379 
areDynamicTemporariesAllocated() const1380 bool StepExecutor::areDynamicTemporariesAllocated() const {
1381     return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
1382 }
1383 
mapInputsAndOutputsTrivially()1384 void StepExecutor::mapInputsAndOutputsTrivially() {
1385     mInputs = mExecutionBuilder->mInputs;
1386     mOutputs = mExecutionBuilder->mOutputs;
1387     mMemories = mExecutionBuilder->mMemories;
1388 }
1389 
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput,const Dimensions * builderDimensions)1390 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1391                                     ModelArgumentInfo* executorInputOrOutput,
1392                                     const Dimensions* builderDimensions) {
1393     auto updateDimensions = [executorInputOrOutput, builderDimensions] {
1394         if (!builderDimensions) {
1395             return;
1396         }
1397         executorInputOrOutput->dimensions() = *builderDimensions;
1398     };
1399 
1400     *executorInputOrOutput = builderInputOrOutput;
1401     switch (executorInputOrOutput->state()) {
1402         default:
1403             CHECK(false) << "unexpected ModelArgumentInfo::state";
1404             break;
1405         case ModelArgumentInfo::HAS_NO_VALUE:
1406         case ModelArgumentInfo::UNSPECIFIED:
1407             break;
1408         case ModelArgumentInfo::POINTER:
1409             updateDimensions();
1410             break;
1411         case ModelArgumentInfo::MEMORY: {
1412             updateDimensions();
1413             const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1414             const RuntimeMemory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1415             const uint32_t executorPoolIndex = mMemories.add(memory);
1416             executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1417             break;
1418         }
1419     }
1420 }
1421 
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const RuntimeMemory * memory,uint32_t offset,uint32_t length,const Dimensions & dimensions,ModelArgumentInfo * inputOrOutputInfo)1422 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1423                                              const RuntimeMemory* memory, uint32_t offset,
1424                                              uint32_t length, const Dimensions& dimensions,
1425                                              ModelArgumentInfo* inputOrOutputInfo) {
1426     // Should be similar to
1427     //     ExecutionBuilder::setInputFromMemory()
1428     //     ExecutionBuilder::setOutputFromMemory()
1429 
1430     uint32_t poolIndex = mMemories.add(memory);
1431     CHECK(inputOrOutputInfo->unspecified());
1432     int n;
1433     std::tie(n, *inputOrOutputInfo) =
1434             ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1435                                                 /*type=*/nullptr, poolIndex, offset, length);
1436     if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
1437         CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
1438         inputOrOutputInfo->dimensions() = dimensions;
1439     }
1440     return n;
1441 }
1442 
toString(std::vector<uint32_t> dimensions)1443 static std::string toString(std::vector<uint32_t> dimensions) {
1444     std::string ret = "(";
1445     bool wroteOne = false;
1446     for (uint32_t dimension : dimensions) {
1447         if (wroteOne) {
1448             ret += ", ";
1449         } else {
1450             wroteOne = true;
1451         }
1452         ret += std::to_string(dimension);
1453     }
1454     ret += ")";
1455     return ret;
1456 };
1457 
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1458 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1459     for (unsigned i = 0; i < args.size(); i++) {
1460         const auto& arg = args[i];
1461         std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1462         switch (arg.state()) {
1463             case ModelArgumentInfo::POINTER:
1464                 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
1465                                 << toString(arg.dimensions());
1466                 break;
1467             case ModelArgumentInfo::MEMORY:
1468                 VLOG(EXECUTION) << prefix << "MEMORY("
1469                                 << "pool=" << arg.locationAndLength().poolIndex << ", "
1470                                 << "off=" << arg.locationAndLength().offset << ") dim"
1471                                 << toString(arg.dimensions());
1472                 break;
1473             case ModelArgumentInfo::HAS_NO_VALUE:
1474                 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1475                 break;
1476             case ModelArgumentInfo::UNSPECIFIED:
1477                 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1478                 break;
1479             default:
1480                 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1481                 break;
1482         }
1483     }
1484 }
1485 
isCpu() const1486 bool StepExecutor::isCpu() const {
1487     return mDevice == DeviceManager::getCpuDevice();
1488 }
1489 
getReusableExecution()1490 std::pair<int, std::shared_ptr<RuntimeExecution>> StepExecutor::getReusableExecution() {
1491     CHECK(mReusable);
1492     if (mExecution == nullptr) {
1493         CHECK(mPreparedModel != nullptr);
1494         const MeasureTiming measure = measureTiming(mExecutionBuilder);
1495         const OptionalDuration loopTimeoutDuration =
1496                 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1497         auto [n, execution] = mPreparedModel->createReusableExecution(
1498                 mInputs, mOutputs, mMemories.getObjects(), measure, loopTimeoutDuration,
1499                 mExecutionBuilder->getMetadata());
1500         if (n != ANEURALNETWORKS_NO_ERROR) {
1501             return {n, nullptr};
1502         }
1503         mExecution = std::move(execution);
1504     }
1505     return {ANEURALNETWORKS_NO_ERROR, mExecution};
1506 }
1507 
compute(const OptionalTimePoint & deadline,const SharedBurst & burstController)1508 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1509         const OptionalTimePoint& deadline, const SharedBurst& burstController) {
1510     if (VLOG_IS_ON(EXECUTION)) {
1511         logArguments("input", mInputs);
1512         logArguments("output", mOutputs);
1513     }
1514 
1515     int n;
1516     std::vector<OutputShape> outputShapes;
1517     Timing timing;
1518     if (mReusable) {
1519         auto [nCreate, execution] = getReusableExecution();
1520         if (nCreate != ANEURALNETWORKS_NO_ERROR) {
1521             return {nCreate, {}, {}};
1522         }
1523         std::tie(n, outputShapes, timing) = execution->compute(burstController, deadline);
1524     } else {
1525         CHECK(mPreparedModel != nullptr);
1526         const MeasureTiming measure = measureTiming(mExecutionBuilder);
1527         const OptionalDuration loopTimeoutDuration =
1528                 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1529         std::tie(n, outputShapes, timing) = mPreparedModel->execute(
1530                 mInputs, mOutputs, mMemories.getObjects(), burstController, measure, deadline,
1531                 loopTimeoutDuration, mExecutionBuilder->getMetadata());
1532     }
1533     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1534     return {n, std::move(outputShapes), std::move(timing)};
1535 }
1536 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const OptionalTimePoint & deadline)1537 std::tuple<int, int, ExecuteFencedInfoCallback> StepExecutor::computeFenced(
1538         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1539         const OptionalTimePoint& deadline) {
1540     if (VLOG_IS_ON(EXECUTION)) {
1541         logArguments("input", mInputs);
1542         logArguments("output", mOutputs);
1543     }
1544 
1545     OptionalDuration optionalTimeoutDurationAfterFence;
1546     if (timeoutDurationAfterFence > 0) {
1547         optionalTimeoutDurationAfterFence = makeTimeoutDuration(timeoutDurationAfterFence);
1548     }
1549 
1550     int n;
1551     int syncFenceFd;
1552     ExecuteFencedInfoCallback executeFencedInfoCallback;
1553     Timing timing;
1554     if (mReusable) {
1555         auto [nCreate, execution] = getReusableExecution();
1556         if (nCreate != ANEURALNETWORKS_NO_ERROR) {
1557             return {nCreate, -1, nullptr};
1558         }
1559         std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) =
1560                 execution->computeFenced(waitFor, deadline, optionalTimeoutDurationAfterFence);
1561     } else {
1562         CHECK(mPreparedModel != nullptr);
1563         const MeasureTiming measure = measureTiming(mExecutionBuilder);
1564         const OptionalDuration loopTimeoutDuration =
1565                 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1566         std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = mPreparedModel->executeFenced(
1567                 mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1568                 loopTimeoutDuration, optionalTimeoutDurationAfterFence,
1569                 mExecutionBuilder->getMetadata());
1570     }
1571     if (syncFenceFd < 0 && executeFencedInfoCallback == nullptr) {
1572         mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1573     }
1574     return {n, syncFenceFd, executeFencedInfoCallback};
1575 }
1576 
1577 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1578 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1579     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1580     VLOG(EXECUTION) << "Re-compile the model on CPU";
1581     const ModelFactory makeModel = [this] { return mModel->makeModel(); };
1582     // TODO: Propagate user preference and compilation priority to this point instead of using
1583     // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1584     // ANEURALNETWORKS_PRIORITY_MEDIUM
1585     const ExecutionPreference preference =
1586             static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1587     const Priority priority = convertToCanonicalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1588     auto [n, preparedModel] = DeviceManager::getCpuDevice()->prepareModel(
1589             makeModel, preference, priority, {}, {}, {}, {}, {});
1590     if (n != ANEURALNETWORKS_NO_ERROR) {
1591         return {n, {}, {}};
1592     }
1593 
1594     // Prepare device memories for CPU fallback.
1595     std::vector<const RuntimeMemory*> memories = mMemories.getObjects();
1596     std::vector<bool> isUsedAsInput(memories.size(), false);
1597     std::vector<bool> isUsedAsOutput(memories.size(), false);
1598     std::vector<std::unique_ptr<RuntimeMemory>> blobAhwbs;
1599 
1600     // Mark the input and output usages.
1601     for (auto& input : mInputs) {
1602         if (input.state() == ModelArgumentInfo::MEMORY) {
1603             const uint32_t poolIndex = input.locationAndLength().poolIndex;
1604             isUsedAsInput[poolIndex] = true;
1605         }
1606     }
1607     for (auto& output : mOutputs) {
1608         if (output.state() == ModelArgumentInfo::MEMORY) {
1609             const uint32_t poolIndex = output.locationAndLength().poolIndex;
1610             // Cannot allocate output buffers with unknown shapes.
1611             if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1612                 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1613                               "has unknown shape.";
1614                 return {ANEURALNETWORKS_OP_FAILED, {}, {}};
1615             }
1616             isUsedAsOutput[poolIndex] = true;
1617         }
1618     }
1619 
1620     // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1621     for (uint32_t i = 0; i < memories.size(); i++) {
1622         const RuntimeMemory* memory = mMemories[i];
1623         if (memory->getIBuffer() != nullptr) {
1624             const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1625             auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1626             if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1627                 return {nAhwb, {}, {}};
1628             }
1629             if (isUsedAsInput[i]) {
1630                 n = copyIBufferToMemory(memory->getIBuffer(), blobAhwb->getMemory());
1631                 if (n != ANEURALNETWORKS_NO_ERROR) {
1632                     return {n, {}, {}};
1633                 }
1634             }
1635             memories[i] = blobAhwb.get();
1636             blobAhwbs.push_back(std::move(blobAhwb));
1637         }
1638     }
1639 
1640     const MeasureTiming measure = measureTiming(mExecutionBuilder);
1641     const OptionalDuration loopTimeoutDuration =
1642             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1643     auto [nExecute, outputShapes, timing] = preparedModel->execute(
1644             mInputs, mOutputs, memories, nullptr, measure, {}, loopTimeoutDuration, {});
1645     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1646     if (nExecute != ANEURALNETWORKS_NO_ERROR) {
1647         return {nExecute, std::move(outputShapes), timing};
1648     }
1649 
1650     // Write back to output device memories.
1651     for (uint32_t i = 0; i < memories.size(); i++) {
1652         const RuntimeMemory* memory = mMemories[i];
1653         if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1654             n = copyMemoryToIBuffer(memories[i]->getMemory(), memory->getIBuffer(), {});
1655             if (n != ANEURALNETWORKS_NO_ERROR) {
1656                 return {n, {}, {}};
1657             }
1658         }
1659     }
1660     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1661 }
1662 
1663 }  // namespace nn
1664 }  // namespace android
1665