1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionPlan"
18 
19 #include "ExecutionPlan.h"
20 
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <GraphDump.h>
24 #include <LegacyUtils.h>
25 #include <MetaModel.h>
26 #include <OperationsUtils.h>
27 #include <TokenHasher.h>
28 #include <Tracing.h>
29 #include <android-base/logging.h>
30 #include <fcntl.h>
31 #include <nnapi/IBurst.h>
32 #include <sys/stat.h>
33 #include <sys/types.h>
34 
35 #include <algorithm>
36 #include <functional>
37 #include <map>
38 #include <memory>
39 #include <mutex>
40 #include <queue>
41 #include <set>
42 #include <string>
43 #include <type_traits>
44 #include <unordered_set>
45 #include <utility>
46 #include <vector>
47 
48 #include "BurstBuilder.h"
49 #include "CompilationBuilder.h"
50 #include "ExecutionBuilder.h"
51 #include "ExecutionCallback.h"
52 #include "Manager.h"
53 #include "ModelBuilder.h"
54 #include "TypeManager.h"
55 
56 namespace android {
57 namespace nn {
58 
59 namespace {
60 
61 // The index of the main model in SourceModels.
62 constexpr uint32_t kMainModelInSourceModels = 0;
63 
64 constexpr uint32_t kNoPadding = 1;
65 
updateTokenFromMetaData(TokenHasher * token,const std::vector<TokenValuePair> & metaData)66 static bool updateTokenFromMetaData(TokenHasher* token,
67                                     const std::vector<TokenValuePair>& metaData) {
68     // Combines the TokenValuePair and corresponding extension name.
69     std::vector<std::tuple<const char*, uint16_t, const uint8_t*, size_t>> metaDataWithExtension;
70     for (auto p : metaData) {
71         uint16_t prefix = static_cast<uint32_t>(p.token) >> kExtensionTypeBits;
72         uint16_t extensionEnum = static_cast<uint32_t>(p.token) & kTypeWithinExtensionMask;
73         const Extension* extension;
74         if (!TypeManager::get()->getExtensionInfo(prefix, &extension)) {
75             LOG(ERROR) << "Prefix " << prefix << " could not be found";
76             return false;
77         }
78         metaDataWithExtension.push_back(std::make_tuple(extension->name.c_str(), extensionEnum,
79                                                         p.value.data(), p.value.size()));
80     }
81     // Sort with extension name and extension enum.
82     std::sort(metaDataWithExtension.begin(), metaDataWithExtension.end(),
83               [](const auto& a, const auto& b) {
84                   if (int r = strcmp(std::get<0>(a), std::get<0>(b))) {
85                       return r < 0;
86                   } else {
87                       return std::get<1>(a) < std::get<1>(b);
88                   }
89               });
90     // Update the cache token with the sorted array.
91     for (auto [extensionName, extensionEnum, value, valueSize] : metaDataWithExtension) {
92         if (!token->updateFromString(extensionName) ||
93             !token->update(&extensionEnum, sizeof(uint16_t)) || !token->update(value, valueSize)) {
94             return false;
95         }
96     }
97     return true;
98 }
99 
100 // Compiles the model on device.
101 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
102 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
103 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
104 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,TokenHasher * token,const std::vector<TokenValuePair> & metaData,std::shared_ptr<RuntimePreparedModel> * preparedModel)105 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
106             int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
107             TokenHasher* token, const std::vector<TokenValuePair>& metaData,
108             std::shared_ptr<RuntimePreparedModel>* preparedModel) {
109     CHECK(token != nullptr);
110     CHECK(preparedModel != nullptr);
111     *preparedModel = nullptr;
112 
113     std::optional<CacheToken> cacheToken;
114     if (device.isCachingSupported() && token->ok() &&
115         token->updateFromString(device.getName().c_str()) &&
116         token->updateFromString(device.getVersionString().c_str()) &&
117         token->update(&executionPreference, sizeof(executionPreference)) &&
118         token->update(&compilationPriority, sizeof(compilationPriority)) &&
119         updateTokenFromMetaData(token, metaData) && token->finish()) {
120         cacheToken = CacheToken{};
121         const uint8_t* tokenPtr = token->getCacheToken();
122         std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin());
123     }
124 
125     const ModelFactory makeModel = [&model] { return model.makeModel(); };
126     const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
127     const Priority priority = convertToCanonicalPriority(compilationPriority);
128     std::vector<ExtensionNameAndPrefix> extensionNameAndPrefix =
129             TypeManager::get()->getExtensionNameAndPrefix(metaData);
130     const auto [n, returnedPreparedModel] =
131             device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken,
132                                 metaData, extensionNameAndPrefix);
133     *preparedModel = returnedPreparedModel;
134     return n;
135 }
136 
137 typedef std::function<void(uint32_t)> OperationReadyCallback;
138 
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)139 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
140                            const Operand& fromOperand) {
141     if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
142         std::holds_alternative<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams)) {
143         auto& fromChannelQuant =
144                 std::get<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams);
145         ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
146                 .channelDim = fromChannelQuant.channelDim,
147                 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
148                 .scales = fromChannelQuant.scales.data(),
149         };
150         return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
151     } else if (isExtension(fromOperand.type) &&
152                std::holds_alternative<Operand::ExtensionParams>(fromOperand.extraParams)) {
153         auto extensionData = std::get<Operand::ExtensionParams>(fromOperand.extraParams);
154         return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
155                                              extensionData.size());
156     } else if (!std::holds_alternative<Operand::NoParams>(fromOperand.extraParams) ||
157                fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
158         LOG(ERROR) << "Type " << fromOperand.type
159                    << " has an unexpected extraParams variant: " << fromOperand.extraParams.index();
160         return ANEURALNETWORKS_BAD_DATA;
161     } else {
162         return ANEURALNETWORKS_NO_ERROR;
163     }
164 }
165 
166 // This class tracks whether we know the value of an operand as operations
167 // are processed.
168 class OperandTracker {
169    public:
170     // Creates the tracker for this model. Figure out which operations can be
171     // executed right away and cb for each one of them.
172     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
173     // Mark the specified operation as having been processed. The output
174     // of the operation now being known, this may make new operations to be
175     // able to run.  Call cb for each one of them.
176     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
177 
178    private:
179     const ModelBuilder* mModel;
180     std::multimap<uint32_t, uint32_t> mOperandToOperations;
181     std::vector<uint32_t> mUnknownInputCount;  // For each operation
182 };
183 
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)184 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
185     : mModel(model) {
186     const auto& operations = mModel->getOperations();
187     mUnknownInputCount.resize(operations.size());
188     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
189         const Operation& operation = operations[operationIndex];
190         uint32_t count = 0;
191         for (uint32_t operandIndex : operation.inputs) {
192             auto lifetime = mModel->getOperand(operandIndex).lifetime;
193             if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
194                 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
195                 count++;
196                 mOperandToOperations.emplace(operandIndex, operationIndex);
197             }
198         }
199         if (count == 0) {
200             cb(operationIndex);
201         }
202         mUnknownInputCount[operationIndex] = count;
203     }
204 }
205 
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)206 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
207     // Mark all its outputs as known.
208     const Operation& operation = mModel->getOperations()[operationIndex];
209     for (uint32_t operandIndex : operation.outputs) {
210         auto range = mOperandToOperations.equal_range(operandIndex);
211         for (auto i = range.first; i != range.second; i++) {
212             uint32_t& count = mUnknownInputCount[i->second];
213             if (--count == 0) {
214                 cb(i->second);
215             }
216         }
217     }
218 }
219 
addTemporary(uint32_t * totalSizeOfTemporaries,uint32_t size,uint32_t alignment,uint32_t padding)220 StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size,
221                                      uint32_t alignment, uint32_t padding) {
222     // TODO: what about overflow?
223     *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment);
224     const uint32_t offset = *totalSizeOfTemporaries;
225     size = roundUp(size, padding);
226     *totalSizeOfTemporaries += size;
227     return {.offset = offset, .paddedLength = size};
228 };
229 
toString(SourceOperandIndex sourceOperandIndex)230 std::string toString(SourceOperandIndex sourceOperandIndex) {
231     return "(" + std::to_string(sourceOperandIndex.first) + ", " +
232            std::to_string(sourceOperandIndex.second) + ")";
233 };
234 
235 // A helper class to analyze the step roles of all partition boundary operands.
236 //
237 // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer
238 // with the following two methods:
239 //   - addRole: Add a step role to a boundary operand
240 //   - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest"
241 //     operand. All of the step roles of the "dest" operand are also possible step roles of the
242 //     "source" operand. This is useful for interpreted control flow, e.g., the outer input operand
243 //     of an interpreted IF operation may be directly used as all step roles of the corresponding
244 //     input operand of the then and else models. Note that this relationship is directional --
245 //     (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a
246 //     shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph
247 //     produced by the used-by relationship is acyclic. This is true for the partitioner algorithm
248 //     because there must be a root operand of each step role for the memory to be allocated on
249 //     behalf of.
250 //
251 class StepRoleAnalyzer {
252    public:
analyze(const std::function<void (StepRoleAnalyzer &)> & setup)253     static std::map<SourceOperandIndex, std::set<StepRole>> analyze(
254             const std::function<void(StepRoleAnalyzer&)>& setup) {
255         StepRoleAnalyzer analyzer;
256         setup(analyzer);
257         return analyzer.finish();
258     }
259 
addRole(const ExecutionStep & step,uint32_t operandIndex,IOType type,uint32_t stepIOIndex)260     void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type,
261                  uint32_t stepIOIndex) {
262         SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex};
263         mRoles[source].emplace(step.getIndex(), type, stepIOIndex);
264     }
265 
setUsedBy(const SourceOperandIndex & source,const SourceOperandIndex & dest)266     void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) {
267         mUsedBy[source].emplace(dest);
268     }
269 
270    private:
271     StepRoleAnalyzer() = default;
272 
273     // Merges the step roles of the destination operands to the source operands
274     // and returns the final map.
finish()275     std::map<SourceOperandIndex, std::set<StepRole>> finish() {
276         for (const auto& [source, _] : mUsedBy) {
277             finishHelper(source);
278         }
279         return std::move(mRoles);
280     }
281 
finishHelper(SourceOperandIndex current)282     void finishHelper(SourceOperandIndex current) {
283         if (mProcessedOperands.count(current) > 0) return;
284         mProcessedOperands.insert(current);
285         const auto it = mUsedBy.find(current);
286         if (it != mUsedBy.end()) {
287             auto& roles = mRoles[current];
288             // Merge the step roles of the destination operands.
289             for (const auto& dest : it->second) {
290                 finishHelper(dest);
291                 const auto& destRoles = mRoles[dest];
292                 roles.insert(destRoles.begin(), destRoles.end());
293             }
294         }
295     }
296 
297     // A map from the source operand to its step roles.
298     std::map<SourceOperandIndex, std::set<StepRole>> mRoles;
299     // A map from the source operand to a set of destination operands that may directly
300     // use the memory of the source operand.
301     std::map<SourceOperandIndex, std::set<SourceOperandIndex>> mUsedBy;
302     // Used in finish to track which operand has been processed.
303     std::set<SourceOperandIndex> mProcessedOperands;
304 };
305 
306 }  // namespace
307 
vlogDump(const char * context) const308 void DynamicTemporaries::vlogDump(const char* context) const {
309     if (empty()) {
310         return;
311     }
312     if (context) {
313         VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
314     }
315     for (const auto& temp : mSourceOperandToTemporary) {
316         VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
317                         << ", stepIndex = " << temp.second.stepIndex
318                         << ", offset = " << temp.second.offset
319                         << ", dimensions = " << toString(temp.second.dimensions)
320                         << ", paddedLength = " << temp.second.paddedLength
321                         << ", alignment = " << temp.second.alignment
322                         << ", padding = " << temp.second.padding;
323     }
324 }
325 
declare(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex,const Dimensions & initialDimensions,uint32_t initialLength,uint32_t alignment,uint32_t padding)326 void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
327                                  const Dimensions& initialDimensions, uint32_t initialLength,
328                                  uint32_t alignment, uint32_t padding) {
329     VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
330                     << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
331                     << ", initialDimensions = " << toString(initialDimensions)
332                     << ", initialLength = " << initialLength << ", alignment = " << alignment
333                     << ", padding = " << padding << ")";
334     CHECK(!mDeclared);
335     CHECK_GT(initialLength, 0u);
336     const uint32_t paddedLength = roundUp(initialLength, padding);
337     auto [_, isNew] = mSourceOperandToTemporary.emplace(
338             sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions,
339                                                          paddedLength, alignment, padding});
340     CHECK(isNew);
341     mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
342 }
343 
redeclare(SourceOperandIndex sourceOperandIndex,const Dimensions & newDimensions,uint32_t newLength)344 bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
345                                    const Dimensions& newDimensions, uint32_t newLength) {
346     auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
347         VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
348                         << toString(sourceOperandIndex)
349                         << ", newDimensions = " << toString(newDimensions)
350                         << ", newLength = " << newLength << ") -> " << toString(changedShape);
351         return changedShape;
352     };
353 
354     CHECK(mDeclared);
355     CHECK_GT(newLength, 0u);
356 
357     InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
358     const uint32_t paddedLength = roundUp(newLength, temp.padding);
359     if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) {
360         return createAndLogResult(false);
361     }
362     if (temp.paddedLength < paddedLength) {
363         // Otherwise allocation remains valid, even if it may be suboptimal
364         // (because it uses more space than needed).  Use case: Don't force
365         // client to allocate again just because the client reported more
366         // accurate shape information.
367         mAllocatedStepIndexes.erase(temp.stepIndex);
368     }
369     temp.paddedLength = paddedLength;
370     temp.dimensions = newDimensions;
371     return createAndLogResult(true);
372 }
373 
allocate(uint32_t stepIndex)374 int DynamicTemporaries::allocate(uint32_t stepIndex) {
375     VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
376 
377     CHECK(mDeclared);
378 
379     const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
380     if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
381         return ANEURALNETWORKS_NO_ERROR;
382     }
383 
384     // perform layout
385     uint32_t newSize = 0;
386     for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) {
387         InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
388         // temp.paddedLength is already padded in declare and redeclare.
389         CHECK(temp.paddedLength % temp.padding == 0);
390         temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset;
391     }
392 
393     // perform (re-)allocation
394     // TODO: Today we may shrink the allocation in order to avoid wasting memory.  Is this important
395     //       to conserve memory, or do we waste time reallocating?
396     const double kWaste = 0.2 /* arbitrary */;  // Willing to waste space to avoid
397                                                 // deallocation/reallocation overhead
398     auto& memory = mStepIndexToMemory[stepIndex];
399     const uint32_t oldSize = (memory ? memory->getSize() : 0);
400     if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
401         // Suitable allocation already exists; nothing to do
402     } else {
403         int n;
404         std::tie(n, memory) = MemoryAshmem::create(newSize);
405         if (n != ANEURALNETWORKS_NO_ERROR) {
406             LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
407                        << " for step " << stepIndex;
408             mAllocatedStepIndexes.erase(stepIndex);
409             return n;
410         }
411     }
412 
413     mAllocatedStepIndexes.insert(stepIndex);
414     return ANEURALNETWORKS_NO_ERROR;
415 }
416 
allocated(uint32_t stepIndex) const417 bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
418     return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
419             mStepIndexToSourceOperandIndexes.end()) ||
420            mAllocatedStepIndexes.count(stepIndex);
421 }
422 
lookup(SourceOperandIndex sourceOperandIndex,bool mustBeAllocated) const423 std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
424         SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
425     CHECK(mDeclared);
426     if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
427         it != mSourceOperandToTemporary.end()) {
428         const InternalLocationAndShape& temp = it->second;
429         const bool isAllocated = allocated(temp.stepIndex);
430         if (mustBeAllocated) {
431             CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
432                                << " must be allocated";
433         }
434         if (isAllocated) {
435             return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
436                                     &temp.dimensions, temp.paddedLength};
437         } else {
438             return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength};
439         }
440     }
441     return std::nullopt;
442 }
443 
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)444 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
445                              std::shared_ptr<Device> device)
446     : mPlan(plan),
447       mIndex(stepIndex),
448       mSourceModelIndex(sourceModelIndex),
449       mStepModel(),
450       mDevice(device),
451       mToken(plan->getCacheToken()) {}
452 
453 // Adds an operand if it has not been added already.
454 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)455 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
456                               OperandKind kind) {
457     // Have we added this operand already?
458     auto i = mOperandMap.find(sourceOperandIndex);
459     if (i != mOperandMap.end()) {
460         CHECK(kind == INPUT);
461         *stepOperandIndex = i->second;
462         return ANEURALNETWORKS_NO_ERROR;
463     }
464 
465     // First time we add this operand.
466     *stepOperandIndex = mStepModel.operandCount();
467     mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
468 
469     // Add the operand to the step model.
470     const ModelBuilder& sourceModel = *getSourceModel();
471     const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
472     ANeuralNetworksOperandType type = {
473             .type = static_cast<int32_t>(operand.type),
474             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
475             .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
476             .scale = operand.scale,
477             .zeroPoint = operand.zeroPoint,
478     };
479 
480     int n = mStepModel.addOperand(type);
481     if (n != ANEURALNETWORKS_NO_ERROR) {
482         LOG(ERROR) << "Previous error occurred when partitioning the graph";
483         return n;
484     }
485 
486     n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
487     if (n != ANEURALNETWORKS_NO_ERROR) {
488         LOG(ERROR) << "Error when copying extra parameters to the operand";
489         return n;
490     }
491 
492     // Sets its value.
493     switch (operand.lifetime) {
494         case Operand::LifeTime::CONSTANT_COPY: {
495             const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
496             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
497         } break;
498         case Operand::LifeTime::CONSTANT_REFERENCE: {
499             const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex];
500             n = mStepModel.setOperandValueFromMemory(
501                     *stepOperandIndex, memory, operand.location.offset, operand.location.length);
502         } break;
503         case Operand::LifeTime::NO_VALUE: {
504             n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
505         } break;
506         case Operand::LifeTime::TEMPORARY_VARIABLE: {  // handled similarly to SUBGRAPH_OUTPUT
507             if (kind == INPUT) {
508                 // The first time we've seen this operand is as an
509                 // input.  That means it must be defined by a
510                 // different partition, and is an input to this one.
511                 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
512             } else {
513                 // The first time we've seen this operand is as an
514                 // output.  It may be an input to a different
515                 // partition, so keep track of it.
516                 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
517                                           mIndex);
518             }
519         } break;
520         case Operand::LifeTime::SUBGRAPH_INPUT: {
521             mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
522         } break;
523         case Operand::LifeTime::SUBGRAPH_OUTPUT: {  // handled similarly to TEMPORARY_VARIABLE
524             if (kind == INPUT) {
525                 // The first time we've seen this operand is as an
526                 // input.  That means it must be defined by a
527                 // different partition, and is an input to this one.
528                 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
529             } else {
530                 // The first time we've seen this operand is as an
531                 // output.
532                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
533                 // It may be an input to a different partition, so keep track of
534                 // it.
535                 mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
536                                        mIndex);
537             }
538         } break;
539         case Operand::LifeTime::SUBGRAPH: {
540             const ModelBuilder* model = sourceModel.getReferencedModel(operand);
541             n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
542         } break;
543         case Operand::LifeTime::POINTER: {
544             const void* data = std::get<const void*>(operand.location.pointer);
545             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
546         } break;
547     }
548 
549     if (n != ANEURALNETWORKS_NO_ERROR) {
550         LOG(ERROR) << "Previous error occurred when partitioning the graph";
551     }
552     return n;
553 }
554 
addOperation(int operationIndex)555 int ExecutionStep::addOperation(int operationIndex) {
556     const Operation& operation = getSourceModel()->getOperation(operationIndex);
557     if (mToken.ok()) {
558         mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
559         mToken.update(&operationIndex, sizeof(operationIndex));
560     }
561 
562     // Convert the input and output operand indexes.
563     //
564     // We expect operations to be added in topological order.  Therefore:
565     //
566     // - We may not have seen an input if it is a model input, a
567     //   constant, or an operand written by a different partition.
568     //
569     // - We should not have seen any outputs.
570     auto addOperands = [this](const std::vector<uint32_t>& sourceModelOperands,
571                               std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
572         const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
573         for (uint32_t i = 0; i < operandCount; i++) {
574             NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
575         }
576         return ANEURALNETWORKS_NO_ERROR;
577     };
578 
579     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
580     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
581     std::vector<uint32_t> inputs(inputCount);
582     std::vector<uint32_t> outputs(outputCount);
583     NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
584     NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
585     return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
586                                    outputCount, outputs.data());
587 }
588 
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const std::vector<OutputShape> * mainModelOutputShapes,const RuntimeMemory * temporaryMemory,const std::map<SourceOperandIndex,StaticTemporaryLocation> & sourceOperandToLocationOfTemporary,const DynamicTemporaries & dynamicTemporaries,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const589 void ExecutionStep::mapInputsAndOutputs(
590         std::shared_ptr<StepExecutor> executor,
591         const std::vector<OutputShape>* mainModelOutputShapes, const RuntimeMemory* temporaryMemory,
592         const std::map<SourceOperandIndex, StaticTemporaryLocation>&
593                 sourceOperandToLocationOfTemporary,
594         const DynamicTemporaries& dynamicTemporaries,
595         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
596         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
597         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
598                 sourceOperandToConstantReference) const {
599     auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
600         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
601         if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
602             it != sourceOperandToLocationOfTemporary.end()) {
603             const auto& loc = it->second;
604             executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset,
605                                          loc.paddedLength);
606         } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
607             executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset,
608                                          loc->paddedLength, *loc->dimensions);
609         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
610                    it != sourceOperandToInputIndex.end()) {
611             executor->mapInput(it->second, stepInputIndex);
612         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
613                    it != sourceOperandToOutputIndex.end()) {
614             executor->mapOutputToInput(it->second, stepInputIndex,
615                                        mainModelOutputShapes
616                                                ? &mainModelOutputShapes->at(it->second).dimensions
617                                                : nullptr);
618         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
619                    it != sourceOperandToConstantReference.end()) {
620             // Constant partition boundary operand. This could be an IF branch
621             // model input or a WHILE variable initializer.
622             const auto& loc = it->second;
623             executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length);
624         } else {
625             CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
626                          << toString(sourceOperandIndex);
627         }
628     };
629     auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
630         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
631         if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
632             it != sourceOperandToLocationOfTemporary.end()) {
633             const auto& loc = it->second;
634             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset,
635                                           loc.paddedLength);
636         } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
637             executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
638                                           loc->paddedLength, *loc->dimensions);
639         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
640                    it != sourceOperandToOutputIndex.end()) {
641             executor->mapOutput(it->second, stepOutputIndex);
642         } else {
643             CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
644                          << toString(sourceOperandIndex);
645         }
646     };
647     for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
648         mapInput(mStepModelInputs[i].first, i);
649     }
650     for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
651         mapOutput(mStepModelOutputs[i].first, i);
652     }
653 }
654 
findModelOutputsThatAreDownstreamInputs()655 void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
656     auto declareModelOutputIsDownstreamInput =
657             [this](const SourceOperandIndex& sourceOperandIndex) {
658                 const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
659                 CHECK(it != mOutputToDefiningExecutionStep.end());
660                 uint32_t stepIndex = it->second;
661                 CHECK_LT(stepIndex, mSteps.size());
662                 VLOG(COMPILATION)
663                         << "ExecutionStep(" << stepIndex
664                         << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
665                         << toString(sourceOperandIndex) << ")";
666                 CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
667                       mSourceOperandToOutputIndex.end());
668                 mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
669                         mSourceOperandToOutputIndex.at(sourceOperandIndex));
670             };
671     for (const auto& logicalStep : mSteps) {
672         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
673             for (const auto& output : step->getOutputsAsStepModelInputs()) {
674                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
675                 declareModelOutputIsDownstreamInput(sourceOperandIndex);
676             }
677         }
678     }
679 }
680 
findTempsAsStepModelOutputs()681 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
682     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
683         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
684         if (it == mTemporaryToDefiningExecutionStep.end()) {
685             // The operand is not a temporary or is not defined by an
686             // ExecutionStep (i.e. it's an output of an IF or a WHILE).
687             // The latter case is handled by ExecutionPlan::makeController().
688             return;
689         }
690         uint32_t stepIndex = it->second;
691         CHECK_LT(stepIndex, mSteps.size());
692         mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
693     };
694     for (const auto& logicalStep : mSteps) {
695         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
696             for (const auto& input : step->getTempsAsStepModelInputs()) {
697                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
698                 recordAsOutputIfTemporary(sourceOperandIndex);
699             }
700         } else if (const IfStep* step = logicalStep->tryIfStep()) {
701             recordAsOutputIfTemporary(step->conditionOperandIndex);
702             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
703                 recordAsOutputIfTemporary(sourceOperandIndex);
704             }
705         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
706             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
707                 recordAsOutputIfTemporary(sourceOperandIndex);
708             }
709         } else {
710             CHECK(logicalStep->isGoto());
711         }
712     }
713 }
714 
declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex)715 void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
716     VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
717                       << mainModelOutputIndex << ")";
718     const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
719                               mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
720     CHECK(it != mOutputIndexStepModelToMainModel.end());
721     const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
722     CHECK(stepModelOutputIndex < mModelOutputs.size());
723     mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
724 }
725 
recordTempAsStepModelOutput(uint32_t stepOperandIndex)726 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
727     const auto it = mOperandMap.find(stepOperandIndex);
728     CHECK(it != mOperandMap.end());
729     mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
730 }
731 
getSourceModel() const732 const ModelBuilder* ExecutionStep::getSourceModel() const {
733     return mPlan->getSourceModels().getModel(mSourceModelIndex);
734 }
735 
logStepModel() const736 void ExecutionStep::logStepModel() const {
737     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
738 
739     auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
740         if (!toLog.empty()) {
741             toLog += ", ";
742         }
743         toLog += toString(e.first);
744         toLog += "->";
745         toLog += toString(e.second);
746     };
747 
748     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
749         std::string toLog;
750         for (const auto& e : map) {
751             logRemapEntry(toLog, e);
752         }
753         VLOG(COMPILATION) << name << ": " << toLog;
754     };
755     auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
756         std::string toLog;
757         for (const auto& e : set) {
758             logRemapEntry(toLog, e);
759         }
760         VLOG(COMPILATION) << name << ": " << toLog;
761     };
762 
763     logRemapVector("step model inputs", mStepModelInputs);
764     logRemapVector("step model outputs", mStepModelOutputs);
765     logRemapVector("model inputs", mModelInputs);
766     logRemapVector("model outputs", mModelOutputs);
767     logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
768     logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
769     logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
770 }
771 
hasUnknownSize(const Operand & operand)772 static bool hasUnknownSize(const Operand& operand) {
773     if (operand.dimensions.empty()) {
774         return TypeManager::get()->isTensorType(operand.type);
775     }
776     for (const Dimension& dimension : operand.dimensions) {
777         if (dimension == 0) {
778             return true;
779         }
780     }
781     return false;
782 }
783 
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)784 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
785                                    int32_t executionPreference, int32_t priority) {
786     CHECK(mDevice != nullptr);
787 
788     for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
789         const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
790         if (hasUnknownSize(operand)) {
791             *hasOutputOfUnknownSize = true;
792             VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
793                               << " of source graph) has unknown size: " << operand;
794         }
795     }
796 
797     mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
798 
799     mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
800     mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
801                             mTempsAsStepModelInputs.end());
802     mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
803                             mOutputsAsStepModelInputs.end());
804 
805     mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
806     mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
807                              mTempsAsStepModelOutputs.end());
808 
809     // A step model with no inputs or no outputs is an invalid model. Note that we would like to
810     // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from
811     // model validation.
812     if (hasNoInputsOrNoOutputs()) {
813         VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs "
814                              "or no outputs";
815         return ANEURALNETWORKS_OP_FAILED;
816     }
817 
818     if (mSourceModelIndex == kMainModelInSourceModels) {
819         std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
820         for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
821             mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
822         }
823         std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
824         for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
825             mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
826         }
827 
828         // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
829         // mModelInputs being the first inputs, as specified by mStepModelInputs.
830         mInputIndexStepModelToMainModel.resize(mModelInputs.size());
831         std::transform(mModelInputs.begin(), mModelInputs.end(),
832                        mInputIndexStepModelToMainModel.begin(),
833                        [&mainModelOperandToInputIndex](auto& e) {
834                            uint32_t sourceOperandIndex = e.first;
835                            return mainModelOperandToInputIndex[sourceOperandIndex];
836                        });
837 
838         // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
839         // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
840         mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
841         std::transform(mModelOutputs.begin(), mModelOutputs.end(),
842                        mOutputIndexStepModelToMainModel.begin(),
843                        [&mainModelOperandToOutputIndex](auto& e) {
844                            uint32_t sourceOperandIndex = e.first;
845                            return mainModelOperandToOutputIndex[sourceOperandIndex];
846                        });
847 
848         // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
849         // on mOutputsAsStepModelInputs being the first outputs.
850         mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
851         std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
852                        mOutputsAsStepModelInputsIndexToMainModel.begin(),
853                        [&mainModelOperandToOutputIndex](auto& e) {
854                            uint32_t sourceOperandIndex = e.first;
855                            return mainModelOperandToOutputIndex[sourceOperandIndex];
856                        });
857     }
858 
859     if (VLOG_IS_ON(COMPILATION)) {
860         logStepModel();
861     }
862 
863     std::vector<uint32_t> inputs(mStepModelInputs.size());
864     std::vector<uint32_t> outputs(mStepModelOutputs.size());
865     std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
866                    [](auto& e) { return e.second; });
867     std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
868                    [](auto& e) { return e.second; });
869     NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
870                                                            outputs.size(), outputs.data()));
871     NN_RETURN_IF_ERROR(mStepModel.finish());
872 
873     // TODO: Move compilation elsewhere?
874     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
875     return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(),
876                    &mToken, {}, &mPreparedStepModel);
877 }
878 
dump() const879 void ExecutionStep::dump() const {
880     if (VLOG_IS_ON(COMPILATION)) {
881         VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
882         logModelToInfo(mStepModel.makeModel());
883     }
884 }
885 
operator <<(std::ostream & os,const IfStep & step)886 std::ostream& operator<<(std::ostream& os, const IfStep& step) {
887     return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
888               << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
889 }
890 
operator <<(std::ostream & os,const WhileStep & step)891 std::ostream& operator<<(std::ostream& os, const WhileStep& step) {
892     return os << "Step#" << step.index << ": while cond=" << step.condStepIndex
893               << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
894 }
895 
operator <<(std::ostream & os,const GotoStep & step)896 std::ostream& operator<<(std::ostream& os, const GotoStep& step) {
897     return os << "Step#" << step.index << ": goto " << step.gotoStepIndex;
898 }
899 
dump() const900 void LogicalStep::dump() const {
901     if (VLOG_IS_ON(COMPILATION)) {
902         if (const IfStep* step = tryIfStep()) {
903             VLOG(COMPILATION) << *step;
904         } else if (const WhileStep* step = tryWhileStep()) {
905             VLOG(COMPILATION) << *step;
906         } else if (const GotoStep* step = tryGotoStep()) {
907             VLOG(COMPILATION) << *step;
908         } else {
909             executionStep()->dump();
910         }
911     }
912 }
913 
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)914 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
915                                         int32_t executionPreference, int32_t priority,
916                                         const OptionalTimePoint& deadline,
917                                         const std::vector<TokenValuePair>& metadata,
918                                         int simulateFailureResultCode) {
919     CHECK(!mSuccessfulFinish);
920     CHECK(!deadline.has_value());
921     CHECK(metadata.empty());
922 
923     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
924 
925     auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
926         for (const auto& sourceOperandIndex : operands) {
927             const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
928             const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
929             if (hasUnknownSize(operand)) {
930                 return true;
931             }
932         }
933         return false;
934     };
935 
936     findTempsAsStepModelOutputs();
937     for (const auto& logicalStep : mSteps) {
938         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
939             bool stepHasDynamicTemporaries = false;
940             int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries,
941                                           executionPreference, priority);
942             if (stepHasDynamicTemporaries) {
943                 mHasDynamicTemporaries = true;
944                 if (!isCompliantVersion(kHalVersionV1_2ToApi.canonical,
945                                         step->getDevice()->getFeatureLevel())) {
946                     // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT
947                     // must have fully specified dimensions either in the
948                     // Operand or in the RequestArgument.  In the case of a
949                     // dynamic temporary, we won't be able to supply fully
950                     // specified dimensions in either.
951                     VLOG(COMPILATION)
952                             << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex()
953                             << " defines dynamic temporaries but is scheduled on pre-1.2 device "
954                             << step->getDevice()->getName();
955                     if (n == ANEURALNETWORKS_NO_ERROR) {
956                         n = ANEURALNETWORKS_OP_FAILED;
957                     }
958                 }
959             }
960             if (n != ANEURALNETWORKS_NO_ERROR) {
961                 VLOG(COMPILATION)
962                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
963                 return n;
964             }
965         } else if (IfStep* step = logicalStep->tryIfStep()) {
966             // The partitioner does not support dynamic temporaries (b/132458982).
967             CHECK(!containsUnknownSize(step->outerInputOperands));
968             CHECK(!containsUnknownSize(step->outerOutputOperands));
969             // step->conditionOperandIndex has a static shape. See b/158557728.
970             CHECK(!containsUnknownSize(step->thenBranchInputOperands));
971             CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
972             CHECK(!containsUnknownSize(step->elseBranchInputOperands));
973             CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
974         } else if (WhileStep* step = logicalStep->tryWhileStep()) {
975             // The partitioner does not support dynamic temporaries (b/132458982).
976             CHECK(!containsUnknownSize(step->outerInputOperands));
977             CHECK(!containsUnknownSize(step->outerOutputOperands));
978             CHECK(!containsUnknownSize(step->condInputOperands));
979             // step->condOutputOperand has a static shape. See b/158557728.
980             CHECK(!containsUnknownSize(step->bodyInputOperands));
981             CHECK(!containsUnknownSize(step->bodyOutputOperands));
982         } else {
983             CHECK(logicalStep->isGoto());
984         }
985     }
986 
987     if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
988         VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
989                           << simulateFailureResultCode;
990         return simulateFailureResultCode;
991     }
992 
993     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
994         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
995         mSourceOperandToInputIndex[index] = i;
996     }
997     for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
998         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
999         mSourceOperandToOutputIndex[index] = i;
1000     }
1001 
1002     findControlFlowBoundaryConstants(sourceModels);
1003     findModelOutputsThatAreDownstreamInputs();
1004     findMemoryStepRoles();
1005 
1006     mSuccessfulFinish = true;
1007     LOG(INFO) << "ExecutionPlan::CompoundBody::finish: compilation finished successfully";
1008     return ANEURALNETWORKS_NO_ERROR;
1009 }
1010 
findControlFlowBoundaryConstants(const SourceModels * sourceModels)1011 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
1012         const SourceModels* sourceModels) {
1013     auto handleBoundaryConstants = [this,
1014                                     sourceModels](const SourceOperandIndex& sourceOperandIndex) {
1015         const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
1016         const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
1017         const DataLocation& location = operand.location;
1018         if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) {
1019             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
1020                     .buffer = sourceModel->getPointerToOperandValue(location.offset),
1021                     .length = location.length,
1022             };
1023         } else if (operand.lifetime == Operand::LifeTime::POINTER) {
1024             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
1025                     .buffer = static_cast<const uint8_t*>(std::get<const void*>(location.pointer)),
1026                     .length = location.length,
1027             };
1028         } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) {
1029             mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
1030                     .memory = sourceModel->getMemories()[location.poolIndex],
1031                     .offset = location.offset,
1032                     .length = location.length,
1033             };
1034         }
1035     };
1036     for (const auto& logicalStep : mSteps) {
1037         if (const IfStep* step = logicalStep->tryIfStep()) {
1038             handleBoundaryConstants(step->conditionOperandIndex);
1039             for (const auto& sourceOperandIndex : step->outerInputOperands) {
1040                 handleBoundaryConstants(sourceOperandIndex);
1041             }
1042         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1043             for (const auto& sourceOperandIndex : step->outerInputOperands) {
1044                 handleBoundaryConstants(sourceOperandIndex);
1045             }
1046         }
1047     }
1048 }
1049 
findMemoryStepRoles()1050 void ExecutionPlan::CompoundBody::findMemoryStepRoles() {
1051     mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) {
1052         for (const auto& logicalStep : mSteps) {
1053             if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1054                 const auto& stepModelInputs = step->getStepModelInputs();
1055                 for (uint32_t i = 0; i < stepModelInputs.size(); i++) {
1056                     const auto& [sourceIndex, stepIndex] = stepModelInputs[i];
1057                     analyzer.addRole(*step, sourceIndex, IOType::INPUT, i);
1058                 }
1059                 const auto& stepModelOutputs = step->getStepModelOutputs();
1060                 for (uint32_t i = 0; i < stepModelOutputs.size(); i++) {
1061                     const auto& [sourceIndex, stepIndex] = stepModelOutputs[i];
1062                     analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i);
1063                 }
1064             } else if (const IfStep* step = logicalStep->tryIfStep()) {
1065                 // See ExecutionPlan::nextCompound(const IfStep*, ...).
1066                 //
1067                 // For interpreted IF operation, the outer input memories may be directly used by
1068                 // the SUBGRAPH_INPUTs of the then and else model.
1069                 CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size());
1070                 CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size());
1071                 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1072                     analyzer.setUsedBy(step->outerInputOperands[i],
1073                                        step->thenBranchInputOperands[i]);
1074                     analyzer.setUsedBy(step->outerInputOperands[i],
1075                                        step->elseBranchInputOperands[i]);
1076                 }
1077                 // For interpreted IF operation, the outer output memories may be directly used by
1078                 // the SUBGRAPH_OUTPUTs of the then and else model.
1079                 CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size());
1080                 CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size());
1081                 for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) {
1082                     analyzer.setUsedBy(step->outerOutputOperands[i],
1083                                        step->thenBranchOutputOperands[i]);
1084                     analyzer.setUsedBy(step->outerOutputOperands[i],
1085                                        step->elseBranchOutputOperands[i]);
1086                 }
1087             } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1088                 // See ExecutionPlan::nextCompound(const WhileStep*, ...).
1089                 //
1090                 // For interpreted WHILE operation, the following memories are involved:
1091                 // a. the outer input memories to the WHILE operation
1092                 // b. the outer output memories to the WHILE operation
1093                 // c. the output memory of the condition model
1094                 // d. one set of output memories of the body model
1095                 // e. another set of output memories of the body model
1096                 //
1097                 // The memories are used in the following ways:
1098                 //
1099                 // - Condition model:
1100                 //   * In the first iteration: inputs use (a); output uses (c)
1101                 //   * In the following iterations: inputs use (d) or (e) for input-output and
1102                 //     state-only operands, and (a) for input-only operands; output uses (c)
1103                 //
1104                 // - Body model:
1105                 //   * In all iterations: inputs are the same as the condition model; outputs use
1106                 //                        (d) or (e)
1107                 //
1108                 // Therefore, we configure the analyzer with the following used-by relationships:
1109                 // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of
1110                 //   the condition model for all inputs in the first iteration, as well as the
1111                 //   input-only operands in the following iterations.
1112                 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1113                 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1114                     analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]);
1115                 }
1116                 // - The output memories of the body model (d) and (e) may be directly used by the
1117                 //   SUBGRAPH_INPUTs of the condition model for input-output and state-only operands
1118                 //   after the first iteration.
1119                 CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size());
1120                 for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) {
1121                     analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]);
1122                 }
1123                 // - The SUBGRAPH_INPUTs of the condition model are directly used by the
1124                 //   SUBGRAPH_INPUTs of the body model for all inputs in all iterations.
1125                 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1126                 for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) {
1127                     analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]);
1128                 }
1129             } else if (logicalStep->isGoto()) {
1130                 // Nothing to do.
1131             } else {
1132                 CHECK(false) << "Unexpected LogicalStep kind";
1133             }
1134         }
1135     });
1136 }
1137 
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1138 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
1139                                       int32_t priority, const OptionalTimePoint& deadline,
1140                                       const std::vector<TokenValuePair>& metadata,
1141                                       int simulateFailureResultCode) {
1142     CHECK(!mSuccessfulFinish);
1143     CHECK(mDevice != nullptr);
1144     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
1145     int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo,
1146                     &mToken, metadata, &mPreparedModel);
1147     if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
1148         VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
1149                           << simulateFailureResultCode;
1150         n = simulateFailureResultCode;
1151     }
1152     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
1153     if (mSuccessfulFinish) {
1154         LOG(INFO) << "ExecutionPlan::SimpleBody::finish: compilation finished successfully on "
1155                   << mDevice->getName();
1156     }
1157     return n;
1158 }
1159 
finish(int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1160 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
1161                           const OptionalTimePoint& deadline,
1162                           const std::vector<TokenValuePair>& metadata,
1163                           int simulateFailureResultCode) {
1164     CHECK(mBody != nullptr);
1165     return mBody->finish(&getSourceModels(), executionPreference, priority, deadline, metadata,
1166                          simulateFailureResultCode);
1167 }
1168 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference,DynamicTemporaries dynamicTemporaries)1169 ExecutionPlan::Controller::Controller(
1170         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
1171         const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
1172         std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary,
1173         std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,
1174         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
1175         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
1176         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
1177         std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
1178         DynamicTemporaries dynamicTemporaries)
1179     : mPlan(plan),
1180       mExecutionBuilder(executionBuilder),
1181       mBurstBuilder(burstBuilder),
1182       mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)),
1183       mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)),
1184       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
1185       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
1186       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
1187       mDynamicTemporaries(std::move(dynamicTemporaries)),
1188       mNextStepIndex(0),
1189       mFallbackNextStepIndex(kBadStepIndex),
1190       mLastStepSyncFd(-1) {
1191     if (totalSizeOfTemporaries == 0) {
1192         return;
1193     }
1194     int n;
1195     std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
1196     if (n != ANEURALNETWORKS_NO_ERROR) {
1197         LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
1198         mNextStepIndex = kBadStepIndex;
1199     }
1200     for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
1201         memcpy(mTemporaries->getPointer() +
1202                        mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset,
1203                location.buffer, location.length);
1204     }
1205 }
1206 
1207 // Attempt to create a burst object for each PreparedModel/Partition. If the
1208 // burst controller object cannot be made, return a nullptr in its place to
1209 // indicate the regular execution path should be used. This can occur either
1210 // because PreparedModel was nullptr (cpu was best choice), or because the
1211 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const1212 std::vector<SharedBurst> ExecutionPlan::makeBursts() const {
1213     switch (mState) {
1214         // burst object for each partition in the compound case
1215         case COMPOUND: {
1216             std::vector<SharedBurst> bursts;
1217             bursts.reserve(compound()->mSteps.size());
1218             for (const auto& logicalStep : compound()->mSteps) {
1219                 if (!logicalStep->isExecution()) {
1220                     bursts.push_back(nullptr);
1221                     continue;
1222                 }
1223                 if (const auto preparedModel =
1224                             logicalStep->executionStep()->getPreparedStepModel()) {
1225                     const auto maybeBurst = preparedModel->configureExecutionBurst();
1226                     if (!maybeBurst.has_value()) {
1227                         LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1228                                    << maybeBurst.error().code << ": " << maybeBurst.error().message;
1229                     }
1230                     bursts.push_back(maybeBurst.value_or(nullptr));
1231                 } else {
1232                     bursts.push_back(nullptr);
1233                 }
1234             }
1235             return bursts;
1236         }
1237         // single burst object for the simple case
1238         case SIMPLE: {
1239             std::vector<SharedBurst> burst;
1240             auto simpleBody = simple();
1241             if (const auto preparedModel = simpleBody->mPreparedModel) {
1242                 const auto maybeBurst = preparedModel->configureExecutionBurst();
1243                 if (!maybeBurst.has_value()) {
1244                     LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1245                                << maybeBurst.error().code << ": " << maybeBurst.error().message;
1246                 }
1247                 burst.push_back(maybeBurst.value_or(nullptr));
1248             } else {
1249                 burst.push_back(nullptr);
1250             }
1251             return burst;
1252         }
1253         // no burst objects made
1254         default:
1255             return {};
1256     }
1257 }
1258 
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const1259 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
1260         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
1261     CHECK(isValid());
1262     CHECK(mState != SIMPLE);
1263     const auto* body = compound();
1264     // Create the layout for a RuntimeMemory object big enough to hold
1265     // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
1266     // - buffers required by the control flow implementation.
1267     //
1268     // TODO: Rethink this approach for managing temporaries.  Some
1269     // alternatives:
1270     //
1271     // 1) Adopt a memory layout scheme analogous to stack allocation,
1272     // where objects of non-overlapping lifetime can occupy the same
1273     // storage.  We would still have a single Memory object in this
1274     // case.
1275     //
1276     // 2) Do something like what CpuExecutor does, and do allocations
1277     // and deallocations on the fly (during execution) before first
1278     // reference and after last reference, respectively.  This would
1279     // mean having one Memory object per TEMPORARY; or, in a more
1280     // complicated implementation, one Memory object per set of
1281     // temporaries that have the same lifetime.  Note that the Android
1282     // system limits the number of shared memory objects, which are
1283     // what our Memory objects represent.
1284     //
1285     uint32_t totalSizeOfTemporaries = 0;
1286     // This function has two modes of operation:
1287     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
1288     //    TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
1289     //    skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
1290     //    skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
1291     //    operand of another lifetime.
1292     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
1293     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
1294     //    of another lifetime.
1295     auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries](
1296                                 const SourceOperandIndex& sourceOperandIndex,
1297                                 std::map<SourceOperandIndex, StaticTemporaryLocation>*
1298                                         sourceOperandToLocationOfTemporary,
1299                                 Operand::LifeTime lifetime =
1300                                         Operand::LifeTime::TEMPORARY_VARIABLE) {
1301         CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
1302               lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT);
1303         const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex);
1304         if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE &&
1305             sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
1306             // See the caller for explanation.
1307             return;
1308         }
1309         CHECK_EQ(sourceOperand.lifetime, lifetime);
1310         const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1311         if (size != 0u) {
1312             const auto memoryPreference =
1313                     body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1314             const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment,
1315                                           memoryPreference.padding);
1316             auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc);
1317             CHECK(isNew);
1318             VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
1319                             << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1320         } else {
1321             // Unknown size, hence dynamic temporary.  The mapping will
1322             // be established elsewhere (DynamicTemporaries::allocate()).
1323             CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1324             CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1325         }
1326     };
1327     std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary;
1328     std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2;
1329     for (const auto& logicalStep : body->mSteps) {
1330         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1331             // Allocate memory for ExecutionStep temporary outputs that are
1332             // inputs to other steps, as determined by
1333             // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
1334             //
1335             // We don't allocate memory for step model output operands with
1336             // source operand lifetime SUBGRAPH_OUTPUT because they will be
1337             // - managed by the client (main model outputs),
1338             // - assigned a location of another operand (when this step model
1339             //   output is a branch model output of an IF; see
1340             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1341             // - allocated by a WHILE (when this step model output
1342             //   is a condition or body model output of a WHILE; see the
1343             //   step->bodyOutputOperands and step->condOutputOperand handling
1344             //   below).
1345             for (const auto& output : step->getTempsAsStepModelOutputs()) {
1346                 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
1347                              &sourceOperandToLocationOfTemporary);
1348             }
1349         } else if (const IfStep* step = logicalStep->tryIfStep()) {
1350             // Allocate memory for all temporary outputs of an IfStep because
1351             // they are going to be written to by a branch model. We don't
1352             // perform unused output operand optimisation for referenced models.
1353             //
1354             // We don't allocate memory for branch output operands because they
1355             // use the same location as the corresponding outer output operands,
1356             // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
1357             //
1358             // We don't allocate memory for outer output operands with source
1359             // operand lifetime SUBGRAPH_OUTPUT because they will be
1360             // - managed by the client (main model outputs),
1361             // - assigned a location of another operand (when this IF outer
1362             //   output is a branch model output of another IF; see
1363             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1364             // - allocated by a WHILE (when this IF outer output
1365             //   is a condition or body model output of a WHILE; see the
1366             //   step->bodyOutputOperands and step->condOutputOperand handling
1367             //   below).
1368             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1369                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1370             }
1371         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1372             // Allocate memory for all temporary outputs of an WhileStep because
1373             // they are going to be written to by the WHILE loop.
1374             //
1375             // We don't allocate memory for outer output operands with source
1376             // operand lifetime SUBGRAPH_OUTPUT because they will be
1377             // - managed by the client (main model outputs),
1378             // - assigned a location of another operand (when this WHILE outer
1379             //   output is a branch model output of an IF; see
1380             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1381             // - allocated by another WHILE (when this WHILE outer output
1382             //   is a condition or body model output of another WHILE; see the
1383             //   step->bodyOutputOperands and step->condOutputOperand handling
1384             //   below).
1385             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1386                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1387             }
1388             // Allocate memory for body model outputs. Note that we could use
1389             // the outer output operand memory instead but we currently don't do
1390             // so (b/148206073).
1391             for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
1392                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary,
1393                              Operand::LifeTime::SUBGRAPH_OUTPUT);
1394                 // Allocate another set of temporaries for double buffering.
1395                 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2,
1396                              Operand::LifeTime::SUBGRAPH_OUTPUT);
1397             }
1398             // Allocate memory for condition model output.
1399             // TODO: Share one condition output memory region between all loops.
1400             mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary,
1401                          Operand::LifeTime::SUBGRAPH_OUTPUT);
1402         } else {
1403             CHECK(logicalStep->isGoto());
1404         }
1405     }
1406     // Allocate temporary memory for boundary CONSTANT_COPY operands.
1407     for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) {
1408         const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1409         const auto loc = addTemporary(&totalSizeOfTemporaries, location.length,
1410                                       memoryPreference.alignment, memoryPreference.padding);
1411         sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc);
1412         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
1413                         << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1414     }
1415     // Collect dynamic temporaries.
1416     // TODO(b/157236079): Move some or all of this work to compilation time?
1417     DynamicTemporaries dynamicTemporaries;
1418     const TypeManager* typeManager = TypeManager::get();
1419     forEachDynamicTemporary([body, typeManager, &dynamicTemporaries](
1420                                     SourceOperandIndex sourceOperandIndex,
1421                                     const Operand& sourceOperand, uint32_t definingStepIndex) {
1422         CHECK(typeManager->isTensorType(sourceOperand.type));
1423         const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1424         // TODO: For now we guess an initial size equal to element
1425         // size, which is overly conservative.
1426         const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
1427         dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
1428                                    size, memoryPreference.alignment, memoryPreference.padding);
1429     });
1430     dynamicTemporaries.endDeclarations();
1431     dynamicTemporaries.vlogDump("finished declarations");
1432 
1433     return std::shared_ptr<Controller>(new Controller(
1434             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
1435             std::move(sourceOperandToLocationOfTemporary),
1436             std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex,
1437             body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy,
1438             body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
1439 }
1440 
1441 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1442 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
1443                             std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1444                             const std::vector<OutputShape>* mainModelOutputShapes) const {
1445     *executor = nullptr;
1446     if (burstController != nullptr) {
1447         *burstController = nullptr;
1448     }
1449 
1450     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
1451                     << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
1452 
1453     if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
1454         // We haven't called next().
1455         return ANEURALNETWORKS_OP_FAILED;
1456     }
1457 
1458     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1459         // The last call to next() did not produce an executor.
1460         return ANEURALNETWORKS_OP_FAILED;
1461     }
1462 
1463     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1464     return next(controller, executor, burstController, mainModelOutputShapes);
1465 }
1466 
Buffer(void * pointer,uint32_t size)1467 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1468     : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast<uint8_t*>(pointer), size)),
1469       mOffset(0) {}
1470 
Buffer(RunTimePoolInfo info,uint32_t offset)1471 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1472     : mInfo(std::move(info)), mOffset(offset) {}
1473 
getPointer() const1474 void* ExecutionPlan::Buffer::getPointer() const {
1475     return mInfo.getBuffer() + mOffset;
1476 }
1477 
getSize() const1478 uint32_t ExecutionPlan::Buffer::getSize() const {
1479     return mInfo.getSize() - mOffset;
1480 }
1481 
flush() const1482 void ExecutionPlan::Buffer::flush() const {
1483     mInfo.flush();
1484 }
1485 
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1486 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1487         const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1488     switch (info.state()) {
1489         case ModelArgumentInfo::POINTER: {
1490             return Buffer(info.buffer(), info.length());
1491         } break;
1492         case ModelArgumentInfo::MEMORY: {
1493             if (std::optional<RunTimePoolInfo> poolInfo =
1494                         executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1495                 return Buffer(*poolInfo, info.locationAndLength().offset);
1496             } else {
1497                 LOG(ERROR) << "Unable to map operand memory pool";
1498                 return std::nullopt;
1499             }
1500         } break;
1501         case ModelArgumentInfo::HAS_NO_VALUE: {
1502             LOG(ERROR) << "Attempting to read an operand that has no value";
1503             return std::nullopt;
1504         } break;
1505         default: {
1506             LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1507             return std::nullopt;
1508         } break;
1509     }
1510 }
1511 
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1512 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1513         std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1514     const auto& sourceOperandToLocationOfTemporary =
1515             controller->mSourceOperandToLocationOfTemporary;
1516     const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1517     const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1518     const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1519     if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex);
1520         it != sourceOperandToLocationOfTemporary.end()) {
1521         const uint32_t offset = it->second.offset;
1522         const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1523         return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1524     } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1525                it != sourceOperandToInputIndex.end()) {
1526         const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1527         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1528     } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1529                it != sourceOperandToOutputIndex.end()) {
1530         const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1531         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1532     } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1533                it != sourceOperandToConstantReference.end()) {
1534         const ConstantReferenceLocation& location = it->second;
1535         const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1536         if (info == std::nullopt) {
1537             return std::nullopt;
1538         }
1539         return Buffer(info->getBuffer() + location.offset, location.length);
1540     }
1541     return std::nullopt;
1542 }
1543 
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1544 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1545                                       SourceOperandIndex operandIndex, bool* value) const {
1546     std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1547     if (buffer == std::nullopt) {
1548         LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1549         return ANEURALNETWORKS_OP_FAILED;
1550     }
1551     CHECK_GE(buffer->getSize(), sizeof(bool8));
1552     bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1553     *value = static_cast<bool>(value8);
1554     VLOG(EXECUTION) << "readConditionValue: " << *value;
1555     return ANEURALNETWORKS_NO_ERROR;
1556 }
1557 
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes,int syncFdOfLastStep) const1558 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1559                         std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1560                         const std::vector<OutputShape>* mainModelOutputShapes,
1561                         int syncFdOfLastStep) const {
1562     CHECK(mState == COMPOUND);
1563 
1564     controller->mLastStepSyncFd = syncFdOfLastStep;
1565     *executor = nullptr;
1566     if (burstController != nullptr) {
1567         *burstController = nullptr;
1568     }
1569 
1570     VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1571                     << "): mNextStepIndex = " << controller->mNextStepIndex;
1572 
1573     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1574         return ANEURALNETWORKS_OP_FAILED;
1575     }
1576 
1577     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1578 }
1579 
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1580 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1581                                 std::shared_ptr<StepExecutor>* executor,
1582                                 SharedBurst* burstController,
1583                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1584     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1585         return ANEURALNETWORKS_OP_FAILED;
1586     }
1587 
1588     auto compoundBody = compound();
1589     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1590         controller->mNextStepIndex = Controller::kBadStepIndex;  // end
1591         return ANEURALNETWORKS_NO_ERROR;
1592     }
1593 
1594     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1595     if (const IfStep* step = logicalStep->tryIfStep()) {
1596         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1597     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1598         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1599     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1600         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1601     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1602         return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1603     } else {
1604         CHECK(false) << "Unknown step variant";
1605         return ANEURALNETWORKS_BAD_STATE;
1606     }
1607 }
1608 
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1609 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1610                                 std::shared_ptr<StepExecutor>* executor,
1611                                 SharedBurst* burstController,
1612                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1613     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1614                     << step->getDevice()->getName();
1615 
1616     NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
1617     controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
1618 
1619     *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1620                                                step->getDevice(), step->getPreparedStepModel(),
1621                                                /*reusable=*/false, step,
1622                                                &controller->mDynamicTemporaries);
1623 
1624     step->mapInputsAndOutputs(
1625             *executor, mainModelOutputShapes, controller->mTemporaries.get(),
1626             controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries,
1627             controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
1628             controller->mSourceOperandToConstantReference);
1629     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1630         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1631     }
1632 
1633     controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1634     controller->mNextStepIndex++;
1635     return ANEURALNETWORKS_NO_ERROR;
1636 }
1637 
1638 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1639 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1640                                          const SourceOperandIndex& innerOperand) {
1641     VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1642                     << toString(outerOperand);
1643 #ifdef NN_DEBUGGABLE
1644     CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1645                      mSourceOperandToInputIndex.count(innerOperand) +
1646                      mSourceOperandToOutputIndex.count(innerOperand) +
1647                      mSourceOperandToConstantReference.count(innerOperand),
1648              1u);
1649 #endif
1650     mSourceOperandToLocationOfTemporary.erase(innerOperand);
1651     mSourceOperandToInputIndex.erase(innerOperand);
1652     mSourceOperandToOutputIndex.erase(innerOperand);
1653     mSourceOperandToConstantReference.erase(innerOperand);
1654     if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1655         it != mSourceOperandToLocationOfTemporary.end()) {
1656         mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1657     } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1658                it != mSourceOperandToInputIndex.end()) {
1659         mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1660     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1661                it != mSourceOperandToOutputIndex.end()) {
1662         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1663     } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1664                it != mSourceOperandToConstantReference.end()) {
1665         mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1666     } else {
1667         CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1668                      << " from operand " << toString(outerOperand);
1669     }
1670 }
1671 
1672 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1673 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1674                                           const SourceOperandIndex& innerOperand) {
1675     VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1676                     << toString(outerOperand);
1677 #ifdef NN_DEBUGGABLE
1678     CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1679                      mSourceOperandToOutputIndex.count(innerOperand),
1680              1u);
1681 #endif
1682     mSourceOperandToLocationOfTemporary.erase(innerOperand);
1683     mSourceOperandToOutputIndex.erase(innerOperand);
1684     if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1685         it != mSourceOperandToLocationOfTemporary.end()) {
1686         mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1687     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1688                it != mSourceOperandToOutputIndex.end()) {
1689         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1690     } else {
1691         CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1692                      << " from operand " << toString(outerOperand);
1693     }
1694 }
1695 
waitForLastStepSyncFence() const1696 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1697     if (mLastStepSyncFd == -1) {
1698         return ANEURALNETWORKS_NO_ERROR;
1699     }
1700     VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1701     auto r = syncWait(mLastStepSyncFd, -1);
1702     int n = ANEURALNETWORKS_NO_ERROR;
1703     if (r != FenceState::SIGNALED) {
1704         LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1705         n = ANEURALNETWORKS_OP_FAILED;
1706     }
1707     return n;
1708 }
1709 
1710 // Invocations of Controller::setInput/setOutput in this function must match with invocations of
1711 // StepRoleAnalyzer::setUsedBy in the IfStep branch in
1712 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1713 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1714                                 std::shared_ptr<StepExecutor>* executor,
1715                                 SharedBurst* burstController,
1716                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1717     VLOG(EXECUTION) << "next: " << *step;
1718     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1719     // This is safe because the steps are serialized when doing fenced compute.
1720     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1721     bool condValue;
1722     NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1723     controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1724     const std::vector<SourceOperandIndex>& branchInputOperands =
1725             condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1726     const std::vector<SourceOperandIndex>& branchOutputOperands =
1727             condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1728     CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1729     CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1730     for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1731         // We have to do this assignment just before executing this step to
1732         // accommodate cases when the IF resides within a WHILE condition or
1733         // body model and for some j the i-th input of the IF branch model is
1734         // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1735         // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1736         // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1737         // In such cases, the WhileStep modifies the location of
1738         // step->outerInputOperands[i] to implement double buffering.
1739         controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1740     }
1741     for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1742         // We have to do this assignment just before executing this step to
1743         // accommodate the case when the IF resides within a WHILE body
1744         // model and the i-th output of the IF branch model is an
1745         // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1746         // some j). In that case, the WhileStep modifies the location of
1747         // step->outerOutputOperands[i] to implement double buffering.
1748         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1749     }
1750     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1751 }
1752 
1753 // Invocations of Controller::setInput in this function must match with invocations of
1754 // StepRoleAnalyzer::setUsedBy in the WhileStep branch in
1755 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1756 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1757                                 std::shared_ptr<StepExecutor>* executor,
1758                                 SharedBurst* burstController,
1759                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1760     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1761     if (state.stage == WhileState::EVALUATE_CONDITION) {
1762         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1763         VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1764                         << ": evaluating condition";
1765         controller->mNextStepIndex = step->condStepIndex;
1766 
1767         if (state.iteration == 0) {
1768             state.startTime = Clock::now();
1769         }
1770 
1771         // iteration = 0   cond inputs = outer inputs
1772         // iteration = 1   cond inputs = body outputs
1773         // iteration = 2   cond inputs = body outputs
1774         // iteration = 3   cond inputs = ...
1775         uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1776         CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1777         CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1778         for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1779             bool operandIsInputOnly = i >= loopBodyOutputCount;
1780             controller->setInput((state.iteration == 0 || operandIsInputOnly)
1781                                          ? step->outerInputOperands[i]
1782                                          : step->bodyOutputOperands[i],
1783                                  step->condInputOperands[i]);
1784         }
1785 
1786         state.stage = WhileState::EVALUATE_BODY;
1787         return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1788     }
1789 
1790     CHECK(state.stage == WhileState::EVALUATE_BODY);
1791     std::chrono::nanoseconds timeoutDuration(
1792             controller->mExecutionBuilder->getLoopTimeoutDuration());
1793     auto duration = Clock::now() - state.startTime;
1794     if (duration > timeoutDuration) {
1795         LOG(ERROR) << "WHILE loop timed out after "
1796                    << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1797                    << " ms";
1798         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1799     }
1800 
1801     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1802     // This is safe because the steps are serialized when doing fenced compute.
1803     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1804     bool condValue;
1805     NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1806     if (condValue) {
1807         VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1808                         << ": evaluating body";
1809         controller->mNextStepIndex = step->bodyStepIndex;
1810 
1811         // iteration = 0   body inputs = cond inputs = outer inputs   body outputs = tmp1
1812         // iteration = 1   body inputs = cond inputs = tmp1           body outputs = tmp2
1813         // iteration = 2   body inputs = cond inputs = tmp2           body outputs = tmp1
1814         // iteration = 3   body inputs = cond inputs = ...            body outputs = ...
1815 #ifdef NN_DEBUGGABLE
1816         CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1817         CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1818         CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1819         CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1820 #endif
1821         for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1822             controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1823         }
1824         if (state.iteration != 0) {
1825             for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1826 #ifdef NN_DEBUGGABLE
1827                 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1828                 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1829                 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u);
1830                 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u);
1831 #endif
1832                 std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand],
1833                           controller->mSourceOperandToLocationOfTemporary2[outputOperand]);
1834             }
1835         }
1836     } else {
1837         VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1838                         << ": exiting loop";
1839         controller->mNextStepIndex = step->exitStepIndex;
1840 
1841         // Copy body outputs to outer outputs.
1842         // TODO: Use outer outputs instead of tmp2 to avoid copying?
1843         CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1844         for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1845             // condInputOperands[i] points to a body output operand from the
1846             // last iteration if we've executed at least one iteration and to a
1847             // WHILE operation input operand otherwise.
1848             const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1849             const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1850             std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1851             if (outerBuffer == std::nullopt) {
1852                 // This should never happen.
1853                 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1854                 return ANEURALNETWORKS_OP_FAILED;
1855             }
1856             const Operand& sourceOperand =
1857                     controller->mExecutionBuilder->getSourceOperand(outerOperand);
1858             const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1859             CHECK_NE(size, 0u);
1860             std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1861             if (innerBuffer == std::nullopt) {
1862                 // This should never happen.
1863                 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1864                 return ANEURALNETWORKS_OP_FAILED;
1865             }
1866             CHECK_LE(size, innerBuffer->getSize());
1867             CHECK_LE(size, outerBuffer->getSize());
1868             memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1869             outerBuffer->flush();
1870         }
1871         state.iteration = WhileState::kOutsideLoop;
1872     }
1873 
1874     state.stage = WhileState::EVALUATE_CONDITION;
1875     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1876 }
1877 
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1878 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1879                                 std::shared_ptr<StepExecutor>* executor,
1880                                 SharedBurst* burstController,
1881                                 const std::vector<OutputShape>* mainModelOutputShapes) const {
1882     VLOG(EXECUTION) << "next: " << *step;
1883     controller->mNextStepIndex = step->gotoStepIndex;
1884     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1885 }
1886 
makeStepExecutor(bool reusable,ExecutionBuilder * executionBuilder) const1887 std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
1888         bool reusable, ExecutionBuilder* executionBuilder) const {
1889     auto simpleBody = simple();
1890     auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
1891                                                    simpleBody->mDevice, simpleBody->mPreparedModel,
1892                                                    reusable);
1893     executor->mapInputsAndOutputsTrivially();
1894     return executor;
1895 }
1896 
becomeCompoundIfEmpty()1897 void ExecutionPlan::becomeCompoundIfEmpty() {
1898     CHECK(mState != SIMPLE);
1899     if (mState == EMPTY) {
1900         mBody = new CompoundBody(this);
1901         mState = COMPOUND;
1902     }
1903 }
1904 
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1905 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1906                                                      const std::shared_ptr<Device> device) {
1907     becomeCompoundIfEmpty();
1908     auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1909                                               compound()->mSteps.size(), sourceModelIndex, device);
1910     compound()->mSteps.push_back(step);
1911     return step->executionStep();
1912 }
1913 
createNewIfStep()1914 IfStep* ExecutionPlan::createNewIfStep() {
1915     becomeCompoundIfEmpty();
1916     auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1917     step->ifStep()->index = compound()->mSteps.size();
1918     compound()->mSteps.push_back(step);
1919     return step->ifStep();
1920 }
1921 
createNewWhileStep()1922 WhileStep* ExecutionPlan::createNewWhileStep() {
1923     becomeCompoundIfEmpty();
1924     auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1925     step->whileStep()->index = compound()->mSteps.size();
1926     compound()->mSteps.push_back(step);
1927     return step->whileStep();
1928 }
1929 
createNewGotoStep()1930 GotoStep* ExecutionPlan::createNewGotoStep() {
1931     becomeCompoundIfEmpty();
1932     auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1933     step->gotoStep()->index = compound()->mSteps.size();
1934     compound()->mSteps.push_back(step);
1935     return step->gotoStep();
1936 }
1937 
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1938 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1939                                      const ModelBuilder* model) {
1940     CHECK(mState == EMPTY);
1941     mBody = new SimpleBody(device, model, mCacheInfo, mToken);
1942     mState = SIMPLE;
1943 }
1944 
recordOutputDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1945 void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1946     auto [it, isNew] =
1947             compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1948     CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
1949                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
1950 }
1951 
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1952 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1953     auto [it, isNew] =
1954             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1955     CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1956                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
1957 }
1958 
dump() const1959 void ExecutionPlan::dump() const {
1960     if (mBody) {
1961         mBody->dump();
1962     } else {
1963         VLOG(COMPILATION) << "EMPTY";
1964     }
1965 }
1966 
reset()1967 void ExecutionPlan::reset() {
1968     if (mBody) {
1969         delete mBody;
1970         mBody = nullptr;
1971     }
1972     mState = EMPTY;
1973 }
1974 
isSimpleCpu() const1975 bool ExecutionPlan::isSimpleCpu() const {
1976     return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1977 }
1978 
forTest_getKind() const1979 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1980     switch (mState) {
1981         case EMPTY:
1982             return Kind::EMPTY;
1983         case SIMPLE:
1984             CHECK(mBody);
1985             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1986         case COMPOUND:
1987             CHECK(mBody);
1988             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1989         default:
1990             LOG(FATAL) << "unexpected state";
1991             return Kind::ERROR;
1992     }
1993 }
1994 
forTest_simpleGetDevice() const1995 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1996     return simple()->mDevice;
1997 }
1998 
forTest_compoundGetSteps() const1999 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
2000     return compound()->mSteps;
2001 }
2002 
forTest_flatGetDynamicTemporaries() const2003 std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
2004     CHECK_EQ(getSourceModels().size(), size_t(1));
2005     std::set<uint32_t> ret;
2006     forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
2007         ret.insert(dynTemp.second);
2008     });
2009     return ret;
2010 }
2011 
hasDynamicTemporaries() const2012 bool ExecutionPlan::hasDynamicTemporaries() const {
2013     return mBody == nullptr ? false : mBody->hasDynamicTemporaries();
2014 }
2015 
forTest_hasStepModelWithNoInputsOrNoOutputs() const2016 bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const {
2017     return mBody == nullptr ? false : mBody->hasStepModelWithNoInputsOrNoOutputs();
2018 }
2019 
hasStepModelWithNoInputsOrNoOutputs() const2020 bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const {
2021     return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) {
2022         const ExecutionStep* step = logicalStep->tryExecutionStep();
2023         return step != nullptr && step->hasNoInputsOrNoOutputs();
2024     });
2025 }
2026 
forTest_simpleGetCacheToken() const2027 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
2028     return simple()->mToken.getCacheToken();
2029 }
2030 
dump() const2031 void ExecutionPlan::SimpleBody::dump() const {
2032     VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
2033 }
2034 
dump() const2035 void ExecutionPlan::CompoundBody::dump() const {
2036     for (const auto& step : mSteps) {
2037         step->dump();
2038     }
2039 }
2040 
getInputSourceOperand(uint32_t index) const2041 SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const {
2042     const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
2043     CHECK_LT(index, mainModel->inputCount());
2044     const auto operandIndex = mainModel->getInputOperandIndex(index);
2045     return {kMainModelInSourceModels, operandIndex};
2046 }
2047 
getOutputSourceOperand(uint32_t index) const2048 SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const {
2049     const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
2050     CHECK_LT(index, mainModel->outputCount());
2051     const auto operandIndex = mainModel->getOutputOperandIndex(index);
2052     return {kMainModelInSourceModels, operandIndex};
2053 }
2054 
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2055 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
2056                                                        const StepRoleCallback& callback) const {
2057     callback(mPreparedModel.get(), IOType::INPUT, index);
2058 }
2059 
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2060 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
2061                                                         const StepRoleCallback& callback) const {
2062     callback(mPreparedModel.get(), IOType::OUTPUT, index);
2063 }
2064 
2065 // Map an input role of the main model to the input/output roles in the step models.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2066 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
2067                                                          const StepRoleCallback& callback) const {
2068     const auto sourceOperandIndex = mPlan->getInputSourceOperand(index);
2069     forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2070 }
2071 
2072 // Map an output role of the main model to the input/output roles in the step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2073 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
2074                                                           const StepRoleCallback& callback) const {
2075     const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index);
2076     forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2077 }
2078 
forEachStepRoleOfSourceOperand(const SourceOperandIndex & index,const StepRoleCallback & callback) const2079 void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand(
2080         const SourceOperandIndex& index, const StepRoleCallback& callback) const {
2081     const auto it = mSourceOperandToStepRoles.find(index);
2082     if (it == mSourceOperandToStepRoles.end()) return;
2083     for (const auto& [stepIndex, type, ioIndex] : it->second) {
2084         CHECK_LT(stepIndex, mSteps.size());
2085         const auto* step = mSteps[stepIndex]->executionStep();
2086         callback(step->getPreparedStepModel().get(), type, ioIndex);
2087     }
2088 }
2089 
getMemoryPreference(IOType type,uint32_t index) const2090 MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const {
2091     CHECK(mState == SIMPLE || mState == COMPOUND);
2092     if (mState == SIMPLE) {
2093         return simple()->mPreparedModel->getMemoryPreference();
2094     } else {
2095         const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index)
2096                                                               : getOutputSourceOperand(index);
2097         return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
2098     }
2099 }
2100 
getMemoryPreferenceOfSourceOperand(const SourceOperandIndex & index) const2101 MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand(
2102         const SourceOperandIndex& index) const {
2103     uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding;
2104     forEachStepRoleOfSourceOperand(
2105             index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) {
2106                 const auto preference = preparedModel->getMemoryPreference();
2107                 alignment = std::max(alignment, preference.alignment);
2108                 padding = std::max(padding, preference.padding);
2109             });
2110     return {alignment, padding};
2111 }
2112 
forEachDynamicTemporary(const std::function<void (SourceOperandIndex,const Operand &,uint32_t definingStepIndex)> & fn) const2113 void ExecutionPlan::forEachDynamicTemporary(
2114         const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
2115                 fn) const {
2116     if (mState != COMPOUND) {
2117         return;
2118     }
2119 
2120     for (const auto& logicalStep : compound()->mSteps) {
2121         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
2122             const uint32_t stepIndex = step->getIndex();
2123             const uint32_t sourceModelIndex = step->getSourceModelIndex();
2124             for (const auto& entry : step->getTempsAsStepModelOutputs()) {
2125                 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
2126                 const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
2127                 if (hasUnknownSize(sourceOperand)) {
2128                     fn(sourceOperandIndex, sourceOperand, stepIndex);
2129                 }
2130             }
2131         }
2132     }
2133 }
2134 
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan,const std::vector<TokenValuePair> & metaData,int simulateFailureResultCode) const2135 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
2136                                    uint32_t preference, uint32_t priority,
2137                                    const OptionalTimePoint& deadline, ExecutionPlan* plan,
2138                                    const std::vector<TokenValuePair>& metaData,
2139                                    int simulateFailureResultCode) const {
2140     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
2141     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
2142                                                 deadline, plan));
2143     int n = plan->finish(preference, priority, deadline, metaData, simulateFailureResultCode);
2144     if (VLOG_IS_ON(COMPILATION)) {
2145         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
2146         logModelToInfo(makeModel());
2147         plan->dump();
2148     }
2149     return n;
2150 }
2151 
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan) const2152 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
2153                                            const std::vector<std::shared_ptr<Device>>& devices,
2154                                            uint32_t preference, uint32_t priority,
2155                                            const OptionalTimePoint& deadline,
2156                                            ExecutionPlan* plan) const {
2157     // This function uses a heuristic approach to partitioning the graph.
2158     // It should be good enough for the first release.
2159 
2160     SourceModels* sourceModels = &plan->getSourceModels();
2161     const size_t deviceCount = devices.size();
2162     const size_t operationCount = mOperations.size();
2163 
2164     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
2165                       << "sourceModelIndex = " << sourceModelIndex << ", "
2166                       << "deviceCount = " << deviceCount << ", "
2167                       << "operationCount = " << operationCount;
2168 
2169     // Figure out where each operation will best execute.
2170     // The value of the vector is the index in the devices vector.
2171     std::vector<int> bestDeviceForOperation(operationCount);
2172     NN_RETURN_IF_ERROR(
2173             findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
2174 
2175     // A special value produced by findBestDeviceForEachOperation meaning that
2176     // this is a control flow operation scheduled for interpreted execution
2177     // (see LogicalStep).
2178     const int kControlFlowInterpreter = deviceCount;
2179 
2180     // If one device will run all the operations, we don't need to split the
2181     // work. This shortcut does not apply when recursively partitioning
2182     // referenced models because our plan representation is flat.
2183     if (sourceModelIndex == kMainModelInSourceModels &&
2184         std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
2185                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
2186         const int bestDeviceIndex = bestDeviceForOperation[0];
2187         // Bypass the partitioning process unless the only operation is a
2188         // control flow operation scheduled for interpreted execution.
2189         if (bestDeviceIndex != kControlFlowInterpreter) {
2190             VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
2191                               << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
2192             plan->becomeSingleStep(devices[bestDeviceIndex], this);
2193             return ANEURALNETWORKS_NO_ERROR;
2194         }
2195     }
2196 
2197     // No easy solution, we need to split the work.
2198 
2199     // We keep track of the operations that are ready to run for each device.
2200     // perDeviceQueue[deviceCount] is for interpreted execution of control flow
2201     // (see LogicalStep).
2202     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
2203 
2204     // This helper function produces a device name.
2205     auto deviceName = [&devices, kControlFlowInterpreter,
2206                        deviceCount](int deviceIndex) -> std::string {
2207         if (deviceIndex == kControlFlowInterpreter) {
2208             return "NNAPI";
2209         } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
2210             return "{unknown}";
2211         } else {
2212             return devices.at(deviceIndex)->getName();
2213         }
2214     };
2215 
2216     // This helper function enqueues the operation on the appropriate queue.
2217     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
2218         int deviceIndex = bestDeviceForOperation[operationIndex];
2219         perDeviceQueue[deviceIndex].push(operationIndex);
2220         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
2221                           << deviceIndex << " (" << deviceName(deviceIndex) << ")";
2222     };
2223 
2224     // This helper function finds a device that has operations ready to process.
2225     // We start by looking at the control flow queue, and then look at the
2226     // devices in reverse order (i.e., starting at the end of the devices
2227     // vector). Earlier devices have a chance to prepare more of the inputs
2228     // required by other devices. This function returns -1 if all queues are
2229     // empty.
2230     auto findNextDeviceToProcess = [&]() -> int {
2231         for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
2232             if (!perDeviceQueue[i].empty()) {
2233                 return i;
2234             }
2235         }
2236         return -1;
2237     };
2238 
2239     OperandTracker tracker(this, enqueueOnAppropriateDevice);
2240     // For each iteration of this loop, we'll create either an execution step or
2241     // an interpreted control flow construct (including nested execution steps
2242     // and interpreted control flow constructs).
2243     while (true) {
2244         // Find the device we'll do this step for.
2245         int deviceIndex = findNextDeviceToProcess();
2246         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
2247                           << deviceName(deviceIndex) << ")";
2248         if (deviceIndex < 0) {
2249             break;
2250         }
2251 
2252         // Assign as much as possible to this device.
2253         auto& queue = perDeviceQueue[deviceIndex];
2254         if (deviceIndex != kControlFlowInterpreter) {
2255             ExecutionStep* step =
2256                     plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
2257             while (!queue.empty()) {
2258                 uint32_t operationIndex = queue.front();
2259                 queue.pop();
2260                 int n = step->addOperation(operationIndex);
2261                 if (n != ANEURALNETWORKS_NO_ERROR) {
2262                     LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
2263                     return n;
2264                 }
2265                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2266             }
2267         } else {
2268             while (!queue.empty()) {
2269                 uint32_t operationIndex = queue.front();
2270                 queue.pop();
2271                 const Operation& operation = getOperation(operationIndex);
2272                 if (operation.type == OperationType::IF) {
2273                     namespace op = operation_if;
2274                     const Operand& thenOperand =
2275                             getOperand(operation.inputs[op::kThenModelOperand]);
2276                     const Operand& elseOperand =
2277                             getOperand(operation.inputs[op::kElseModelOperand]);
2278                     const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2279                     const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2280                     uint32_t thenModelIndex = sourceModels->addModel(thenModel);
2281                     uint32_t elseModelIndex = sourceModels->addModel(elseModel);
2282 
2283                     // Emits the following:
2284                     // Index  Step
2285                     //   i    if then=(i + 1) else=(j + 1)
2286                     //  ...   (then model steps)
2287                     //   j    goto k
2288                     //  ...   (else model steps)
2289                     //   k    (steps after the IF)
2290                     IfStep* ifStep = plan->createNewIfStep();
2291                     ifStep->conditionOperandIndex = SourceOperandIndex(
2292                             sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
2293                     ifStep->thenStepIndex = plan->getNextStepIndex();
2294                     NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
2295                             thenModelIndex, devices, preference, priority, deadline, plan));
2296                     GotoStep* afterThenBranch = plan->createNewGotoStep();
2297                     ifStep->elseStepIndex = plan->getNextStepIndex();
2298                     NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
2299                             elseModelIndex, devices, preference, priority, deadline, plan));
2300                     afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
2301 
2302                     // Outer model operands.
2303                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2304                         ifStep->outerInputOperands.emplace_back(sourceModelIndex,
2305                                                                 operation.inputs[i]);
2306                     }
2307                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2308                         ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
2309                                                                  operation.outputs[i]);
2310                     }
2311                     // Then model operands.
2312                     for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
2313                         ifStep->thenBranchInputOperands.emplace_back(
2314                                 thenModelIndex, thenModel->getInputOperandIndex(i));
2315                     }
2316                     for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
2317                         ifStep->thenBranchOutputOperands.emplace_back(
2318                                 thenModelIndex, thenModel->getOutputOperandIndex(i));
2319                     }
2320                     // Else model operands.
2321                     for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
2322                         ifStep->elseBranchInputOperands.emplace_back(
2323                                 elseModelIndex, elseModel->getInputOperandIndex(i));
2324                     }
2325                     for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
2326                         ifStep->elseBranchOutputOperands.emplace_back(
2327                                 elseModelIndex, elseModel->getOutputOperandIndex(i));
2328                     }
2329                 } else if (operation.type == OperationType::WHILE) {
2330                     namespace op = operation_while;
2331                     const Operand& condOperand =
2332                             getOperand(operation.inputs[op::kCondModelOperand]);
2333                     const Operand& bodyOperand =
2334                             getOperand(operation.inputs[op::kBodyModelOperand]);
2335                     const ModelBuilder* condModel = getReferencedModel(condOperand);
2336                     const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2337                     uint32_t condModelIndex = sourceModels->addModel(condModel);
2338                     uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
2339 
2340                     // Emits the following:
2341                     // Index  Step
2342                     //   i    while cond=(i + 1) body=(j + 1) exit=(k + 1)
2343                     //  ...   (cond model steps)
2344                     //   j    goto i
2345                     //  ...   (body model steps)
2346                     //   k    goto i
2347                     //  ...   (steps after the WHILE)
2348                     //
2349                     //  Note that WhileStep has WhileState associated with it.
2350                     WhileStep* whileStep = plan->createNewWhileStep();
2351                     whileStep->condStepIndex = plan->getNextStepIndex();
2352                     NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
2353                             condModelIndex, devices, preference, priority, deadline, plan));
2354                     GotoStep* afterCond = plan->createNewGotoStep();
2355                     afterCond->gotoStepIndex = whileStep->index;
2356                     whileStep->bodyStepIndex = plan->getNextStepIndex();
2357                     NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
2358                             bodyModelIndex, devices, preference, priority, deadline, plan));
2359                     GotoStep* afterBody = plan->createNewGotoStep();
2360                     afterBody->gotoStepIndex = whileStep->index;
2361                     whileStep->exitStepIndex = plan->getNextStepIndex();
2362 
2363                     // Outer model operands.
2364                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2365                         whileStep->outerInputOperands.emplace_back(sourceModelIndex,
2366                                                                    operation.inputs[i]);
2367                     }
2368                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2369                         whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
2370                                                                     operation.outputs[i]);
2371                     }
2372                     // Cond model operands.
2373                     for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
2374                         whileStep->condInputOperands.emplace_back(
2375                                 condModelIndex, condModel->getInputOperandIndex(i));
2376                     }
2377                     whileStep->condOutputOperand =
2378                             SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
2379                     // Body model operands.
2380                     for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
2381                         whileStep->bodyInputOperands.emplace_back(
2382                                 bodyModelIndex, bodyModel->getInputOperandIndex(i));
2383                     }
2384                     for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
2385                         whileStep->bodyOutputOperands.emplace_back(
2386                                 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
2387                     }
2388                 } else {
2389                     CHECK(false) << operation.type << " is not a control flow operation";
2390                 }
2391                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2392             }
2393         }
2394     }
2395     return ANEURALNETWORKS_NO_ERROR;
2396 }
2397 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const2398 float ModelBuilder::getPerformance(uint32_t preference,
2399                                    const std::shared_ptr<Device> device) const {
2400     // Note that we will call this method multiple times per compilation with
2401     // the same arguments if there are nested control flow operations and we
2402     // decide to execute the outer operation on the ExecutionPlan::next()
2403     // interpreter.
2404     //
2405     // This is a potential compilation performance problem. To work around it,
2406     // the performance value could be cached for the duration of a compilation.
2407     float perf = 0;
2408     const size_t operationCount = mOperations.size();
2409     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2410         perf += getPerformance(preference, device, operationIndex);
2411     }
2412     return perf;
2413 }
2414 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const2415 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
2416                                    uint32_t operationIndex) const {
2417     auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) {
2418         return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
2419     };
2420 
2421     const Operation& operation = getOperation(operationIndex);
2422 
2423     if (operation.type == OperationType::IF) {
2424         namespace op = operation_if;
2425         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2426         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2427         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2428         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2429         return applyPreference(device->getIfPerformance()) +
2430                0.5 * (thenModel->getPerformance(preference, device) +
2431                       elseModel->getPerformance(preference, device));
2432     }
2433 
2434     if (operation.type == OperationType::WHILE) {
2435         namespace op = operation_while;
2436         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2437         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2438         const ModelBuilder* condModel = getReferencedModel(condOperand);
2439         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2440         return applyPreference(device->getWhilePerformance()) +
2441                condModel->getPerformance(preference, device) +
2442                bodyModel->getPerformance(preference, device);
2443     }
2444 
2445     // TODO This assumes that the type is dictated by the first operand. This is
2446     // currently the case but is not a safe assumption to make in the long term.
2447     const uint32_t operandIndex = operation.inputs[0];
2448     const OperandType operandType = mOperands[operandIndex].type;
2449     switch (operandType) {
2450         case OperandType::FLOAT32:
2451             if (mRelaxComputationFloat32toFloat16) {
2452                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
2453             }
2454             break;
2455         case OperandType::TENSOR_FLOAT32:
2456             if (mRelaxComputationFloat32toFloat16) {
2457                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
2458             }
2459             break;
2460         default:
2461             break;
2462     }
2463 
2464     return applyPreference(device->getPerformance(operandType));
2465 }
2466 
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const2467 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
2468     auto containsUnknownSize = [](const ModelBuilder* model,
2469                                   const std::vector<uint32_t>& operandIndexes) {
2470         for (uint32_t operandIndex : operandIndexes) {
2471             if (hasUnknownSize(model->getOperand(operandIndex))) {
2472                 return true;
2473             }
2474         }
2475         return false;
2476     };
2477 
2478     const Operation& operation = getOperation(operationIndex);
2479 
2480     if (operation.type == OperationType::IF) {
2481         namespace op = operation_if;
2482         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2483         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2484         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2485         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2486         return containsUnknownSize(this, operation.inputs) ||
2487                containsUnknownSize(this, operation.outputs) ||
2488                containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
2489                containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
2490                containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
2491                containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
2492     }
2493 
2494     if (operation.type == OperationType::WHILE) {
2495         namespace op = operation_while;
2496         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2497         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2498         const ModelBuilder* condModel = getReferencedModel(condOperand);
2499         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2500         return containsUnknownSize(this, operation.inputs) ||
2501                containsUnknownSize(this, operation.outputs) ||
2502                containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
2503                containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
2504                containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
2505                containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
2506     }
2507 
2508     // Not a control flow operation.
2509     return false;
2510 }
2511 
supportedByControlFlowInterpreter(uint32_t operationIndex) const2512 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
2513     const Operation& operation = getOperation(operationIndex);
2514     return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
2515            // The partitioner does not support dynamic temporaries (b/132458982).
2516            !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
2517 }
2518 
2519 namespace {
2520 
2521 // This class determines whether a given device can execute a given operation
2522 class CanDo {
2523    public:
CanDo()2524     CanDo() {}
2525 
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)2526     void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
2527         mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
2528     }
2529 
check(size_t operationIndex) const2530     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
2531 
2532    private:
2533     std::vector<bool> mSupportsOperationByIndex;
2534 };
2535 
2536 }  // anonymous namespace
2537 
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const2538 int ModelBuilder::findBestDeviceForEachOperation(
2539         uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
2540         std::vector<int>* bestDeviceForOperation) const {
2541     const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing());
2542 
2543     const size_t deviceCount = devices.size();
2544     std::vector<CanDo> canDo(deviceCount);
2545     for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2546         canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2547     }
2548 
2549     // Figure out the best driver for each operation.
2550     const size_t operationCount = mOperations.size();
2551     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2552         const Operation& operation = getOperation(operationIndex);
2553         // Find which device, including CPU fallback, gives the best performance for this operation.
2554         int bestChoice = -1;
2555 
2556         if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2557             // Do not schedule control flow operations with unknown size to
2558             // non-CPU devices because this is not supported by the 1.3 HAL.
2559             // See http://b/159076604#comment5.
2560             auto cpuDeviceIterator =
2561                     std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2562             if (cpuDeviceIterator != devices.end()) {
2563                 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2564                 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2565                     bestChoice = cpuDeviceIndex;
2566                 }
2567             }
2568         } else {
2569             float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
2570             for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2571                 const auto& device = devices[deviceIndex];
2572                 if (canDo[deviceIndex].check(operationIndex)) {
2573                     const float perfVal = getPerformance(preference, device, operationIndex);
2574                     const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice());
2575                     if (bestChoice < 0 || perfVal < bestPerfVal ||
2576                         (perfVal == bestPerfVal && deviceIsPreferred)) {
2577                         bestChoice = deviceIndex;
2578                         bestPerfVal = perfVal;
2579                     }
2580                 } else {
2581                     // Somewhat noisy logging, but only place where the user of NNAPI can get
2582                     // feedback on why an operation was not run on a specific device.
2583                     //
2584                     // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2585                     // very small.
2586                     VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2587                                       << operation.type << ":" << operationIndex;
2588                 }
2589             }
2590         }
2591 
2592         if (bestChoice < 0) {
2593             LOG(ERROR) << "No driver can do operation " << operation.type;
2594             return ANEURALNETWORKS_BAD_DATA;
2595         } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2596                    supportedByControlFlowInterpreter(operationIndex)) {
2597             // Run control flow on the ExecutionPlan::next() interpreter and try
2598             // to delegate referenced models.
2599             const int kControlFlowInterpreter = deviceCount;
2600             (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2601             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2602                               << ":" << operationIndex << ") = -1 (NNAPI)";
2603         } else {
2604             (*bestDeviceForOperation)[operationIndex] = bestChoice;
2605             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2606                               << ":" << operationIndex << ") = " << bestChoice << " ("
2607                               << devices[bestChoice]->getName() << ")";
2608         }
2609     }
2610     return ANEURALNETWORKS_NO_ERROR;
2611 }
2612 
2613 }  // namespace nn
2614 }  // namespace android
2615