1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionPlan"
18
19 #include "ExecutionPlan.h"
20
21 #include <ControlFlow.h>
22 #include <CpuExecutor.h>
23 #include <GraphDump.h>
24 #include <LegacyUtils.h>
25 #include <MetaModel.h>
26 #include <OperationsUtils.h>
27 #include <TokenHasher.h>
28 #include <Tracing.h>
29 #include <android-base/logging.h>
30 #include <fcntl.h>
31 #include <nnapi/IBurst.h>
32 #include <sys/stat.h>
33 #include <sys/types.h>
34
35 #include <algorithm>
36 #include <functional>
37 #include <map>
38 #include <memory>
39 #include <mutex>
40 #include <queue>
41 #include <set>
42 #include <string>
43 #include <type_traits>
44 #include <unordered_set>
45 #include <utility>
46 #include <vector>
47
48 #include "BurstBuilder.h"
49 #include "CompilationBuilder.h"
50 #include "ExecutionBuilder.h"
51 #include "ExecutionCallback.h"
52 #include "Manager.h"
53 #include "ModelBuilder.h"
54 #include "TypeManager.h"
55
56 namespace android {
57 namespace nn {
58
59 namespace {
60
61 // The index of the main model in SourceModels.
62 constexpr uint32_t kMainModelInSourceModels = 0;
63
64 constexpr uint32_t kNoPadding = 1;
65
updateTokenFromMetaData(TokenHasher * token,const std::vector<TokenValuePair> & metaData)66 static bool updateTokenFromMetaData(TokenHasher* token,
67 const std::vector<TokenValuePair>& metaData) {
68 // Combines the TokenValuePair and corresponding extension name.
69 std::vector<std::tuple<const char*, uint16_t, const uint8_t*, size_t>> metaDataWithExtension;
70 for (auto p : metaData) {
71 uint16_t prefix = static_cast<uint32_t>(p.token) >> kExtensionTypeBits;
72 uint16_t extensionEnum = static_cast<uint32_t>(p.token) & kTypeWithinExtensionMask;
73 const Extension* extension;
74 if (!TypeManager::get()->getExtensionInfo(prefix, &extension)) {
75 LOG(ERROR) << "Prefix " << prefix << " could not be found";
76 return false;
77 }
78 metaDataWithExtension.push_back(std::make_tuple(extension->name.c_str(), extensionEnum,
79 p.value.data(), p.value.size()));
80 }
81 // Sort with extension name and extension enum.
82 std::sort(metaDataWithExtension.begin(), metaDataWithExtension.end(),
83 [](const auto& a, const auto& b) {
84 if (int r = strcmp(std::get<0>(a), std::get<0>(b))) {
85 return r < 0;
86 } else {
87 return std::get<1>(a) < std::get<1>(b);
88 }
89 });
90 // Update the cache token with the sorted array.
91 for (auto [extensionName, extensionEnum, value, valueSize] : metaDataWithExtension) {
92 if (!token->updateFromString(extensionName) ||
93 !token->update(&extensionEnum, sizeof(uint16_t)) || !token->update(value, valueSize)) {
94 return false;
95 }
96 }
97 return true;
98 }
99
100 // Compiles the model on device.
101 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
102 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
103 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
104 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,TokenHasher * token,const std::vector<TokenValuePair> & metaData,std::shared_ptr<RuntimePreparedModel> * preparedModel)105 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
106 int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
107 TokenHasher* token, const std::vector<TokenValuePair>& metaData,
108 std::shared_ptr<RuntimePreparedModel>* preparedModel) {
109 CHECK(token != nullptr);
110 CHECK(preparedModel != nullptr);
111 *preparedModel = nullptr;
112
113 std::optional<CacheToken> cacheToken;
114 if (device.isCachingSupported() && token->ok() &&
115 token->updateFromString(device.getName().c_str()) &&
116 token->updateFromString(device.getVersionString().c_str()) &&
117 token->update(&executionPreference, sizeof(executionPreference)) &&
118 token->update(&compilationPriority, sizeof(compilationPriority)) &&
119 updateTokenFromMetaData(token, metaData) && token->finish()) {
120 cacheToken = CacheToken{};
121 const uint8_t* tokenPtr = token->getCacheToken();
122 std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin());
123 }
124
125 const ModelFactory makeModel = [&model] { return model.makeModel(); };
126 const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
127 const Priority priority = convertToCanonicalPriority(compilationPriority);
128 std::vector<ExtensionNameAndPrefix> extensionNameAndPrefix =
129 TypeManager::get()->getExtensionNameAndPrefix(metaData);
130 const auto [n, returnedPreparedModel] =
131 device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken,
132 metaData, extensionNameAndPrefix);
133 *preparedModel = returnedPreparedModel;
134 return n;
135 }
136
137 typedef std::function<void(uint32_t)> OperationReadyCallback;
138
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)139 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
140 const Operand& fromOperand) {
141 if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
142 std::holds_alternative<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams)) {
143 auto& fromChannelQuant =
144 std::get<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams);
145 ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
146 .channelDim = fromChannelQuant.channelDim,
147 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
148 .scales = fromChannelQuant.scales.data(),
149 };
150 return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
151 } else if (isExtension(fromOperand.type) &&
152 std::holds_alternative<Operand::ExtensionParams>(fromOperand.extraParams)) {
153 auto extensionData = std::get<Operand::ExtensionParams>(fromOperand.extraParams);
154 return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
155 extensionData.size());
156 } else if (!std::holds_alternative<Operand::NoParams>(fromOperand.extraParams) ||
157 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
158 LOG(ERROR) << "Type " << fromOperand.type
159 << " has an unexpected extraParams variant: " << fromOperand.extraParams.index();
160 return ANEURALNETWORKS_BAD_DATA;
161 } else {
162 return ANEURALNETWORKS_NO_ERROR;
163 }
164 }
165
166 // This class tracks whether we know the value of an operand as operations
167 // are processed.
168 class OperandTracker {
169 public:
170 // Creates the tracker for this model. Figure out which operations can be
171 // executed right away and cb for each one of them.
172 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
173 // Mark the specified operation as having been processed. The output
174 // of the operation now being known, this may make new operations to be
175 // able to run. Call cb for each one of them.
176 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
177
178 private:
179 const ModelBuilder* mModel;
180 std::multimap<uint32_t, uint32_t> mOperandToOperations;
181 std::vector<uint32_t> mUnknownInputCount; // For each operation
182 };
183
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)184 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
185 : mModel(model) {
186 const auto& operations = mModel->getOperations();
187 mUnknownInputCount.resize(operations.size());
188 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
189 const Operation& operation = operations[operationIndex];
190 uint32_t count = 0;
191 for (uint32_t operandIndex : operation.inputs) {
192 auto lifetime = mModel->getOperand(operandIndex).lifetime;
193 if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
194 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
195 count++;
196 mOperandToOperations.emplace(operandIndex, operationIndex);
197 }
198 }
199 if (count == 0) {
200 cb(operationIndex);
201 }
202 mUnknownInputCount[operationIndex] = count;
203 }
204 }
205
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)206 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
207 // Mark all its outputs as known.
208 const Operation& operation = mModel->getOperations()[operationIndex];
209 for (uint32_t operandIndex : operation.outputs) {
210 auto range = mOperandToOperations.equal_range(operandIndex);
211 for (auto i = range.first; i != range.second; i++) {
212 uint32_t& count = mUnknownInputCount[i->second];
213 if (--count == 0) {
214 cb(i->second);
215 }
216 }
217 }
218 }
219
addTemporary(uint32_t * totalSizeOfTemporaries,uint32_t size,uint32_t alignment,uint32_t padding)220 StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size,
221 uint32_t alignment, uint32_t padding) {
222 // TODO: what about overflow?
223 *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment);
224 const uint32_t offset = *totalSizeOfTemporaries;
225 size = roundUp(size, padding);
226 *totalSizeOfTemporaries += size;
227 return {.offset = offset, .paddedLength = size};
228 };
229
toString(SourceOperandIndex sourceOperandIndex)230 std::string toString(SourceOperandIndex sourceOperandIndex) {
231 return "(" + std::to_string(sourceOperandIndex.first) + ", " +
232 std::to_string(sourceOperandIndex.second) + ")";
233 };
234
235 // A helper class to analyze the step roles of all partition boundary operands.
236 //
237 // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer
238 // with the following two methods:
239 // - addRole: Add a step role to a boundary operand
240 // - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest"
241 // operand. All of the step roles of the "dest" operand are also possible step roles of the
242 // "source" operand. This is useful for interpreted control flow, e.g., the outer input operand
243 // of an interpreted IF operation may be directly used as all step roles of the corresponding
244 // input operand of the then and else models. Note that this relationship is directional --
245 // (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a
246 // shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph
247 // produced by the used-by relationship is acyclic. This is true for the partitioner algorithm
248 // because there must be a root operand of each step role for the memory to be allocated on
249 // behalf of.
250 //
251 class StepRoleAnalyzer {
252 public:
analyze(const std::function<void (StepRoleAnalyzer &)> & setup)253 static std::map<SourceOperandIndex, std::set<StepRole>> analyze(
254 const std::function<void(StepRoleAnalyzer&)>& setup) {
255 StepRoleAnalyzer analyzer;
256 setup(analyzer);
257 return analyzer.finish();
258 }
259
addRole(const ExecutionStep & step,uint32_t operandIndex,IOType type,uint32_t stepIOIndex)260 void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type,
261 uint32_t stepIOIndex) {
262 SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex};
263 mRoles[source].emplace(step.getIndex(), type, stepIOIndex);
264 }
265
setUsedBy(const SourceOperandIndex & source,const SourceOperandIndex & dest)266 void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) {
267 mUsedBy[source].emplace(dest);
268 }
269
270 private:
271 StepRoleAnalyzer() = default;
272
273 // Merges the step roles of the destination operands to the source operands
274 // and returns the final map.
finish()275 std::map<SourceOperandIndex, std::set<StepRole>> finish() {
276 for (const auto& [source, _] : mUsedBy) {
277 finishHelper(source);
278 }
279 return std::move(mRoles);
280 }
281
finishHelper(SourceOperandIndex current)282 void finishHelper(SourceOperandIndex current) {
283 if (mProcessedOperands.count(current) > 0) return;
284 mProcessedOperands.insert(current);
285 const auto it = mUsedBy.find(current);
286 if (it != mUsedBy.end()) {
287 auto& roles = mRoles[current];
288 // Merge the step roles of the destination operands.
289 for (const auto& dest : it->second) {
290 finishHelper(dest);
291 const auto& destRoles = mRoles[dest];
292 roles.insert(destRoles.begin(), destRoles.end());
293 }
294 }
295 }
296
297 // A map from the source operand to its step roles.
298 std::map<SourceOperandIndex, std::set<StepRole>> mRoles;
299 // A map from the source operand to a set of destination operands that may directly
300 // use the memory of the source operand.
301 std::map<SourceOperandIndex, std::set<SourceOperandIndex>> mUsedBy;
302 // Used in finish to track which operand has been processed.
303 std::set<SourceOperandIndex> mProcessedOperands;
304 };
305
306 } // namespace
307
vlogDump(const char * context) const308 void DynamicTemporaries::vlogDump(const char* context) const {
309 if (empty()) {
310 return;
311 }
312 if (context) {
313 VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
314 }
315 for (const auto& temp : mSourceOperandToTemporary) {
316 VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
317 << ", stepIndex = " << temp.second.stepIndex
318 << ", offset = " << temp.second.offset
319 << ", dimensions = " << toString(temp.second.dimensions)
320 << ", paddedLength = " << temp.second.paddedLength
321 << ", alignment = " << temp.second.alignment
322 << ", padding = " << temp.second.padding;
323 }
324 }
325
declare(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex,const Dimensions & initialDimensions,uint32_t initialLength,uint32_t alignment,uint32_t padding)326 void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
327 const Dimensions& initialDimensions, uint32_t initialLength,
328 uint32_t alignment, uint32_t padding) {
329 VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
330 << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
331 << ", initialDimensions = " << toString(initialDimensions)
332 << ", initialLength = " << initialLength << ", alignment = " << alignment
333 << ", padding = " << padding << ")";
334 CHECK(!mDeclared);
335 CHECK_GT(initialLength, 0u);
336 const uint32_t paddedLength = roundUp(initialLength, padding);
337 auto [_, isNew] = mSourceOperandToTemporary.emplace(
338 sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions,
339 paddedLength, alignment, padding});
340 CHECK(isNew);
341 mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
342 }
343
redeclare(SourceOperandIndex sourceOperandIndex,const Dimensions & newDimensions,uint32_t newLength)344 bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
345 const Dimensions& newDimensions, uint32_t newLength) {
346 auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
347 VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
348 << toString(sourceOperandIndex)
349 << ", newDimensions = " << toString(newDimensions)
350 << ", newLength = " << newLength << ") -> " << toString(changedShape);
351 return changedShape;
352 };
353
354 CHECK(mDeclared);
355 CHECK_GT(newLength, 0u);
356
357 InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
358 const uint32_t paddedLength = roundUp(newLength, temp.padding);
359 if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) {
360 return createAndLogResult(false);
361 }
362 if (temp.paddedLength < paddedLength) {
363 // Otherwise allocation remains valid, even if it may be suboptimal
364 // (because it uses more space than needed). Use case: Don't force
365 // client to allocate again just because the client reported more
366 // accurate shape information.
367 mAllocatedStepIndexes.erase(temp.stepIndex);
368 }
369 temp.paddedLength = paddedLength;
370 temp.dimensions = newDimensions;
371 return createAndLogResult(true);
372 }
373
allocate(uint32_t stepIndex)374 int DynamicTemporaries::allocate(uint32_t stepIndex) {
375 VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
376
377 CHECK(mDeclared);
378
379 const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
380 if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
381 return ANEURALNETWORKS_NO_ERROR;
382 }
383
384 // perform layout
385 uint32_t newSize = 0;
386 for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) {
387 InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
388 // temp.paddedLength is already padded in declare and redeclare.
389 CHECK(temp.paddedLength % temp.padding == 0);
390 temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset;
391 }
392
393 // perform (re-)allocation
394 // TODO: Today we may shrink the allocation in order to avoid wasting memory. Is this important
395 // to conserve memory, or do we waste time reallocating?
396 const double kWaste = 0.2 /* arbitrary */; // Willing to waste space to avoid
397 // deallocation/reallocation overhead
398 auto& memory = mStepIndexToMemory[stepIndex];
399 const uint32_t oldSize = (memory ? memory->getSize() : 0);
400 if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
401 // Suitable allocation already exists; nothing to do
402 } else {
403 int n;
404 std::tie(n, memory) = MemoryAshmem::create(newSize);
405 if (n != ANEURALNETWORKS_NO_ERROR) {
406 LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
407 << " for step " << stepIndex;
408 mAllocatedStepIndexes.erase(stepIndex);
409 return n;
410 }
411 }
412
413 mAllocatedStepIndexes.insert(stepIndex);
414 return ANEURALNETWORKS_NO_ERROR;
415 }
416
allocated(uint32_t stepIndex) const417 bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
418 return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
419 mStepIndexToSourceOperandIndexes.end()) ||
420 mAllocatedStepIndexes.count(stepIndex);
421 }
422
lookup(SourceOperandIndex sourceOperandIndex,bool mustBeAllocated) const423 std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
424 SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
425 CHECK(mDeclared);
426 if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
427 it != mSourceOperandToTemporary.end()) {
428 const InternalLocationAndShape& temp = it->second;
429 const bool isAllocated = allocated(temp.stepIndex);
430 if (mustBeAllocated) {
431 CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
432 << " must be allocated";
433 }
434 if (isAllocated) {
435 return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
436 &temp.dimensions, temp.paddedLength};
437 } else {
438 return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength};
439 }
440 }
441 return std::nullopt;
442 }
443
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)444 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
445 std::shared_ptr<Device> device)
446 : mPlan(plan),
447 mIndex(stepIndex),
448 mSourceModelIndex(sourceModelIndex),
449 mStepModel(),
450 mDevice(device),
451 mToken(plan->getCacheToken()) {}
452
453 // Adds an operand if it has not been added already.
454 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)455 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
456 OperandKind kind) {
457 // Have we added this operand already?
458 auto i = mOperandMap.find(sourceOperandIndex);
459 if (i != mOperandMap.end()) {
460 CHECK(kind == INPUT);
461 *stepOperandIndex = i->second;
462 return ANEURALNETWORKS_NO_ERROR;
463 }
464
465 // First time we add this operand.
466 *stepOperandIndex = mStepModel.operandCount();
467 mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
468
469 // Add the operand to the step model.
470 const ModelBuilder& sourceModel = *getSourceModel();
471 const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
472 ANeuralNetworksOperandType type = {
473 .type = static_cast<int32_t>(operand.type),
474 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
475 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
476 .scale = operand.scale,
477 .zeroPoint = operand.zeroPoint,
478 };
479
480 int n = mStepModel.addOperand(type);
481 if (n != ANEURALNETWORKS_NO_ERROR) {
482 LOG(ERROR) << "Previous error occurred when partitioning the graph";
483 return n;
484 }
485
486 n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
487 if (n != ANEURALNETWORKS_NO_ERROR) {
488 LOG(ERROR) << "Error when copying extra parameters to the operand";
489 return n;
490 }
491
492 // Sets its value.
493 switch (operand.lifetime) {
494 case Operand::LifeTime::CONSTANT_COPY: {
495 const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
496 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
497 } break;
498 case Operand::LifeTime::CONSTANT_REFERENCE: {
499 const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex];
500 n = mStepModel.setOperandValueFromMemory(
501 *stepOperandIndex, memory, operand.location.offset, operand.location.length);
502 } break;
503 case Operand::LifeTime::NO_VALUE: {
504 n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
505 } break;
506 case Operand::LifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT
507 if (kind == INPUT) {
508 // The first time we've seen this operand is as an
509 // input. That means it must be defined by a
510 // different partition, and is an input to this one.
511 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
512 } else {
513 // The first time we've seen this operand is as an
514 // output. It may be an input to a different
515 // partition, so keep track of it.
516 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
517 mIndex);
518 }
519 } break;
520 case Operand::LifeTime::SUBGRAPH_INPUT: {
521 mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
522 } break;
523 case Operand::LifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE
524 if (kind == INPUT) {
525 // The first time we've seen this operand is as an
526 // input. That means it must be defined by a
527 // different partition, and is an input to this one.
528 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
529 } else {
530 // The first time we've seen this operand is as an
531 // output.
532 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
533 // It may be an input to a different partition, so keep track of
534 // it.
535 mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
536 mIndex);
537 }
538 } break;
539 case Operand::LifeTime::SUBGRAPH: {
540 const ModelBuilder* model = sourceModel.getReferencedModel(operand);
541 n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
542 } break;
543 case Operand::LifeTime::POINTER: {
544 const void* data = std::get<const void*>(operand.location.pointer);
545 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
546 } break;
547 }
548
549 if (n != ANEURALNETWORKS_NO_ERROR) {
550 LOG(ERROR) << "Previous error occurred when partitioning the graph";
551 }
552 return n;
553 }
554
addOperation(int operationIndex)555 int ExecutionStep::addOperation(int operationIndex) {
556 const Operation& operation = getSourceModel()->getOperation(operationIndex);
557 if (mToken.ok()) {
558 mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
559 mToken.update(&operationIndex, sizeof(operationIndex));
560 }
561
562 // Convert the input and output operand indexes.
563 //
564 // We expect operations to be added in topological order. Therefore:
565 //
566 // - We may not have seen an input if it is a model input, a
567 // constant, or an operand written by a different partition.
568 //
569 // - We should not have seen any outputs.
570 auto addOperands = [this](const std::vector<uint32_t>& sourceModelOperands,
571 std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
572 const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
573 for (uint32_t i = 0; i < operandCount; i++) {
574 NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
575 }
576 return ANEURALNETWORKS_NO_ERROR;
577 };
578
579 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
580 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
581 std::vector<uint32_t> inputs(inputCount);
582 std::vector<uint32_t> outputs(outputCount);
583 NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
584 NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
585 return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
586 outputCount, outputs.data());
587 }
588
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const std::vector<OutputShape> * mainModelOutputShapes,const RuntimeMemory * temporaryMemory,const std::map<SourceOperandIndex,StaticTemporaryLocation> & sourceOperandToLocationOfTemporary,const DynamicTemporaries & dynamicTemporaries,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const589 void ExecutionStep::mapInputsAndOutputs(
590 std::shared_ptr<StepExecutor> executor,
591 const std::vector<OutputShape>* mainModelOutputShapes, const RuntimeMemory* temporaryMemory,
592 const std::map<SourceOperandIndex, StaticTemporaryLocation>&
593 sourceOperandToLocationOfTemporary,
594 const DynamicTemporaries& dynamicTemporaries,
595 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
596 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
597 const std::map<SourceOperandIndex, ConstantReferenceLocation>&
598 sourceOperandToConstantReference) const {
599 auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
600 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
601 if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
602 it != sourceOperandToLocationOfTemporary.end()) {
603 const auto& loc = it->second;
604 executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset,
605 loc.paddedLength);
606 } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
607 executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset,
608 loc->paddedLength, *loc->dimensions);
609 } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
610 it != sourceOperandToInputIndex.end()) {
611 executor->mapInput(it->second, stepInputIndex);
612 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
613 it != sourceOperandToOutputIndex.end()) {
614 executor->mapOutputToInput(it->second, stepInputIndex,
615 mainModelOutputShapes
616 ? &mainModelOutputShapes->at(it->second).dimensions
617 : nullptr);
618 } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
619 it != sourceOperandToConstantReference.end()) {
620 // Constant partition boundary operand. This could be an IF branch
621 // model input or a WHILE variable initializer.
622 const auto& loc = it->second;
623 executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length);
624 } else {
625 CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
626 << toString(sourceOperandIndex);
627 }
628 };
629 auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
630 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
631 if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
632 it != sourceOperandToLocationOfTemporary.end()) {
633 const auto& loc = it->second;
634 executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset,
635 loc.paddedLength);
636 } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
637 executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
638 loc->paddedLength, *loc->dimensions);
639 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
640 it != sourceOperandToOutputIndex.end()) {
641 executor->mapOutput(it->second, stepOutputIndex);
642 } else {
643 CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
644 << toString(sourceOperandIndex);
645 }
646 };
647 for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
648 mapInput(mStepModelInputs[i].first, i);
649 }
650 for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
651 mapOutput(mStepModelOutputs[i].first, i);
652 }
653 }
654
findModelOutputsThatAreDownstreamInputs()655 void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
656 auto declareModelOutputIsDownstreamInput =
657 [this](const SourceOperandIndex& sourceOperandIndex) {
658 const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
659 CHECK(it != mOutputToDefiningExecutionStep.end());
660 uint32_t stepIndex = it->second;
661 CHECK_LT(stepIndex, mSteps.size());
662 VLOG(COMPILATION)
663 << "ExecutionStep(" << stepIndex
664 << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
665 << toString(sourceOperandIndex) << ")";
666 CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
667 mSourceOperandToOutputIndex.end());
668 mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
669 mSourceOperandToOutputIndex.at(sourceOperandIndex));
670 };
671 for (const auto& logicalStep : mSteps) {
672 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
673 for (const auto& output : step->getOutputsAsStepModelInputs()) {
674 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
675 declareModelOutputIsDownstreamInput(sourceOperandIndex);
676 }
677 }
678 }
679 }
680
findTempsAsStepModelOutputs()681 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
682 auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
683 const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
684 if (it == mTemporaryToDefiningExecutionStep.end()) {
685 // The operand is not a temporary or is not defined by an
686 // ExecutionStep (i.e. it's an output of an IF or a WHILE).
687 // The latter case is handled by ExecutionPlan::makeController().
688 return;
689 }
690 uint32_t stepIndex = it->second;
691 CHECK_LT(stepIndex, mSteps.size());
692 mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
693 };
694 for (const auto& logicalStep : mSteps) {
695 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
696 for (const auto& input : step->getTempsAsStepModelInputs()) {
697 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
698 recordAsOutputIfTemporary(sourceOperandIndex);
699 }
700 } else if (const IfStep* step = logicalStep->tryIfStep()) {
701 recordAsOutputIfTemporary(step->conditionOperandIndex);
702 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
703 recordAsOutputIfTemporary(sourceOperandIndex);
704 }
705 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
706 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
707 recordAsOutputIfTemporary(sourceOperandIndex);
708 }
709 } else {
710 CHECK(logicalStep->isGoto());
711 }
712 }
713 }
714
declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex)715 void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
716 VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
717 << mainModelOutputIndex << ")";
718 const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
719 mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
720 CHECK(it != mOutputIndexStepModelToMainModel.end());
721 const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
722 CHECK(stepModelOutputIndex < mModelOutputs.size());
723 mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
724 }
725
recordTempAsStepModelOutput(uint32_t stepOperandIndex)726 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
727 const auto it = mOperandMap.find(stepOperandIndex);
728 CHECK(it != mOperandMap.end());
729 mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
730 }
731
getSourceModel() const732 const ModelBuilder* ExecutionStep::getSourceModel() const {
733 return mPlan->getSourceModels().getModel(mSourceModelIndex);
734 }
735
logStepModel() const736 void ExecutionStep::logStepModel() const {
737 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
738
739 auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
740 if (!toLog.empty()) {
741 toLog += ", ";
742 }
743 toLog += toString(e.first);
744 toLog += "->";
745 toLog += toString(e.second);
746 };
747
748 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
749 std::string toLog;
750 for (const auto& e : map) {
751 logRemapEntry(toLog, e);
752 }
753 VLOG(COMPILATION) << name << ": " << toLog;
754 };
755 auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
756 std::string toLog;
757 for (const auto& e : set) {
758 logRemapEntry(toLog, e);
759 }
760 VLOG(COMPILATION) << name << ": " << toLog;
761 };
762
763 logRemapVector("step model inputs", mStepModelInputs);
764 logRemapVector("step model outputs", mStepModelOutputs);
765 logRemapVector("model inputs", mModelInputs);
766 logRemapVector("model outputs", mModelOutputs);
767 logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
768 logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
769 logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
770 }
771
hasUnknownSize(const Operand & operand)772 static bool hasUnknownSize(const Operand& operand) {
773 if (operand.dimensions.empty()) {
774 return TypeManager::get()->isTensorType(operand.type);
775 }
776 for (const Dimension& dimension : operand.dimensions) {
777 if (dimension == 0) {
778 return true;
779 }
780 }
781 return false;
782 }
783
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)784 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
785 int32_t executionPreference, int32_t priority) {
786 CHECK(mDevice != nullptr);
787
788 for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
789 const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
790 if (hasUnknownSize(operand)) {
791 *hasOutputOfUnknownSize = true;
792 VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
793 << " of source graph) has unknown size: " << operand;
794 }
795 }
796
797 mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
798
799 mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
800 mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
801 mTempsAsStepModelInputs.end());
802 mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
803 mOutputsAsStepModelInputs.end());
804
805 mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
806 mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
807 mTempsAsStepModelOutputs.end());
808
809 // A step model with no inputs or no outputs is an invalid model. Note that we would like to
810 // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from
811 // model validation.
812 if (hasNoInputsOrNoOutputs()) {
813 VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs "
814 "or no outputs";
815 return ANEURALNETWORKS_OP_FAILED;
816 }
817
818 if (mSourceModelIndex == kMainModelInSourceModels) {
819 std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
820 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
821 mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
822 }
823 std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
824 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
825 mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
826 }
827
828 // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
829 // mModelInputs being the first inputs, as specified by mStepModelInputs.
830 mInputIndexStepModelToMainModel.resize(mModelInputs.size());
831 std::transform(mModelInputs.begin(), mModelInputs.end(),
832 mInputIndexStepModelToMainModel.begin(),
833 [&mainModelOperandToInputIndex](auto& e) {
834 uint32_t sourceOperandIndex = e.first;
835 return mainModelOperandToInputIndex[sourceOperandIndex];
836 });
837
838 // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
839 // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
840 mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
841 std::transform(mModelOutputs.begin(), mModelOutputs.end(),
842 mOutputIndexStepModelToMainModel.begin(),
843 [&mainModelOperandToOutputIndex](auto& e) {
844 uint32_t sourceOperandIndex = e.first;
845 return mainModelOperandToOutputIndex[sourceOperandIndex];
846 });
847
848 // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
849 // on mOutputsAsStepModelInputs being the first outputs.
850 mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
851 std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
852 mOutputsAsStepModelInputsIndexToMainModel.begin(),
853 [&mainModelOperandToOutputIndex](auto& e) {
854 uint32_t sourceOperandIndex = e.first;
855 return mainModelOperandToOutputIndex[sourceOperandIndex];
856 });
857 }
858
859 if (VLOG_IS_ON(COMPILATION)) {
860 logStepModel();
861 }
862
863 std::vector<uint32_t> inputs(mStepModelInputs.size());
864 std::vector<uint32_t> outputs(mStepModelOutputs.size());
865 std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
866 [](auto& e) { return e.second; });
867 std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
868 [](auto& e) { return e.second; });
869 NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
870 outputs.size(), outputs.data()));
871 NN_RETURN_IF_ERROR(mStepModel.finish());
872
873 // TODO: Move compilation elsewhere?
874 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
875 return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(),
876 &mToken, {}, &mPreparedStepModel);
877 }
878
dump() const879 void ExecutionStep::dump() const {
880 if (VLOG_IS_ON(COMPILATION)) {
881 VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
882 logModelToInfo(mStepModel.makeModel());
883 }
884 }
885
operator <<(std::ostream & os,const IfStep & step)886 std::ostream& operator<<(std::ostream& os, const IfStep& step) {
887 return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
888 << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
889 }
890
operator <<(std::ostream & os,const WhileStep & step)891 std::ostream& operator<<(std::ostream& os, const WhileStep& step) {
892 return os << "Step#" << step.index << ": while cond=" << step.condStepIndex
893 << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
894 }
895
operator <<(std::ostream & os,const GotoStep & step)896 std::ostream& operator<<(std::ostream& os, const GotoStep& step) {
897 return os << "Step#" << step.index << ": goto " << step.gotoStepIndex;
898 }
899
dump() const900 void LogicalStep::dump() const {
901 if (VLOG_IS_ON(COMPILATION)) {
902 if (const IfStep* step = tryIfStep()) {
903 VLOG(COMPILATION) << *step;
904 } else if (const WhileStep* step = tryWhileStep()) {
905 VLOG(COMPILATION) << *step;
906 } else if (const GotoStep* step = tryGotoStep()) {
907 VLOG(COMPILATION) << *step;
908 } else {
909 executionStep()->dump();
910 }
911 }
912 }
913
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)914 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
915 int32_t executionPreference, int32_t priority,
916 const OptionalTimePoint& deadline,
917 const std::vector<TokenValuePair>& metadata,
918 int simulateFailureResultCode) {
919 CHECK(!mSuccessfulFinish);
920 CHECK(!deadline.has_value());
921 CHECK(metadata.empty());
922
923 const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
924
925 auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
926 for (const auto& sourceOperandIndex : operands) {
927 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
928 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
929 if (hasUnknownSize(operand)) {
930 return true;
931 }
932 }
933 return false;
934 };
935
936 findTempsAsStepModelOutputs();
937 for (const auto& logicalStep : mSteps) {
938 if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
939 bool stepHasDynamicTemporaries = false;
940 int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries,
941 executionPreference, priority);
942 if (stepHasDynamicTemporaries) {
943 mHasDynamicTemporaries = true;
944 if (!isCompliantVersion(kHalVersionV1_2ToApi.canonical,
945 step->getDevice()->getFeatureLevel())) {
946 // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT
947 // must have fully specified dimensions either in the
948 // Operand or in the RequestArgument. In the case of a
949 // dynamic temporary, we won't be able to supply fully
950 // specified dimensions in either.
951 VLOG(COMPILATION)
952 << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex()
953 << " defines dynamic temporaries but is scheduled on pre-1.2 device "
954 << step->getDevice()->getName();
955 if (n == ANEURALNETWORKS_NO_ERROR) {
956 n = ANEURALNETWORKS_OP_FAILED;
957 }
958 }
959 }
960 if (n != ANEURALNETWORKS_NO_ERROR) {
961 VLOG(COMPILATION)
962 << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
963 return n;
964 }
965 } else if (IfStep* step = logicalStep->tryIfStep()) {
966 // The partitioner does not support dynamic temporaries (b/132458982).
967 CHECK(!containsUnknownSize(step->outerInputOperands));
968 CHECK(!containsUnknownSize(step->outerOutputOperands));
969 // step->conditionOperandIndex has a static shape. See b/158557728.
970 CHECK(!containsUnknownSize(step->thenBranchInputOperands));
971 CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
972 CHECK(!containsUnknownSize(step->elseBranchInputOperands));
973 CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
974 } else if (WhileStep* step = logicalStep->tryWhileStep()) {
975 // The partitioner does not support dynamic temporaries (b/132458982).
976 CHECK(!containsUnknownSize(step->outerInputOperands));
977 CHECK(!containsUnknownSize(step->outerOutputOperands));
978 CHECK(!containsUnknownSize(step->condInputOperands));
979 // step->condOutputOperand has a static shape. See b/158557728.
980 CHECK(!containsUnknownSize(step->bodyInputOperands));
981 CHECK(!containsUnknownSize(step->bodyOutputOperands));
982 } else {
983 CHECK(logicalStep->isGoto());
984 }
985 }
986
987 if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
988 VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
989 << simulateFailureResultCode;
990 return simulateFailureResultCode;
991 }
992
993 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
994 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
995 mSourceOperandToInputIndex[index] = i;
996 }
997 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
998 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
999 mSourceOperandToOutputIndex[index] = i;
1000 }
1001
1002 findControlFlowBoundaryConstants(sourceModels);
1003 findModelOutputsThatAreDownstreamInputs();
1004 findMemoryStepRoles();
1005
1006 mSuccessfulFinish = true;
1007 LOG(INFO) << "ExecutionPlan::CompoundBody::finish: compilation finished successfully";
1008 return ANEURALNETWORKS_NO_ERROR;
1009 }
1010
findControlFlowBoundaryConstants(const SourceModels * sourceModels)1011 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
1012 const SourceModels* sourceModels) {
1013 auto handleBoundaryConstants = [this,
1014 sourceModels](const SourceOperandIndex& sourceOperandIndex) {
1015 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
1016 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
1017 const DataLocation& location = operand.location;
1018 if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) {
1019 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
1020 .buffer = sourceModel->getPointerToOperandValue(location.offset),
1021 .length = location.length,
1022 };
1023 } else if (operand.lifetime == Operand::LifeTime::POINTER) {
1024 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
1025 .buffer = static_cast<const uint8_t*>(std::get<const void*>(location.pointer)),
1026 .length = location.length,
1027 };
1028 } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) {
1029 mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
1030 .memory = sourceModel->getMemories()[location.poolIndex],
1031 .offset = location.offset,
1032 .length = location.length,
1033 };
1034 }
1035 };
1036 for (const auto& logicalStep : mSteps) {
1037 if (const IfStep* step = logicalStep->tryIfStep()) {
1038 handleBoundaryConstants(step->conditionOperandIndex);
1039 for (const auto& sourceOperandIndex : step->outerInputOperands) {
1040 handleBoundaryConstants(sourceOperandIndex);
1041 }
1042 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1043 for (const auto& sourceOperandIndex : step->outerInputOperands) {
1044 handleBoundaryConstants(sourceOperandIndex);
1045 }
1046 }
1047 }
1048 }
1049
findMemoryStepRoles()1050 void ExecutionPlan::CompoundBody::findMemoryStepRoles() {
1051 mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) {
1052 for (const auto& logicalStep : mSteps) {
1053 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1054 const auto& stepModelInputs = step->getStepModelInputs();
1055 for (uint32_t i = 0; i < stepModelInputs.size(); i++) {
1056 const auto& [sourceIndex, stepIndex] = stepModelInputs[i];
1057 analyzer.addRole(*step, sourceIndex, IOType::INPUT, i);
1058 }
1059 const auto& stepModelOutputs = step->getStepModelOutputs();
1060 for (uint32_t i = 0; i < stepModelOutputs.size(); i++) {
1061 const auto& [sourceIndex, stepIndex] = stepModelOutputs[i];
1062 analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i);
1063 }
1064 } else if (const IfStep* step = logicalStep->tryIfStep()) {
1065 // See ExecutionPlan::nextCompound(const IfStep*, ...).
1066 //
1067 // For interpreted IF operation, the outer input memories may be directly used by
1068 // the SUBGRAPH_INPUTs of the then and else model.
1069 CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size());
1070 CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size());
1071 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1072 analyzer.setUsedBy(step->outerInputOperands[i],
1073 step->thenBranchInputOperands[i]);
1074 analyzer.setUsedBy(step->outerInputOperands[i],
1075 step->elseBranchInputOperands[i]);
1076 }
1077 // For interpreted IF operation, the outer output memories may be directly used by
1078 // the SUBGRAPH_OUTPUTs of the then and else model.
1079 CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size());
1080 CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size());
1081 for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) {
1082 analyzer.setUsedBy(step->outerOutputOperands[i],
1083 step->thenBranchOutputOperands[i]);
1084 analyzer.setUsedBy(step->outerOutputOperands[i],
1085 step->elseBranchOutputOperands[i]);
1086 }
1087 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1088 // See ExecutionPlan::nextCompound(const WhileStep*, ...).
1089 //
1090 // For interpreted WHILE operation, the following memories are involved:
1091 // a. the outer input memories to the WHILE operation
1092 // b. the outer output memories to the WHILE operation
1093 // c. the output memory of the condition model
1094 // d. one set of output memories of the body model
1095 // e. another set of output memories of the body model
1096 //
1097 // The memories are used in the following ways:
1098 //
1099 // - Condition model:
1100 // * In the first iteration: inputs use (a); output uses (c)
1101 // * In the following iterations: inputs use (d) or (e) for input-output and
1102 // state-only operands, and (a) for input-only operands; output uses (c)
1103 //
1104 // - Body model:
1105 // * In all iterations: inputs are the same as the condition model; outputs use
1106 // (d) or (e)
1107 //
1108 // Therefore, we configure the analyzer with the following used-by relationships:
1109 // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of
1110 // the condition model for all inputs in the first iteration, as well as the
1111 // input-only operands in the following iterations.
1112 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1113 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1114 analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]);
1115 }
1116 // - The output memories of the body model (d) and (e) may be directly used by the
1117 // SUBGRAPH_INPUTs of the condition model for input-output and state-only operands
1118 // after the first iteration.
1119 CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size());
1120 for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) {
1121 analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]);
1122 }
1123 // - The SUBGRAPH_INPUTs of the condition model are directly used by the
1124 // SUBGRAPH_INPUTs of the body model for all inputs in all iterations.
1125 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1126 for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) {
1127 analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]);
1128 }
1129 } else if (logicalStep->isGoto()) {
1130 // Nothing to do.
1131 } else {
1132 CHECK(false) << "Unexpected LogicalStep kind";
1133 }
1134 }
1135 });
1136 }
1137
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1138 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
1139 int32_t priority, const OptionalTimePoint& deadline,
1140 const std::vector<TokenValuePair>& metadata,
1141 int simulateFailureResultCode) {
1142 CHECK(!mSuccessfulFinish);
1143 CHECK(mDevice != nullptr);
1144 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
1145 int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo,
1146 &mToken, metadata, &mPreparedModel);
1147 if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
1148 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
1149 << simulateFailureResultCode;
1150 n = simulateFailureResultCode;
1151 }
1152 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
1153 if (mSuccessfulFinish) {
1154 LOG(INFO) << "ExecutionPlan::SimpleBody::finish: compilation finished successfully on "
1155 << mDevice->getName();
1156 }
1157 return n;
1158 }
1159
finish(int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1160 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
1161 const OptionalTimePoint& deadline,
1162 const std::vector<TokenValuePair>& metadata,
1163 int simulateFailureResultCode) {
1164 CHECK(mBody != nullptr);
1165 return mBody->finish(&getSourceModels(), executionPreference, priority, deadline, metadata,
1166 simulateFailureResultCode);
1167 }
1168
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference,DynamicTemporaries dynamicTemporaries)1169 ExecutionPlan::Controller::Controller(
1170 const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
1171 const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
1172 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary,
1173 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,
1174 std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
1175 std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
1176 const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
1177 std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
1178 DynamicTemporaries dynamicTemporaries)
1179 : mPlan(plan),
1180 mExecutionBuilder(executionBuilder),
1181 mBurstBuilder(burstBuilder),
1182 mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)),
1183 mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)),
1184 mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
1185 mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
1186 mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
1187 mDynamicTemporaries(std::move(dynamicTemporaries)),
1188 mNextStepIndex(0),
1189 mFallbackNextStepIndex(kBadStepIndex),
1190 mLastStepSyncFd(-1) {
1191 if (totalSizeOfTemporaries == 0) {
1192 return;
1193 }
1194 int n;
1195 std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
1196 if (n != ANEURALNETWORKS_NO_ERROR) {
1197 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
1198 mNextStepIndex = kBadStepIndex;
1199 }
1200 for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
1201 memcpy(mTemporaries->getPointer() +
1202 mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset,
1203 location.buffer, location.length);
1204 }
1205 }
1206
1207 // Attempt to create a burst object for each PreparedModel/Partition. If the
1208 // burst controller object cannot be made, return a nullptr in its place to
1209 // indicate the regular execution path should be used. This can occur either
1210 // because PreparedModel was nullptr (cpu was best choice), or because the
1211 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const1212 std::vector<SharedBurst> ExecutionPlan::makeBursts() const {
1213 switch (mState) {
1214 // burst object for each partition in the compound case
1215 case COMPOUND: {
1216 std::vector<SharedBurst> bursts;
1217 bursts.reserve(compound()->mSteps.size());
1218 for (const auto& logicalStep : compound()->mSteps) {
1219 if (!logicalStep->isExecution()) {
1220 bursts.push_back(nullptr);
1221 continue;
1222 }
1223 if (const auto preparedModel =
1224 logicalStep->executionStep()->getPreparedStepModel()) {
1225 const auto maybeBurst = preparedModel->configureExecutionBurst();
1226 if (!maybeBurst.has_value()) {
1227 LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1228 << maybeBurst.error().code << ": " << maybeBurst.error().message;
1229 }
1230 bursts.push_back(maybeBurst.value_or(nullptr));
1231 } else {
1232 bursts.push_back(nullptr);
1233 }
1234 }
1235 return bursts;
1236 }
1237 // single burst object for the simple case
1238 case SIMPLE: {
1239 std::vector<SharedBurst> burst;
1240 auto simpleBody = simple();
1241 if (const auto preparedModel = simpleBody->mPreparedModel) {
1242 const auto maybeBurst = preparedModel->configureExecutionBurst();
1243 if (!maybeBurst.has_value()) {
1244 LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1245 << maybeBurst.error().code << ": " << maybeBurst.error().message;
1246 }
1247 burst.push_back(maybeBurst.value_or(nullptr));
1248 } else {
1249 burst.push_back(nullptr);
1250 }
1251 return burst;
1252 }
1253 // no burst objects made
1254 default:
1255 return {};
1256 }
1257 }
1258
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const1259 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
1260 ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
1261 CHECK(isValid());
1262 CHECK(mState != SIMPLE);
1263 const auto* body = compound();
1264 // Create the layout for a RuntimeMemory object big enough to hold
1265 // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
1266 // - buffers required by the control flow implementation.
1267 //
1268 // TODO: Rethink this approach for managing temporaries. Some
1269 // alternatives:
1270 //
1271 // 1) Adopt a memory layout scheme analogous to stack allocation,
1272 // where objects of non-overlapping lifetime can occupy the same
1273 // storage. We would still have a single Memory object in this
1274 // case.
1275 //
1276 // 2) Do something like what CpuExecutor does, and do allocations
1277 // and deallocations on the fly (during execution) before first
1278 // reference and after last reference, respectively. This would
1279 // mean having one Memory object per TEMPORARY; or, in a more
1280 // complicated implementation, one Memory object per set of
1281 // temporaries that have the same lifetime. Note that the Android
1282 // system limits the number of shared memory objects, which are
1283 // what our Memory objects represent.
1284 //
1285 uint32_t totalSizeOfTemporaries = 0;
1286 // This function has two modes of operation:
1287 // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
1288 // TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
1289 // skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
1290 // skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
1291 // operand of another lifetime.
1292 // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
1293 // SUBGRAPH_OUTPUT source operands and panic if we see a source operand
1294 // of another lifetime.
1295 auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries](
1296 const SourceOperandIndex& sourceOperandIndex,
1297 std::map<SourceOperandIndex, StaticTemporaryLocation>*
1298 sourceOperandToLocationOfTemporary,
1299 Operand::LifeTime lifetime =
1300 Operand::LifeTime::TEMPORARY_VARIABLE) {
1301 CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
1302 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT);
1303 const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex);
1304 if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE &&
1305 sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
1306 // See the caller for explanation.
1307 return;
1308 }
1309 CHECK_EQ(sourceOperand.lifetime, lifetime);
1310 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1311 if (size != 0u) {
1312 const auto memoryPreference =
1313 body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1314 const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment,
1315 memoryPreference.padding);
1316 auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc);
1317 CHECK(isNew);
1318 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
1319 << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1320 } else {
1321 // Unknown size, hence dynamic temporary. The mapping will
1322 // be established elsewhere (DynamicTemporaries::allocate()).
1323 CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1324 CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1325 }
1326 };
1327 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary;
1328 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2;
1329 for (const auto& logicalStep : body->mSteps) {
1330 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1331 // Allocate memory for ExecutionStep temporary outputs that are
1332 // inputs to other steps, as determined by
1333 // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
1334 //
1335 // We don't allocate memory for step model output operands with
1336 // source operand lifetime SUBGRAPH_OUTPUT because they will be
1337 // - managed by the client (main model outputs),
1338 // - assigned a location of another operand (when this step model
1339 // output is a branch model output of an IF; see
1340 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
1341 // - allocated by a WHILE (when this step model output
1342 // is a condition or body model output of a WHILE; see the
1343 // step->bodyOutputOperands and step->condOutputOperand handling
1344 // below).
1345 for (const auto& output : step->getTempsAsStepModelOutputs()) {
1346 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
1347 &sourceOperandToLocationOfTemporary);
1348 }
1349 } else if (const IfStep* step = logicalStep->tryIfStep()) {
1350 // Allocate memory for all temporary outputs of an IfStep because
1351 // they are going to be written to by a branch model. We don't
1352 // perform unused output operand optimisation for referenced models.
1353 //
1354 // We don't allocate memory for branch output operands because they
1355 // use the same location as the corresponding outer output operands,
1356 // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
1357 //
1358 // We don't allocate memory for outer output operands with source
1359 // operand lifetime SUBGRAPH_OUTPUT because they will be
1360 // - managed by the client (main model outputs),
1361 // - assigned a location of another operand (when this IF outer
1362 // output is a branch model output of another IF; see
1363 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
1364 // - allocated by a WHILE (when this IF outer output
1365 // is a condition or body model output of a WHILE; see the
1366 // step->bodyOutputOperands and step->condOutputOperand handling
1367 // below).
1368 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1369 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1370 }
1371 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1372 // Allocate memory for all temporary outputs of an WhileStep because
1373 // they are going to be written to by the WHILE loop.
1374 //
1375 // We don't allocate memory for outer output operands with source
1376 // operand lifetime SUBGRAPH_OUTPUT because they will be
1377 // - managed by the client (main model outputs),
1378 // - assigned a location of another operand (when this WHILE outer
1379 // output is a branch model output of an IF; see
1380 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
1381 // - allocated by another WHILE (when this WHILE outer output
1382 // is a condition or body model output of another WHILE; see the
1383 // step->bodyOutputOperands and step->condOutputOperand handling
1384 // below).
1385 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1386 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1387 }
1388 // Allocate memory for body model outputs. Note that we could use
1389 // the outer output operand memory instead but we currently don't do
1390 // so (b/148206073).
1391 for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
1392 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary,
1393 Operand::LifeTime::SUBGRAPH_OUTPUT);
1394 // Allocate another set of temporaries for double buffering.
1395 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2,
1396 Operand::LifeTime::SUBGRAPH_OUTPUT);
1397 }
1398 // Allocate memory for condition model output.
1399 // TODO: Share one condition output memory region between all loops.
1400 mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary,
1401 Operand::LifeTime::SUBGRAPH_OUTPUT);
1402 } else {
1403 CHECK(logicalStep->isGoto());
1404 }
1405 }
1406 // Allocate temporary memory for boundary CONSTANT_COPY operands.
1407 for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) {
1408 const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1409 const auto loc = addTemporary(&totalSizeOfTemporaries, location.length,
1410 memoryPreference.alignment, memoryPreference.padding);
1411 sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc);
1412 VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
1413 << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1414 }
1415 // Collect dynamic temporaries.
1416 // TODO(b/157236079): Move some or all of this work to compilation time?
1417 DynamicTemporaries dynamicTemporaries;
1418 const TypeManager* typeManager = TypeManager::get();
1419 forEachDynamicTemporary([body, typeManager, &dynamicTemporaries](
1420 SourceOperandIndex sourceOperandIndex,
1421 const Operand& sourceOperand, uint32_t definingStepIndex) {
1422 CHECK(typeManager->isTensorType(sourceOperand.type));
1423 const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1424 // TODO: For now we guess an initial size equal to element
1425 // size, which is overly conservative.
1426 const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
1427 dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
1428 size, memoryPreference.alignment, memoryPreference.padding);
1429 });
1430 dynamicTemporaries.endDeclarations();
1431 dynamicTemporaries.vlogDump("finished declarations");
1432
1433 return std::shared_ptr<Controller>(new Controller(
1434 this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
1435 std::move(sourceOperandToLocationOfTemporary),
1436 std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex,
1437 body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy,
1438 body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
1439 }
1440
1441 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1442 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
1443 std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1444 const std::vector<OutputShape>* mainModelOutputShapes) const {
1445 *executor = nullptr;
1446 if (burstController != nullptr) {
1447 *burstController = nullptr;
1448 }
1449
1450 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
1451 << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
1452
1453 if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
1454 // We haven't called next().
1455 return ANEURALNETWORKS_OP_FAILED;
1456 }
1457
1458 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1459 // The last call to next() did not produce an executor.
1460 return ANEURALNETWORKS_OP_FAILED;
1461 }
1462
1463 controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1464 return next(controller, executor, burstController, mainModelOutputShapes);
1465 }
1466
Buffer(void * pointer,uint32_t size)1467 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1468 : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast<uint8_t*>(pointer), size)),
1469 mOffset(0) {}
1470
Buffer(RunTimePoolInfo info,uint32_t offset)1471 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1472 : mInfo(std::move(info)), mOffset(offset) {}
1473
getPointer() const1474 void* ExecutionPlan::Buffer::getPointer() const {
1475 return mInfo.getBuffer() + mOffset;
1476 }
1477
getSize() const1478 uint32_t ExecutionPlan::Buffer::getSize() const {
1479 return mInfo.getSize() - mOffset;
1480 }
1481
flush() const1482 void ExecutionPlan::Buffer::flush() const {
1483 mInfo.flush();
1484 }
1485
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1486 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1487 const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1488 switch (info.state()) {
1489 case ModelArgumentInfo::POINTER: {
1490 return Buffer(info.buffer(), info.length());
1491 } break;
1492 case ModelArgumentInfo::MEMORY: {
1493 if (std::optional<RunTimePoolInfo> poolInfo =
1494 executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1495 return Buffer(*poolInfo, info.locationAndLength().offset);
1496 } else {
1497 LOG(ERROR) << "Unable to map operand memory pool";
1498 return std::nullopt;
1499 }
1500 } break;
1501 case ModelArgumentInfo::HAS_NO_VALUE: {
1502 LOG(ERROR) << "Attempting to read an operand that has no value";
1503 return std::nullopt;
1504 } break;
1505 default: {
1506 LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1507 return std::nullopt;
1508 } break;
1509 }
1510 }
1511
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1512 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1513 std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1514 const auto& sourceOperandToLocationOfTemporary =
1515 controller->mSourceOperandToLocationOfTemporary;
1516 const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1517 const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1518 const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1519 if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex);
1520 it != sourceOperandToLocationOfTemporary.end()) {
1521 const uint32_t offset = it->second.offset;
1522 const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1523 return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1524 } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1525 it != sourceOperandToInputIndex.end()) {
1526 const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1527 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1528 } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1529 it != sourceOperandToOutputIndex.end()) {
1530 const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1531 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1532 } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1533 it != sourceOperandToConstantReference.end()) {
1534 const ConstantReferenceLocation& location = it->second;
1535 const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1536 if (info == std::nullopt) {
1537 return std::nullopt;
1538 }
1539 return Buffer(info->getBuffer() + location.offset, location.length);
1540 }
1541 return std::nullopt;
1542 }
1543
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1544 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1545 SourceOperandIndex operandIndex, bool* value) const {
1546 std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1547 if (buffer == std::nullopt) {
1548 LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1549 return ANEURALNETWORKS_OP_FAILED;
1550 }
1551 CHECK_GE(buffer->getSize(), sizeof(bool8));
1552 bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1553 *value = static_cast<bool>(value8);
1554 VLOG(EXECUTION) << "readConditionValue: " << *value;
1555 return ANEURALNETWORKS_NO_ERROR;
1556 }
1557
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes,int syncFdOfLastStep) const1558 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1559 std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1560 const std::vector<OutputShape>* mainModelOutputShapes,
1561 int syncFdOfLastStep) const {
1562 CHECK(mState == COMPOUND);
1563
1564 controller->mLastStepSyncFd = syncFdOfLastStep;
1565 *executor = nullptr;
1566 if (burstController != nullptr) {
1567 *burstController = nullptr;
1568 }
1569
1570 VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1571 << "): mNextStepIndex = " << controller->mNextStepIndex;
1572
1573 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1574 return ANEURALNETWORKS_OP_FAILED;
1575 }
1576
1577 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1578 }
1579
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1580 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1581 std::shared_ptr<StepExecutor>* executor,
1582 SharedBurst* burstController,
1583 const std::vector<OutputShape>* mainModelOutputShapes) const {
1584 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1585 return ANEURALNETWORKS_OP_FAILED;
1586 }
1587
1588 auto compoundBody = compound();
1589 if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1590 controller->mNextStepIndex = Controller::kBadStepIndex; // end
1591 return ANEURALNETWORKS_NO_ERROR;
1592 }
1593
1594 const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1595 if (const IfStep* step = logicalStep->tryIfStep()) {
1596 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1597 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1598 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1599 } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1600 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1601 } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1602 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1603 } else {
1604 CHECK(false) << "Unknown step variant";
1605 return ANEURALNETWORKS_BAD_STATE;
1606 }
1607 }
1608
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1609 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1610 std::shared_ptr<StepExecutor>* executor,
1611 SharedBurst* burstController,
1612 const std::vector<OutputShape>* mainModelOutputShapes) const {
1613 VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1614 << step->getDevice()->getName();
1615
1616 NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
1617 controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
1618
1619 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1620 step->getDevice(), step->getPreparedStepModel(),
1621 /*reusable=*/false, step,
1622 &controller->mDynamicTemporaries);
1623
1624 step->mapInputsAndOutputs(
1625 *executor, mainModelOutputShapes, controller->mTemporaries.get(),
1626 controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries,
1627 controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
1628 controller->mSourceOperandToConstantReference);
1629 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1630 *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1631 }
1632
1633 controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1634 controller->mNextStepIndex++;
1635 return ANEURALNETWORKS_NO_ERROR;
1636 }
1637
1638 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1639 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1640 const SourceOperandIndex& innerOperand) {
1641 VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1642 << toString(outerOperand);
1643 #ifdef NN_DEBUGGABLE
1644 CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1645 mSourceOperandToInputIndex.count(innerOperand) +
1646 mSourceOperandToOutputIndex.count(innerOperand) +
1647 mSourceOperandToConstantReference.count(innerOperand),
1648 1u);
1649 #endif
1650 mSourceOperandToLocationOfTemporary.erase(innerOperand);
1651 mSourceOperandToInputIndex.erase(innerOperand);
1652 mSourceOperandToOutputIndex.erase(innerOperand);
1653 mSourceOperandToConstantReference.erase(innerOperand);
1654 if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1655 it != mSourceOperandToLocationOfTemporary.end()) {
1656 mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1657 } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1658 it != mSourceOperandToInputIndex.end()) {
1659 mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1660 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1661 it != mSourceOperandToOutputIndex.end()) {
1662 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1663 } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1664 it != mSourceOperandToConstantReference.end()) {
1665 mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1666 } else {
1667 CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1668 << " from operand " << toString(outerOperand);
1669 }
1670 }
1671
1672 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1673 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1674 const SourceOperandIndex& innerOperand) {
1675 VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1676 << toString(outerOperand);
1677 #ifdef NN_DEBUGGABLE
1678 CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1679 mSourceOperandToOutputIndex.count(innerOperand),
1680 1u);
1681 #endif
1682 mSourceOperandToLocationOfTemporary.erase(innerOperand);
1683 mSourceOperandToOutputIndex.erase(innerOperand);
1684 if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1685 it != mSourceOperandToLocationOfTemporary.end()) {
1686 mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1687 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1688 it != mSourceOperandToOutputIndex.end()) {
1689 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1690 } else {
1691 CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1692 << " from operand " << toString(outerOperand);
1693 }
1694 }
1695
waitForLastStepSyncFence() const1696 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1697 if (mLastStepSyncFd == -1) {
1698 return ANEURALNETWORKS_NO_ERROR;
1699 }
1700 VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1701 auto r = syncWait(mLastStepSyncFd, -1);
1702 int n = ANEURALNETWORKS_NO_ERROR;
1703 if (r != FenceState::SIGNALED) {
1704 LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1705 n = ANEURALNETWORKS_OP_FAILED;
1706 }
1707 return n;
1708 }
1709
1710 // Invocations of Controller::setInput/setOutput in this function must match with invocations of
1711 // StepRoleAnalyzer::setUsedBy in the IfStep branch in
1712 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1713 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1714 std::shared_ptr<StepExecutor>* executor,
1715 SharedBurst* burstController,
1716 const std::vector<OutputShape>* mainModelOutputShapes) const {
1717 VLOG(EXECUTION) << "next: " << *step;
1718 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1719 // This is safe because the steps are serialized when doing fenced compute.
1720 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1721 bool condValue;
1722 NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1723 controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1724 const std::vector<SourceOperandIndex>& branchInputOperands =
1725 condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1726 const std::vector<SourceOperandIndex>& branchOutputOperands =
1727 condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1728 CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1729 CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1730 for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1731 // We have to do this assignment just before executing this step to
1732 // accommodate cases when the IF resides within a WHILE condition or
1733 // body model and for some j the i-th input of the IF branch model is
1734 // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1735 // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1736 // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1737 // In such cases, the WhileStep modifies the location of
1738 // step->outerInputOperands[i] to implement double buffering.
1739 controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1740 }
1741 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1742 // We have to do this assignment just before executing this step to
1743 // accommodate the case when the IF resides within a WHILE body
1744 // model and the i-th output of the IF branch model is an
1745 // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1746 // some j). In that case, the WhileStep modifies the location of
1747 // step->outerOutputOperands[i] to implement double buffering.
1748 controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1749 }
1750 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1751 }
1752
1753 // Invocations of Controller::setInput in this function must match with invocations of
1754 // StepRoleAnalyzer::setUsedBy in the WhileStep branch in
1755 // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1756 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1757 std::shared_ptr<StepExecutor>* executor,
1758 SharedBurst* burstController,
1759 const std::vector<OutputShape>* mainModelOutputShapes) const {
1760 WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1761 if (state.stage == WhileState::EVALUATE_CONDITION) {
1762 state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1763 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1764 << ": evaluating condition";
1765 controller->mNextStepIndex = step->condStepIndex;
1766
1767 if (state.iteration == 0) {
1768 state.startTime = Clock::now();
1769 }
1770
1771 // iteration = 0 cond inputs = outer inputs
1772 // iteration = 1 cond inputs = body outputs
1773 // iteration = 2 cond inputs = body outputs
1774 // iteration = 3 cond inputs = ...
1775 uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1776 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1777 CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1778 for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1779 bool operandIsInputOnly = i >= loopBodyOutputCount;
1780 controller->setInput((state.iteration == 0 || operandIsInputOnly)
1781 ? step->outerInputOperands[i]
1782 : step->bodyOutputOperands[i],
1783 step->condInputOperands[i]);
1784 }
1785
1786 state.stage = WhileState::EVALUATE_BODY;
1787 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1788 }
1789
1790 CHECK(state.stage == WhileState::EVALUATE_BODY);
1791 std::chrono::nanoseconds timeoutDuration(
1792 controller->mExecutionBuilder->getLoopTimeoutDuration());
1793 auto duration = Clock::now() - state.startTime;
1794 if (duration > timeoutDuration) {
1795 LOG(ERROR) << "WHILE loop timed out after "
1796 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1797 << " ms";
1798 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1799 }
1800
1801 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1802 // This is safe because the steps are serialized when doing fenced compute.
1803 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1804 bool condValue;
1805 NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1806 if (condValue) {
1807 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1808 << ": evaluating body";
1809 controller->mNextStepIndex = step->bodyStepIndex;
1810
1811 // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1
1812 // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2
1813 // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1
1814 // iteration = 3 body inputs = cond inputs = ... body outputs = ...
1815 #ifdef NN_DEBUGGABLE
1816 CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1817 CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1818 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1819 CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1820 #endif
1821 for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1822 controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1823 }
1824 if (state.iteration != 0) {
1825 for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1826 #ifdef NN_DEBUGGABLE
1827 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1828 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1829 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u);
1830 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u);
1831 #endif
1832 std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand],
1833 controller->mSourceOperandToLocationOfTemporary2[outputOperand]);
1834 }
1835 }
1836 } else {
1837 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1838 << ": exiting loop";
1839 controller->mNextStepIndex = step->exitStepIndex;
1840
1841 // Copy body outputs to outer outputs.
1842 // TODO: Use outer outputs instead of tmp2 to avoid copying?
1843 CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1844 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1845 // condInputOperands[i] points to a body output operand from the
1846 // last iteration if we've executed at least one iteration and to a
1847 // WHILE operation input operand otherwise.
1848 const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1849 const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1850 std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1851 if (outerBuffer == std::nullopt) {
1852 // This should never happen.
1853 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1854 return ANEURALNETWORKS_OP_FAILED;
1855 }
1856 const Operand& sourceOperand =
1857 controller->mExecutionBuilder->getSourceOperand(outerOperand);
1858 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1859 CHECK_NE(size, 0u);
1860 std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1861 if (innerBuffer == std::nullopt) {
1862 // This should never happen.
1863 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1864 return ANEURALNETWORKS_OP_FAILED;
1865 }
1866 CHECK_LE(size, innerBuffer->getSize());
1867 CHECK_LE(size, outerBuffer->getSize());
1868 memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1869 outerBuffer->flush();
1870 }
1871 state.iteration = WhileState::kOutsideLoop;
1872 }
1873
1874 state.stage = WhileState::EVALUATE_CONDITION;
1875 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1876 }
1877
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1878 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1879 std::shared_ptr<StepExecutor>* executor,
1880 SharedBurst* burstController,
1881 const std::vector<OutputShape>* mainModelOutputShapes) const {
1882 VLOG(EXECUTION) << "next: " << *step;
1883 controller->mNextStepIndex = step->gotoStepIndex;
1884 return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1885 }
1886
makeStepExecutor(bool reusable,ExecutionBuilder * executionBuilder) const1887 std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
1888 bool reusable, ExecutionBuilder* executionBuilder) const {
1889 auto simpleBody = simple();
1890 auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
1891 simpleBody->mDevice, simpleBody->mPreparedModel,
1892 reusable);
1893 executor->mapInputsAndOutputsTrivially();
1894 return executor;
1895 }
1896
becomeCompoundIfEmpty()1897 void ExecutionPlan::becomeCompoundIfEmpty() {
1898 CHECK(mState != SIMPLE);
1899 if (mState == EMPTY) {
1900 mBody = new CompoundBody(this);
1901 mState = COMPOUND;
1902 }
1903 }
1904
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1905 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1906 const std::shared_ptr<Device> device) {
1907 becomeCompoundIfEmpty();
1908 auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1909 compound()->mSteps.size(), sourceModelIndex, device);
1910 compound()->mSteps.push_back(step);
1911 return step->executionStep();
1912 }
1913
createNewIfStep()1914 IfStep* ExecutionPlan::createNewIfStep() {
1915 becomeCompoundIfEmpty();
1916 auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1917 step->ifStep()->index = compound()->mSteps.size();
1918 compound()->mSteps.push_back(step);
1919 return step->ifStep();
1920 }
1921
createNewWhileStep()1922 WhileStep* ExecutionPlan::createNewWhileStep() {
1923 becomeCompoundIfEmpty();
1924 auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1925 step->whileStep()->index = compound()->mSteps.size();
1926 compound()->mSteps.push_back(step);
1927 return step->whileStep();
1928 }
1929
createNewGotoStep()1930 GotoStep* ExecutionPlan::createNewGotoStep() {
1931 becomeCompoundIfEmpty();
1932 auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1933 step->gotoStep()->index = compound()->mSteps.size();
1934 compound()->mSteps.push_back(step);
1935 return step->gotoStep();
1936 }
1937
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1938 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1939 const ModelBuilder* model) {
1940 CHECK(mState == EMPTY);
1941 mBody = new SimpleBody(device, model, mCacheInfo, mToken);
1942 mState = SIMPLE;
1943 }
1944
recordOutputDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1945 void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1946 auto [it, isNew] =
1947 compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1948 CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
1949 << toString(sourceOperandIndex) << " already defined by step " << it->second;
1950 }
1951
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1952 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1953 auto [it, isNew] =
1954 compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1955 CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1956 << toString(sourceOperandIndex) << " already defined by step " << it->second;
1957 }
1958
dump() const1959 void ExecutionPlan::dump() const {
1960 if (mBody) {
1961 mBody->dump();
1962 } else {
1963 VLOG(COMPILATION) << "EMPTY";
1964 }
1965 }
1966
reset()1967 void ExecutionPlan::reset() {
1968 if (mBody) {
1969 delete mBody;
1970 mBody = nullptr;
1971 }
1972 mState = EMPTY;
1973 }
1974
isSimpleCpu() const1975 bool ExecutionPlan::isSimpleCpu() const {
1976 return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1977 }
1978
forTest_getKind() const1979 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1980 switch (mState) {
1981 case EMPTY:
1982 return Kind::EMPTY;
1983 case SIMPLE:
1984 CHECK(mBody);
1985 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1986 case COMPOUND:
1987 CHECK(mBody);
1988 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1989 default:
1990 LOG(FATAL) << "unexpected state";
1991 return Kind::ERROR;
1992 }
1993 }
1994
forTest_simpleGetDevice() const1995 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1996 return simple()->mDevice;
1997 }
1998
forTest_compoundGetSteps() const1999 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
2000 return compound()->mSteps;
2001 }
2002
forTest_flatGetDynamicTemporaries() const2003 std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
2004 CHECK_EQ(getSourceModels().size(), size_t(1));
2005 std::set<uint32_t> ret;
2006 forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
2007 ret.insert(dynTemp.second);
2008 });
2009 return ret;
2010 }
2011
hasDynamicTemporaries() const2012 bool ExecutionPlan::hasDynamicTemporaries() const {
2013 return mBody == nullptr ? false : mBody->hasDynamicTemporaries();
2014 }
2015
forTest_hasStepModelWithNoInputsOrNoOutputs() const2016 bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const {
2017 return mBody == nullptr ? false : mBody->hasStepModelWithNoInputsOrNoOutputs();
2018 }
2019
hasStepModelWithNoInputsOrNoOutputs() const2020 bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const {
2021 return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) {
2022 const ExecutionStep* step = logicalStep->tryExecutionStep();
2023 return step != nullptr && step->hasNoInputsOrNoOutputs();
2024 });
2025 }
2026
forTest_simpleGetCacheToken() const2027 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
2028 return simple()->mToken.getCacheToken();
2029 }
2030
dump() const2031 void ExecutionPlan::SimpleBody::dump() const {
2032 VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
2033 }
2034
dump() const2035 void ExecutionPlan::CompoundBody::dump() const {
2036 for (const auto& step : mSteps) {
2037 step->dump();
2038 }
2039 }
2040
getInputSourceOperand(uint32_t index) const2041 SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const {
2042 const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
2043 CHECK_LT(index, mainModel->inputCount());
2044 const auto operandIndex = mainModel->getInputOperandIndex(index);
2045 return {kMainModelInSourceModels, operandIndex};
2046 }
2047
getOutputSourceOperand(uint32_t index) const2048 SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const {
2049 const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
2050 CHECK_LT(index, mainModel->outputCount());
2051 const auto operandIndex = mainModel->getOutputOperandIndex(index);
2052 return {kMainModelInSourceModels, operandIndex};
2053 }
2054
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2055 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
2056 const StepRoleCallback& callback) const {
2057 callback(mPreparedModel.get(), IOType::INPUT, index);
2058 }
2059
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2060 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
2061 const StepRoleCallback& callback) const {
2062 callback(mPreparedModel.get(), IOType::OUTPUT, index);
2063 }
2064
2065 // Map an input role of the main model to the input/output roles in the step models.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2066 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
2067 const StepRoleCallback& callback) const {
2068 const auto sourceOperandIndex = mPlan->getInputSourceOperand(index);
2069 forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2070 }
2071
2072 // Map an output role of the main model to the input/output roles in the step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2073 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
2074 const StepRoleCallback& callback) const {
2075 const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index);
2076 forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2077 }
2078
forEachStepRoleOfSourceOperand(const SourceOperandIndex & index,const StepRoleCallback & callback) const2079 void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand(
2080 const SourceOperandIndex& index, const StepRoleCallback& callback) const {
2081 const auto it = mSourceOperandToStepRoles.find(index);
2082 if (it == mSourceOperandToStepRoles.end()) return;
2083 for (const auto& [stepIndex, type, ioIndex] : it->second) {
2084 CHECK_LT(stepIndex, mSteps.size());
2085 const auto* step = mSteps[stepIndex]->executionStep();
2086 callback(step->getPreparedStepModel().get(), type, ioIndex);
2087 }
2088 }
2089
getMemoryPreference(IOType type,uint32_t index) const2090 MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const {
2091 CHECK(mState == SIMPLE || mState == COMPOUND);
2092 if (mState == SIMPLE) {
2093 return simple()->mPreparedModel->getMemoryPreference();
2094 } else {
2095 const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index)
2096 : getOutputSourceOperand(index);
2097 return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
2098 }
2099 }
2100
getMemoryPreferenceOfSourceOperand(const SourceOperandIndex & index) const2101 MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand(
2102 const SourceOperandIndex& index) const {
2103 uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding;
2104 forEachStepRoleOfSourceOperand(
2105 index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) {
2106 const auto preference = preparedModel->getMemoryPreference();
2107 alignment = std::max(alignment, preference.alignment);
2108 padding = std::max(padding, preference.padding);
2109 });
2110 return {alignment, padding};
2111 }
2112
forEachDynamicTemporary(const std::function<void (SourceOperandIndex,const Operand &,uint32_t definingStepIndex)> & fn) const2113 void ExecutionPlan::forEachDynamicTemporary(
2114 const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
2115 fn) const {
2116 if (mState != COMPOUND) {
2117 return;
2118 }
2119
2120 for (const auto& logicalStep : compound()->mSteps) {
2121 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
2122 const uint32_t stepIndex = step->getIndex();
2123 const uint32_t sourceModelIndex = step->getSourceModelIndex();
2124 for (const auto& entry : step->getTempsAsStepModelOutputs()) {
2125 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
2126 const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
2127 if (hasUnknownSize(sourceOperand)) {
2128 fn(sourceOperandIndex, sourceOperand, stepIndex);
2129 }
2130 }
2131 }
2132 }
2133 }
2134
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan,const std::vector<TokenValuePair> & metaData,int simulateFailureResultCode) const2135 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
2136 uint32_t preference, uint32_t priority,
2137 const OptionalTimePoint& deadline, ExecutionPlan* plan,
2138 const std::vector<TokenValuePair>& metaData,
2139 int simulateFailureResultCode) const {
2140 uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
2141 NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
2142 deadline, plan));
2143 int n = plan->finish(preference, priority, deadline, metaData, simulateFailureResultCode);
2144 if (VLOG_IS_ON(COMPILATION)) {
2145 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
2146 logModelToInfo(makeModel());
2147 plan->dump();
2148 }
2149 return n;
2150 }
2151
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan) const2152 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
2153 const std::vector<std::shared_ptr<Device>>& devices,
2154 uint32_t preference, uint32_t priority,
2155 const OptionalTimePoint& deadline,
2156 ExecutionPlan* plan) const {
2157 // This function uses a heuristic approach to partitioning the graph.
2158 // It should be good enough for the first release.
2159
2160 SourceModels* sourceModels = &plan->getSourceModels();
2161 const size_t deviceCount = devices.size();
2162 const size_t operationCount = mOperations.size();
2163
2164 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
2165 << "sourceModelIndex = " << sourceModelIndex << ", "
2166 << "deviceCount = " << deviceCount << ", "
2167 << "operationCount = " << operationCount;
2168
2169 // Figure out where each operation will best execute.
2170 // The value of the vector is the index in the devices vector.
2171 std::vector<int> bestDeviceForOperation(operationCount);
2172 NN_RETURN_IF_ERROR(
2173 findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
2174
2175 // A special value produced by findBestDeviceForEachOperation meaning that
2176 // this is a control flow operation scheduled for interpreted execution
2177 // (see LogicalStep).
2178 const int kControlFlowInterpreter = deviceCount;
2179
2180 // If one device will run all the operations, we don't need to split the
2181 // work. This shortcut does not apply when recursively partitioning
2182 // referenced models because our plan representation is flat.
2183 if (sourceModelIndex == kMainModelInSourceModels &&
2184 std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
2185 std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
2186 const int bestDeviceIndex = bestDeviceForOperation[0];
2187 // Bypass the partitioning process unless the only operation is a
2188 // control flow operation scheduled for interpreted execution.
2189 if (bestDeviceIndex != kControlFlowInterpreter) {
2190 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
2191 << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
2192 plan->becomeSingleStep(devices[bestDeviceIndex], this);
2193 return ANEURALNETWORKS_NO_ERROR;
2194 }
2195 }
2196
2197 // No easy solution, we need to split the work.
2198
2199 // We keep track of the operations that are ready to run for each device.
2200 // perDeviceQueue[deviceCount] is for interpreted execution of control flow
2201 // (see LogicalStep).
2202 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
2203
2204 // This helper function produces a device name.
2205 auto deviceName = [&devices, kControlFlowInterpreter,
2206 deviceCount](int deviceIndex) -> std::string {
2207 if (deviceIndex == kControlFlowInterpreter) {
2208 return "NNAPI";
2209 } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
2210 return "{unknown}";
2211 } else {
2212 return devices.at(deviceIndex)->getName();
2213 }
2214 };
2215
2216 // This helper function enqueues the operation on the appropriate queue.
2217 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
2218 int deviceIndex = bestDeviceForOperation[operationIndex];
2219 perDeviceQueue[deviceIndex].push(operationIndex);
2220 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
2221 << deviceIndex << " (" << deviceName(deviceIndex) << ")";
2222 };
2223
2224 // This helper function finds a device that has operations ready to process.
2225 // We start by looking at the control flow queue, and then look at the
2226 // devices in reverse order (i.e., starting at the end of the devices
2227 // vector). Earlier devices have a chance to prepare more of the inputs
2228 // required by other devices. This function returns -1 if all queues are
2229 // empty.
2230 auto findNextDeviceToProcess = [&]() -> int {
2231 for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
2232 if (!perDeviceQueue[i].empty()) {
2233 return i;
2234 }
2235 }
2236 return -1;
2237 };
2238
2239 OperandTracker tracker(this, enqueueOnAppropriateDevice);
2240 // For each iteration of this loop, we'll create either an execution step or
2241 // an interpreted control flow construct (including nested execution steps
2242 // and interpreted control flow constructs).
2243 while (true) {
2244 // Find the device we'll do this step for.
2245 int deviceIndex = findNextDeviceToProcess();
2246 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
2247 << deviceName(deviceIndex) << ")";
2248 if (deviceIndex < 0) {
2249 break;
2250 }
2251
2252 // Assign as much as possible to this device.
2253 auto& queue = perDeviceQueue[deviceIndex];
2254 if (deviceIndex != kControlFlowInterpreter) {
2255 ExecutionStep* step =
2256 plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
2257 while (!queue.empty()) {
2258 uint32_t operationIndex = queue.front();
2259 queue.pop();
2260 int n = step->addOperation(operationIndex);
2261 if (n != ANEURALNETWORKS_NO_ERROR) {
2262 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
2263 return n;
2264 }
2265 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2266 }
2267 } else {
2268 while (!queue.empty()) {
2269 uint32_t operationIndex = queue.front();
2270 queue.pop();
2271 const Operation& operation = getOperation(operationIndex);
2272 if (operation.type == OperationType::IF) {
2273 namespace op = operation_if;
2274 const Operand& thenOperand =
2275 getOperand(operation.inputs[op::kThenModelOperand]);
2276 const Operand& elseOperand =
2277 getOperand(operation.inputs[op::kElseModelOperand]);
2278 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2279 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2280 uint32_t thenModelIndex = sourceModels->addModel(thenModel);
2281 uint32_t elseModelIndex = sourceModels->addModel(elseModel);
2282
2283 // Emits the following:
2284 // Index Step
2285 // i if then=(i + 1) else=(j + 1)
2286 // ... (then model steps)
2287 // j goto k
2288 // ... (else model steps)
2289 // k (steps after the IF)
2290 IfStep* ifStep = plan->createNewIfStep();
2291 ifStep->conditionOperandIndex = SourceOperandIndex(
2292 sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
2293 ifStep->thenStepIndex = plan->getNextStepIndex();
2294 NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
2295 thenModelIndex, devices, preference, priority, deadline, plan));
2296 GotoStep* afterThenBranch = plan->createNewGotoStep();
2297 ifStep->elseStepIndex = plan->getNextStepIndex();
2298 NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
2299 elseModelIndex, devices, preference, priority, deadline, plan));
2300 afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
2301
2302 // Outer model operands.
2303 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2304 ifStep->outerInputOperands.emplace_back(sourceModelIndex,
2305 operation.inputs[i]);
2306 }
2307 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2308 ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
2309 operation.outputs[i]);
2310 }
2311 // Then model operands.
2312 for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
2313 ifStep->thenBranchInputOperands.emplace_back(
2314 thenModelIndex, thenModel->getInputOperandIndex(i));
2315 }
2316 for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
2317 ifStep->thenBranchOutputOperands.emplace_back(
2318 thenModelIndex, thenModel->getOutputOperandIndex(i));
2319 }
2320 // Else model operands.
2321 for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
2322 ifStep->elseBranchInputOperands.emplace_back(
2323 elseModelIndex, elseModel->getInputOperandIndex(i));
2324 }
2325 for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
2326 ifStep->elseBranchOutputOperands.emplace_back(
2327 elseModelIndex, elseModel->getOutputOperandIndex(i));
2328 }
2329 } else if (operation.type == OperationType::WHILE) {
2330 namespace op = operation_while;
2331 const Operand& condOperand =
2332 getOperand(operation.inputs[op::kCondModelOperand]);
2333 const Operand& bodyOperand =
2334 getOperand(operation.inputs[op::kBodyModelOperand]);
2335 const ModelBuilder* condModel = getReferencedModel(condOperand);
2336 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2337 uint32_t condModelIndex = sourceModels->addModel(condModel);
2338 uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
2339
2340 // Emits the following:
2341 // Index Step
2342 // i while cond=(i + 1) body=(j + 1) exit=(k + 1)
2343 // ... (cond model steps)
2344 // j goto i
2345 // ... (body model steps)
2346 // k goto i
2347 // ... (steps after the WHILE)
2348 //
2349 // Note that WhileStep has WhileState associated with it.
2350 WhileStep* whileStep = plan->createNewWhileStep();
2351 whileStep->condStepIndex = plan->getNextStepIndex();
2352 NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
2353 condModelIndex, devices, preference, priority, deadline, plan));
2354 GotoStep* afterCond = plan->createNewGotoStep();
2355 afterCond->gotoStepIndex = whileStep->index;
2356 whileStep->bodyStepIndex = plan->getNextStepIndex();
2357 NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
2358 bodyModelIndex, devices, preference, priority, deadline, plan));
2359 GotoStep* afterBody = plan->createNewGotoStep();
2360 afterBody->gotoStepIndex = whileStep->index;
2361 whileStep->exitStepIndex = plan->getNextStepIndex();
2362
2363 // Outer model operands.
2364 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2365 whileStep->outerInputOperands.emplace_back(sourceModelIndex,
2366 operation.inputs[i]);
2367 }
2368 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2369 whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
2370 operation.outputs[i]);
2371 }
2372 // Cond model operands.
2373 for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
2374 whileStep->condInputOperands.emplace_back(
2375 condModelIndex, condModel->getInputOperandIndex(i));
2376 }
2377 whileStep->condOutputOperand =
2378 SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
2379 // Body model operands.
2380 for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
2381 whileStep->bodyInputOperands.emplace_back(
2382 bodyModelIndex, bodyModel->getInputOperandIndex(i));
2383 }
2384 for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
2385 whileStep->bodyOutputOperands.emplace_back(
2386 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
2387 }
2388 } else {
2389 CHECK(false) << operation.type << " is not a control flow operation";
2390 }
2391 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2392 }
2393 }
2394 }
2395 return ANEURALNETWORKS_NO_ERROR;
2396 }
2397
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const2398 float ModelBuilder::getPerformance(uint32_t preference,
2399 const std::shared_ptr<Device> device) const {
2400 // Note that we will call this method multiple times per compilation with
2401 // the same arguments if there are nested control flow operations and we
2402 // decide to execute the outer operation on the ExecutionPlan::next()
2403 // interpreter.
2404 //
2405 // This is a potential compilation performance problem. To work around it,
2406 // the performance value could be cached for the duration of a compilation.
2407 float perf = 0;
2408 const size_t operationCount = mOperations.size();
2409 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2410 perf += getPerformance(preference, device, operationIndex);
2411 }
2412 return perf;
2413 }
2414
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const2415 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
2416 uint32_t operationIndex) const {
2417 auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) {
2418 return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
2419 };
2420
2421 const Operation& operation = getOperation(operationIndex);
2422
2423 if (operation.type == OperationType::IF) {
2424 namespace op = operation_if;
2425 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2426 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2427 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2428 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2429 return applyPreference(device->getIfPerformance()) +
2430 0.5 * (thenModel->getPerformance(preference, device) +
2431 elseModel->getPerformance(preference, device));
2432 }
2433
2434 if (operation.type == OperationType::WHILE) {
2435 namespace op = operation_while;
2436 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2437 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2438 const ModelBuilder* condModel = getReferencedModel(condOperand);
2439 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2440 return applyPreference(device->getWhilePerformance()) +
2441 condModel->getPerformance(preference, device) +
2442 bodyModel->getPerformance(preference, device);
2443 }
2444
2445 // TODO This assumes that the type is dictated by the first operand. This is
2446 // currently the case but is not a safe assumption to make in the long term.
2447 const uint32_t operandIndex = operation.inputs[0];
2448 const OperandType operandType = mOperands[operandIndex].type;
2449 switch (operandType) {
2450 case OperandType::FLOAT32:
2451 if (mRelaxComputationFloat32toFloat16) {
2452 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
2453 }
2454 break;
2455 case OperandType::TENSOR_FLOAT32:
2456 if (mRelaxComputationFloat32toFloat16) {
2457 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
2458 }
2459 break;
2460 default:
2461 break;
2462 }
2463
2464 return applyPreference(device->getPerformance(operandType));
2465 }
2466
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const2467 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
2468 auto containsUnknownSize = [](const ModelBuilder* model,
2469 const std::vector<uint32_t>& operandIndexes) {
2470 for (uint32_t operandIndex : operandIndexes) {
2471 if (hasUnknownSize(model->getOperand(operandIndex))) {
2472 return true;
2473 }
2474 }
2475 return false;
2476 };
2477
2478 const Operation& operation = getOperation(operationIndex);
2479
2480 if (operation.type == OperationType::IF) {
2481 namespace op = operation_if;
2482 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2483 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2484 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2485 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2486 return containsUnknownSize(this, operation.inputs) ||
2487 containsUnknownSize(this, operation.outputs) ||
2488 containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
2489 containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
2490 containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
2491 containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
2492 }
2493
2494 if (operation.type == OperationType::WHILE) {
2495 namespace op = operation_while;
2496 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2497 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2498 const ModelBuilder* condModel = getReferencedModel(condOperand);
2499 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2500 return containsUnknownSize(this, operation.inputs) ||
2501 containsUnknownSize(this, operation.outputs) ||
2502 containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
2503 containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
2504 containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
2505 containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
2506 }
2507
2508 // Not a control flow operation.
2509 return false;
2510 }
2511
supportedByControlFlowInterpreter(uint32_t operationIndex) const2512 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
2513 const Operation& operation = getOperation(operationIndex);
2514 return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
2515 // The partitioner does not support dynamic temporaries (b/132458982).
2516 !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
2517 }
2518
2519 namespace {
2520
2521 // This class determines whether a given device can execute a given operation
2522 class CanDo {
2523 public:
CanDo()2524 CanDo() {}
2525
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)2526 void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
2527 mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
2528 }
2529
check(size_t operationIndex) const2530 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
2531
2532 private:
2533 std::vector<bool> mSupportsOperationByIndex;
2534 };
2535
2536 } // anonymous namespace
2537
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const2538 int ModelBuilder::findBestDeviceForEachOperation(
2539 uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
2540 std::vector<int>* bestDeviceForOperation) const {
2541 const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing());
2542
2543 const size_t deviceCount = devices.size();
2544 std::vector<CanDo> canDo(deviceCount);
2545 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2546 canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2547 }
2548
2549 // Figure out the best driver for each operation.
2550 const size_t operationCount = mOperations.size();
2551 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2552 const Operation& operation = getOperation(operationIndex);
2553 // Find which device, including CPU fallback, gives the best performance for this operation.
2554 int bestChoice = -1;
2555
2556 if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2557 // Do not schedule control flow operations with unknown size to
2558 // non-CPU devices because this is not supported by the 1.3 HAL.
2559 // See http://b/159076604#comment5.
2560 auto cpuDeviceIterator =
2561 std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2562 if (cpuDeviceIterator != devices.end()) {
2563 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2564 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2565 bestChoice = cpuDeviceIndex;
2566 }
2567 }
2568 } else {
2569 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
2570 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2571 const auto& device = devices[deviceIndex];
2572 if (canDo[deviceIndex].check(operationIndex)) {
2573 const float perfVal = getPerformance(preference, device, operationIndex);
2574 const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice());
2575 if (bestChoice < 0 || perfVal < bestPerfVal ||
2576 (perfVal == bestPerfVal && deviceIsPreferred)) {
2577 bestChoice = deviceIndex;
2578 bestPerfVal = perfVal;
2579 }
2580 } else {
2581 // Somewhat noisy logging, but only place where the user of NNAPI can get
2582 // feedback on why an operation was not run on a specific device.
2583 //
2584 // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2585 // very small.
2586 VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2587 << operation.type << ":" << operationIndex;
2588 }
2589 }
2590 }
2591
2592 if (bestChoice < 0) {
2593 LOG(ERROR) << "No driver can do operation " << operation.type;
2594 return ANEURALNETWORKS_BAD_DATA;
2595 } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2596 supportedByControlFlowInterpreter(operationIndex)) {
2597 // Run control flow on the ExecutionPlan::next() interpreter and try
2598 // to delegate referenced models.
2599 const int kControlFlowInterpreter = deviceCount;
2600 (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2601 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2602 << ":" << operationIndex << ") = -1 (NNAPI)";
2603 } else {
2604 (*bestDeviceForOperation)[operationIndex] = bestChoice;
2605 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2606 << ":" << operationIndex << ") = " << bestChoice << " ("
2607 << devices[bestChoice]->getName() << ")";
2608 }
2609 }
2610 return ANEURALNETWORKS_NO_ERROR;
2611 }
2612
2613 } // namespace nn
2614 } // namespace android
2615