1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionPlan"
18
19 #include "ExecutionPlan.h"
20
21 #include "BurstBuilder.h"
22 #include "Callbacks.h"
23 #include "CompilationBuilder.h"
24 #include "ExecutionBuilder.h"
25 #include "ExecutionBurstController.h"
26 #include "GraphDump.h"
27 #include "Manager.h"
28 #include "ModelBuilder.h"
29 #include "OperationsUtils.h"
30 #include "TokenHasher.h"
31 #include "Tracing.h"
32 #include "TypeManager.h"
33 #include "Utils.h"
34
35 #include <cutils/native_handle.h>
36 #include <fcntl.h>
37 #include <openssl/sha.h>
38 #include <sys/stat.h>
39 #include <sys/types.h>
40 #include <functional>
41 #include <map>
42 #include <mutex>
43 #include <queue>
44 #include <strstream>
45 #include <type_traits>
46 #include <unordered_set>
47 #include <utility>
48 #include <vector>
49
50 using HidlToken = hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>;
51
52 namespace android {
53 namespace nn {
54
55 namespace {
56
57 // Opens cache file by filename and sets the handle to the opened fd. Returns false on fail. The
58 // handle is expected to come in as empty, and is only set to a fd when the function returns true.
59 // The file descriptor is always opened with both read and write permission.
createCacheHandle(const std::string & cache,bool createIfNotExist,hidl_handle * handle)60 bool createCacheHandle(const std::string& cache, bool createIfNotExist, hidl_handle* handle) {
61 CHECK(handle->getNativeHandle() == nullptr);
62 int fd = open(cache.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR, S_IRUSR | S_IWUSR);
63 NN_RET_CHECK_GE(fd, 0);
64 native_handle_t* cacheNativeHandle = native_handle_create(1, 0);
65 if (cacheNativeHandle == nullptr) {
66 close(fd);
67 return false;
68 }
69 cacheNativeHandle->data[0] = fd;
70 handle->setTo(cacheNativeHandle, /*shouldOwn=*/true);
71 return true;
72 }
73
74 // Opens a list of cache files and returns the handle vector. Returns empty vector on fail.
75 // The file descriptors are always opened with both read and write permission.
createCacheHandleVec(uint32_t numCacheFiles,const std::string & baseFileName,bool createIfNotExist)76 hidl_vec<hidl_handle> createCacheHandleVec(uint32_t numCacheFiles, const std::string& baseFileName,
77 bool createIfNotExist) {
78 CHECK(numCacheFiles <= static_cast<uint32_t>(Constant::MAX_NUMBER_OF_CACHE_FILES));
79 hidl_vec<hidl_handle> handles(numCacheFiles);
80 for (uint32_t i = 0; i < numCacheFiles; i++) {
81 std::string filename = baseFileName + std::to_string(i);
82 VLOG(COMPILATION) << "Cache " << i << ": " << filename;
83 if (!createCacheHandle(filename, createIfNotExist, &handles[i])) {
84 return hidl_vec<hidl_handle>();
85 }
86 }
87 return handles;
88 }
89
90 // Maps token to cache file names and sets the handle vectors to the opened fds. Returns false on
91 // fail and leaves the vectors empty. Each vector is expected to come in as empty.
getCacheHandles(const std::string & cacheDir,const uint8_t * token,const std::pair<uint32_t,uint32_t> & numCacheFiles,bool createIfNotExist,hidl_vec<hidl_handle> * modelCache,hidl_vec<hidl_handle> * dataCache)92 bool getCacheHandles(const std::string& cacheDir, const uint8_t* token,
93 const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist,
94 hidl_vec<hidl_handle>* modelCache, hidl_vec<hidl_handle>* dataCache) {
95 // The filename includes ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2 characters for token,
96 // and 1 character for model/data cache identifier.
97 std::string filename(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2 + 1, '0');
98 for (uint32_t i = 0; i < ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN; i++) {
99 filename[i * 2] = 'A' + (token[i] & 0x0F);
100 filename[i * 2 + 1] = 'A' + (token[i] >> 4);
101 }
102 CHECK(cacheDir.empty() || cacheDir.back() == '/');
103 std::string cacheFileName = cacheDir + filename;
104
105 cacheFileName[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2] = '1';
106 *modelCache = createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist);
107 if (modelCache->size() != numCacheFiles.first) {
108 return false;
109 }
110 cacheFileName[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2] = '2';
111 *dataCache = createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist);
112 if (dataCache->size() != numCacheFiles.second) {
113 modelCache->resize(0);
114 return false;
115 }
116 return true;
117 }
118
119 // Tries to compile directly from cache, returns false on fail.
compileFromCache(const std::shared_ptr<Device> & device,const std::string & cacheDir,const uint8_t * token,std::shared_ptr<VersionedIPreparedModel> * preparedModel)120 bool compileFromCache(const std::shared_ptr<Device>& device, const std::string& cacheDir,
121 const uint8_t* token,
122 std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
123 CHECK(token != nullptr && device != nullptr);
124 VLOG(COMPILATION) << "compileFromCache";
125 *preparedModel = nullptr;
126 HidlToken cacheToken(token);
127 hidl_vec<hidl_handle> modelCache, dataCache;
128 NN_RET_CHECK(getCacheHandles(cacheDir, token, device->getNumberOfCacheFilesNeeded(),
129 /*createIfNotExist=*/false, &modelCache, &dataCache));
130 int ret = device->prepareModelFromCache(modelCache, dataCache, cacheToken, preparedModel);
131 return ret == ANEURALNETWORKS_NO_ERROR;
132 }
133
compileModelAndCache(const std::shared_ptr<Device> & device,const ModelBuilder * model,int32_t executionPreference,const std::string & cacheDir,const uint8_t * token,std::shared_ptr<VersionedIPreparedModel> * preparedModel)134 int compileModelAndCache(const std::shared_ptr<Device>& device, const ModelBuilder* model,
135 int32_t executionPreference, const std::string& cacheDir,
136 const uint8_t* token,
137 std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
138 CHECK(device != nullptr);
139 *preparedModel = nullptr;
140 uint8_t dummyToken[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN] = {0};
141 HidlToken cacheToken(token == nullptr ? dummyToken : token);
142 hidl_vec<hidl_handle> modelCache, dataCache;
143 if (token == nullptr || !getCacheHandles(cacheDir, token, device->getNumberOfCacheFilesNeeded(),
144 /*createIfNotExist=*/true, &modelCache, &dataCache)) {
145 modelCache.resize(0);
146 dataCache.resize(0);
147 }
148 Model hidlModel;
149 model->setHidlModel(&hidlModel);
150 return device->prepareModel(hidlModel, static_cast<ExecutionPreference>(executionPreference),
151 modelCache, dataCache, cacheToken, preparedModel);
152 }
153
154 // Compiles the model on device.
155 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
156 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
157 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
158 // device name, device version string, and the execution preference in this function.
compile(std::shared_ptr<Device> device,const ModelBuilder * model,int32_t executionPreference,const std::string & cacheDir,TokenHasher * token,std::shared_ptr<VersionedIPreparedModel> * preparedModel)159 int compile(std::shared_ptr<Device> device, const ModelBuilder* model, int32_t executionPreference,
160 const std::string& cacheDir, TokenHasher* token,
161 std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
162 CHECK(device != nullptr);
163 const uint8_t* tokenData = nullptr;
164 if (device->isCachingSupported() && token->ok() && token->updateFromString(device->getName()) &&
165 token->updateFromString(device->getVersionString()) &&
166 token->update(&executionPreference, sizeof(executionPreference)) && token->finish()) {
167 tokenData = token->getCacheToken();
168 }
169 if (tokenData != nullptr && compileFromCache(device, cacheDir, tokenData, preparedModel)) {
170 return ANEURALNETWORKS_NO_ERROR;
171 }
172 return compileModelAndCache(device, model, executionPreference, cacheDir, tokenData,
173 preparedModel);
174 }
175
176 typedef std::function<void(uint32_t)> OperationReadyCallback;
177
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)178 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
179 const Operand& fromOperand) {
180 if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
181 fromOperand.extraParams.getDiscriminator() ==
182 Operand::ExtraParams::hidl_discriminator::channelQuant) {
183 auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
184 ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
185 .channelDim = fromChannelQuant.channelDim,
186 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
187 .scales = fromChannelQuant.scales.data(),
188 };
189 return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
190 } else if (isExtensionOperandType(fromOperand.type) &&
191 fromOperand.extraParams.getDiscriminator() ==
192 Operand::ExtraParams::hidl_discriminator::extension) {
193 hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
194 return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
195 extensionData.size());
196 } else if (fromOperand.extraParams.getDiscriminator() !=
197 Operand::ExtraParams::hidl_discriminator::none ||
198 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
199 LOG(ERROR) << "Type " << toString(fromOperand.type)
200 << " has an unexpected extraParams discriminator: "
201 << static_cast<int>(fromOperand.extraParams.getDiscriminator());
202 return ANEURALNETWORKS_BAD_DATA;
203 } else {
204 return ANEURALNETWORKS_NO_ERROR;
205 }
206 }
207
208 // This class tracks whether we know the value of an operand as operations
209 // are processed.
210 class OperandTracker {
211 public:
212 // Creates the tracker for this model. Figure out which operations can be
213 // executed right away and cb for each one of them.
214 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
215 // Mark the specified operation as having been processed. The output
216 // of the operation now being known, this may make new operations to be
217 // able to run. Call cb for each one of them.
218 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
219
220 private:
221 const ModelBuilder* mModel;
222 std::multimap<uint32_t, uint32_t> mOperandToOperations;
223 std::vector<uint32_t> mUnknownInputCount; // For each operation
224 };
225
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)226 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
227 mModel(model) {
228 const auto& operations = mModel->getOperations();
229 mUnknownInputCount.resize(operations.size());
230 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
231 const Operation& operation = operations[operationIndex];
232 uint32_t count = 0;
233 for (uint32_t operandIndex : operation.inputs) {
234 auto lifetime = mModel->getOperand(operandIndex).lifetime;
235 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
236 lifetime == OperandLifeTime::MODEL_OUTPUT) {
237 count++;
238 mOperandToOperations.insert(
239 std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
240 }
241 }
242 if (count == 0) {
243 cb(operationIndex);
244 }
245 mUnknownInputCount[operationIndex] = count;
246 }
247 }
248
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)249 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
250 // Mark all its outputs as known.
251 const Operation& operation = mModel->getOperations()[operationIndex];
252 for (uint32_t operandIndex : operation.outputs) {
253 auto range = mOperandToOperations.equal_range(operandIndex);
254 for (auto i = range.first; i != range.second; i++) {
255 uint32_t& count = mUnknownInputCount[i->second];
256 if (--count == 0) {
257 cb(i->second);
258 }
259 }
260 }
261 }
262
263 } // namespace
264
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,std::shared_ptr<Device> device)265 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
266 std::shared_ptr<Device> device)
267 : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device), mToken(plan->getCacheToken()) {}
268
269 // Adds an operand if it has not been added already.
270 // Sets the index in the submodel for the corresponding operand.
addOperand(uint32_t fromOperandIndex,uint32_t * toOperandIndex,const ModelBuilder & fromModel,OperandKind kind)271 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
272 const ModelBuilder& fromModel, OperandKind kind) {
273 // Have we added this operand already?
274 auto i = mOperandMap.find(fromOperandIndex);
275 if (i != mOperandMap.end()) {
276 nnAssert(kind == INPUT);
277 *toOperandIndex = i->second;
278 return ANEURALNETWORKS_NO_ERROR;
279 }
280
281 // First time we add this operand.
282 *toOperandIndex = mSubModel.operandCount();
283 mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
284
285 // Add the operand to the submodel.
286 const Operand& operand = fromModel.getOperand(fromOperandIndex);
287 ANeuralNetworksOperandType type = {
288 .type = static_cast<int32_t>(operand.type),
289 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
290 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
291 .scale = operand.scale,
292 .zeroPoint = operand.zeroPoint,
293 };
294
295 int n = mSubModel.addOperand(type);
296 if (n != ANEURALNETWORKS_NO_ERROR) {
297 LOG(ERROR) << "Previous error occurred when partitioning the graph";
298 return n;
299 }
300
301 n = copyOperandExtraParams(mSubModel, *toOperandIndex, operand);
302 if (n != ANEURALNETWORKS_NO_ERROR) {
303 LOG(ERROR) << "Error when copying extra parameters to the operand";
304 return n;
305 }
306
307 // Sets its value.
308 switch (operand.lifetime) {
309 case OperandLifeTime::CONSTANT_COPY: {
310 const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
311 n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
312 if (n != ANEURALNETWORKS_NO_ERROR) {
313 LOG(ERROR) << "Previous error occurred when partitioning the graph";
314 return n;
315 }
316 } break;
317 case OperandLifeTime::CONSTANT_REFERENCE: {
318 const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
319 n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory,
320 operand.location.offset,
321 operand.location.length);
322 if (n != ANEURALNETWORKS_NO_ERROR) {
323 LOG(ERROR) << "Previous error occurred when partitioning the graph";
324 return n;
325 }
326 } break;
327 case OperandLifeTime::NO_VALUE: {
328 n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
329 if (n != ANEURALNETWORKS_NO_ERROR) {
330 LOG(ERROR) << "Previous error occurred when partitioning the graph";
331 return n;
332 }
333 } break;
334 case OperandLifeTime::TEMPORARY_VARIABLE: // handled similarly to MODEL_OUTPUT
335 if (kind == INPUT) {
336 // The first time we've seen this operand is as an
337 // input. That means it must be defined by a
338 // different partition, and is an input to this one.
339 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
340 } else {
341 // The first time we've seen this operand is as an
342 // output. It may be an input to a different
343 // partition, so keep track of it.
344 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
345 }
346 break;
347 case OperandLifeTime::MODEL_INPUT:
348 mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
349 break;
350 case OperandLifeTime::MODEL_OUTPUT: // handled similarly to TEMPORARY_VARIABLE
351 if (kind == INPUT) {
352 // The first time we've seen this operand is as an
353 // input. That means it must be defined by a
354 // different partition, and is an input to this one.
355 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
356 } else {
357 // The first time we've seen this operand is as an
358 // output.
359 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
360 }
361 break;
362 default:
363 nnAssert(false);
364 break;
365 }
366
367 return ANEURALNETWORKS_NO_ERROR;
368 }
369
addOperation(int operationIndex,const ModelBuilder & fromModel)370 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
371 const Operation& operation = fromModel.getOperation(operationIndex);
372 if (mToken.ok()) {
373 mToken.update(&operationIndex, sizeof(operationIndex));
374 }
375
376 // Convert the input and output operand indexes.
377 //
378 // We expect operations to be added in topological order. Therefore:
379 //
380 // - We may not have seen an input if it is a model input, a
381 // constant, or an operand written by a different partition.
382 //
383 // - We should not have seen any outputs.
384 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
385 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
386 std::vector<uint32_t> inputs(inputCount);
387 std::vector<uint32_t> outputs(outputCount);
388
389 auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
390 std::vector<uint32_t>& localOperands,
391 OperandKind kind) -> int {
392 const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
393 for (uint32_t i = 0; i < operandCount; i++) {
394 uint32_t localOperand = ~0U;
395 int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
396 if (n != ANEURALNETWORKS_NO_ERROR)
397 return n;
398 localOperands[i] = localOperand;
399 }
400 return ANEURALNETWORKS_NO_ERROR;
401 };
402
403 int n;
404 if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
405 (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
406 return n;
407 }
408
409 return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
410 outputCount, outputs.data());
411 }
412
mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const413 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
414 for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
415 stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
416 }
417 for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
418 stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
419 }
420 }
421
findTempsAsSubModelOutputs()422 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
423 for (const auto& step : mSteps) {
424 for (const auto& input : step->getTempsAsSubModelInputs()) {
425 const uint32_t fromModelIndex = input.first;
426 const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
427 nnAssert(it != mTemporaryToDefiningStep.end());
428 const uint32_t stepIndex = it->second;
429 nnAssert(stepIndex < mSteps.size());
430 mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
431 }
432 }
433 }
434
logSubModel() const435 void ExecutionStep::logSubModel() const {
436 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
437
438 auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) {
439 if (!toLog.empty()) {
440 toLog += ", ";
441 }
442 toLog += "(";
443 toLog += std::to_string(e.first);
444 toLog += "->";
445 toLog += std::to_string(e.second);
446 toLog += ")";
447 };
448
449 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
450 std::string toLog;
451 for (const auto& e : map) {
452 logRemapEntry(toLog, e);
453 }
454 VLOG(COMPILATION) << name << ": " << toLog;
455 };
456 auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
457 std::string toLog;
458 for (const auto& e : set) {
459 logRemapEntry(toLog, e);
460 }
461 VLOG(COMPILATION) << name << ": " << toLog;
462 };
463
464 logRemapVector("model inputs", mModelInputs);
465 logRemapVector("model outputs", mModelOutputs);
466 logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
467 logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
468 logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
469 }
470
convertModelInputsOrOutputs(const ExecutionStep::RemapVectorType & myModelInputsOrOutputs,uint32_t fromModelInputOrOutputCount,std::function<uint32_t (uint32_t)> fromModelGetInputOrOutputOperandIndex,std::vector<uint32_t> * inputsOrOutputs,std::vector<uint32_t> * inputOrOutputIndexSubModelToFromModel)471 static void convertModelInputsOrOutputs(
472 // IN: mModel{Inputs|Outputs}
473 const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
474 // IN: fromModel->{input|output}Count()
475 uint32_t fromModelInputOrOutputCount,
476 // IN: fromModel->get{Input|Output}OperandIndex
477 std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex,
478 // OUT: for v : mModel{Inputs|Outputs} : v.second
479 std::vector<uint32_t>* inputsOrOutputs,
480 // OUT: submodel input-or-output index to original model input-or-output index
481 std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) {
482 std::map<uint32_t, uint32_t> fromModelIndexMap; // operand index to input-or-output index
483 for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
484 fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
485 }
486 for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
487 inputsOrOutputs->push_back(myInputOrOutput.second);
488 const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
489 inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
490 }
491 }
492
finishSubModel(const ModelBuilder * fromModel,bool * hasOutputOfUnknownSize,int32_t executionPreference)493 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
494 int32_t executionPreference) {
495 nnAssert(mDevice != nullptr);
496 if (VLOG_IS_ON(COMPILATION)) {
497 logSubModel();
498 }
499
500 mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());
501
502 // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
503 // Output order: mModelOutputs, mTempsAsSubModelOutputs
504 //
505 // ExecutionPlan::next() depends on these orderings.
506
507 std::vector<uint32_t> inputs;
508 convertModelInputsOrOutputs(mModelInputs,
509 fromModel->inputCount(),
510 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
511 &inputs,
512 &mInputIndexSubModelToFromModel);
513 for (const auto& subModelInput : mTempsAsSubModelInputs) {
514 inputs.push_back(subModelInput.second);
515 }
516 for (const auto& subModelInput : mOutputsAsSubModelInputs) {
517 inputs.push_back(subModelInput.second);
518 }
519
520 std::vector<uint32_t> outputs;
521 convertModelInputsOrOutputs(mModelOutputs,
522 fromModel->outputCount(),
523 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
524 &outputs,
525 &mOutputIndexSubModelToFromModel);
526 for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
527 outputs.push_back(subModelOutput.second);
528 const Operand& operand = mSubModel.getOperand(subModelOutput.second);
529 if (operand.dimensions.size() == 0) {
530 *hasOutputOfUnknownSize = true;
531 } else {
532 for (uint32_t dimension : operand.dimensions) {
533 if (dimension == 0) {
534 *hasOutputOfUnknownSize = true;
535 break;
536 }
537 }
538 }
539 if (*hasOutputOfUnknownSize) {
540 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
541 << " of original graph) has unknown size: " << toString(operand);
542 }
543 }
544
545 {
546 int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
547 if (n != ANEURALNETWORKS_NO_ERROR) {
548 return n;
549 }
550 n = mSubModel.finish();
551 if (n != ANEURALNETWORKS_NO_ERROR) {
552 return n;
553 }
554 }
555
556 {
557 // Compute mOutputsAsSubModelInputsIndexToFromModel.
558
559 std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
560 for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
561 fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
562 }
563
564 for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
565 const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
566 const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
567 if (it == fromModelOperandIndexToOutputIndex.end()) {
568 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
569 << " in main model output operand list";
570 return ANEURALNETWORKS_BAD_STATE;
571 }
572 mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
573 }
574 }
575
576 // TODO: Move compilation elsewhere?
577 VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation on " << mDevice->getName();
578 return compile(mDevice, &mSubModel, executionPreference, *mPlan->getCacheDir(), &mToken,
579 &mPreparedSubModel);
580 }
581
dump() const582 void ExecutionStep::dump() const {
583 Model model;
584 mSubModel.setHidlModel(&model);
585 if (VLOG_IS_ON(COMPILATION)) {
586 VLOG(COMPILATION) << "ExecutionStep#" << mIndex << " for " << mDevice->getName();
587 logModelToInfo(model);
588 }
589 }
590
finish(const ModelBuilder * fromModel,int32_t executionPreference)591 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
592 int32_t executionPreference) {
593 findTempsAsSubModelOutputs();
594 for (const auto& step : mSteps) {
595 int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
596 executionPreference);
597 if (n != ANEURALNETWORKS_NO_ERROR) {
598 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
599 return n;
600 }
601 }
602 if (mHasSubModelOutputOfUnknownSize) {
603 VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
604 return ANEURALNETWORKS_OP_FAILED;
605 }
606
607 mSuccessfulFinish = true;
608 return ANEURALNETWORKS_NO_ERROR;
609 }
610
finish(const ModelBuilder * fromModel,int32_t executionPreference)611 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
612 int32_t executionPreference) {
613 nnAssert(mDevice != nullptr);
614 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
615 const int n =
616 compile(mDevice, mModel, executionPreference, *mCacheDir, &mToken, &mPreparedModel);
617 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
618 return n;
619 }
620
finish(const ModelBuilder * fromModel,int32_t executionPreference)621 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
622 nnAssert(mBody != nullptr);
623 return mBody->finish(fromModel, executionPreference);
624 }
625
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,uint32_t totalSizeOfTemporaries)626 ExecutionPlan::Controller::Controller(
627 const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
628 const BurstBuilder* burstBuilder,
629 std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
630 uint32_t totalSizeOfTemporaries)
631 : mPlan(plan),
632 mExecutionBuilder(executionBuilder),
633 mBurstBuilder(burstBuilder),
634 mSubModelInputsAndOutputs(subModelInputsAndOutputs),
635 mNextStepIndex(0) {
636 if (totalSizeOfTemporaries) {
637 if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
638 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
639 mNextStepIndex = kBadStepIndex;
640 }
641 }
642 }
643
644 // Attempt to create a burst object for each PreparedModel/Partition. If the
645 // burst controller object cannot be made, return a nullptr in its place to
646 // indicate the regular execution path should be used. This can occur either
647 // because PreparedModel was nullptr (cpu was best choice), or because the
648 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const649 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts() const {
650 switch (mState) {
651 // burst object for each partition in the compound case
652 case COMPOUND: {
653 std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
654 bursts.reserve(compound()->mSteps.size());
655 for (const auto& step : compound()->mSteps) {
656 if (const auto preparedModel = step->getPreparedSubModel()) {
657 bursts.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true));
658 } else {
659 bursts.push_back(nullptr);
660 }
661 }
662 return bursts;
663 }
664 // single burst object for the simple case
665 case SIMPLE: {
666 std::vector<std::shared_ptr<ExecutionBurstController>> burst;
667 auto simpleBody = static_cast<const SimpleBody*>(mBody);
668 if (const auto preparedModel = simpleBody->mPreparedModel) {
669 burst.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true));
670 } else {
671 burst.push_back(nullptr);
672 }
673 return burst;
674 }
675 // no burst objects made
676 default:
677 return {};
678 }
679 }
680
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const681 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
682 ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
683 nnAssert(isValid());
684
685 // Create the layout for a Memory object big enough for to hold
686 // every TEMPORARY in the original model that is live across
687 // partition boundaries.
688 //
689 // TODO: Rethink this approach for managing temporaries. Some
690 // alternatives:
691 //
692 // 1) Adopt a memory layout scheme analogous to stack allocation,
693 // where objects of non-overlapping lifetime can occupy the same
694 // storage. We would still have a single Memory object in this
695 // case.
696 //
697 // 2) Do something like what CpuExecutor does, and do allocations
698 // and deallocations on the fly (during execution) before first
699 // reference and after last reference, respectively. This would
700 // mean having one Memory object per TEMPORARY; or, in a more
701 // complicated implementation, one Memory object per set of
702 // temporaries that have the same lifetime. Note that the Android
703 // system limits the number of shared memory objects, which are
704 // what our Memory objects represent.
705 //
706 uint32_t totalSizeOfTemporaries = 0;
707 std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
708 if (mState == COMPOUND) {
709 const ModelBuilder* fromModel = executionBuilder->getModel();
710 for (const auto& step : compound()->mSteps) {
711 for (const auto& output: step->getTempsAsSubModelOutputs()) {
712 const uint32_t fromModelOperandIndex = output.first;
713 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
714 if (subModelInputsAndOutputs == nullptr) {
715 subModelInputsAndOutputs =
716 std::make_shared<Controller::SubModelInputsAndOutputsType>();
717 }
718 const uint32_t size = TypeManager::get()->getSizeOfData(fromModelOperand);
719 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
720 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
721 totalSizeOfTemporaries += size;
722 }
723 }
724 if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
725 for (const auto& io : *subModelInputsAndOutputs) {
726 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first
727 << ", offset = " << io.second;
728 }
729 }
730 }
731
732 return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder,
733 subModelInputsAndOutputs,
734 totalSizeOfTemporaries));
735 }
736
737
738 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const739 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
740 std::shared_ptr<StepExecutor>* executor) const {
741 *executor = nullptr;
742
743 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
744 << "): mNextStepIndex = " << controller->mNextStepIndex;
745
746 if (controller->mNextStepIndex == 0) {
747 // We haven't called next().
748 return ANEURALNETWORKS_OP_FAILED;
749 }
750
751 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
752 // The last call to next() did not produce an executor.
753 return ANEURALNETWORKS_OP_FAILED;
754 }
755
756 --controller->mNextStepIndex;
757 return next(controller, executor);
758 }
759
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const760 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
761 std::shared_ptr<StepExecutor>* executor,
762 std::shared_ptr<ExecutionBurstController>* burstController) const {
763 *executor = nullptr;
764 if (burstController != nullptr) {
765 *burstController = nullptr;
766 }
767
768 VLOG(EXECUTION) << "ExecutionPlan::next("
769 << SHOW_IF_DEBUG(controller << ", " << executor)
770 << "): mNextStepIndex = " << controller->mNextStepIndex;
771
772 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
773 return ANEURALNETWORKS_OP_FAILED;
774 }
775
776 if (mState == EMPTY) {
777 nnAssert(controller->mNextStepIndex == 0); // end
778 controller->mNextStepIndex = Controller::kBadStepIndex;
779 return ANEURALNETWORKS_NO_ERROR;
780 }
781
782 if (mState == SIMPLE) {
783 if (controller->mNextStepIndex == 0) {
784 // First (and only) step.
785 auto simpleBody = static_cast<const SimpleBody*>(mBody);
786 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
787 simpleBody->mModel, simpleBody->mDevice,
788 simpleBody->mPreparedModel);
789 (*executor)->mapInputsAndOutputsTrivially();
790 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
791 *burstController = controller->mBurstBuilder->getControllerAt(0);
792 }
793 controller->mNextStepIndex = 1;
794 return ANEURALNETWORKS_NO_ERROR;
795 }
796
797 nnAssert(controller->mNextStepIndex == 1); // end
798 controller->mNextStepIndex = Controller::kBadStepIndex;
799 return ANEURALNETWORKS_NO_ERROR;
800 }
801
802 auto compoundBody = compound();
803
804 if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
805 // end
806 controller->mNextStepIndex = Controller::kBadStepIndex;
807 return ANEURALNETWORKS_NO_ERROR;
808 }
809
810 // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
811 // Output order: model outputs, temps as submodel outputs
812 //
813 // ExecutionStep::finishSubModel() establishes these orderings.
814
815 const auto step = compoundBody->mSteps[controller->mNextStepIndex];
816 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getSubModel(),
817 step->getDevice(), step->getPreparedSubModel());
818 (*executor)->setExecutionStep(step);
819 step->mapInputsAndOutputs(*executor);
820 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
821 *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
822 }
823 if (controller->mSubModelInputsAndOutputs != nullptr) {
824 {
825 // Tell executor about temps as submodel outputs.
826
827 const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
828 const auto& subModelOutputs = step->getTempsAsSubModelOutputs();
829
830 uint32_t idx = 0;
831 for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
832 const uint32_t fromModelOperandIndex = I->first;
833 const uint32_t offsetOfTemporary =
834 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
835 int n = (*executor)->setOutputFromTemporaryMemory(
836 firstSubModelOutputIndex + idx,
837 &controller->mTemporaries,
838 offsetOfTemporary);
839 if (n != ANEURALNETWORKS_NO_ERROR) {
840 controller->mNextStepIndex = Controller::kBadStepIndex;
841 return n;
842 }
843 }
844 }
845 {
846 // Tell executor about temps as submodel inputs.
847
848 const size_t firstSubModelInputIndex = step->getModelInputs().size();
849 const auto& subModelInputs = step->getTempsAsSubModelInputs();
850
851 uint32_t idx = 0;
852 for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
853 const uint32_t fromModelOperandIndex = I->first;
854 const uint32_t offsetOfTemporary =
855 controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
856 int n = (*executor)->setInputFromTemporaryMemory(
857 firstSubModelInputIndex + idx,
858 &controller->mTemporaries,
859 offsetOfTemporary);
860 if (n != ANEURALNETWORKS_NO_ERROR) {
861 controller->mNextStepIndex = Controller::kBadStepIndex;
862 return n;
863 }
864 }
865 }
866 }
867 {
868 // Tell executor about outputs as submodel inputs.
869
870 const size_t firstOutputsAsSubModelInputIndex =
871 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
872 const auto& outputsAsSubModelInputsIndexToFromModel =
873 step->getOutputsAsSubModelInputsIndexToFromModel();
874 for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
875 uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
876 (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
877 }
878 }
879
880 controller->mNextStepIndex++;
881 return ANEURALNETWORKS_NO_ERROR;
882 }
883
createNewStep(const std::shared_ptr<Device> device)884 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
885 nnAssert(mState != SIMPLE);
886 if (mState == EMPTY) {
887 mBody = new CompoundBody();
888 mState = COMPOUND;
889 }
890 auto& steps = compound()->mSteps;
891 auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
892 steps.push_back(step);
893 return step;
894 }
895
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)896 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
897 const ModelBuilder* model) {
898 nnAssert(mState == EMPTY);
899 mBody = new SimpleBody(device, model, mCacheDir, mToken);
900 mState = SIMPLE;
901 }
902
dump() const903 void ExecutionPlan::dump() const {
904 if (mBody) {
905 mBody->dump();
906 } else {
907 VLOG(COMPILATION) << "EMPTY";
908 }
909 }
910
reset()911 void ExecutionPlan::reset() {
912 if (mBody) {
913 delete mBody;
914 mBody = nullptr;
915 }
916 mState = EMPTY;
917 }
918
forTest_getKind() const919 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
920 switch (mState) {
921 case EMPTY:
922 return Kind::EMPTY;
923 case SIMPLE:
924 nnAssert(mBody);
925 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
926 case COMPOUND:
927 nnAssert(mBody);
928 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
929 default:
930 nnAssert(!"unexpected state");
931 return Kind::ERROR;
932 }
933 }
934
forTest_simpleGetDevice() const935 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
936 nnAssert(mState == SIMPLE);
937 return static_cast<const SimpleBody*>(mBody)->mDevice;
938 }
939
forTest_compoundGetSteps() const940 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
941 return compound()->mSteps;
942 }
943
forTest_hasSubModelOutputsOfUnknownSize() const944 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
945 return mBody->hasSubModelOutputsOfUnknownSize();
946 }
947
forTest_simpleGetCacheToken() const948 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
949 CHECK(mState == SIMPLE)
950 << "Calling forTest_simpleGetCacheToken from execution plan with a non-SIMPLE body";
951 return static_cast<const SimpleBody*>(mBody)->mToken.getCacheToken();
952 }
953
dump() const954 void ExecutionPlan::SimpleBody::dump() const {
955 VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
956 }
957
dump() const958 void ExecutionPlan::CompoundBody::dump() const {
959 for (const auto& step : mSteps) {
960 step->dump();
961 }
962 }
963
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,ExecutionPlan * plan) const964 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
965 uint32_t preference, ExecutionPlan* plan) const {
966 // This function uses a heuristic approach to partitioning the graph.
967 // It should be good enough for the first release.
968
969 const size_t deviceCount = devices.size();
970 const size_t operationCount = mOperations.size();
971
972 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
973 << ", operationCount = " << operationCount;
974
975 // Figure out where each operation will best execute.
976 // The value of the vector is the index in the devices vector.
977 std::vector<int> bestDeviceForOperation(operationCount);
978 NN_RETURN_IF_ERROR(
979 findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
980
981 // If one device will run all the operations, we don't need to split the work.
982 if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
983 std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
984 const int bestDeviceIndex = bestDeviceForOperation[0];
985 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
986 << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
987 plan->becomeSingleStep(devices[bestDeviceIndex], this);
988 return plan->finish(this, preference);
989 }
990
991 // No easy solution, we need to split the work.
992
993 // We keep track of the operations that are ready to run for each device.
994 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
995
996 // This helper function enqueues the operation on the appropriate queue.
997 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
998 int deviceIndex = bestDeviceForOperation[operationIndex];
999 perDeviceQueue[deviceIndex].push(operationIndex);
1000 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
1001 << deviceIndex;
1002 };
1003
1004 // This helper function finds a device that has operations ready to process.
1005 // We start by looking at the CPU. We do this to try to maximize the
1006 // size of the graph we'll send to non-CPU devices. If the CPU runs first,
1007 // it will have the chance to prepare more of the inputs required by the
1008 // other devices. This function returns -1 if all queues are empty.
1009 auto findNextDeviceToProcess = [&]() -> int {
1010 for (int i = deviceCount - 1; i >= 0; i--) {
1011 if (!perDeviceQueue[i].empty()) {
1012 return i;
1013 }
1014 }
1015 return -1;
1016 };
1017
1018 OperandTracker tracker(this, enqueueOnAppropriateDevice);
1019 // For each iteration of this loop, we'll create an execution step.
1020 while (true) {
1021 // Find the device we'll do this step for.
1022 int deviceIndex = findNextDeviceToProcess();
1023 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
1024 if (deviceIndex < 0) {
1025 break;
1026 }
1027
1028 // Assign as much as possible to this device.
1029 std::shared_ptr<ExecutionStep> step = plan->createNewStep(devices[deviceIndex]);
1030 auto& queue = perDeviceQueue[deviceIndex];
1031 while (!queue.empty()) {
1032 uint32_t operationIndex = queue.front();
1033 queue.pop();
1034 int n = step->addOperation(operationIndex, *this);
1035 if (n != ANEURALNETWORKS_NO_ERROR) {
1036 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
1037 return n;
1038 }
1039 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1040 }
1041 }
1042
1043 int n = plan->finish(this, preference);
1044 if (VLOG_IS_ON(COMPILATION)) {
1045 Model model;
1046 setHidlModel(&model);
1047 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
1048 logModelToInfo(model);
1049 plan->dump();
1050 }
1051 return n;
1052 }
1053
getPerformanceInfo(const std::shared_ptr<Device> device,uint32_t operationIndex) const1054 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
1055 uint32_t operationIndex) const {
1056 const Operation& operation = getOperation(operationIndex);
1057 // TODO This assumes that the type is dictated by the first operand. This is
1058 // currently the case but is not a safe assumption to make in the long term.
1059 const uint32_t operandIndex = operation.inputs[0];
1060 const OperandType operandType = mOperands[operandIndex].type;
1061 switch(operandType) {
1062 case OperandType::FLOAT32:
1063 if (mRelaxComputationFloat32toFloat16) {
1064 return device->getRelaxedFloat32toFloat16PerformanceScalar();
1065 }
1066 break;
1067 case OperandType::TENSOR_FLOAT32:
1068 if (mRelaxComputationFloat32toFloat16) {
1069 return device->getRelaxedFloat32toFloat16PerformanceTensor();
1070 }
1071 break;
1072 default:
1073 break;
1074 }
1075
1076 return device->getPerformance(operandType);
1077 }
1078
1079 namespace {
1080
1081 // Add an element to the end of the vector and return a pair consisting of the
1082 // index of the new element and a pointer to the new element.
1083 template <class T>
extend(hidl_vec<T> * vec)1084 std::pair<uint32_t, T*> extend(hidl_vec<T>* vec) {
1085 size_t nextIndex = vec->size();
1086 vec->resize(nextIndex + 1);
1087 return {nextIndex, &(*vec)[nextIndex]};
1088 }
1089
1090 // Add an element to the end of the vector, set it to the specified value, and
1091 // return a pair consisting of the index of the new element and a pointer to the
1092 // new element.
1093 template <class T>
extend(hidl_vec<T> * vec,const T & val)1094 std::pair<uint32_t, T*> extend(hidl_vec<T>* vec, const T& val) {
1095 auto extended = extend(vec);
1096 *extended.second = val;
1097 return extended;
1098 }
1099
1100 template <typename T>
operator <(const hidl_vec<T> & a,const hidl_vec<T> & b)1101 bool operator<(const hidl_vec<T>& a, const hidl_vec<T>& b) {
1102 return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
1103 }
1104
1105 // Compile-time mapping from a particular Model type to a name for that type.
1106 template <class T_Model>
1107 struct ModelVersion;
1108 template <>
1109 struct ModelVersion<V1_0::Model> {
1110 static constexpr char name[] = "V1_0";
1111 };
1112 template <>
1113 struct ModelVersion<V1_1::Model> {
1114 static constexpr char name[] = "V1_1";
1115 };
1116 template <>
1117 struct ModelVersion<V1_2::Model> {
1118 static constexpr char name[] = "V1_2";
1119 };
1120
1121 // Dispatcher mechanism for calling an appropriate uncheckedConvertToV1_*
1122 // given the desired return type.
1123 template <typename T_ReturnType>
1124 T_ReturnType uncheckedConvertTo(OperationType type);
1125 template <>
uncheckedConvertTo(OperationType type)1126 V1_0::OperationType uncheckedConvertTo<V1_0::OperationType>(OperationType type) {
1127 return uncheckedConvertToV1_0(type);
1128 }
1129 template <>
uncheckedConvertTo(OperationType type)1130 V1_1::OperationType uncheckedConvertTo<V1_1::OperationType>(OperationType type) {
1131 return uncheckedConvertToV1_1(type);
1132 }
1133
1134 // Dispatcher mechanism for calling an appropriate convertToV1_* given the
1135 // desired return type. Note that there is no V1_1::Operand type.
1136 template <typename T_ReturnType>
1137 T_ReturnType convertTo(Operand operand);
1138 template <>
convertTo(Operand operand)1139 V1_0::Operand convertTo<V1_0::Operand>(Operand operand) {
1140 return convertToV1_0(operand);
1141 }
1142
1143 // Dispatcher mechanism for calling an appropriate compliantWithV1_* given the
1144 // desired target model type.
1145 template <typename T_SlicedModel>
1146 void getNoncompliantOperations(const V1_2::Model& model,
1147 std::set<uint32_t>* noncompliantOperations);
1148 template <>
getNoncompliantOperations(const V1_2::Model & model,std::set<uint32_t> * noncompliantOperations)1149 void getNoncompliantOperations<V1_0::Model>(const V1_2::Model& model,
1150 std::set<uint32_t>* noncompliantOperations) {
1151 compliantWithV1_0(model, noncompliantOperations);
1152 }
1153 template <>
getNoncompliantOperations(const V1_2::Model & model,std::set<uint32_t> * noncompliantOperations)1154 void getNoncompliantOperations<V1_1::Model>(const V1_2::Model& model,
1155 std::set<uint32_t>* noncompliantOperations) {
1156 compliantWithV1_1(model, noncompliantOperations);
1157 }
1158
1159 class PlanModelSlicer : public IModelSlicer {
1160 public:
1161 PlanModelSlicer(const ModelBuilder* model);
1162
getSliceV1_0()1163 std::optional<std::pair<V1_0::Model, std::function<uint32_t(uint32_t)>>> getSliceV1_0()
1164 override {
1165 return getSlice(&mSliceV1_0);
1166 }
getSliceV1_1()1167 std::optional<std::pair<V1_1::Model, std::function<uint32_t(uint32_t)>>> getSliceV1_1()
1168 override {
1169 return getSlice(&mSliceV1_1);
1170 }
1171
getModel() const1172 const Model& getModel() const { return mHidlModel; }
1173
1174 private:
1175 template <class T_SlicedModel>
1176 static bool invalid(const T_SlicedModel& model);
1177
1178 enum class SliceState { UNINITIALIZED, INVALID, NORMAL };
1179 template <class T_SlicedModel>
1180 struct Slice {
1181 SliceState mState = SliceState::UNINITIALIZED;
1182 T_SlicedModel mHidlModel;
1183 std::vector<uint32_t> mSlicedOperationIndexToOrigIndex;
1184 };
1185 Slice<V1_0::Model> mSliceV1_0;
1186 Slice<V1_1::Model> mSliceV1_1;
1187
1188 template <class T_SlicedModel>
1189 void initializeSlice(Slice<T_SlicedModel>* slice);
1190
1191 template <class T_SlicedModel>
getSlice(Slice<T_SlicedModel> * slice)1192 std::optional<std::pair<T_SlicedModel, std::function<uint32_t(uint32_t)>>> getSlice(
1193 Slice<T_SlicedModel>* slice) {
1194 CHECK(slice != nullptr);
1195 if (slice->mState == SliceState::UNINITIALIZED) {
1196 initializeSlice(slice);
1197 }
1198 if (slice->mState == SliceState::INVALID) {
1199 return {};
1200 }
1201 return std::pair<T_SlicedModel, std::function<uint32_t(uint32_t)>>(
1202 slice->mHidlModel, [slice](uint32_t slicedOperationIndex) {
1203 return slice->mSlicedOperationIndexToOrigIndex.at(slicedOperationIndex);
1204 });
1205 }
1206
1207 Model mHidlModel;
1208 };
1209
1210 template <class T_SlicedModel>
invalid(const T_SlicedModel & model)1211 bool PlanModelSlicer::invalid(const T_SlicedModel& model) {
1212 // A model must have at least one operation. However, it's possible that a
1213 // slice has no operations (because no operations from the original model
1214 // are compliant with the sliced model type). In this case, the sliced
1215 // model would be invalid.
1216 const bool looksEmpty = (model.operations.size() == 0);
1217 if (DeviceManager::get()->strictSlicing()) {
1218 CHECK_EQ(looksEmpty, (model.operands.size() == 0));
1219 }
1220 if (looksEmpty) return true;
1221
1222 // A model must have at least one output. However, it's possible for a
1223 // model to contain dead operations (i.e., outputs on which no model outputs
1224 // are data dependent). A slice might contain only dead operations, and
1225 // hence have no model outputs. In this case, the sliced model would be
1226 // invalid.
1227 if (model.outputIndexes.size() == 0) return true;
1228
1229 // We shouldn't have to check whether the model is valid.
1230 // However, it could be invalid if:
1231 // - there is an error in the slicing algorithm; or
1232 // - there is an error in compliantWith (see http://b/131845106)
1233 if (!validateModel(model)) {
1234 LOG(WARNING) << "Sliced model fails validateModel()";
1235 CHECK(!DeviceManager::get()->strictSlicing());
1236 return true;
1237 }
1238
1239 return false;
1240 }
1241
PlanModelSlicer(const ModelBuilder * model)1242 PlanModelSlicer::PlanModelSlicer(const ModelBuilder* model) {
1243 model->setHidlModel(&mHidlModel);
1244 }
1245
1246 template <class T_SlicedModel>
initializeSlice(Slice<T_SlicedModel> * slice)1247 void PlanModelSlicer::initializeSlice(Slice<T_SlicedModel>* slice) {
1248 using SlicedOperand = std::remove_pointer_t<decltype(slice->mHidlModel.operands.data())>;
1249 using SlicedOperation = std::remove_pointer_t<decltype(slice->mHidlModel.operations.data())>;
1250 using SlicedOperationType = decltype(SlicedOperation::type);
1251
1252 CHECK(slice->mState == SliceState::UNINITIALIZED);
1253
1254 const auto& origOperands = mHidlModel.operands;
1255 const auto& origOperations = mHidlModel.operations;
1256 auto& slicedOperands = slice->mHidlModel.operands;
1257 auto& slicedOperations = slice->mHidlModel.operations;
1258
1259 // Indexes of elements of noncompliant origOperations
1260 std::set<uint32_t> noncompliantOperations;
1261 getNoncompliantOperations<T_SlicedModel>(mHidlModel, &noncompliantOperations);
1262
1263 // Map from an operand index in origOperands to the corresponding operand index in
1264 // slicedOperands
1265 std::map<uint32_t, uint32_t> origOperandIndexToSlicedIndex;
1266
1267 // Collect the operand indexes of every operand that is an input to a
1268 // compliant operation. If the operand is a CONSTANT_* or a NO_VALUE, copy
1269 // it to the sliced model and update origOperandIndexToSlicedIndex
1270 // accordingly. Otherwise, we'll deal with the operand in the subsequent
1271 // "Main loop", where we process operation outputs (intermediates and model
1272 // outputs).
1273 std::set<uint32_t> inputOperandIndexesOfCompliantOperations;
1274 for (uint32_t origOperationIndex = 0; origOperationIndex < origOperations.size();
1275 ++origOperationIndex) {
1276 if (noncompliantOperations.count(origOperationIndex)) {
1277 continue;
1278 }
1279 for (uint32_t input : origOperations[origOperationIndex].inputs) {
1280 if (inputOperandIndexesOfCompliantOperations.insert(input).second) {
1281 const Operand& origOperand = origOperands[input];
1282 switch (origOperand.lifetime) {
1283 case OperandLifeTime::CONSTANT_COPY:
1284 case OperandLifeTime::CONSTANT_REFERENCE:
1285 case OperandLifeTime::NO_VALUE: {
1286 const uint32_t slicedOperandIndex =
1287 extend(&slicedOperands, convertTo<SlicedOperand>(origOperand))
1288 .first;
1289 slicedOperands[slicedOperandIndex].numberOfConsumers = 0;
1290 origOperandIndexToSlicedIndex[input] = slicedOperandIndex;
1291 VLOG(COMPILATION) << "origOperandIndexToSlicedIndex initialization created "
1292 << input << " -> " << slicedOperandIndex << ": "
1293 << toString(slicedOperands[slicedOperandIndex]);
1294 break;
1295 }
1296 default:
1297 break;
1298 }
1299 }
1300 }
1301 }
1302
1303 // For each output operand of a noncompliant operation that is the input
1304 // operand of at least one compliant operation, we will ensure that there is
1305 // a sliced model input whose "type" is that of the output operand. This is
1306 // a map from output operand "type" (in the original model) to model input
1307 // operand index (in the sliced model). Unfortunately, there is no
1308 // representation of operand "type" defined in the HAL that we can use
1309 // naively here -- we want (OperandType, dimensions, scale, zeroPoint,
1310 // extraParams), but these fields exist in Operand along with other fields
1311 // that need to be excluded from the map key (numberOfConsumers, lifetime,
1312 // location). There are several choices:
1313 // - Don't have a map -- each output identified above gets its own sliced
1314 // model input (no sharing of sliced model inputs).
1315 // - Create an operand "type" representation solely for use as a map key.
1316 // - Write a tailored comparison function that ignores the excluded fields.
1317 // We choose to write a tailored comparison function. If Treble were to
1318 // generate a comparison function for us (http://b/130567619) then it might
1319 // be better to instead reset the excluded fields to canonical values --
1320 // then we could use the Treble provided comparison function, and the
1321 // solution would be robust (in a correctness sense, not a sharing sense) if
1322 // more fields are added and we neglect to canonicalize them.
1323 //
1324 // We also use this map for model input operands of the original model that
1325 // become input operands of the sliced model. This means that an original
1326 // model input operand might be coalesced with other original model input
1327 // operands and/or with original model temporary operands.
1328 class OrigOperandToSlicedInputOperandIndex {
1329 public:
1330 OrigOperandToSlicedInputOperandIndex(hidl_vec<SlicedOperand>* slicedOperands,
1331 hidl_vec<uint32_t>* slicedInputIndexes)
1332 : mSlicedOperands(*slicedOperands), mSlicedInputIndexes(*slicedInputIndexes) {}
1333
1334 // Given an operand from the original model, return the index of the
1335 // corresponding model input operand from the sliced model. Creates a
1336 // new operand in the sliced model if necessary.
1337 uint32_t getIndex(Operand operand) {
1338 // Lookup
1339 auto it = mMap.find(operand);
1340 if (it != mMap.end()) {
1341 VLOG(COMPILATION) << "OrigOperandToSlicedInputOperandIndex::getIndex looked for "
1342 << toString(operand) << " and found " << it->second << ": "
1343 << toString(it->first);
1344 return it->second;
1345 }
1346
1347 // Create
1348 operand.numberOfConsumers = 0;
1349 operand.lifetime = OperandLifeTime::MODEL_INPUT;
1350 operand.location = {};
1351 uint32_t slicedOperandIndex =
1352 extend(&mSlicedOperands, convertTo<SlicedOperand>(operand)).first;
1353 mMap[operand] = slicedOperandIndex;
1354 extend(&mSlicedInputIndexes, slicedOperandIndex);
1355 VLOG(COMPILATION) << "OrigOperandToSlicedInputOperandIndex::getIndex created "
1356 << slicedOperandIndex << ": " << toString(operand);
1357 return slicedOperandIndex;
1358 }
1359
1360 private:
1361 class Compare {
1362 public:
1363 bool operator()(const Operand& a, const Operand& b) const {
1364 if (a.type != b.type) {
1365 return a.type < b.type;
1366 }
1367 if (a.dimensions != b.dimensions) {
1368 return a.dimensions < b.dimensions;
1369 }
1370 if (a.scale != b.scale) {
1371 return a.scale < b.scale;
1372 }
1373 if (a.zeroPoint != b.zeroPoint) {
1374 return a.zeroPoint < b.zeroPoint;
1375 }
1376 return compare(a.extraParams, b.extraParams);
1377 }
1378
1379 private:
1380 static bool compare(const SymmPerChannelQuantParams& a,
1381 const SymmPerChannelQuantParams& b) {
1382 if (a.scales != b.scales) {
1383 return a.scales < b.scales;
1384 }
1385 return a.channelDim < b.channelDim;
1386 }
1387
1388 static bool compare(const Operand::ExtraParams& a, const Operand::ExtraParams& b) {
1389 if (a.getDiscriminator() != b.getDiscriminator()) {
1390 return a.getDiscriminator() < b.getDiscriminator();
1391 }
1392
1393 switch (a.getDiscriminator()) {
1394 default:
1395 CHECK(false) << "Unexpected";
1396 FALLTHROUGH_INTENDED;
1397 case Operand::ExtraParams::hidl_discriminator::none:
1398 return false;
1399
1400 case Operand::ExtraParams::hidl_discriminator::channelQuant:
1401 return compare(a.channelQuant(), b.channelQuant());
1402
1403 case Operand::ExtraParams::hidl_discriminator::extension:
1404 return a.extension() < b.extension();
1405 }
1406 }
1407 };
1408 std::map<Operand, uint32_t, Compare> mMap;
1409 hidl_vec<SlicedOperand>& mSlicedOperands;
1410 hidl_vec<uint32_t>& mSlicedInputIndexes;
1411 } origOperandToSlicedInputOperandIndex(&slicedOperands, &slice->mHidlModel.inputIndexes);
1412
1413 // An input of the original model is an input of the sliced model if and
1414 // only if it is consumed by at least one compliant operation. Note that in
1415 // the sliced model we share all model inputs of the same "type"; and that
1416 // we may later add model inputs to the sliced model.
1417 for (uint32_t origInputIndex : mHidlModel.inputIndexes) {
1418 if (inputOperandIndexesOfCompliantOperations.count(origInputIndex)) {
1419 const uint32_t slicedIndex =
1420 origOperandToSlicedInputOperandIndex.getIndex(origOperands[origInputIndex]);
1421 origOperandIndexToSlicedIndex[origInputIndex] = slicedIndex;
1422 VLOG(COMPILATION) << "origOperandIndexToSlicedIndex inputIndexes processing created "
1423 << origInputIndex << " -> " << slicedIndex << ": "
1424 << toString(slicedOperands[slicedIndex]);
1425 }
1426 }
1427
1428 // Main loop: Process each operation of the original model.
1429 for (uint32_t origOperationIndex = 0; origOperationIndex < origOperations.size();
1430 ++origOperationIndex) {
1431 const Operation& origOperation = origOperations[origOperationIndex];
1432
1433 if (noncompliantOperations.count(origOperationIndex)) {
1434 for (uint32_t output : origOperation.outputs) {
1435 if (!inputOperandIndexesOfCompliantOperations.count(output)) {
1436 continue;
1437 }
1438 const uint32_t slicedIndex =
1439 origOperandToSlicedInputOperandIndex.getIndex(origOperands[output]);
1440 origOperandIndexToSlicedIndex[output] = slicedIndex;
1441 VLOG(COMPILATION)
1442 << "origOperandIndexToSlicedIndex noncompliant output processing created "
1443 << output << " -> " << slicedIndex << ": "
1444 << toString(slicedOperands[slicedIndex]);
1445 }
1446 } else {
1447 slice->mSlicedOperationIndexToOrigIndex.push_back(origOperationIndex);
1448 SlicedOperation& slicedOperation = *extend(&slicedOperations).second;
1449 CHECK(slice->mSlicedOperationIndexToOrigIndex.size() == slicedOperations.size());
1450
1451 slicedOperation.type = uncheckedConvertTo<SlicedOperationType>(origOperation.type);
1452
1453 // Model is topologically sorted, so all inputs must be present in
1454 // origOperandIndexToSlicedIndex, and no outputs may be.
1455
1456 // Operation inputs
1457 // - Fill in slicedOperation.inputs
1458 // - Update number of consumers for each input operand
1459 slicedOperation.inputs.resize(origOperation.inputs.size());
1460 std::transform(
1461 origOperation.inputs.begin(), origOperation.inputs.end(),
1462 slicedOperation.inputs.begin(),
1463 [&origOperandIndexToSlicedIndex, &slicedOperands](uint32_t origOperandIndex) {
1464 uint32_t slicedOperandIndex =
1465 origOperandIndexToSlicedIndex.at(origOperandIndex);
1466 slicedOperands[slicedOperandIndex].numberOfConsumers++;
1467 VLOG(COMPILATION) << "origOperandIndexToSlicedIndex compliant input "
1468 "processing created "
1469 << origOperandIndex << " -> " << slicedOperandIndex
1470 << ": " << toString(slicedOperands[slicedOperandIndex]);
1471 return slicedOperandIndex;
1472 });
1473
1474 // Operation outputs
1475 // - Add new operands to slicedOperands
1476 // - Update origOperandIndexToSlicedIndex
1477 // - Fill in slicedOperation.outputs
1478 // - Record as a model output, if necessary
1479 const uint32_t firstOutputSlicedOperandIndex = slicedOperands.size();
1480 slicedOperands.resize(firstOutputSlicedOperandIndex + origOperation.outputs.size());
1481 slicedOperation.outputs.resize(origOperation.outputs.size());
1482 for (uint32_t outputNum = 0; outputNum < slicedOperation.outputs.size(); ++outputNum) {
1483 uint32_t origOperandIndex = origOperation.outputs[outputNum];
1484 uint32_t slicedOperandIndex = firstOutputSlicedOperandIndex + outputNum;
1485 auto& slicedOperand = slicedOperands[slicedOperandIndex];
1486 const auto& origOperand = origOperands[origOperandIndex];
1487 slicedOperand = convertTo<SlicedOperand>(origOperand);
1488 slicedOperand.numberOfConsumers = 0;
1489
1490 CHECK(origOperandIndexToSlicedIndex.count(origOperandIndex) == 0);
1491 origOperandIndexToSlicedIndex[origOperandIndex] = slicedOperandIndex;
1492 slicedOperation.outputs[outputNum] = slicedOperandIndex;
1493
1494 if (!inputOperandIndexesOfCompliantOperations.count(origOperandIndex) &&
1495 origOperand.numberOfConsumers) {
1496 // Was consumed only by noncompliant operations; convert to
1497 // an output of the sliced model.
1498 slicedOperand.lifetime = OperandLifeTime::MODEL_OUTPUT;
1499 }
1500
1501 VLOG(COMPILATION) << "origOperandIndexToSlicedIndex compliant output created "
1502 << origOperandIndex << " -> " << slicedOperandIndex << ": "
1503 << toString(slicedOperand);
1504
1505 if (slicedOperand.lifetime == OperandLifeTime::MODEL_OUTPUT) {
1506 extend(&slice->mHidlModel.outputIndexes, slicedOperandIndex);
1507 }
1508 }
1509 }
1510 }
1511
1512 // To keep things simple, we copy over these fields as-is. We could instead
1513 // opt to regenerate them based on the operands present in the sliced model:
1514 // This would be more complex and probably take more computation time, but
1515 // it would reduce the size of the sliced model, and hence the time spent
1516 // copying it around and passing it across the HAL interface.
1517 slice->mHidlModel.operandValues = mHidlModel.operandValues;
1518 slice->mHidlModel.pools = mHidlModel.pools;
1519
1520 if (VLOG_IS_ON(COMPILATION)) {
1521 {
1522 std::ostrstream fromName;
1523 fromName << "Slice: From " << ModelVersion<decltype(mHidlModel)>::name << std::ends;
1524 graphDump(fromName.str(), mHidlModel);
1525 fromName.freeze(false);
1526 }
1527 {
1528 std::ostrstream toName;
1529 toName << "Slice: To " << ModelVersion<decltype(slice->mHidlModel)>::name << std::ends;
1530 graphDump(toName.str(), convertToV1_2(slice->mHidlModel));
1531 toName.freeze(false);
1532 }
1533 }
1534
1535 slice->mState = invalid(slice->mHidlModel) ? SliceState::INVALID : SliceState::NORMAL;
1536 }
1537
1538 // This class determines whether a given device can execute a given operation
1539 class CanDo {
1540 public:
CanDo()1541 CanDo() {}
1542
initialize(PlanModelSlicer * slicer,std::shared_ptr<Device> device)1543 void initialize(PlanModelSlicer* slicer, std::shared_ptr<Device> device) {
1544 device->getSupportedOperations(slicer->getModel(), slicer, &mSupportsOperationByIndex);
1545 }
1546
check(size_t operationIndex) const1547 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
1548
1549 private:
1550 hidl_vec<bool> mSupportsOperationByIndex;
1551 };
1552
1553 }; // anonymous namespace
1554
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const1555 int ModelBuilder::findBestDeviceForEachOperation(
1556 uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
1557 std::vector<int>* bestDeviceForOperation) const {
1558 PlanModelSlicer slicer(this);
1559 const size_t deviceCount = devices.size();
1560 std::vector<CanDo> canDo(deviceCount);
1561 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1562 canDo[deviceIndex].initialize(&slicer, devices[deviceIndex]);
1563 }
1564
1565 // Figure out the best driver for each operation.
1566 const size_t operationCount = mOperations.size();
1567 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
1568 // Find which device, including CPU fallback, gives the best performance for this operation.
1569 int bestChoice = -1;
1570 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
1571 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1572 const auto& device = devices[deviceIndex];
1573 if (canDo[deviceIndex].check(operationIndex)) {
1574 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
1575 const float perfVal =
1576 (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
1577 : perf.execTime);
1578 if (bestChoice < 0 || perfVal < bestPerfVal ||
1579 (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
1580 bestChoice = deviceIndex;
1581 bestPerfVal = perfVal;
1582 }
1583 } else {
1584 // Somewhat noisy logging, but only place where the user of
1585 // NNAPI can get feedback on why an operation was not run on a
1586 // specific device.
1587 // Logs O(operationCount * deviceCount) times, but
1588 // typically deviceCount is very small.
1589 VLOG(COMPILATION) << "Device " << device->getName()
1590 << " can't do operation "
1591 << toString(getOperation(operationIndex).type);
1592 }
1593 }
1594 if (bestChoice < 0) {
1595 LOG(ERROR) << "No driver can do the op";
1596 return ANEURALNETWORKS_BAD_DATA;
1597 }
1598
1599 (*bestDeviceForOperation)[operationIndex] = bestChoice;
1600 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
1601 << toString(getOperation(operationIndex).type) << ") = " << bestChoice
1602 << " (" << devices[bestChoice]->getName() << ")";
1603 }
1604 return ANEURALNETWORKS_NO_ERROR;
1605 }
1606
1607 } // namespace nn
1608 } // namespace android
1609